#!/usr/bin/perl ( $corpus, $lex, $ngram ) = @ARGV; open CORPUS, $corpus, or die "Can not open the $corpus file!\n"; open LEX, " > $lex " or die "Can not open the $lex file!\n"; open NGRAM, " > $ngram " or die "Can not open the $ngram file\n"; %wplex = (); %unigram = (); %bigram = (); while( ) { chomp; @units = split; $pretag = ""; foreach $unit ( @units ) { @wordpos = split /\//, $unit; $word = $wordpos[0]; $pos = $wordpos[1]; $wplex{$word." ".$pos}++; $unigram{$pos}++; if ( $pretag eq "" ) { $pretag = $pos; } else { $bigram{$pretag." ".$pos}++; $pretag = $pos; } } } foreach $wpkey ( keys %wplex ) { print LEX "$wpkey $wplex{$wpkey}\n"; } foreach $unikey ( keys %unigram ) { print NGRAM "$unikey $unigram{$unikey}\n"; } foreach $bikey ( keys %bigram ) { print NGRAM "$bikey $bigram{$bikey}\n"; }