#!/usr/bin/perl use strict; my %test_dist; # a hash to hold testing data symbol distribution my $test_dir; # directory where testing corpora reside my @test_names; # testing corpora file names my %train_dist; # a hash to hold training data symbol distribution my $train_dir; # directory where training distributions reside my @train_names; # training distribution file names my $longest_name; # length of longest training distribution file name my $current_ce_val; # value of the current cross entropy my $lowest_ce_val; # value of the lowest cross entropy my $lowest_ce_name; # name of the model with the lowest cross entropy eval { set_parameters(); }; die $@ if $@; eval { process_test_corpora(); }; die $@ if $@; exit(0); sub set_parameters { # process user-supplied arguments and convert # paths to UNIX-style with forward slashes for (my $i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i] =~ /^(\-d|\-\-directory)$/) { $ARGV[++$i] =~ s/\\/\//g; $train_dir = $ARGV[$i]; } elsif ($ARGV[$i] =~ /^(\-s|\-\-distribution)$/) { $ARGV[++$i] =~ s/\\/\//g; push @train_names, $ARGV[$i]; $longest_name = length($ARGV[$i]); } elsif ($ARGV[$i] =~ /^(\-D|\-\-Directory)$/) { $ARGV[++$i] =~ s/\\/\//g; $test_dir = $ARGV[$i]; } elsif ($ARGV[$i] =~ /^(\-C|\-\-Corpus)$/) { $ARGV[++$i] =~ s/\\/\//g; push @test_names, $ARGV[$i]; } elsif ($ARGV[$i] =~ /^(\-h|\-\-help)$/) { print_usage(); exit(0); } else { die "Unknown argument specified [" . $ARGV[$i] . "]\n"; } } # set default corpora directory if none has been supplied # or tack on a final / if there isn't one if (!$test_dir || $test_dir =~ /^$/) { $test_dir = "./"; } else { $test_dir .= "\/" if ($test_dir !~ /\/$/); } # get list of corpora in the corpora directory # if a corpora file name has not been supplied if ($#test_names < 0) { opendir(DIR, $test_dir) || die "$! [" . $test_dir . "]\n"; @test_names = grep(/^\w+\_test\_corpus\.txt$/, readdir(DIR)); closedir(DIR) || die "$!\n"; die "No corpora in directory \'" . $test_dir . "\'\n" if ($#test_names < 0); } # set default distribution directory if none has been supplied # or tack on a final / if there isn't one if (!$train_dir || $train_dir =~ /^$/) { $train_dir = "./"; } else { $train_dir .= "\/" if ($train_dir !~ /\/$/); } # get list of distributions in the distribution directory # if a distribution file name has not been supplied if ($#train_names < 0) { opendir(DIR, $train_dir) || die "$! [" . $train_dir . "]\n"; @train_names = grep(/^\w+\_train\_distribution\.txt$/, readdir(DIR)); closedir(DIR) || die "$!\n"; die "No distributions in directory \'" . $train_dir . "\'\n" if ($#train_names < 0); } # get the length of the longest training distribution file name # this will be used for formatting output later foreach my $train_name (@train_names) { $longest_name = length($train_name) if (length($train_name) > $longest_name); } } sub process_test_corpora { # go through each test corpus and generate the symbol distribution, # then test against each of the training distribution files using # the cross entropy measure; simple add-one smoothing is used for # previously unseen symbols in the testing corpora foreach my $test_name (@test_names) { print "\n" . $test_name . "\n"; undef $lowest_ce_val; undef $lowest_ce_name; eval { generate_test_distribution($test_name); }; die $@ . " [" . $test_name . "]\n" if $@; foreach my $train_name (@train_names) { print "\t" . sprintf("%-" . ($longest_name + 1) . "s", $train_name) . "=> "; my @chunks = split(/\//, $train_name); my ($language, $extraneous) = split(/\_/, $chunks[$#chunks]); eval { load_train_distribution($train_name); }; die $@ . " [" . $train_name . "]\n" if $@; eval { smooth_train_distribution(); }; die $@ . " [" . $train_name . "]\n" if $@; eval { calculate_cross_entropy(); }; die $@ . " [" . $train_name . "]\n" if $@; print $current_ce_val . "\n"; if (!defined($lowest_ce_val) || $current_ce_val < $lowest_ce_val) { $lowest_ce_val = $current_ce_val; $lowest_ce_name = $language; } } print "\tPREDICTION: " . $lowest_ce_name . "\n"; } print "\n"; } sub generate_test_distribution { undef %test_dist; my $test_name = shift; open(CORP, $test_dir . $test_name) || die "$! [" . $test_dir . $test_name . "]\n"; while () { chomp; s/ //g; my @line_symbols = split(/ */, $_); foreach my $line_symbol (@line_symbols) { $test_dist{total}++; $line_symbol = lc($line_symbol); $test_dist{unique}++ if (!$test_dist{symbols}{$line_symbol}); $test_dist{symbols}{$line_symbol}++; } } close(CORP); } sub load_train_distribution { undef %train_dist; my $train_name = shift; my $line_num = 0; open(DIST, $train_dir . $train_name) || die "$! [" . $train_dir . $train_name . "]\n"; while () { chomp; if ($line_num == 0) { my ($total, $unique) = split(/ +/, $_); $train_dist{total} = $total; $train_dist{unique} = $unique; $line_num++; next; } my ($symbol, $count) = split(/ +/, $_); $train_dist{symbols}{$symbol} = $count; } close(DIST); } sub smooth_train_distribution { # if a symbol from the test corpus does not occur in the training # distribution, add the symbol to the training distribution with a # count of one and add one to the total number of tokens in the # training distribution foreach my $symbol (sort { $a cmp $b } keys %{$test_dist{symbols}}) { my $match = $symbol; $match = "\\" . $symbol if ($symbol !~ /(\w|\d)/); if (!grep(/^$match$/, keys %{$train_dist{symbols}})) { $train_dist{symbols}{$symbol}++; $train_dist{total}++; } } } sub calculate_cross_entropy { my $cross_entropy = 0; foreach my $symbol (sort { $a cmp $b } keys %{$test_dist{symbols}}) { $cross_entropy += -(calculate_test_symbol_freq($symbol) * log(calculate_train_symbol_freq($symbol))); } $current_ce_val = $cross_entropy; } sub calculate_test_symbol_freq { my $symbol = shift; my $test_symbol_freq = $test_dist{symbols}{$symbol} / $test_dist{total}; return $test_symbol_freq; } sub calculate_train_symbol_freq { my $symbol = shift; my $train_symbol_freq = $train_dist{symbols}{$symbol} / $train_dist{total}; return $train_symbol_freq; } sub print_usage { print <