#!/usr/bin/perl

use strict;

my %test_dist;		# a hash to hold testing data symbol distribution
my $test_dir;		# directory where testing corpora reside
my @test_names;		# testing corpora file names
my %train_dist;		# a hash to hold training data symbol distribution
my $train_dir;		# directory where training distributions reside
my @train_names;	# training distribution file names
my $longest_name;	# length of longest training distribution file name
my $current_ce_val;	# value of the current cross entropy
my $lowest_ce_val;	# value of the lowest cross entropy
my $lowest_ce_name;	# name of the model with the lowest cross entropy

eval { set_parameters(); }; die $@ if $@;
eval { process_test_corpora(); }; die $@ if $@;

exit(0);

sub set_parameters {

	# process user-supplied arguments and convert
	# paths to UNIX-style with forward slashes
	for (my $i = 0; $i <= $#ARGV; $i++) {
		if ($ARGV[$i] =~ /^(\-d|\-\-directory)$/) {
			$ARGV[++$i] =~ s/\\/\//g;
			$train_dir = $ARGV[$i];
		} elsif ($ARGV[$i] =~ /^(\-s|\-\-distribution)$/) {
			$ARGV[++$i] =~ s/\\/\//g;
			push @train_names, $ARGV[$i];
			$longest_name = length($ARGV[$i]);
		} elsif ($ARGV[$i] =~ /^(\-D|\-\-Directory)$/) {
			$ARGV[++$i] =~ s/\\/\//g;
			$test_dir = $ARGV[$i];
		} elsif ($ARGV[$i] =~ /^(\-C|\-\-Corpus)$/) {
			$ARGV[++$i] =~ s/\\/\//g;
			push @test_names, $ARGV[$i];
		} elsif ($ARGV[$i] =~ /^(\-h|\-\-help)$/) {
			print_usage();
			exit(0);
		} else {
			die "Unknown argument specified [" . $ARGV[$i] . "]\n";
		}
	}

	# set default corpora directory if none has been supplied
	# or tack on a final / if there isn't one
	if (!$test_dir || $test_dir =~ /^$/) {
		$test_dir = "./";
	} else {
		$test_dir .= "\/" if ($test_dir !~ /\/$/);
	}

	# get list of corpora in the corpora directory
	# if a corpora file name has not been supplied
	if ($#test_names < 0) {
		opendir(DIR, $test_dir) || die "$! [" . $test_dir . "]\n";
		@test_names = grep(/^\w+\_test\_corpus\.txt$/, readdir(DIR));
		closedir(DIR) || die "$!\n";
		die "No corpora in directory \'" . $test_dir . "\'\n" if ($#test_names < 0);
	}

	# set default distribution directory if none has been supplied
	# or tack on a final / if there isn't one
	if (!$train_dir || $train_dir =~ /^$/) {
		$train_dir = "./";
	} else {
		$train_dir .= "\/" if ($train_dir !~ /\/$/);
	}

	# get list of distributions in the distribution directory
	# if a distribution file name has not been supplied
	if ($#train_names < 0) {
		opendir(DIR, $train_dir) || die "$! [" . $train_dir . "]\n";
		@train_names = grep(/^\w+\_train\_distribution\.txt$/, readdir(DIR));
		closedir(DIR) || die "$!\n";
		die "No distributions in directory \'" . $train_dir . "\'\n" if ($#train_names < 0);
	}

	# get the length of the longest training distribution file name
	# this will be used for formatting output later
	foreach my $train_name (@train_names) {
		$longest_name = length($train_name) if (length($train_name) > $longest_name);
	}

}

sub process_test_corpora {

	# go through each test corpus and generate the symbol distribution,
	# then test against each of the training distribution files using
	# the cross entropy measure; simple add-one smoothing is used for
	# previously unseen symbols in the testing corpora
	foreach my $test_name (@test_names) {
		print "\n" . $test_name . "\n";
		undef $lowest_ce_val;
		undef $lowest_ce_name;
		eval { generate_test_distribution($test_name); };
		die $@ . " [" . $test_name . "]\n" if $@;
		foreach my $train_name (@train_names) {
			print "\t" . sprintf("%-" . ($longest_name + 1) . "s", $train_name) . "=> ";
			my @chunks = split(/\//, $train_name);
			my ($language, $extraneous) = split(/\_/, $chunks[$#chunks]);
			eval { load_train_distribution($train_name); };
			die $@ . " [" . $train_name . "]\n" if $@;
			eval { smooth_train_distribution(); };
			die $@ . " [" . $train_name . "]\n" if $@;
			eval { calculate_cross_entropy(); };
			die $@ . " [" . $train_name . "]\n" if $@;
			print $current_ce_val . "\n";
			if (!defined($lowest_ce_val) || $current_ce_val < $lowest_ce_val) {
				$lowest_ce_val = $current_ce_val;
				$lowest_ce_name = $language;
			}
		}
		print "\tPREDICTION: " . $lowest_ce_name . "\n";
	}
	print "\n";

}

sub generate_test_distribution {

	undef %test_dist;
	my $test_name = shift;

	open(CORP, $test_dir . $test_name) || die "$! [" . $test_dir . $test_name . "]\n";
	while (<CORP>) {
		chomp;
		s/ //g;
		my @line_symbols = split(/ */, $_);
		foreach my $line_symbol (@line_symbols) {
			$test_dist{total}++;
			$line_symbol = lc($line_symbol);
			$test_dist{unique}++ if (!$test_dist{symbols}{$line_symbol});
			$test_dist{symbols}{$line_symbol}++;
		}
	}
	close(CORP);

}

sub load_train_distribution {

	undef %train_dist;
	my $train_name = shift;
	my $line_num = 0;

	open(DIST, $train_dir . $train_name) || die "$! [" . $train_dir . $train_name . "]\n";
	while (<DIST>) {
		chomp;
		if ($line_num == 0) {
			my ($total, $unique) = split(/ +/, $_);
			$train_dist{total} = $total;
			$train_dist{unique} = $unique;
			$line_num++;
			next;
		}
		my ($symbol, $count) = split(/ +/, $_);
		$train_dist{symbols}{$symbol} = $count;
	}
	close(DIST);

}

sub smooth_train_distribution {

	# if a symbol from the test corpus does not occur in the training
	# distribution, add the symbol to the training distribution with a
	# count of one and add one to the total number of tokens in the
	# training distribution
	foreach my $symbol (sort { $a cmp $b } keys %{$test_dist{symbols}}) {
		my $match = $symbol;
		$match = "\\" . $symbol if ($symbol !~ /(\w|\d)/);
		if (!grep(/^$match$/, keys %{$train_dist{symbols}})) {
			$train_dist{symbols}{$symbol}++;
			$train_dist{total}++;
		}
	}

}

sub calculate_cross_entropy {

	my $cross_entropy = 0;
	foreach my $symbol (sort { $a cmp $b } keys %{$test_dist{symbols}}) {
		$cross_entropy += -(calculate_test_symbol_freq($symbol) * log(calculate_train_symbol_freq($symbol)));
	}
	$current_ce_val = $cross_entropy;

}

sub calculate_test_symbol_freq {

	my $symbol = shift;
	my $test_symbol_freq = $test_dist{symbols}{$symbol} / $test_dist{total};
	return $test_symbol_freq;

}

sub calculate_train_symbol_freq {

	my $symbol = shift;
	my $train_symbol_freq = $train_dist{symbols}{$symbol} / $train_dist{total};
	return $train_symbol_freq;

}

sub print_usage {

	print <<HELP;

NAME
  langce -- A tool to predict the language of a test corpus
            based on cross entropy values calculated using
            language symbol distributions generated from
            training corpora

SYNOPSIS
  perl langce [-d distribution directory] [-s distribution file name]
              [-D corpus directory] [-C corpus file name] [-h]

OPTIONS
  -d, --directory       training distributions directory
  -s, --distribution    training distribution file name
  -D, --Directory       testing corpora directory
  -C, --Corpus          testing corpus file name
  -h, --help            print this help menu

NOTES
  langce expects testing corpus file names to follow a specific
  naming convention: *_test_corpus.txt, where * is the language
  of the testing corpus. It will search for distribution files
  with names of a similar format: *_train_distribution.txt, where
  * is the language of the training corpus.

HELP

}