#!/usr/bin/perl

use strict;

my %train_dist;		# a hash to hold training data symbol distribution
my $train_dir;		# directory where training corpora reside
my @train_names;	# training corpora file names
my $longest_name;	# length of longest training corpus file name

eval { set_parameters(); }; die $@ if $@;
eval { process_train_corpora(); }; die $@ if $@;

exit(0);

sub set_parameters {

	# process user-supplied arguments
	for (my $i = 0; $i <= $#ARGV; $i++) {
		if ($ARGV[$i] =~ /^(\-d|\-\-directory)$/) {
			$ARGV[++$i] =~ s/\\/\//g;
			$train_dir = $ARGV[$i];
		} elsif ($ARGV[$i] =~ /^(\-c|\-\-corpus)$/) {
			$ARGV[++$i] =~ s/\\/\//g;
			push @train_names, $ARGV[$i];
			$longest_name = length($ARGV[$i]);
		} elsif ($ARGV[$i] =~ /^(\-h|\-\-help)$/) {
			print_usage();
			exit(0);
		} else {
			die "Unknown argument specified [" . $ARGV[$i] . "]\n";
		}
	}

	# set default corpora directory if none has been supplied
	# or tack on a final / if there isn't one
	if (!$train_dir || $train_dir =~ /^$/) {
		$train_dir = "./";
	} else {
		$train_dir .= "\/" if ($train_dir !~ /\/$/);
	}

	# get list of corpora in the corpora directory
	# if a corpus file name has not been supplied
	if ($#train_names < 0) {
		opendir(DIR, $train_dir) || die "$! [" . $train_dir . "]\n";
		@train_names = grep(/^\w+\_train\_corpus\.txt$/, readdir(DIR));
		closedir(DIR) || die "$!\n";
		die "No training corpora in directory \'" . $train_dir . "\'\n" if ($#train_names < 0);
	}

	# get the length of the longest corpus file name
	# this will be used for formatting output later
	foreach my $train_name (@train_names) {
		$longest_name = length($train_name) if (length($train_name) > $longest_name);
	}

}

sub process_train_corpora {

	# go through each corpus and generate the symbol distribution
	# then output the data to the distribution file; the first line
	# will contain the total number of tokens and the number of
	# unique tokens while the rest will have a symbol & it's count
	foreach my $train_name (@train_names) {
		eval { generate_train_distribution($train_name); };
		die $@ . " [" . $train_name . "]\n" if $@;
		eval { print_train_distribution($train_name); };
		die $@ . " [" . $train_name . "]\n" if $@;
	}
	print "\n\n";

}

sub generate_train_distribution {

	undef %train_dist;
	my $train_name = shift;

	print "\n" . sprintf("%-" . ($longest_name + 1) . "s", $train_name) . "=> ";

	open(CORP, $train_dir . $train_name) || die "$! [" . $train_dir . $train_name . "]\n";
	while (<CORP>) {
		chomp;
		s/( |\t)+//g;
		my @line_symbols = split(/ */, $_);
		foreach my $line_symbol (@line_symbols) {
			$train_dist{total}++;
			$line_symbol = lc($line_symbol);
			$train_dist{unique}++ if (!$train_dist{symbols}{$line_symbol});
			$train_dist{symbols}{$line_symbol}++;
		}
	}
	close(CORP);

}

sub print_train_distribution {

	my $train_name = shift;
	my @chunks = split(/\//, $train_name);
	my ($language, $extraneous) = split(/\_/, $chunks[$#chunks]);
	my $train_dist_name = $language . "\_train\_distribution\.txt";

	print $train_dist_name;

	open(DIST, ">" . $train_dir . $train_dist_name) || die "$! [" . $train_dir . $train_dist_name . "]\n";
	print DIST $train_dist{total} . " " . $train_dist{unique};
	foreach my $symbol (sort { $a cmp $b } keys %{$train_dist{symbols}}) {
		print DIST "\n" . $symbol . " " . $train_dist{symbols}{$symbol};
	}
	close(DIST);

}

sub print_usage {

	print <<HELP;

NAME
  langdist -- A tool to generate language symbol distributions
              from training corpora

SYNOPSIS
  perl langdist [-d corpora directory] [-c corpus file name] [-h]

OPTIONS
  -d, --directory    training corpora directory
  -c, --corpus       training corpus file name
  -h, --help         print this help menu

NOTES
  langdist expects training corpus file names to follow a specific
  naming convention: *_train_corpus.txt, where * is the language
  of the training corpus. It will then generate distribution files
  with names of a similar format: *_train_distribution.txt, where
  once again * is the language of the training corpus.

HELP

}
