#!/usr/bin/perl

use strict;
use POSIX;

$| = 1;

my @window = ("#");
my %contexts = ();
my $corpus_file_name = "corpus.txt";
my $output_file_name = "";
my $print_to_file = 0;
my $max_token_count = 50000;
my $max_type_count = 10000;
my $set_token_count = 0;
my $set_type_count = 0;

eval { process_options(); }; die $@ if $@;
eval { generate_context_vectors(); }; die $@ if $@;
eval { print_context_vector_similarities(); }; die $@ if $@;

sub process_options {
	for (my $i = 0; $i <= $#ARGV; $i++) {
		if ($ARGV[$i] =~ /(^\-c|\-\-corpus\-file-name)$/) {
			$corpus_file_name = $ARGV[++$i];
		} elsif ($ARGV[$i] =~ /(^\-o|\-\-output\-file-name)$/) {
			$output_file_name = $ARGV[++$i];
			$print_to_file = 1;
		} elsif ($ARGV[$i] =~ /(^\-k|\-\-max\-token\-count)$/) {
			$max_token_count = $ARGV[++$i];
			$set_token_count = 1;
		} elsif ($ARGV[$i] =~ /(^\-p|\-\-max\-type\-count)$/) {
			$max_type_count = $ARGV[++$i];
			$set_type_count = 1;
		} elsif ($ARGV[$i] =~ /(^\-h|\-\-help)$/) {
			print_help();
			exit(0);
		}
	}
}

sub generate_context_vectors {

	print "Generating context vectors\n";

	my $corpus_line_number = 0;
	my $token_count = 0;
	my $type_count = 0;
	my $OUT = "";

	open(COR, "<$corpus_file_name") || die "$! [". $corpus_file_name . "]\n";

	VECTOR: {
		while (<COR>) {
			chomp;					# trim trailing new line
			$corpus_line_number++;	# update corpus line number
			next if (/^ *$/);		# skip this line if it's blank
			tr/A-Z/a-z/;			# lowercase everything
			s/[^a-z]+/ /g;			# turn all non-letters into spaces
			s/(^ +| +$)//g;			# trim leading/trailing spaces

			my @tokens = split(/ /, $_);

			foreach my $token (@tokens) {
				$token_count++;
				$type_count++ if (!grep(/^$token$/, keys %contexts));
				$contexts{$token}{count}++;

				if ($token_count == 1) {
					push @window, $token;
				} elsif ($token_count == 2) {
					push @window, $token;
					$contexts{$window[1]}{left}{$window[0]}++;
					$contexts{$window[1]}{right}{$window[2]}++;
				} else {
					shift @window;
					push @window, $token;
					$contexts{$window[1]}{left}{$window[0]}++;
					$contexts{$window[1]}{right}{$window[2]}++;
				}

				last VECTOR if (($set_token_count && $token_count == $max_token_count) || ($set_type_count && $type_count == $max_type_count));
			}

			for (my $i = 0; $i < length($OUT); $i++) {
				print "\b";
			}

			$OUT = "    " . sprintf("Corpus Line: %-10s Tokens: %-10s Types: %-10s", $corpus_line_number, $token_count, $type_count);

			print $OUT;
		}
	}

	close(COR);

	shift @window;
	push @window, "#";
	$contexts{$window[1]}{left}{$window[0]}++;
	$contexts{$window[1]}{right}{$window[2]}++;

	for (my $i = 0; $i < length($OUT); $i++) {
		print "\b";
	}

	$OUT = "    " . sprintf("Corpus Line: %-10s Tokens: %-10s Types: %-10s", $corpus_line_number, $token_count, $type_count);

	print $OUT . "\n";
}

sub get_vector_dot_product {
	my ($first_token, $second_token) = @_;
	my $vector_dot_product = 0;
	my %left_matches = ();
	my %right_matches = ();

	return $contexts{$first_token}{dot_products}{$second_token} if (defined($contexts{$first_token}{dot_products}{$second_token}));
	return $contexts{$second_token}{dot_products}{$first_token} if (defined($contexts{$second_token}{dot_products}{$first_token}));

	foreach my $left_token (keys %{$contexts{$first_token}{left}}, keys %{$contexts{$second_token}{left}}) {
		$left_matches{$left_token}++;
	}

	foreach my $left_token (keys %left_matches) {
		if ($left_matches{$left_token} > 1) {
			$vector_dot_product += $contexts{$first_token}{left}{$left_token} * $contexts{$second_token}{left}{$left_token};
		}
	}

	foreach my $right_token (keys %{$contexts{$first_token}{right}}, keys %{$contexts{$second_token}{right}}) {
		$right_matches{$right_token}++;
	}

	foreach my $right_token (keys %right_matches) {
		if ($right_matches{$right_token} > 1) {
			$vector_dot_product += $contexts{$first_token}{right}{$right_token} * $contexts{$second_token}{right}{$right_token};
		}
	}

	$contexts{$first_token}{dot_products}{$second_token} = $vector_dot_product;
	# $contexts{$second_token}{dot_products}{$first_token} = $vector_dot_product;

	return $vector_dot_product;
}

sub get_vector_length {
	my $token = shift;
	my $context_vector_length = 0;

	return $contexts{$token}{length} if (defined($contexts{$token}{length}));

	foreach my $left_token (keys %{$contexts{$token}{left}}) {
		$context_vector_length += pow($contexts{$token}{left}{$left_token}, 2);
	}

	foreach my $right_token (keys %{$contexts{$token}{right}}) {
		$context_vector_length += pow($contexts{$token}{right}{$right_token}, 2);
	}

	$context_vector_length = pow($context_vector_length, 0.5);
	$contexts{$token}{length} = $context_vector_length;

	return $context_vector_length;
}

sub get_vector_similarity {
	my ($first_token, $second_token) = @_;
	my $vector_similarity = 0;

	return $contexts{$first_token}{similarities}{$second_token} if (defined($contexts{$first_token}{similarities}{$second_token}));
	return $contexts{$second_token}{similarities}{$first_token} if (defined($contexts{$second_token}{similarities}{$first_token}));

	$vector_similarity = get_vector_dot_product($first_token, $second_token) / (get_vector_length($first_token) * get_vector_length($second_token));

	$contexts{$first_token}{similarities}{$second_token} = $vector_similarity;
	# $contexts{$second_token}{similarities}{$first_token} = $vector_similarity;

	return $vector_similarity;
}

sub print_context_vector_similarities {
	my $total = keys %contexts;
	$total = ($total > 500) ? 500 : $total;
	my $matches = 0;
	my $current = 0;
	my $percent = 0;
	my $OUT = "";

	if ($print_to_file) {
		print "Printing context vector similarities\n";
		open(OUT, ">$output_file_name") || die "$! [" . $output_file_name . "]\n";
		select(OUT);
		$| = 1;
	}

	my $notice = "    Getting context vector similarities";

	foreach my $token (sort { $contexts{$b}{count} <=> $contexts{$a}{count} } keys %contexts) {
		select(OUT) if ($print_to_file);
		$| = 1;

		print $token . "\n";

		print $notice if (!$print_to_file);

		foreach my $sub_token (sort { $a cmp $b } keys %contexts) {
			get_vector_similarity($token, $sub_token);
		}

		if (!$print_to_file) {
			for (my $i = 0; $i < length($notice); $i++) {
				print "\b";
			}
			for (my $i = 0; $i < length($notice); $i++) {
				print " ";
			}
			for (my $i = 0; $i < length($notice); $i++) {
				print "\b";
			}
		}

		$matches = 0;

		foreach my $sub_token (sort { $contexts{$token}{similarities}{$b} <=> $contexts{$token}{similarities}{$a} } keys %{$contexts{$token}{similarities}}) {
			if ($token ne $sub_token && $contexts{$token}{similarities}{$sub_token} > 0) {
				print "    " . $sub_token . " " . $contexts{$token}{similarities}{$sub_token} . "\n";
				$matches++;
			}

			last if ($matches == 15);
		}

		$current++;

		if ($print_to_file) {
			select(STDOUT);
			$| = 1;

			$percent = int(($current / $total) * 100);

			for (my $i = 0; $i < length($OUT); $i++) {
				print "\b";
			}

			$OUT = "    Processed " . $current . " out of " . $total . " types (" . $percent . "\%)";

			print $OUT;
		}

		last if ($current == 500);
		
	}

	if ($print_to_file) {
		close(OUT);
		select(STDOUT);
		$| = 1;
	}

	print "\n";
}

sub print_help {
	print <<HELP;

NAME
  convecs -- A tool to find the 15 most similar words for the 500 most
             frequent words in a corpus based on the cosine between
             their context vectors

SYNOPSIS
  perl convecs [-cokph]

OPTIONS
  -c, --corpus-file-name  path to the corpus used for input
  -o, --output-file-name  path to file where output will be printed
  -k, --max-token-count   maximum number of tokens to read from corpus
  -p, --max-type-count    maximum number of types to read from corpus
  -h, --help              print this help menu

NOTES
  If no corpus file name is supplied, convecs will look in the current
  directory for a file named corpus.txt to use as training data. Default
  behavior prints output to STDOUT unless an output file name is given.
  The default maximum token count is 50000 and the default maximum type
  count is 10000.

HELP
}

exit(0);
