#!/usr/bin/perl

use strict;

my %probabilities = ();
my %alphas = ();
my %betas = ();
my @states = ("C", "V");
my @characters = ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", " ");
my $hmm_string = "";
my $use_corpus = 0;
my $print_probabilities = 0;
my $longest_probability = 0;

eval { process_options(); }; die $@ if $@;
eval { assign_initial_probabilities(); }; die $@ if $@;
eval { print_initial_probabilities(); } if ($print_probabilities); die $@ if $@;
eval { assign_transition_probabilities(); }; die $@ if $@;
eval { print_transition_probabilities(); } if ($print_probabilities); die $@ if $@;
eval { assign_emission_probabilities(); }; die $@ if $@;
eval { print_emission_probabilities(); } if ($print_probabilities); die $@ if $@;
eval { calculate_probabilities(); }; die $@ if $@;


sub process_options {
	for (my $i = 0; $i <= $#ARGV; $i++) {
		if ($ARGV[$i] =~ /^(\-p|\-\-print\-probabilities)$/) {
			$print_probabilities = 1;
		} elsif ($ARGV[$i] =~ /^(\-h|\-\-help)$/) {
			eval { print_help(); }; die $@ if $@;
			exit(0);
		}
	}

	if ($ARGV[0] =~ /^\-{1,2}\w+/ || $ARGV[0] =~ /^ *$/) {
		print "\nThe first argument must be a string, or the path to a file containing strings\n";
		exit(0);
	}

	$hmm_string = $ARGV[0];
}

sub assign_initial_probabilities {
	$longest_probability = 0;

	my $random_number_sum = 0;

	# assign a non-zero random number value as the initial probability for each state
	foreach my $state (@states) {
		my $state_random_number = rand;

		while ($state_random_number == 0) {
			$state_random_number = rand;
		}

		$probabilities{initial}{$state} = $state_random_number;

		$random_number_sum += $state_random_number;
	}

	# normalize the above-assigned probabilities so they sum to one (form a distribution)
	foreach my $state (keys %{$probabilities{initial}}) {
		$probabilities{initial}{$state} /= $random_number_sum;
		$longest_probability = length($probabilities{initial}{$state}) if (length($probabilities{initial}{$state}) > $longest_probability);
	}
}

sub assign_transition_probabilities {
	$longest_probability = 0;

	foreach my $state (@states) {
		my $random_number_sum = 0;

		# assign a non-zero random number value as the initial probability for each transition
		foreach my $next_state (@states) {
			my $transition_random_number = rand;

			while ($transition_random_number == 0) {
				$transition_random_number = rand;
			}

			$probabilities{transition}{$state}{$next_state} = $transition_random_number;

			$random_number_sum += $transition_random_number;
		}

		# normalize the above-assigned probabilities so they sum to one (form a distribution)
		foreach my $next_state (keys %{$probabilities{transition}{$state}}) {
			$probabilities{transition}{$state}{$next_state} /= $random_number_sum;
			$longest_probability = length($probabilities{transition}{$state}{$next_state}) if (length($probabilities{transition}{$state}{$next_state}) > $longest_probability);
		}
	}
}

sub assign_emission_probabilities {
	$longest_probability = 0;

	foreach my $state (@states) {
		my $random_number_sum = 0;

		foreach my $next_state (@states) {
			foreach my $character (@characters) {
				# assign a non-zero random number value as the emission probability for each character at each transition
				my $emission_random_number = rand;

				while ($emission_random_number == 0) {
					$emission_random_number = rand;
				}

				$probabilities{emission}{$state}{$next_state}{$character} = $emission_random_number;

				$random_number_sum += $emission_random_number;
			}
		}

		# normalize the above-assigned probabilities so they sum to one (form a distribution)
		foreach my $next_state (keys %{$probabilities{emission}{$state}}) {
			foreach my $character (keys %{$probabilities{emission}{$state}{$next_state}}) {
				$probabilities{emission}{$state}{$next_state}{$character} /= $random_number_sum;
				$longest_probability = length($probabilities{emission}{$state}{$next_state}{$character}) if (length($probabilities{emission}{$state}{$next_state}{$character}) > $longest_probability);
			}
		}
	}
}

sub calculate_probabilities {
	my $line = 1;
	if (-f $hmm_string) {
		open(HMM, "<$hmm_string") || die "$! [" . $hmm_string . "]\n";
		while (<HMM>) {
			chomp;
			tr/A-Z/a-z/;
			s/[^a-z ]//g;
			print "\n" . get_character_string("\_", 24) . "HMM String " . $line . get_character_string("\_", 24) . "\n";
			eval { calculate_forward_probability($_); }; die $@ if $@;
			eval { calculate_backward_probability($_); }; die $@ if $@;
			print "\n";
			$line++;
		}
		close(HMM);
		print "\n";
	} else {
		print "\n";
		eval { calculate_forward_probability($hmm_string); }; die $@ if $@;
		eval { calculate_backward_probability($hmm_string); }; die $@ if $@;
		print "\n\n";
	}
}

sub calculate_forward_probability {
	my $string = shift;
	my $string_length = length($string);
	my $total_probability = 0;
	undef %alphas;

	print "FORWARD PROBABILITIES (ALPHA)\n";
	print "Time=1\n";
	# set alpha_i(1) = pi(i)
	foreach my $state (@states) {
		$alphas{$state}{0} = $probabilities{initial}{$state};
		print "Pr[State(1)=" . $state . "]=" . $alphas{$state}{0} . "\n";
	}

	for (my $time = 1; $time <= $string_length; $time++) {
		my $character = lc(substr($string, ($time - 1), 1));
		print "Obs(" . $time . ")='" . $character . "'\n\n";
		print "Time=" . ($time + 1) . "\n";
		foreach my $state (@states) {
			foreach my $previous_state (@states) {
				$alphas{$state}{$time} += $alphas{$previous_state}{($time - 1)} * $probabilities{transition}{$previous_state}{$state} * $probabilities{emission}{$previous_state}{$state}{$character};
			}
			$total_probability += $alphas{$state}{$time} if ($time == ($string_length));
			my $observation_string = "";
			if ($time == 1) {
				$observation_string = " \&\& Obs(1)";
			} elsif ($time == 2) {
				$observation_string = " \&\& Obs(1),Obs(2)";
			} elsif ($time > 2) {
				$observation_string = " \&\& Obs(1)\,\.\.\.\,Obs(" . $time . ")";
			}
			print "Pr[State(" . ($time + 1) . ")=" . $state . $observation_string . "]=" . $alphas{$state}{$time} . "\n";
		}
	}

	print "\nPr[Observation|Model]=" . $total_probability . "\n\n";
}

sub calculate_backward_probability {
	my $string = shift;
	my $string_length = length($string);
	my $total_probability = 0;
	undef %betas;

	print "\nBACKWARD PROBABILITIES (BETA)\n";
	print "Time=" . ($string_length + 1) . "\n";
	# set beta_i(T+1) = 1
	foreach my $state (@states) {
		$betas{$state}{$string_length} = 1;
		print "Pr[State(" . ($string_length + 1) . ")=" . $state . "]=1\n";
	}

	for (my $time = ($string_length - 1); $time >= 0; $time--) {
		my $character = lc(substr($string, $time, 1));
		print "Obs(" . ($time + 1) . ")='" . $character . "'\n\n";
		print "Time=" . ($time + 1) . "\n";
		foreach my $state (@states) {
			foreach my $next_state (@states) {
				$betas{$state}{$time} += $betas{$next_state}{($time + 1)} * $probabilities{transition}{$state}{$next_state} * $probabilities{emission}{$state}{$next_state}{$character};
			}
			$total_probability += ($betas{$state}{$time} * $probabilities{initial}{$state}) if ($time == 0);
			my $observation_string = "";
			if ($time == ($string_length - 1)) {
				$observation_string = " \&\& Obs(" . ($time + 1) . ")";
			} elsif ($time == ($string_length - 2)) {
				$observation_string = " \&\& Obs(" . ($time + 1) . "),Obs(" . $string_length . ")";
			} elsif ($time < ($string_length - 2)) {
				$observation_string = " \&\& Obs(" . ($time + 1) .")\,\.\.\.\,Obs(" . $string_length . ")";
			}
			print "Pr[State(" . ($time + 1) . ")=" . $state . $observation_string . "]=" . $betas{$state}{$time} . "\n";
		}
	}

	print "\nPr[Observation|Model]=" . $total_probability . "\n";
}


sub print_initial_probabilities {
	print "\nINITIAL STATE PROBABILITIES\n";
	print   "+" . get_character_string("\-", $longest_probability + 8) . "+\n";
	foreach my $state (sort { $a cmp $b } keys %{$probabilities{initial}}) {
		print sprintf("%-" . ($longest_probability + 8) . "s", "| Pr[" . $state . "]=" . $probabilities{initial}{$state}) . " |\n";
	}
	print   "+" . get_character_string("\-", $longest_probability + 8) . "+\n";
}

sub print_transition_probabilities {
	print "\nINITIAL TRANSITION PROBABILITIES\n";
	print   "+" . get_character_string("\-", $longest_probability + 11) . "+\n";
	foreach my $state (sort { $a cmp $b } keys %{$probabilities{transition}}) {
		foreach my $next_state (sort { $a cmp $b } keys %{$probabilities{transition}{$state}}) {
			print sprintf("%-" . ($longest_probability + 11) . "s", "| Pr[" . $state . "->" . $next_state . "]=" . $probabilities{transition}{$state}{$next_state}) . " |\n";
		}
	}
	print   "+" . get_character_string("\-", $longest_probability + 11) . "+\n";
}

sub print_emission_probabilities {
	print "\nINITIAL EMISSION PROBABILITIES\n";
	print   "+" . get_character_string("\-", $longest_probability + 13) . "+\n";
	foreach my $state (sort { $a cmp $b } keys %{$probabilities{emission}}) {
		foreach my $next_state (sort { $a cmp $b } keys %{$probabilities{emission}{$state}}) {
			foreach my $character (sort { $a cmp $b } keys %{$probabilities{emission}{$state}{$next_state}}) {
				print sprintf("%-" . ($longest_probability + 13) . "s", "| Pr[" . $state . "-" . $character . "->" . $next_state . "]=" . $probabilities{emission}{$state}{$next_state}{$character}) . " |\n";
			}
		}
	}
	print   "+" . get_character_string("\-", $longest_probability + 13) . "+\n\n";
}

sub get_character_string {
	my ($character, $length) = @_;
	my $character_string = "";
	for (my $i = 0; $i < $length; $i++) {
		$character_string .= $character;
	}
	return $character_string;
}

sub print_help {
	print <<HELP;

NAME
  hmm -- A tool to find the probability of a string using a randomly
         generated arc-emission hidden markov model and the forward
         and backward algorithms (alpha and beta)

SYNOPSIS
  perl hmm string|file [-ph]

OPTIONS
  -p, --print-probabilities  prints the randomly generated initial state,
                             transition, and symbol emission probabilities
  -h, --help                 print this help menu

NOTES
  hmm randomly generates a two-state hidden markov model that it uses to
  analyze the supplied string, or strings stored in a file. In particular,
  the two states are named 'C' & 'V' and there are 4 transitions: self-
  connects and transitions to the other state in the system. The initial
  probabilities for each state are randomly generated and their sum equals 1.
  Also, the probabilities for each transition are randomly generated and the
  sum for all the transitions leaving any state is equal to 1 (forms a
  distribution). Next, an emission probability for each of the 26 letters
  plus the space character is generated for each transition. Once again, the
  probabilities for the characters and transitions define a distribution (i.e.
  the sum of the emission probabilities for all 27 symbols, over both the
  self-connect and the transition to the other state in the system, equals 1).

HELP
}


exit(0);
