#!/usr/bin/perl

use strict;

$| = 1;

my %alignments;
my @first_string_letters;
my @second_string_letters;

my @vowels;
my @consonants;

my $print_matrix = 0;
my $is_detailed = 0;
my $is_genomic = 0;
my $identity_cost = 0.0;
my $vowel_vowel_mismatch_cost = 0.5;
my $consonant_consonant_mismatch_cost = 0.6;
my $vowel_consonant_mismatch_cost = 1.0;
my $vowel_insertion_cost = 1.0;
my $consonant_insertion_cost = 1.0;
my $favored_direction = 1;

my @mismatches = qw (I V C O L T);
my @directions = qw(H D V);

my $max_cost_length = 0;
my $print_width = 79;
my $start_time = time;

eval { process_options(); }; die $@ if $@;
eval { get_string_alignment(); }; die $@ if $@;

sub process_options {
	print "\n";

	for (my $i = 0; $i <= $#ARGV; $i++) {
		if ($ARGV[$i] =~ /^(\-s|\-\-show\-matrix)$/) {
			$print_matrix = 1;
		} elsif ($ARGV[$i] =~ /^(\-d|\-\-detailed)$/) {
			$is_detailed = 1;
		} elsif ($ARGV[$i] =~ /^(\-g|\-\-genomic)$/) {
			$is_genomic = 1;
		} elsif ($ARGV[$i] =~ /^(\-i|\-\-identity)$/) {
			$identity_cost = $ARGV[++$i];
		} elsif ($ARGV[$i] =~ /^(\-v|\-\-vowel)$/) {
			$vowel_vowel_mismatch_cost = $ARGV[++$i];
		} elsif ($ARGV[$i] =~ /^(\-c|\-\-consonant)$/) {
			$consonant_consonant_mismatch_cost = $ARGV[++$i];
		} elsif ($ARGV[$i] =~ /^(\-o|\-\-vowel\-consonant)$/) {
			$vowel_consonant_mismatch_cost = $ARGV[++$i];
		} elsif ($ARGV[$i] =~ /^(\-l|\-\-vowel\-insertion)$/) {
			$vowel_insertion_cost = $ARGV[++$i];
		} elsif ($ARGV[$i] =~ /^(\-t|\-\-consonant\-insertion)$/) {
			$consonant_insertion_cost = $ARGV[++$i];
		} elsif ($ARGV[$i] =~ /^(\-f|\-\-favored\-direction)$/) {
			$_ = $ARGV[++$i];
			SWITCH: {
				$favored_direction = 0, last SWITCH if /^H$/i;
				$favored_direction = 2, last SWITCH if /^V$/i;
				$favored_direction = 1;
			}			
		} elsif ($ARGV[$i] =~ /^(\-p|\-\-print\-width)$/) {
			$print_width = $ARGV[++$i];
		} elsif ($ARGV[$i] =~ /^(\-h|\-\-help)$/) {
			print_help();
			exit(0);
		}
	}

	if ($ARGV[0] =~ /^\-{1,2}\w+/ || $ARGV[1] =~ /^\-{1,2}\w+/) {
		print "The first two arguments must be strings\n";
		exit(0);
	}

	if (!$is_genomic) {
		@vowels = qw(A E I O U);
		@consonants = qw(B C D F G H J K L M N P Q R S T V W X Y Z);
	} else {
		@vowels = qw(A G);
		@consonants = qw (C T U);
	}

	my $string_1;
	my $string_2;

	if (-f $ARGV[0]) {
		$string_1 = get_string_from_file($ARGV[0]);
	} else {
		$string_1 = $ARGV[0];
	}

	if (-f $ARGV[1]) {
		$string_2 = get_string_from_file($ARGV[1]);
	} else {
		$string_2 = $ARGV[1];
	}

	if (length($string_1) < length($string_2)) {
		my $temp_string = $string_1;
		$string_1 = $string_2;
		$string_1 = $temp_string;
	}

	$string_1 =~ tr/a-z/A-Z/;
	@first_string_letters = split(//, $string_1);
	unshift @first_string_letters, " ";

	$string_2 =~ tr/a-z/A-Z/;
	@second_string_letters = split(//, $string_2);
	unshift @second_string_letters, " ";
}

sub get_string_alignment {
	my $total = ($#first_string_letters + 1) * ($#second_string_letters + 1);
	my $current = 0;
	my $percent = 0;
	my $current_time = time;
	my $remaining_time;
	my $OUT;
	for (my $i = 0; $i <= $#first_string_letters; $i++) {
		for (my $j = 0; $j <= $#second_string_letters; $j++) {
			if ($i == 0 && $j == 0) {
				$alignments{$i}{$j}{cost} = get_alignment_cost($first_string_letters[$i], $second_string_letters[$j]);
				$alignments{$i}{$j}{direction} = 1;
			} elsif ($i == 0) {
				$alignments{$i}{$j}{cost} = get_alignment_cost(" ", $second_string_letters[$j]) + $alignments{$i}{$j-1}{cost};
				$alignments{$i}{$j}{direction} = 0;
			} elsif ($j == 0) {
				$alignments{$i}{$j}{cost} = get_alignment_cost($first_string_letters[$i], " ") + $alignments{$i-1}{$j}{cost};
				$alignments{$i}{$j}{direction} = 2;
			} else {
				my ($cost, $direction) = get_min_cost(
					get_alignment_cost(" ", $second_string_letters[$j]) + $alignments{$i}{$j-1}{cost},
					get_alignment_cost($first_string_letters[$i], $second_string_letters[$j]) + $alignments{$i-1}{$j-1}{cost},
					get_alignment_cost($first_string_letters[$i], " ") + $alignments{$i-1}{$j}{cost});
				$alignments{$i}{$j}{cost} = $cost;
				$alignments{$i}{$j}{direction} = $direction;
			}
			$max_cost_length = length($alignments{$i}{$j}{cost}) if ($max_cost_length < length($alignments{$i}{$j}{cost}));
			$current++;
			for (my $i = 0; $i < length($OUT); $i++) {
				print "\b";
			}
			$percent = ($current / $total) * 100;
			$OUT = "Processed " . $current . " out of " . $total . " cells (" . sprintf("%d", $percent) . "%)";
			$OUT .= " [" . get_formatted_time(get_remaining_time($current / $total)) . "]";
			print $OUT;
		}
	}

	for (my $i = 0; $i < length($OUT); $i++) {
		print "\b";
	}
	$OUT = "Processed " . $current . " out of " . $total . " cells (" . sprintf("%d", $percent) . "%)";
	$OUT .= " [" . get_formatted_time(get_elapsed_time()) . "]";
	print $OUT;

	print "\n\n";
	print_alignment_strings();
	print_alignment_matrix() if ($print_matrix);
	print "\n";
}

sub print_alignment_strings {
	my $i = $#first_string_letters;
	my $j = $#second_string_letters;
	my $min_cost = $alignments{$i}{$j}{cost};
	my $first_string;
	my $second_string;
	my $alignment_string_1;
	my $alignment_string_2;

	if ($alignments{$i}{$j}{direction} == 0) {
		$first_string = "\ ";
		$second_string = $second_string_letters[$j--];
		$alignment_string_1 = "\*";
		$alignment_string_2 = "\|";
	} elsif ($alignments{$i}{$j}{direction} == 1) {
		if ($first_string_letters[$i] eq $second_string_letters[$j]) {
			$alignment_string_1 = "\|";
			$alignment_string_2 = "\|";
		} else {
			$alignment_string_1 = "\:";
			$alignment_string_2 = "\:";
		}
		$first_string = $first_string_letters[$i--];
		$second_string = $second_string_letters[$j--];
	} elsif ($alignments{$i}{$j}{direction} == 2) {
		$first_string = $first_string_letters[$i--];
		$second_string = "\ ";
		$alignment_string_1 = "\|";
		$alignment_string_2 = "\*";
	}

	while ($i >= 0 && $j >= 0) {
		if ($alignments{$i}{$j}{direction} == 0) {
			$first_string = "\ " . $first_string;
			$second_string = $second_string_letters[$j--] . $second_string;
			$alignment_string_1 = "\*" . $alignment_string_1;
			$alignment_string_2 = "\|" . $alignment_string_2;
		} elsif ($alignments{$i}{$j}{direction} == 1) {
			if ($first_string_letters[$i] eq $second_string_letters[$j]) {
				$alignment_string_1 = "\|" . $alignment_string_1;
				$alignment_string_2 = "\|" . $alignment_string_2;
			} else {
				$alignment_string_1 = "\:" . $alignment_string_1;
				$alignment_string_2 = "\:" . $alignment_string_2;
			}
			$first_string = $first_string_letters[$i--] . $first_string;
			$second_string = $second_string_letters[$j--] . $second_string;
		} elsif ($alignments{$i}{$j}{direction} == 2) {
			$first_string = $first_string_letters[$i--] . $first_string;
			$second_string = "\ " . $second_string;
			$alignment_string_1 = "\|" . $alignment_string_1;
			$alignment_string_2 = "\*" . $alignment_string_2;
		}
	}

	$first_string =~ s/^ //;
	$second_string =~ s/^ //;
	$alignment_string_1 =~ s/^\|//;
	$alignment_string_2 =~ s/^\|//;

	print "BEST STRING ALIGNMENT (Cost->" . $min_cost . "):\n\n";

	if (length($first_string) <= $print_width) {
		print $first_string . "\n";
		print $alignment_string_1 . "\n" . $alignment_string_2 . "\n" if ($is_detailed);
		print $second_string . "\n";
	} else {
		my $lines = int(length($first_string) / $print_width);
		my $leftover = length($first_string) % $print_width;
		for (my $i = 0; $i < $lines; $i++) {
			print substr($first_string, ($i * $print_width), $print_width) . "\n";
			if ($is_detailed) {
				print substr($alignment_string_1, ($i * $print_width), $print_width) . "\n";
				print substr($alignment_string_2, ($i * $print_width), $print_width) . "\n";
			}
			print substr($second_string, ($i * $print_width), $print_width) . "\n\n";
		}
		if ($leftover) {
			print substr($first_string, ($lines * $print_width)) . "\n";
			if ($is_detailed) {
				print substr($alignment_string_1, ($lines * $print_width)) . "\n";
				print substr($alignment_string_2, ($lines * $print_width)) . "\n";
			}
			print substr($second_string, ($lines * $print_width)) . "\n\n";
		}
	}
}

sub print_alignment_matrix {
	print "\nALIGNMENT COST MATRIX:";
	print "\n+---" . get_repeated_characters("+--" . get_repeated_characters("-", $max_cost_length), $#second_string_letters + 1) . "+";
	print "\n|   |" . join(" " . get_repeated_characters(" ", $max_cost_length) . "|", @second_string_letters) . get_repeated_characters(" ", ($max_cost_length + 1)) . "|\n";
	print "+---" . get_repeated_characters("+--" . get_repeated_characters("-", $max_cost_length), $#second_string_letters + 1) . "+\n";
	foreach my $i (sort { $a <=> $b } keys %alignments) {
		print "| " . $first_string_letters[$i] . " ";
		foreach my $j (sort { $a <=> $b } keys %{$alignments{$i}}) {
			print "|" . $directions[$alignments{$i}{$j}{direction}] . " " . sprintf("%-" . $max_cost_length . "s", $alignments{$i}{$j}{cost});
		}
		print "|\n";
		print "+---" . get_repeated_characters("+--" . get_repeated_characters("-", $max_cost_length), $#second_string_letters + 1) . "+\n";
	}
}

sub get_remaining_time {
	my $percent_completed = shift;
	my $percent_remaining = 1 - $percent_completed;
	return ($percent_remaining * (get_elapsed_time() / $percent_completed));
}

sub get_elapsed_time {
	my $hour = 0;
	my $min = 0;
	my $sec = 0;
	my $current_time = time;
	my $elapsed_time = $current_time - $start_time;
	return $elapsed_time;
}

sub get_formatted_time {
	my $hour = 0;
	my $min = 0;
	my $sec = 0;
	my $current_time = shift;
	$sec = $current_time % 60;
	$min = int($current_time / 60);
	if ($min >= 60) {
		$min = $min % 60;
		$hour = int($min / 60);
	}
	return sprintf("%02s", $hour) . ":" . sprintf("%02s", $min) . ":" . sprintf("%02s", $sec);
}

sub get_string_from_file {
	my $file = shift;
	my $string;
	$file =~ s/\\/\//g;
	open(FIL, "<$file") || die "$! [" . $file . "]\n";
	while (<FIL>) {
		next if (/^\>/);
		chomp;
		$string .= $_;
	}
	close(FIL);
	return $string;
}

sub get_repeated_characters {
	my ($character, $count) = @_;
	my $repeated_characters;
	for (my $i = 0; $i < $count; $i++) {
		$repeated_characters .= $character;
	}
	return $repeated_characters;
}

sub get_min_cost {
	my @choices = @_;
	my $min = shift @choices;
	my $index = 0;

	for (my $i = 0; $i <= $#choices; $i++) {
		my $temp_choice = $choices[$i];
		if ($temp_choice < $min || ($temp_choice == $min && ($i + 1) == $favored_direction)) {
			$min = $temp_choice;
			$index = $i + 1;
		}
	}

	return ($min, $index);
}	

sub get_alignment_cost {
	my ($first_letter, $second_letter) = @_;
	my $cost = $identity_cost;

	if ($first_letter ne $second_letter) {
		if (is_vowel($first_letter) && is_vowel($second_letter)) {
			$cost = $vowel_vowel_mismatch_cost;
		} elsif (is_consonant($first_letter) && is_consonant($second_letter)) {
			$cost = $consonant_consonant_mismatch_cost;
		} elsif (($first_letter eq " " && is_vowel($second_letter)) || (is_vowel($first_letter) && $second_letter eq " ")) {
			$cost = $vowel_insertion_cost;
		} elsif (($first_letter eq " " && is_consonant($second_letter)) || (is_consonant($first_letter) && $second_letter eq " ")) {
			$cost = $consonant_insertion_cost;
		} else {
			$cost = $vowel_consonant_mismatch_cost;
		}
	}

	return $cost;
}

sub is_vowel {
	my $letter = shift;
	return grep(/^$letter$/, @vowels);
}

sub is_consonant {
	my $letter = shift;
	return grep(/^$letter$/, @consonants);
}

sub print_help {
	print <<HELP;

NAME
  stralign -- A tool to find the best alignment between two strings

SYNOPSIS
  perl stralign string-1|file-1 string-2|file-2 [-sdgivcoltfph]

OPTIONS
  -s, --show-matrix          show the alignment cost matrix
  -d, --detailed             show detailed string alignment
  -g, --genomic              use genetic alphabet
  -i, --identity             identity match cost
  -v, --vowel                mismatched vowels (purines) cost
  -c, --consonant            mismatched consonants (pyrimidines) cost
  -o, --vowel-consonant      vowel-consonant (purine-pyrimidine) mismatch cost
  -l, --vowel-insertion      vowel (purine) character insertion cost
  -t, --consonant-insertion  consonant (pyrimidine) character insertion cost
  -f, --favored-direction    favored cell direction for cost ties
  -p, --print-width          where to break long strings when printing
  -h, --help                 print this help menu

NOTES
  The first two arguments to stralign must be either the strings to be
  aligned or the names of files where these strings are saved. If reading
  a string from a file, all lines that begin with '>' will be treated as
  comments.

  When viewing the alignment cost matrix, H implies horizontal, D diagonal,
  and V vertical, and they give the direction to the cell from which the
  minimal alignment cost was generated.

  The costs for each misalignment type can be user-specified at runtime, but
  the following are the default values:

  Mismatch Type        Default Cost
  ---------------      ------------
  Identity             0.0
  Vowel                0.5
  Consonant            0.6
  Vowel-Consonant      1.0
  Vowel Insertion      1.0
  Consonant Insertion  1.0

  The default favored direction is diagonal, but can be specified using the
  following (case-insensitive) codes:

  Favored Direction  Code
  -----------------  ----
  Horizontal         H|h
  Diagonal           D|d
  Vertical           V|v

  Below is an example of datailed output for genomic data. Perfectly matched
  characters are joined by two pipes ('|'), mismatched characters that were
  kept in the analysis are joined by two colons (':'), and dangling characters
  have a pipe and asterisk ('|' and '*') attached.

  GTTAATGTAGCTT AATA A   AAAGTGAGGCACTGAA
  |||:|||||||||*||:|*|***||||::||:|||||||
  |||:||||||||||||:||||||||||::||:|||||||
  GTTTATGTAGCTTAAACATACCCAAAGCAAGACACTGAA

  Detailed output for regular strings has the same format.

  It may be of interest to note that when using genomic data, it is possible
  to find complimentary (i.e. binding) patterns by reversing cost settings.
  That is, make the identity cost high and the vowel-consonant (purine-
  pyrimidine) cost zero. The present level of granularity does not allow for
  specifying specifically interesting matches (e.g. A--T, or G--C), but simply
  gives a first-order picture of complimentarity.


HELP
}

exit(0);
