#!/usr/bin/perl use strict; $| = 1; my %alignments; my @first_string_letters; my @second_string_letters; my @vowels; my @consonants; my $print_matrix = 0; my $is_detailed = 0; my $is_genomic = 0; my $identity_cost = 0.0; my $vowel_vowel_mismatch_cost = 0.5; my $consonant_consonant_mismatch_cost = 0.6; my $vowel_consonant_mismatch_cost = 1.0; my $vowel_insertion_cost = 1.0; my $consonant_insertion_cost = 1.0; my $favored_direction = 1; my @mismatches = qw (I V C O L T); my @directions = qw(H D V); my $max_cost_length = 0; my $print_width = 79; my $start_time = time; eval { process_options(); }; die $@ if $@; eval { get_string_alignment(); }; die $@ if $@; sub process_options { print "\n"; for (my $i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i] =~ /^(\-s|\-\-show\-matrix)$/) { $print_matrix = 1; } elsif ($ARGV[$i] =~ /^(\-d|\-\-detailed)$/) { $is_detailed = 1; } elsif ($ARGV[$i] =~ /^(\-g|\-\-genomic)$/) { $is_genomic = 1; } elsif ($ARGV[$i] =~ /^(\-i|\-\-identity)$/) { $identity_cost = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /^(\-v|\-\-vowel)$/) { $vowel_vowel_mismatch_cost = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /^(\-c|\-\-consonant)$/) { $consonant_consonant_mismatch_cost = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /^(\-o|\-\-vowel\-consonant)$/) { $vowel_consonant_mismatch_cost = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /^(\-l|\-\-vowel\-insertion)$/) { $vowel_insertion_cost = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /^(\-t|\-\-consonant\-insertion)$/) { $consonant_insertion_cost = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /^(\-f|\-\-favored\-direction)$/) { $_ = $ARGV[++$i]; SWITCH: { $favored_direction = 0, last SWITCH if /^H$/i; $favored_direction = 2, last SWITCH if /^V$/i; $favored_direction = 1; } } elsif ($ARGV[$i] =~ /^(\-p|\-\-print\-width)$/) { $print_width = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /^(\-h|\-\-help)$/) { print_help(); exit(0); } } if ($ARGV[0] =~ /^\-{1,2}\w+/ || $ARGV[1] =~ /^\-{1,2}\w+/) { print "The first two arguments must be strings\n"; exit(0); } if (!$is_genomic) { @vowels = qw(A E I O U); @consonants = qw(B C D F G H J K L M N P Q R S T V W X Y Z); } else { @vowels = qw(A G); @consonants = qw (C T U); } my $string_1; my $string_2; if (-f $ARGV[0]) { $string_1 = get_string_from_file($ARGV[0]); } else { $string_1 = $ARGV[0]; } if (-f $ARGV[1]) { $string_2 = get_string_from_file($ARGV[1]); } else { $string_2 = $ARGV[1]; } if (length($string_1) < length($string_2)) { my $temp_string = $string_1; $string_1 = $string_2; $string_1 = $temp_string; } $string_1 =~ tr/a-z/A-Z/; @first_string_letters = split(//, $string_1); unshift @first_string_letters, " "; $string_2 =~ tr/a-z/A-Z/; @second_string_letters = split(//, $string_2); unshift @second_string_letters, " "; } sub get_string_alignment { my $total = ($#first_string_letters + 1) * ($#second_string_letters + 1); my $current = 0; my $percent = 0; my $current_time = time; my $remaining_time; my $OUT; for (my $i = 0; $i <= $#first_string_letters; $i++) { for (my $j = 0; $j <= $#second_string_letters; $j++) { if ($i == 0 && $j == 0) { $alignments{$i}{$j}{cost} = get_alignment_cost($first_string_letters[$i], $second_string_letters[$j]); $alignments{$i}{$j}{direction} = 1; } elsif ($i == 0) { $alignments{$i}{$j}{cost} = get_alignment_cost(" ", $second_string_letters[$j]) + $alignments{$i}{$j-1}{cost}; $alignments{$i}{$j}{direction} = 0; } elsif ($j == 0) { $alignments{$i}{$j}{cost} = get_alignment_cost($first_string_letters[$i], " ") + $alignments{$i-1}{$j}{cost}; $alignments{$i}{$j}{direction} = 2; } else { my ($cost, $direction) = get_min_cost( get_alignment_cost(" ", $second_string_letters[$j]) + $alignments{$i}{$j-1}{cost}, get_alignment_cost($first_string_letters[$i], $second_string_letters[$j]) + $alignments{$i-1}{$j-1}{cost}, get_alignment_cost($first_string_letters[$i], " ") + $alignments{$i-1}{$j}{cost}); $alignments{$i}{$j}{cost} = $cost; $alignments{$i}{$j}{direction} = $direction; } $max_cost_length = length($alignments{$i}{$j}{cost}) if ($max_cost_length < length($alignments{$i}{$j}{cost})); $current++; for (my $i = 0; $i < length($OUT); $i++) { print "\b"; } $percent = ($current / $total) * 100; $OUT = "Processed " . $current . " out of " . $total . " cells (" . sprintf("%d", $percent) . "%)"; $OUT .= " [" . get_formatted_time(get_remaining_time($current / $total)) . "]"; print $OUT; } } for (my $i = 0; $i < length($OUT); $i++) { print "\b"; } $OUT = "Processed " . $current . " out of " . $total . " cells (" . sprintf("%d", $percent) . "%)"; $OUT .= " [" . get_formatted_time(get_elapsed_time()) . "]"; print $OUT; print "\n\n"; print_alignment_strings(); print_alignment_matrix() if ($print_matrix); print "\n"; } sub print_alignment_strings { my $i = $#first_string_letters; my $j = $#second_string_letters; my $min_cost = $alignments{$i}{$j}{cost}; my $first_string; my $second_string; my $alignment_string_1; my $alignment_string_2; if ($alignments{$i}{$j}{direction} == 0) { $first_string = "\ "; $second_string = $second_string_letters[$j--]; $alignment_string_1 = "\*"; $alignment_string_2 = "\|"; } elsif ($alignments{$i}{$j}{direction} == 1) { if ($first_string_letters[$i] eq $second_string_letters[$j]) { $alignment_string_1 = "\|"; $alignment_string_2 = "\|"; } else { $alignment_string_1 = "\:"; $alignment_string_2 = "\:"; } $first_string = $first_string_letters[$i--]; $second_string = $second_string_letters[$j--]; } elsif ($alignments{$i}{$j}{direction} == 2) { $first_string = $first_string_letters[$i--]; $second_string = "\ "; $alignment_string_1 = "\|"; $alignment_string_2 = "\*"; } while ($i >= 0 && $j >= 0) { if ($alignments{$i}{$j}{direction} == 0) { $first_string = "\ " . $first_string; $second_string = $second_string_letters[$j--] . $second_string; $alignment_string_1 = "\*" . $alignment_string_1; $alignment_string_2 = "\|" . $alignment_string_2; } elsif ($alignments{$i}{$j}{direction} == 1) { if ($first_string_letters[$i] eq $second_string_letters[$j]) { $alignment_string_1 = "\|" . $alignment_string_1; $alignment_string_2 = "\|" . $alignment_string_2; } else { $alignment_string_1 = "\:" . $alignment_string_1; $alignment_string_2 = "\:" . $alignment_string_2; } $first_string = $first_string_letters[$i--] . $first_string; $second_string = $second_string_letters[$j--] . $second_string; } elsif ($alignments{$i}{$j}{direction} == 2) { $first_string = $first_string_letters[$i--] . $first_string; $second_string = "\ " . $second_string; $alignment_string_1 = "\|" . $alignment_string_1; $alignment_string_2 = "\*" . $alignment_string_2; } } $first_string =~ s/^ //; $second_string =~ s/^ //; $alignment_string_1 =~ s/^\|//; $alignment_string_2 =~ s/^\|//; print "BEST STRING ALIGNMENT (Cost->" . $min_cost . "):\n\n"; if (length($first_string) <= $print_width) { print $first_string . "\n"; print $alignment_string_1 . "\n" . $alignment_string_2 . "\n" if ($is_detailed); print $second_string . "\n"; } else { my $lines = int(length($first_string) / $print_width); my $leftover = length($first_string) % $print_width; for (my $i = 0; $i < $lines; $i++) { print substr($first_string, ($i * $print_width), $print_width) . "\n"; if ($is_detailed) { print substr($alignment_string_1, ($i * $print_width), $print_width) . "\n"; print substr($alignment_string_2, ($i * $print_width), $print_width) . "\n"; } print substr($second_string, ($i * $print_width), $print_width) . "\n\n"; } if ($leftover) { print substr($first_string, ($lines * $print_width)) . "\n"; if ($is_detailed) { print substr($alignment_string_1, ($lines * $print_width)) . "\n"; print substr($alignment_string_2, ($lines * $print_width)) . "\n"; } print substr($second_string, ($lines * $print_width)) . "\n\n"; } } } sub print_alignment_matrix { print "\nALIGNMENT COST MATRIX:"; print "\n+---" . get_repeated_characters("+--" . get_repeated_characters("-", $max_cost_length), $#second_string_letters + 1) . "+"; print "\n| |" . join(" " . get_repeated_characters(" ", $max_cost_length) . "|", @second_string_letters) . get_repeated_characters(" ", ($max_cost_length + 1)) . "|\n"; print "+---" . get_repeated_characters("+--" . get_repeated_characters("-", $max_cost_length), $#second_string_letters + 1) . "+\n"; foreach my $i (sort { $a <=> $b } keys %alignments) { print "| " . $first_string_letters[$i] . " "; foreach my $j (sort { $a <=> $b } keys %{$alignments{$i}}) { print "|" . $directions[$alignments{$i}{$j}{direction}] . " " . sprintf("%-" . $max_cost_length . "s", $alignments{$i}{$j}{cost}); } print "|\n"; print "+---" . get_repeated_characters("+--" . get_repeated_characters("-", $max_cost_length), $#second_string_letters + 1) . "+\n"; } } sub get_remaining_time { my $percent_completed = shift; my $percent_remaining = 1 - $percent_completed; return ($percent_remaining * (get_elapsed_time() / $percent_completed)); } sub get_elapsed_time { my $hour = 0; my $min = 0; my $sec = 0; my $current_time = time; my $elapsed_time = $current_time - $start_time; return $elapsed_time; } sub get_formatted_time { my $hour = 0; my $min = 0; my $sec = 0; my $current_time = shift; $sec = $current_time % 60; $min = int($current_time / 60); if ($min >= 60) { $min = $min % 60; $hour = int($min / 60); } return sprintf("%02s", $hour) . ":" . sprintf("%02s", $min) . ":" . sprintf("%02s", $sec); } sub get_string_from_file { my $file = shift; my $string; $file =~ s/\\/\//g; open(FIL, "<$file") || die "$! [" . $file . "]\n"; while () { next if (/^\>/); chomp; $string .= $_; } close(FIL); return $string; } sub get_repeated_characters { my ($character, $count) = @_; my $repeated_characters; for (my $i = 0; $i < $count; $i++) { $repeated_characters .= $character; } return $repeated_characters; } sub get_min_cost { my @choices = @_; my $min = shift @choices; my $index = 0; for (my $i = 0; $i <= $#choices; $i++) { my $temp_choice = $choices[$i]; if ($temp_choice < $min || ($temp_choice == $min && ($i + 1) == $favored_direction)) { $min = $temp_choice; $index = $i + 1; } } return ($min, $index); } sub get_alignment_cost { my ($first_letter, $second_letter) = @_; my $cost = $identity_cost; if ($first_letter ne $second_letter) { if (is_vowel($first_letter) && is_vowel($second_letter)) { $cost = $vowel_vowel_mismatch_cost; } elsif (is_consonant($first_letter) && is_consonant($second_letter)) { $cost = $consonant_consonant_mismatch_cost; } elsif (($first_letter eq " " && is_vowel($second_letter)) || (is_vowel($first_letter) && $second_letter eq " ")) { $cost = $vowel_insertion_cost; } elsif (($first_letter eq " " && is_consonant($second_letter)) || (is_consonant($first_letter) && $second_letter eq " ")) { $cost = $consonant_insertion_cost; } else { $cost = $vowel_consonant_mismatch_cost; } } return $cost; } sub is_vowel { my $letter = shift; return grep(/^$letter$/, @vowels); } sub is_consonant { my $letter = shift; return grep(/^$letter$/, @consonants); } sub print_help { print <' will be treated as comments. When viewing the alignment cost matrix, H implies horizontal, D diagonal, and V vertical, and they give the direction to the cell from which the minimal alignment cost was generated. The costs for each misalignment type can be user-specified at runtime, but the following are the default values: Mismatch Type Default Cost --------------- ------------ Identity 0.0 Vowel 0.5 Consonant 0.6 Vowel-Consonant 1.0 Vowel Insertion 1.0 Consonant Insertion 1.0 The default favored direction is diagonal, but can be specified using the following (case-insensitive) codes: Favored Direction Code ----------------- ---- Horizontal H|h Diagonal D|d Vertical V|v Below is an example of datailed output for genomic data. Perfectly matched characters are joined by two pipes ('|'), mismatched characters that were kept in the analysis are joined by two colons (':'), and dangling characters have a pipe and asterisk ('|' and '*') attached. GTTAATGTAGCTT AATA A AAAGTGAGGCACTGAA |||:|||||||||*||:|*|***||||::||:||||||| |||:||||||||||||:||||||||||::||:||||||| GTTTATGTAGCTTAAACATACCCAAAGCAAGACACTGAA Detailed output for regular strings has the same format. It may be of interest to note that when using genomic data, it is possible to find complimentary (i.e. binding) patterns by reversing cost settings. That is, make the identity cost high and the vowel-consonant (purine- pyrimidine) cost zero. The present level of granularity does not allow for specifying specifically interesting matches (e.g. A--T, or G--C), but simply gives a first-order picture of complimentarity. HELP } exit(0);