#!/usr/bin/perl use strict; my %train_dist; # a hash to hold training data symbol distribution my $train_dir; # directory where training corpora reside my @train_names; # training corpora file names my $longest_name; # length of longest training corpus file name eval { set_parameters(); }; die $@ if $@; eval { process_train_corpora(); }; die $@ if $@; exit(0); sub set_parameters { # process user-supplied arguments for (my $i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i] =~ /^(\-d|\-\-directory)$/) { $ARGV[++$i] =~ s/\\/\//g; $train_dir = $ARGV[$i]; } elsif ($ARGV[$i] =~ /^(\-c|\-\-corpus)$/) { $ARGV[++$i] =~ s/\\/\//g; push @train_names, $ARGV[$i]; $longest_name = length($ARGV[$i]); } elsif ($ARGV[$i] =~ /^(\-h|\-\-help)$/) { print_usage(); exit(0); } else { die "Unknown argument specified [" . $ARGV[$i] . "]\n"; } } # set default corpora directory if none has been supplied # or tack on a final / if there isn't one if (!$train_dir || $train_dir =~ /^$/) { $train_dir = "./"; } else { $train_dir .= "\/" if ($train_dir !~ /\/$/); } # get list of corpora in the corpora directory # if a corpus file name has not been supplied if ($#train_names < 0) { opendir(DIR, $train_dir) || die "$! [" . $train_dir . "]\n"; @train_names = grep(/^\w+\_train\_corpus\.txt$/, readdir(DIR)); closedir(DIR) || die "$!\n"; die "No training corpora in directory \'" . $train_dir . "\'\n" if ($#train_names < 0); } # get the length of the longest corpus file name # this will be used for formatting output later foreach my $train_name (@train_names) { $longest_name = length($train_name) if (length($train_name) > $longest_name); } } sub process_train_corpora { # go through each corpus and generate the symbol distribution # then output the data to the distribution file; the first line # will contain the total number of tokens and the number of # unique tokens while the rest will have a symbol & it's count foreach my $train_name (@train_names) { eval { generate_train_distribution($train_name); }; die $@ . " [" . $train_name . "]\n" if $@; eval { print_train_distribution($train_name); }; die $@ . " [" . $train_name . "]\n" if $@; } print "\n\n"; } sub generate_train_distribution { undef %train_dist; my $train_name = shift; print "\n" . sprintf("%-" . ($longest_name + 1) . "s", $train_name) . "=> "; open(CORP, $train_dir . $train_name) || die "$! [" . $train_dir . $train_name . "]\n"; while () { chomp; s/ //g; my @line_symbols = split(/ */, $_); foreach my $line_symbol (@line_symbols) { $train_dist{total}++; $line_symbol = lc($line_symbol); $train_dist{unique}++ if (!$train_dist{symbols}{$line_symbol}); $train_dist{symbols}{$line_symbol}++; } } close(CORP); } sub print_train_distribution { my $train_name = shift; my @chunks = split(/\//, $train_name); my ($language, $extraneous) = split(/\_/, $chunks[$#chunks]); my $train_dist_name = $language . "\_train\_distribution\.txt"; print $train_dist_name; open(DIST, ">" . $train_dir . $train_dist_name) || die "$! [" . $train_dir . $train_dist_name . "]\n"; print DIST $train_dist{total} . " " . $train_dist{unique}; foreach my $symbol (sort { $a cmp $b } keys %{$train_dist{symbols}}) { print DIST "\n" . $symbol . " " . $train_dist{symbols}{$symbol}; } close(DIST); } sub print_usage { print <) { chomp; s/( +|[^a-zA-Z])//g; next if ($_ =~ //); my @sentence_chars = split(//, $_); for (my $i = 0; $i <= $#sentence_chars; $i++) { my $chunk; for (my $j = 0; $j < $max_chunk_size; $j++) { $chunk .= $sentence_chars[$i+$j]; if ($i <= ($#sentence_chars-$j)) { $total_token_count++; $unique_token_count++ if (!$lexicon{lc($chunk)}{count}); $lexicon{lc($chunk)}{count}++; } } } for (my $i = 0; $i < length($OUT); $i++) { print "\b"; } $line++; my $percent = sprintf("%3d", (($line / $length) * 100)); $percent =~ s/ +//g; if ($line == 1) { $OUT = " Processed training data line " . $line . " out of " . $length . " (" . $percent . "%)"; } else { $OUT = " Processed training data line " . $line . " out of " . $length . " (" . $percent . "%)"; } print $OUT; } close(SEN); print "\n"; eval { print_lexicon(); }; die $@ if $@; } sub get_training_data_length { my $training_data_file_length = 0; open(DAT, "<$training_data_file") || die "$!\n"; while () { $training_data_file_length++; } close(DAT); return $training_data_file_length; } sub print_lexicon { print "Printing lexicon to file\n"; my $tokens = 0; my $OUT; open(LEX, ">$lexicon_file") || die "$!\n"; print LEX $total_token_count . " " . $unique_token_count . " " . $max_chunk_size; foreach my $token (sort { $lexicon{$b}{count} <=> $lexicon{$a}{count} } keys %lexicon) { print LEX "\n" . $token . " " . $lexicon{$token}{count}; for (my $i = 0; $i < length($OUT); $i++) { print "\b"; } $tokens++; my $percent = sprintf("%3d", (($tokens / $unique_token_count) * 100)); $percent =~ s/ +//g; if ($tokens == 1) { $OUT = " Printed " . $tokens . " token out of " . $unique_token_count . " (" . $percent . "%)"; } else { $OUT = " Printed " . $tokens . " tokens out of " . $unique_token_count . " (" . $percent . "%)"; } print $OUT; } } exit(0); #!/usr/bin/perl use strict; use POSIX qw(floor); my %stems; my $corpus; my $sentence; my $output_data_file; my $minimum_stem_size = 1; my $training_percentage = 100; my $random = 0; my $verbose = 0; my $print_to_file = 0; my $use_heuristic = 0; my $prefix_size = 0; eval { process_options(); }; die $@ if $@; eval { find_stems_and_suffixes(); }; die $@ if $@; eval { print_stems_and_suffixes(); }; die $@ if $@; sub process_options { for (my $i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i] =~ /(^\-m|\-\-minimum\-stem\-size)$/) { $minimum_stem_size = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /(^\-s|\-\-sentence)$/) { $sentence = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /(^\-c|\-\-corpus)$/) { $corpus = $ARGV[++$i]; $corpus =~ s/\\/\//g; } elsif ($ARGV[$i] =~ /(^\-o|\-\-output\-data\-file)$/) { $output_data_file = $ARGV[++$i]; $output_data_file =~ s/\\/\//g; $print_to_file = 1; } elsif ($ARGV[$i] =~ /(^\-p|\-\-corpus\-percentage)$/) { $training_percentage = $ARGV[++$i]; $training_percentage =~ s/\%//g; } elsif ($ARGV[$i] =~ /(^\-r|\-\-random\-data)$/) { $random = 1; } elsif ($ARGV[$i] =~ /(^\-v|\-\-verbose)$/) { $verbose = 1; } elsif ($ARGV[$i] =~ /(^\-u|\-\-use\-heuristic)$/) { $use_heuristic = 1; } elsif ($ARGV[$i] =~ /(^\-x|\-\-prefix\-size)$/) { $prefix_size = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /(^\-h|\-\-help)$/) { print_help(); exit(0); } } die "You must supply a sentence or the path to a corpus\n" if (($sentence =~ /^$/ || $sentence eq "") && ($corpus =~ /^$/ || $corpus eq "")); } sub print_help { print < 1 +1 for a next character entropy = 1 and a total suffix count > 1 +1 for a total suffix count > 1 and an entropy increase > 0 from the current character to the next +2 for a unique suffix count > 1 and an entropy increase > 0 from the current character to the next +4 for a unique suffix count > 1 one of which is the null string +(# of occurrences) for each bare word without suffixes The default behavior is to not use the heuristic, so the -u, or --use-heuristic argument must be explicitly supplied. If a prefix size is not supplied, all possible morphemes with weights greater than or equal 4 are suggested. Otherwise, if the substring of characters in the currently considered stem equal in length to the prefix size matches an already posited morpheme of weight >= 4, then the stem is considered a concatenation of the shorter morpheme and a suffix, and so will not be added to the list of predicted stems. If a corpus percentage is supplied, then sucfreq will only use the supplied percentage of the input corpus in the prediction of word boundaries. Furthermore, if the random data argument is supplied, sucfreq will pick a random point in the corpus from which to generate input. Supplying the verbose argument will print stems with their associated suffixes. And, unless an output file is supplied, all output is directed to the standard output (STDOUT). Since language is a global phenomenon, sucfreq can only supply local solutions (for English text, in particular). Also, without adding ad-hoc, non-mathematic/non-statistical heuristic devices, sucfreq is doomed to failure in certain cases. For example, irregular verbs will cause problems (e.g. 'went' will not be associated with other 'go' forms) and the dropping or changing of characters before stems (e.g. dropping 'e' before adding 'ing') will also present difficulties. Therefore, unless a priori knowledge of the language in question is at hand, using purely statistical methods will not produce perfect solutions to the task of discovering morphemes/word boundaries in a body of text. HELP } sub find_stems_and_suffixes { print "Finding Stems and Suffixes\n"; if ($sentence !~ /^$/ && $sentence ne "") { eval { find_stems_and_suffixes_in_sentence(); }; die $@ if $@; } else { eval { find_stems_and_suffixes_in_corpus(); }; die $@ if $@; } } sub find_stems_and_suffixes_in_sentence { $sentence =~ tr/A-Z/a-z/; my @words = split(/[^a-zA-Z]+/, $sentence); foreach my $word (@words) { for (my $i = $minimum_stem_size; $i <= length($word); $i++) { my $stem = substr($word, 0, $i); my $suffix = substr($word, $i); $stems{$stem}{$suffix}++; } } } sub find_stems_and_suffixes_in_corpus { my $length = get_corpus_length(); my $random_length = floor(($training_percentage / 100) * $length); my $random_start_line; if ($random) { $random_start_line = floor(rand($length - (($training_percentage / 100) * $length))); } else { $random_start_line = 0; } my $random_line = 0; my $random_percent = 0; my $line = 0; my $OUT; open(DAT, "<$corpus") || die "$! [" . $corpus . "]\n"; while () { if ($line < $random_start_line) { $line++; next; } chomp; next if (/^ *$/); tr/A-Z/a-z/; my @words = split(/[^a-zA-Z]+/, $_); foreach my $word (@words) { for (my $i = $minimum_stem_size; $i <= length($word); $i++) { my $stem = substr($word, 0, $i); my $suffix = substr($word, $i); $stems{$stem}{$suffix}++; } } for (my $i = 0; $i < length($OUT); $i++) { print "\b"; } $random_line++; $random_percent = ($random_line / $random_length) * 100; if ($random_line == 1) { $OUT = " Processed corpus line " . $random_line . " out of " . $random_length . " (" . sprintf("%d", $random_percent) . "%)"; } else { $OUT = " Processed corpus line " . $random_line . " out of " . $random_length . " (" . sprintf("%d", $random_percent) . "%)"; } print $OUT; last if ($random_line >= $random_length); } close(DAT); print "\n"; } sub print_stems_and_suffixes { print "Printing Stems"; print " and Suffixes" if ($verbose); print " ("; print "not " if (!$use_heuristic); print "using heuristic)\n"; if ($print_to_file) { open(OUT, ">$output_data_file") || die "$! [" . $output_data_file . "]\n"; $| = 1; } my $current_stem = "9999999999"; my $tentative_stems = 0; my $predicted_stems = 0; my $lines = keys %stems; my $line = 0; my $OUT; foreach my $stem (sort { $a cmp $b } keys %stems) { my $weight = 0; my $entire_suffix_entropy = get_entire_suffix_entropy($stem); my $suffix_entropy = get_suffix_entropy($stem); my $entropy_difference = $entire_suffix_entropy - $suffix_entropy; my $look_behind_entropy = get_look_behind_entropy($stem); $tentative_stems++; # entropy measures $weight += 1 if ($suffix_entropy > 1); $weight += 1 if ($suffix_entropy == 1 && get_total_suffix_count($stem) > 1); $weight += 1 if (get_total_suffix_count($stem) > 1 && ($suffix_entropy - $look_behind_entropy) > 0); $weight += 2 if (get_unique_suffix_count($stem) > 1 && ($suffix_entropy - $look_behind_entropy) > 0); # null suffix measures $weight += get_total_suffix_count($stem) if (get_unique_suffix_count($stem) == 1 && get_total_suffix_count($stem) > 1 && is_stem_suffix($stem, "")); $weight += 4 if (get_unique_suffix_count($stem) > 1 && is_stem_suffix($stem, "")); # fix single letter matches $weight = 0 if (length($stem) == 1 && !is_stem_suffix($stem, "")); if (!$use_heuristic || ($use_heuristic && $weight >= 4)) { my $stem_match = ($prefix_size > 0 && length($stem) >= $prefix_size) ? substr($current_stem, 0, $prefix_size) : "\$"; if ($stem !~ /^$stem_match/) { $predicted_stems++; $current_stem = $stem; select(OUT) if ($print_to_file); $| = 1; print " " if (!$print_to_file); print $stem . " (WEIGHT = " . $weight . ")\n"; if ($verbose) { foreach my $suffix (sort { $a cmp $b } keys %{$stems{$stem}}) { print " " if (!$print_to_file); print " " . $suffix . ": " . $stems{$stem}{$suffix} . "\n"; } } } } if ($print_to_file) { select(STDOUT) if ($print_to_file); for (my $i = 0; $i < length($OUT); $i++) { print "\b"; } $line++; my $percent = ($line / $lines) * 100; if ($line == 1) { $OUT = " Processed stem " . $line . " out of " . $lines . " (" . sprintf("%d", $percent) . "%)"; } else { $OUT = " Processed stem " . $line . " out of " . $lines . " (" . sprintf("%d", $percent) . "%)"; } print $OUT; select(OUT); } } if ($print_to_file) { close(OUT); select(STDOUT); $| = 1; print "\n"; } if ($tentative_stems == 1) { print "Found 1 tentative stem\n"; } else { print "Found " . $tentative_stems . " tentative stems\n"; } if ($predicted_stems == 1) { print "Predicted 1 stem\n"; } else { print "Predicted " . $predicted_stems . " stems\n"; } } sub get_suffix_entropy { my $stem = shift; my %characters; my $character_count = 0; my $suffix_entropy = 0; foreach my $suffix (keys %{$stems{$stem}}) { $characters{substr($suffix, 0, 1)}++; $character_count++; } foreach my $character (keys %characters) { my $character_probability = ($characters{$character} / $character_count); $suffix_entropy -= $character_probability * (log($character_probability) / log(2)); } return $suffix_entropy; } sub get_entire_suffix_entropy { my $stem = shift; my $entire_suffix_entropy = 0; foreach my $suffix (keys %{$stems{$stem}}) { my $suffix_probability = ($stems{$stem}{$suffix} / get_total_suffix_count($stem)); $entire_suffix_entropy -= $suffix_probability * (log($suffix_probability) / log(2)); } return $entire_suffix_entropy; } sub get_look_behind_entropy { my $stem = shift; $stem = substr($stem, 0, length($stem) - 1); my %characters; my $character_count = 0; my $look_behind_entropy = 0; foreach my $suffix (keys %{$stems{$stem}}) { $characters{substr($suffix, 0, 1)}++; $character_count++; } foreach my $character (keys %characters) { my $character_probability = ($characters{$character} / $character_count); $look_behind_entropy -= $character_probability * (log($character_probability) / log(2)); } return $look_behind_entropy; } sub get_total_suffix_count { my $stem = shift; my $total_suffix_count = 0; foreach my $suffix (keys %{$stems{$stem}}) { $total_suffix_count += $stems{$stem}{$suffix}; } return $total_suffix_count; } sub get_unique_suffix_count { my $stem = shift; my @suffixes = keys %{$stems{$stem}}; my $unique_suffix_count = $#suffixes + 1; return $unique_suffix_count; } sub is_stem_suffix { my ($stem, $suffix) = @_; my $is_suffix = 0; foreach my $temp_suffix (keys %{$stems{$stem}}) { $is_suffix = 1 if ($temp_suffix =~ /^$suffix$/); } return $is_suffix; } sub get_corpus_length { my $corpus_length = 0; open(DAT, "<$corpus") || die "$! [" . $corpus . "]\n"; while () { chomp; next if ($_ =~ /^$/); $corpus_length++; } close(DAT); return $corpus_length; } exit(0);