# License Start: 
#                    Carnegie Mellon University                      
#                      Copyright (c) 2004                            
#                       All Rights Reserved.                         
#
# Permission is hereby granted, free of charge, to use and distribute
# this software and its documentation without restriction, including 
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of this work, and to    
# permit persons to whom this work is furnished to do so, subject to 
# the following conditions:                                          
#  1. The code must retain the above copyright notice, this list of  
#     conditions and the following disclaimer.                       
#  2. Any modifications must be clearly marked as such.              
#  3. Original authors' names are not deleted.                       
#  4. The authors' names are not used to endorse or promote products 
#     derived from this software without specific prior written      
#     permission.                                                    
#
# CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK       
# DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING    
# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT 
# SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE    
# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES  
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,        
# ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF     
# THIS SOFTWARE.                                                     
#
# Author: Satanjeev "Bano" Banerjee satanjeev@cmu.edu
# Author: Alon Lavie alavie@cs.cmu.edu
#
# License End.


package porter_stem;


require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(setUpDataStructures);

use Lingua::Stem::Snowball;

# this will create the alignment ready data structures by matching
# only words that are identical after stemming (but not identical
# before stemming)
{
# Initialize a default stemmer with 
my $stemmer = Lingua::Stem::Snowball->new( lang => 'en', encoding => 'UTF-8', );

sub setUpDataStructures
{
    # initialize the porter stemmer (this will get called every time
    # this module is called, but that's okay since this module will
    # likely be called just once, and it doesn't hurt to do the
    # initialization multiple times if need be)
    porter_initialise();

    my $firstStringWordsRef = shift;
    my $secondStringWordsRef = shift;
    my $string2OriginalPosRef = shift;
    my $string2MatchedPosRef = shift;
    my $multiChoiceWordIndexesRef = shift;
    my $posChoicesRef = shift;
    my $alreadyAlignedFirstStringRef = shift;
    my $alreadyAlignedSecondStringRef = shift;
    my $lang = shift;
    
    # Set the stemmer language
    $stemmer->lang($lang);
    
    my @firstStringStems = ();
    my @secondStringStems = ();

    my $i;
    
    # Take the first string, stem the words and create a hash that
    # maps words to their (list of) positions in the string. Do this
    # only for words that are not already aligned in the previous
    # stages

    my %string1Pos = ();
    for ($i = 0; $i <= $#{$firstStringWordsRef}; $i++) 
    { 
	next if (defined ${$alreadyAlignedFirstStringRef}{$i});
	
	my $wordStem = $stemmer->stem(${$firstStringWordsRef}[$i]);
	$firstStringStems[$i] = $wordStem;
	push @{$string1Pos{$wordStem}}, $i; 
    }
    
    # Do the same for the second string
    my %string2Pos = ();
    for ($i = 0; $i <= $#{$secondStringWordsRef}; $i++) 
    { 
	next if (defined ${$alreadyAlignedSecondStringRef}{$i});
	
	my $wordStem = $stemmer->stem(${$secondStringWordsRef}[$i]);
	$secondStringStems[$i] = $wordStem;
	push @{$string2Pos{$wordStem}}, $i;
    }

    # Now to construct the data structures for the alignment module 
    my $index = 0;
    for ($i = 0; $i <= $#secondStringStems; $i++)
    {
	# skip if already aligned
	next if (defined ${$alreadyAlignedSecondStringRef}{$i});

	# skip if this stem doesn't occur in first string
	next unless (defined $string1Pos{$secondStringStems[$i]}); 
    
	${$string2OriginalPosRef}[$index] = $i; # position in original 2nd string

	# check if occurs only once in both first and second string
	if (($#{$string1Pos{$secondStringStems[$i]}} == 0) && 
            ($#{$string2Pos{$secondStringStems[$i]}} == 0))
	{
	    ${$string2MatchedPosRef}[$index] = ${$string1Pos{$secondStringStems[$i]}}[0]; 
        }

        else 
	{ 
	    # okay, so this word has multiple pos choices
	    ${$multiChoiceWordIndexesRef}{$index} = 1;
	    
	    ${$string2MatchedPosRef}[$index] = $secondStringStems[$i]; # this will be the key into the $posChoicesRef hash
	    
	    # create pos choices hash element for this stem, if not already created!
	    unless (defined ${$posChoicesRef}{${$string2MatchedPosRef}[$index]})
	    {
		# put in all the first string positions for this word
		@{${$posChoicesRef}{${$string2MatchedPosRef}[$index]}} = @{$string1Pos{${$string2MatchedPosRef}[$index]}};
		
		# let d = number of occurrences in second string minus num
		# occ in second string for this word. Need d instances of
		# "-1" (skip) choices in the choices array.
		my $k; 
		for ($k = 0; $k < ($#{$string2Pos{${$string2MatchedPosRef}[$index]}} - $#{$string1Pos{${$string2MatchedPosRef}[$index]}}); $k++)
		{
		    push @{${$posChoicesRef}{${$string2MatchedPosRef}[$index]}}, "-1";
		}
	    }
	}
    
        $index++;
    }
}

}
########################################
#
# The official Perl Porter Stemmer
#
# from http://www.tartarus.org/~martin/PorterStemmer/perl.txt
#
########################################

local %step2list;
local %step3list;
local ($c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);


sub porter_stem
{  my ($stem, $suffix, $firstch);
   my $w = shift;
   if (length($w) < 3) { return $w; } # length at least 3
   # now map initial y to Y so that the patterns never treat it as vowel:
   $w =~ /^./; $firstch = $&;
   if ($firstch =~ /^y/) { $w = ucfirst $w; }

   # Step 1a
   if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
   elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
   # Step 1b
   if ($w =~ /eed$/) { if ($` =~ /$mgr0/o) { chop($w); } }
   elsif ($w =~ /(ed|ing)$/)
   {  $stem = $`;
      if ($stem =~ /$_v/o)
      {  $w = $stem;
         if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
         elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
         elsif ($w =~ /^${C}${v}[^aeiouwxy]$/o) { $w .= "e"; }
      }
   }
   # Step 1c
   if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/o) { $w = $stem."i"; } }

   # Step 2
   if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/)
   { $stem = $`; $suffix = $1;
     if ($stem =~ /$mgr0/o) { $w = $stem . $step2list{$suffix}; }
   }

   # Step 3

   if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/)
   { $stem = $`; $suffix = $1;
     if ($stem =~ /$mgr0/o) { $w = $stem . $step3list{$suffix}; }
   }

   # Step 4

   if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
   { $stem = $`; if ($stem =~ /$mgr1/o) { $w = $stem; } }
   elsif ($w =~ /(s|t)(ion)$/)
   { $stem = $` . $1; if ($stem =~ /$mgr1/o) { $w = $stem; } }


   #  Step 5

   if ($w =~ /e$/)
   { $stem = $`;
     if ($stem =~ /$mgr1/o or
         ($stem =~ /$meq1/o and not $stem =~ /^${C}${v}[^aeiouwxy]$/o))
        { $w = $stem; }
   }
   if ($w =~ /ll$/ and $w =~ /$mgr1/o) { chop($w); }

   # and turn initial Y back to y
   if ($firstch =~ /^y/) { $w = lcfirst $w; }
   return $w;
}

sub porter_initialise {

   %step2list =
   ( 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble',
     'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate',
     'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al',
     'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');

   %step3list =
   ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');


   $c =    "[^aeiou]";          # consonant
   $v =    "[aeiouy]";          # vowel
   $C =    "${c}[^aeiouy]*";    # consonant sequence
   $V =    "${v}[aeiou]*";      # vowel sequence

   $mgr0 = "^(${C})?${V}${C}";               # [C]VC... is m>0
   $meq1 = "^(${C})?${V}${C}(${V})?" . '$';  # [C]VC[V] is m=1
   $mgr1 = "^(${C})?${V}${C}${V}${C}";       # [C]VCVC... is m>1
   $_v   = "^(${C})?${v}";                   # vowel in stem

}

1;
