package Lingua::PT::Stemmer; $Lingua::PT::Stemmer::VERSION = '0.02'; use 5.006; use strict; use warnings; my $aa = "\xe1"; my $ea = "\xe9"; my $ia = "\xed"; my $oa = "\xf3"; my $ua = "\xfa"; my $at = "\xe3"; my $ot = "\xf5"; my $ac = "\xe2"; my $ec = "\xea"; my $cc = "\xe7"; my %rule; $rule{plural} = { "ns" => [ 1, "m" ], "${ot}es" => [ 3, "${at}o" ], "${at}es" => [ 1, "${at}o" ], "ais" => [ 1, "al" ], "${ea}is" => [ 2, "el" ], "eis" => [ 2, "el" ], "${oa}is" => [ 2, "ol" ], "is" => [ 2, "il" ], "les" => [ 2, "l" ], "res" => [ 3, "r" ], "s" => [ 2, "" ], }; $rule{femin} = { "ona" => [ 3, "${at}o" ], "${at}" => [ 2, "${at}o" ], "ora" => [ 3, "or" ], "na" => [ 4, "no" ], "inha" => [ 3, "inho" ], "esa" => [ 3, "${ec}s" ], "osa" => [ 3, "oso" ], "${ia}aca" => [ 3, "${ia}aco" ], "ica" => [ 3, "ico" ], "ada" => [ 3, "ado" ], "ida" => [ 3, "ido" ], "${ia}da" => [ 3, "ido" ], "ima" => [ 3, "imo" ], "iva" => [ 3, "ivo" ], "eira" => [ 3, "eiro" ], }; $rule{augment} = { "d${ia}ssimo" => [ 5, '' ], "abil${ia}ssimo" => [ 5,'' ], "${ia}ssimo" => [ 3,'' ], "${ea}simo" => [ 3,'' ], "${ea}rrimo" => [ 4,'' ], "zinho" => [ 2,'' ], "quinho" => [ 4, "c" ], "uinho" => [ 4,'' ], "adinho" => [ 3,'' ], "inho" => [ 3,'' ], "alh${at}o" => [ 4,'' ], "u${cc}a" => [ 4,'' ], "a${cc}o" => [ 4,'' ], "ad${at}o" => [ 4,'' ], "${aa}zio" => [ 3,'' ], "arraz" => [ 4,'' ], "arra" => [ 3,'' ], "z${at}o" => [ 2,'' ], "${at}o" => [ 3,'' ], }; $rule{noun} = { "encialista" => [ 4, '' ], "alista" => [ 5, '' ], "agem" => [ 3, '' ], "iamento" => [ 4, '' ], "amento" => [ 3, '' ], "imento" => [ 3, '' ], "alizado" => [ 4, '' ], "atizado" => [ 4, '' ], "izado" => [ 5, '' ], "ativo" => [ 4, '' ], "tivo" => [ 4, '' ], "ivo" => [ 4, '' ], "ado" => [ 2, '' ], "ido" => [ 3, '' ], "ador" => [ 3,'' ], "edor" => [ 3, '' ], "idor" => [ 4, '' ], "at${oa}ria" => [ 5, '' ], "or" => [ 2, '' ], "abilidade" => [ 5,'' ], "icionista" => [ 4, '' ], "cionista" => [ 5, '' ], "ional" => [ 4, '' ], "${ec}ncia" => [ 3, '' ], "${ac}ncia" => [ 4, '' ], "edouro" => [ 3, '' ], "queiro" => [ 3, 'c' ], "eiro" => [ 3, '' ], "oso" => [ 3, '' ], "aliza${cc}" => [ 5, '' ], "ismo" => [ 3, '' ], "iza${cc}" => [ 5, '' ], "a${cc}" => [ 3, '' ], "i${cc}" => [ 3, '' ], "${aa}rio" => [ 3, '' ], "${ea}rio" => [ 6, '' ], "${ec}s" => [ 4, '' ], "eza" => [ 3, '' ], "ez" => [ 4, '' ], "esco" => [ 4, '' ], "ante" => [ 2, '' ], "${aa}stico" => [ 4, '' ], "${aa}tico" => [ 3, '' ], "ico" => [ 4, '' ], "ividade" => [ 5, '' ], "idade" => [ 5, '' ], "oria" => [ 4, '' ], "encial" => [ 5, '' ], "ista" => [ 4, '' ], "quice" => [ 4, 'c' ], "ice" => [ 4, '' ], "${ia}aco" => [ 3, '' ], "ente" => [ 4, '' ], "inal" => [ 3, '' ], "ano" => [ 4, '' ], "${aa}vel" => [ 2, '' ], "${ia}vel" => [ 5, '' ], "ura" => [ 4, '' ], "ual" => [ 3, '' ], "ial" => [ 3, '' ], "al" => [ 4, '' ], }; $rule{verb} = { "ar${ia}amo" => [ 2, ''], "eria" => [ 3, '' ], "${aa}ssemo" => [ 2, '' ], "ermo" => [ 3, '' ], "er${ia}amo" => [ 2, '' ], "esse" => [ 3, '' ], "${ec}ssemo" => [ 2, '' ], "este" => [ 3, '' ], "ir${ia}amo" => [ 3, '' ], "${ia}amo" => [ 3, '' ], "${ia}ssemo" => [ 3, '' ], "iram" => [ 3, '' ], "${aa}ramo" => [ 2, '' ], "${ia}ram" => [ 3, '' ], "${aa}rei" => [ 2, '' ], "irde" => [ 2, '' ], "aremo" => [ 2, '' ], "irei" => [ 3, '' ], "ariam" => [ 2, '' ], "irem" => [ 3, '' ], "ar${ia}ei" => [ 2, '' ], "iria" => [ 3, '' ], "${aa}ssei" => [ 2, '' ], "irmo" => [ 3, '' ], "assem" => [ 2, '' ], "isse" => [ 3, '' ], "${aa}vamo" => [ 2, '' ], "iste" => [ 4, '' ], "${ec}ramo" => [ 3, '' ], "amo" => [ 2, '' ], "eremo" => [ 3, '' ], "ara" => [ 2, '' ], "eriam" => [ 3, '' ], "ar${aa}" => [ 2, '' ], "er${ia}ei" => [ 3, '' ], "are" => [ 2, '' ], "${ec}ssei" => [ 3, '' ], "ava" => [ 2, '' ], "essem" => [ 3, '' ], "emo" => [ 2, '' ], "${ia}ramo" => [ 3, '' ], "era" => [ 3, '' ], "iremo" => [ 3, '' ], "er${aa}" => [ 3, '' ], "iriam" => [ 3, '' ], "ere" => [ 3, '' ], "ir${ia}ei" => [ 3, '' ], "iam" => [ 3, '' ], "${ia}ssei" => [ 3, '' ], "${ia}ei" => [ 3, '' ], "issem" => [ 3, '' ], "imo" => [ 3, '' ], "ando" => [ 2, '' ], "ira" => [ 3, '' ], "endo" => [ 3, '' ], "ir${aa}" => [ 3, '' ], "indo" => [ 3, '' ], "ire" => [ 3, '' ], "ondo" => [ 3, '' ], "omo" => [ 3, '' ], "aram" => [ 2, '' ], "ai" => [ 2, '' ], "arde" => [ 2, '' ], "am" => [ 2, '' ], "arei" => [ 2, '' ], "ear" => [ 4, '' ], "arem" => [ 2, '' ], "ar" => [ 2, '' ], "aria" => [ 2, '' ], "uei" => [ 3, '' ], "armo" => [ 2, '' ], "ei" => [ 3, '' ], "asse" => [ 2, '' ], "em" => [ 2, '' ], "aste" => [ 2, '' ], "er" => [ 2, '' ], "avam" => [ 2, '' ], "eu" => [ 3, '' ], "${aa}vei" => [ 2, '' ], "ia" => [ 3, '' ], "eram" => [ 3, '' ], "ir" => [ 3, '' ], "erde" => [ 3, '' ], "iu" => [ 3, '' ], "erei" => [ 3, '' ], "ou" => [ 3, '' ], "${ec}rei" => [ 3, '' ], "i" => [ 3, '' ], "erem" => [ 3, '' ], }; $rule{accent} = { $aa => 'a', $ea => 'e', $ia => 'i', $oa => 'o', $ua => 'u', $at => 'a', $ot => 'o', $ec => 'e', $cc => 'c', }; sub strip($$) { my $cmd = shift; my $word = shift; if($cmd eq 'accent'){ foreach my $a (keys %{$rule{accent}}){ $word =~ s/$a/$rule{accent}->{$a}/eg; } } elsif($cmd eq 'adv'){ $word =~ s/(.{4,})mente/$1/o; } elsif($cmd eq 'vowel'){ $word =~ s/(.{3,})$_$/$1/ for qw/a e o/; } else{ my $cmdref = $rule{$cmd}; for my $key (sort { length $b <=> length $a } keys %{$cmdref}){ my $patt = join q//, "^(.{", $cmdref->{$key}->[0], ",})", $key, '$'; if($word =~ /$patt/){ $word =~ s/$patt/$1.($cmdref->{$key}->[1])/e; last; } } } return $word; } sub stem { my @stems; foreach ( ref($_[0]) ? @{$_[0]} : @_ ){ my $word = $_; $word = strip('plural', $word) if $word =~ /s$/o; $word = strip('femin', $word) if $word =~ /a$/o; foreach my $op (qw/augment adv noun verb vowel accent/){ $word = strip($op, $word); } push @stems, $word; } wantarray ? @stems : \@stems; } 1; __END__ # Below is stub documentation for your module. You better edit it! =head1 NAME Lingua::PT::Stemmer - Portuguese language stemming =head1 SYNOPSIS use Lingua::PT::Stemmer; Lingua::PT::Stemmer::stem(\@words); # or Lingua::PT::Stemmer::stem(@words); =head1 DESCRIPTION This module implements a Portuguese stemming algorithm proposed in the paper B by B and B The eight steps of stemming algorithm are listed as follows: =over 8 =item * Plural Reduction =item * Feminine Reduction =item * Adverb Reduction =item * Augmentative/Diminutive Reduction =item * Noun Suffix Reduction =item * Verb Suffix Reduction =item * Vowel Reduction =item * Accents Removal =back =head1 SEE ALSO L =head1 COPYRIGHT xern Exern@cpan.orgE This module is free software; you can redistribute it or modify it under the same terms as Perl itself. =cut