package Lingua::Stem::Snowball::Da;
use strict;
use bytes;
# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2,
# *NOT* "earlier versions", as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
# -+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
#####
use constant DEBUG=>0;
use vars qw(%cache $VERSION);
$Lingua::Stem::Snowball::Da::VERSION = 1.01;
# special characters
my $aa = chr(229); # å
my $ae = chr(230); # æ
my $oe = chr(248); # &oring;
# delete the s if a "s ending" is preceeded by one
# of these characters.
my %s_ending = (
a => 1,
b => 1,
c => 1,
d => 1,
f => 1,
g => 1,
h => 1,
j => 1,
k => 1,
l => 1,
"m" => 1,
n => 1,
o => 1,
p => 1,
r => 1,
t => 1,
v => 1,
"y" => 1,
z => 1,
$aa => 1,
);
# danish vowels.
my $vowels = "aeiouy$ae$aa$oe";
my %vowels = (
a=>1,
e=>1,
i=>1,
o=>1,
u=>1,
"y"=>1,
$ae=>1,
$aa=>1,
$oe=>1,
);
# ####
# the endings in step 1
# XXX: these must be sorted by length
# to save time we've done it already.
my @endings = (
["erendes"],
["erende", "hedens"],
["erens", "endes", "heden", "ethed", "ernes", "erets", "heder", "erede"],
["ende", "enes", "ered", "eren", "erer", "eres", "eret", "erne", "heds"],
["hed", "ers", "ene", "ere", "ens", "ets"],
["er", "es", "et", "en"],
["e"],
);
%Lingua::Stem::Snowball::Da::cache = ();
sub new {
my $pkg = shift;
$pkg = ref $pkg || $pkg;
my %arg = @_;
my $self = {};
bless $self, $pkg;
$self->{USE_CACHE} = $arg{use_cache} || 0;
return $self;
}
sub step1 {
my ($rs, $word) = @_;
# ### STEP 1
my $endinglen = 8;
foreach (@endings) {
$endinglen--;
my $endingw = substr($rs, -$endinglen); # do this once.
foreach (@$_) {
# only continue if the word has this ending at all.
next unless $endingw eq $_;
warn "matched $_ in $word" if DEBUG;
# a) delete the ending.
return substr($word, 0, -$endinglen);
}
}
if (substr($rs, -1) eq 's') { # b)
# check if it has a valid "s ending"...
if ((length $rs == 1) ?
exists $s_ending{substr($word, -2, -1)} :
exists $s_ending{substr($rs, -2, -1)}) {
warn "Valid s eding $word" if DEBUG;
# ...delete the last character (which is a s)
return substr($word, 0, -1);
}
}
return $word;
}
sub stem {
my ($self, $word) = @_;
my $orig_word;
warn " --- start : $word ---" if DEBUG;
if ($self->{USE_CACHE}) {
$orig_word = $word;
return $cache{$word} if defined $cache{$word};
}
my ($rs, $lslen, $rslen) = getsides($word);
return $word unless $lslen >= 3;
$word = step1($rs, $word);
# ### STEP 2
warn "Step 2" if DEBUG;
($rs, $lslen, $rslen) = getsides($word);
return $word unless $lslen >= 3;
if (substr($rs, -2) =~ /gd|dt|gt|kt/) {
warn "delete last letter $word in step 2" if DEBUG;
$word = substr($word, 0, - 1);
($rs, $lslen, $rslen) = getsides($word);
return $word unless $lslen >= 3;
}
# ### STEP 3
if (substr($rs, -4) eq "igst") {
warn "st as in igst deleted in $word" if DEBUG;
$word = substr($word, 0, -2);
($rs, $lslen, $rslen) = getsides($word);
return $word unless $lslen >= 3;
}
if (substr($rs, -4) eq "l${oe}st") {
warn "t as in l${oe}st deleted in $word" if DEBUG;
$word = substr($word, 0, -1);
($rs, $lslen, $rslen) = getsides($word);
return $word unless $lslen >= 3;
}
for (qw/elig lig els ig/) {
my $len = length;
if (substr($rs, -$len) eq $_) {
warn "delete $_ in $word" if DEBUG;
$word = substr($word, 0, -$len);
($rs) = getsides($word);
if (substr($rs, -2) =~ /gd|dt|gt|kt/) {
warn "delete last letter $word in step 2 again" if DEBUG;
$word = substr($word, 0, - 1);
($rs, $lslen, $rslen) = getsides($word);
}
last;
}
}
return $word unless $lslen >= 3 && length $word > 3;
# ### STEP 4
if ($word =~ /([^$vowels])\1$/o) {
warn "delete double konsonant in $word" if DEBUG;
$word = substr($word, 0, -1);
}
if ($self->{USE_CACHE}) {
$cache{$orig_word} = $word;
}
warn " --- end : $word ---" if DEBUG;
return $word;
}
sub getsides {
my $word = shift;
# ###
# find the first vowel with a non-vowel after it.
my($found_vowel, $nonv_position, $curpos) = (0, -1, 0);
#$found_vowel = 1 if exists $vowels{substr($word,0,1)};
foreach (split//, $word) {
$curpos++;
if (exists $vowels{$_}) {
$found_vowel = 1;
next;
} elsif ($found_vowel) {
$nonv_position = $curpos;
last;
}
}
# got nothing: return false
return undef if $nonv_position == -1;
my($rs, $lslen); # left side and right side.
# ###
# length of the left side must be atleast 3 chars.
if ($nonv_position < 3) {
$lslen = length substr($word, 0, 3);
$rs = substr($word, 3);
} else {
$lslen = $nonv_position;
$rs = substr($word, $nonv_position);
}
return($rs, $lslen, length $rs);
}
1;
__END__
=head1 NAME
Lingua::Stem::Snowball::Da - Porters stemming algorithm for Denmark
=head1 SYNOPSIS
use Lingua::Stem::Snowball::Da
my $stemmer = new Lingua::Stem::Snowball::Da (use_cache => 1);
foreach my $word (@words) {
my $stemmed = $stemmer->stem($word);
print $stemmed, "\n";
}
=head1 DESCRIPTION
The stem function takes a scalar as a parameter and stems the word
according to Martin Porters Danish stemming algorithm,
which can be found at the Snowball website: L.
It also supports caching if you pass the use_cache option when constructing
a new L:S:S:D object.
=head2 EXPORT
Lingua::Stem::Snowball::Da has nothing to export.
=head1 AUTHOR
Dennis Haney Edavh@davh.dkE
Ask Solem Hoel, Eask@unixmonks.netE (Swedish version)
=head1 SEE ALSO
L. L. L. L.
=cut