Add option to protect expressions - eg URLs

This commit is contained in:
Barry Haddow 2013-10-07 09:06:49 +01:00
parent 90fe3514bc
commit 67c294a7d8

View File

@ -22,6 +22,8 @@ use Thread;
my $mydir = "$RealBin/../share/nonbreaking_prefixes";
my %NONBREAKING_PREFIX = ();
my @protected_patterns = ();
my $protected_patterns_file = "";
my $language = "en";
my $QUIET = 0;
my $HELP = 0;
@ -42,6 +44,8 @@ while (@ARGV)
/^-x$/ && ($SKIP_XML = 1, next);
/^-a$/ && ($AGGRESSIVE = 1, next);
/^-time$/ && ($TIMING = 1, next);
# Option to add list of regexps to be protected
/^-protected/ && ($protected_patterns_file = shift, next);
/^-threads$/ && ($NUM_THREADS = int(shift), next);
/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
/^-penn$/ && ($PENN = 1, next);
@ -64,6 +68,7 @@ if ($HELP)
print " -b ... disable Perl buffering.\n";
print " -time ... enable processing time calculation.\n";
print " -penn ... use Penn treebank-like tokenization.\n";
print " -protect FILE ... specify file with patters to be protected in tokenisation.\n";
exit;
}
@ -82,6 +87,16 @@ if (scalar(%NONBREAKING_PREFIX) eq 0)
print STDERR "Warning: No known abbreviations for language '$language'\n";
}
# Load protected patterns
if ($protected_patterns_file)
{
open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
while(<PP>) {
chomp;
push @protected_patterns, $_;
}
}
my @batch_sentences = ();
my @thread_list = ();
my $count_sentences = 0;
@ -212,6 +227,19 @@ sub tokenize
$text =~ s/\s+/ /g;
$text =~ s/[\000-\037]//g;
# Find protected patterns
my @protected = ();
foreach my $protected_pattern (@protected_patterns) {
foreach ($text =~ /($protected_pattern)/) {
push @protected, $_;
}
}
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
$text =~ s,\Q$protected[$i],$subst,g;
}
# seperate out all "other" special characters
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
@ -303,6 +331,12 @@ sub tokenize
$text =~ s/^ //g;
$text =~ s/ $//g;
# restore protected
for (my $i = 0; $i < scalar(@protected); ++$i) {
my $subst = sprintf("THISISPROTECTED%.3d", $i);
$text =~ s/$subst/$protected[$i]/g;
}
#restore multi-dots
while($text =~ /DOTDOTMULTI/)
{