mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 07:42:21 +03:00
Add option to protect expressions - eg URLs
This commit is contained in:
parent
90fe3514bc
commit
67c294a7d8
@ -22,6 +22,8 @@ use Thread;
|
||||
my $mydir = "$RealBin/../share/nonbreaking_prefixes";
|
||||
|
||||
my %NONBREAKING_PREFIX = ();
|
||||
my @protected_patterns = ();
|
||||
my $protected_patterns_file = "";
|
||||
my $language = "en";
|
||||
my $QUIET = 0;
|
||||
my $HELP = 0;
|
||||
@ -42,6 +44,8 @@ while (@ARGV)
|
||||
/^-x$/ && ($SKIP_XML = 1, next);
|
||||
/^-a$/ && ($AGGRESSIVE = 1, next);
|
||||
/^-time$/ && ($TIMING = 1, next);
|
||||
# Option to add list of regexps to be protected
|
||||
/^-protected/ && ($protected_patterns_file = shift, next);
|
||||
/^-threads$/ && ($NUM_THREADS = int(shift), next);
|
||||
/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
|
||||
/^-penn$/ && ($PENN = 1, next);
|
||||
@ -64,6 +68,7 @@ if ($HELP)
|
||||
print " -b ... disable Perl buffering.\n";
|
||||
print " -time ... enable processing time calculation.\n";
|
||||
print " -penn ... use Penn treebank-like tokenization.\n";
|
||||
print " -protect FILE ... specify file with patters to be protected in tokenisation.\n";
|
||||
exit;
|
||||
}
|
||||
|
||||
@ -82,6 +87,16 @@ if (scalar(%NONBREAKING_PREFIX) eq 0)
|
||||
print STDERR "Warning: No known abbreviations for language '$language'\n";
|
||||
}
|
||||
|
||||
# Load protected patterns
|
||||
if ($protected_patterns_file)
|
||||
{
|
||||
open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
|
||||
while(<PP>) {
|
||||
chomp;
|
||||
push @protected_patterns, $_;
|
||||
}
|
||||
}
|
||||
|
||||
my @batch_sentences = ();
|
||||
my @thread_list = ();
|
||||
my $count_sentences = 0;
|
||||
@ -212,6 +227,19 @@ sub tokenize
|
||||
$text =~ s/\s+/ /g;
|
||||
$text =~ s/[\000-\037]//g;
|
||||
|
||||
# Find protected patterns
|
||||
my @protected = ();
|
||||
foreach my $protected_pattern (@protected_patterns) {
|
||||
foreach ($text =~ /($protected_pattern)/) {
|
||||
push @protected, $_;
|
||||
}
|
||||
}
|
||||
|
||||
for (my $i = 0; $i < scalar(@protected); ++$i) {
|
||||
my $subst = sprintf("THISISPROTECTED%.3d", $i);
|
||||
$text =~ s,\Q$protected[$i],$subst,g;
|
||||
}
|
||||
|
||||
# seperate out all "other" special characters
|
||||
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
|
||||
|
||||
@ -303,6 +331,12 @@ sub tokenize
|
||||
$text =~ s/^ //g;
|
||||
$text =~ s/ $//g;
|
||||
|
||||
# restore protected
|
||||
for (my $i = 0; $i < scalar(@protected); ++$i) {
|
||||
my $subst = sprintf("THISISPROTECTED%.3d", $i);
|
||||
$text =~ s/$subst/$protected[$i]/g;
|
||||
}
|
||||
|
||||
#restore multi-dots
|
||||
while($text =~ /DOTDOTMULTI/)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user