mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-05 02:22:21 +03:00
Merge pull request #168 from tofula/master
Named group added for the safer 'protected patterns' recognition regexp
This commit is contained in:
commit
59119c0044
@ -176,6 +176,13 @@ sub ngrams {
|
||||
return { md5(encode_utf8($sent)) => 1 };
|
||||
} else {
|
||||
my @words = split /\s+/, $sent;
|
||||
|
||||
#factors
|
||||
if ( $sent =~ m/[|]/) {
|
||||
my $use_index = 0; # default factor is the first one
|
||||
@words = map { ( split /[|]/, $_ ) [$use_index] } @words;
|
||||
}
|
||||
|
||||
my $out;
|
||||
if ($n == 1) {
|
||||
foreach my $w (@words) {
|
||||
|
@ -243,9 +243,9 @@ sub tokenize
|
||||
my @protected = ();
|
||||
foreach my $protected_pattern (@protected_patterns) {
|
||||
my $t = $text;
|
||||
while ($t =~ /($protected_pattern)(.*)$/) {
|
||||
push @protected, $1;
|
||||
$t = $2;
|
||||
while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
|
||||
push @protected, $+{PATTERN};
|
||||
$t = $+{TAIL};
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user