Merge branch 'master' of github.com:moses-smt/mosesdecoder

This commit is contained in:
Barry Haddow 2019-09-30 15:33:33 +01:00
commit 01a8ec41e8
2 changed files with 26 additions and 8 deletions

View File

@ -3,9 +3,13 @@
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
#use strict;
use strict;
use warnings;
use Getopt::Std;
our ($opt_q, %count);
our $M = 0;
getopts('q');
my $target = shift;
@ -22,19 +26,19 @@ while (<TARGET>) {
unless (defined $opt_q) {
print STDERR "\r$M" if ++$M%1000 == 0;
}
@T = split;
my @T = split;
$_ = <SOURCE>;
@S = split;
my @S = split;
$_ = <ALIGN>;
@A = split;
my @A = split;
my(@source_links,@target_links);
for( $i=0; $i<=$#A; $i+=2 ) {
for(my $i=0; $i<=$#A; $i+=2 ) {
$target_links[$A[$i]]++;
$source_links[$A[$i+1]]++;
}
for( $i=0; $i<=$#A; $i+=2 ) {
for(my $i=0; $i<=$#A; $i+=2 ) {
if ($target_links[$A[$i]] == 1 && $source_links[$A[$i+1]] == 1 &&
$T[$A[$i]] eq $S[$A[$i+1]])
{
@ -46,6 +50,6 @@ while (<TARGET>) {
}
}
foreach $w (sort keys %count) {
foreach my $w (sort keys %count) {
print "$w\n" if $count{$w}==1;
}

View File

@ -167,6 +167,20 @@ sub preprocess {
}{$1\n$2}gx;
}
# Urdu support
# https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
if ($language eq 'ur') {
$text =~ s{
( (?: [\.\?!\x{06d4}] | \.\.+ )
[\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
)
\s+
( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
[\x{0600}-\x{06ff}]
)
}{$1\n$2}gx;
}
# Special punctuation cases are covered. Check all remaining periods.
my $word;
my $i;
@ -179,7 +193,7 @@ sub preprocess {
my $starting_punct = $2;
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
# Not breaking;
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
} elsif ($words[$i] =~ /(\.?)[\p{IsUpper}\-]+(\.+)$/) {
# Not breaking - upper case acronym
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
# The next word has a bunch of initial quotes, maybe a