Merge branch 'master' of github.com:moses-smt/mosesdecoder

2024-08-16 23:10:31 +03:00 · 2019-09-30 15:33:33 +01:00 · 2019-09-30 15:33:33 +01:00 · 01a8ec41e8
commit 01a8ec41e8
parent 768944d851 b21b071a66
2 changed files with 26 additions and 8 deletions
--- a/scripts/OSM/extract-singletons.perl
+++ b/scripts/OSM/extract-singletons.perl
@ -3,9 +3,13 @@
 # This file is part of moses.  Its use is licensed under the GNU Lesser General
 # Public License version 2.1 or, at your option, any later version.

-#use strict;
+use strict;
 use warnings;
 use Getopt::Std;
+
+our ($opt_q, %count);
+our $M = 0;
+
 getopts('q');

 my $target = shift;
@ -22,19 +26,19 @@ while (<TARGET>) {
    unless (defined $opt_q) {
 	print STDERR "\r$M" if ++$M%1000 == 0;
    }
-    @T = split;
+    my @T = split;
    $_ = <SOURCE>;
-    @S = split;
+    my @S = split;
    $_ = <ALIGN>;
-    @A = split;
+    my @A = split;

    my(@source_links,@target_links);
-    for( $i=0; $i<=$#A; $i+=2 ) {
+    for(my $i=0; $i<=$#A; $i+=2 ) {
 	$target_links[$A[$i]]++;
 	$source_links[$A[$i+1]]++;
    }

-    for( $i=0; $i<=$#A; $i+=2 ) {
+    for(my $i=0; $i<=$#A; $i+=2 ) {
 	if ($target_links[$A[$i]] == 1 && $source_links[$A[$i+1]] == 1 &&
 	    $T[$A[$i]] eq $S[$A[$i+1]])
 	{
@ -46,6 +50,6 @@ while (<TARGET>) {
    }
 }

-foreach $w (sort keys %count) {
+foreach my $w (sort keys %count) {
    print "$w\n" if $count{$w}==1;
 }
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@ -167,6 +167,20 @@ sub preprocess {
        }{$1\n$2}gx;
  }

+  # Urdu support
+  # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
+  if ($language eq 'ur') {
+    $text =~ s{
+            ( (?: [\.\?!\x{06d4}] | \.\.+ )
+              [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
+              )
+            \s+
+            ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
+              [\x{0600}-\x{06ff}]
+              )
+        }{$1\n$2}gx;
+  }
+
 	# Special punctuation cases are covered. Check all remaining periods.
 	my $word;
 	my $i;
@ -179,7 +193,7 @@ sub preprocess {
 			my $starting_punct = $2;
 			if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
 				# Not breaking;
-			} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
+			} elsif ($words[$i] =~ /(\.?)[\p{IsUpper}\-]+(\.+)$/) {
 				# Not breaking - upper case acronym
 			} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
 				# The next word has a bunch of initial quotes, maybe a