mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 04:43:03 +03:00
Proper spacing
This commit is contained in:
parent
f46ee7c5ac
commit
d03df21e88
@ -27,10 +27,10 @@ my $LIST_ITEM = 0;
|
|||||||
while (@ARGV) {
|
while (@ARGV) {
|
||||||
$_ = shift;
|
$_ = shift;
|
||||||
/^-l$/ && ($language = shift, next);
|
/^-l$/ && ($language = shift, next);
|
||||||
/^-p$/ && ($prefixfile = shift, next);
|
/^-p$/ && ($prefixfile = shift, next);
|
||||||
/^-q$/ && ($QUIET = 1, next);
|
/^-q$/ && ($QUIET = 1, next);
|
||||||
/^-h$/ && ($HELP = 1, next);
|
/^-h$/ && ($HELP = 1, next);
|
||||||
/^-i$/ && ($LIST_ITEM = 1, next);
|
/^-i$/ && ($LIST_ITEM = 1, next);
|
||||||
/^-b$/ && ($|++, next); # no output buffering
|
/^-b$/ && ($|++, next); # no output buffering
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -39,7 +39,7 @@ if ($HELP) {
|
|||||||
print "-q: quiet mode\n";
|
print "-q: quiet mode\n";
|
||||||
print "-b: no output buffering (for use in bidirectional pipes)\n";
|
print "-b: no output buffering (for use in bidirectional pipes)\n";
|
||||||
print "-p: use a custom prefix file, overriding the installed one\n";
|
print "-p: use a custom prefix file, overriding the installed one\n";
|
||||||
print "-i: avoid splitting on list items (e.g. 1. This is the first)\n";
|
print "-i: avoid splitting on list items (e.g. 1. This is the first)\n";
|
||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
if (!$QUIET) {
|
if (!$QUIET) {
|
||||||
@ -53,17 +53,17 @@ if ($language eq "yue" || $language eq "zh" || $language eq "ja") {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ($prefixfile ne "") {
|
if ($prefixfile ne "") {
|
||||||
print STDERR "Loading non-breaking prefixes from $prefixfile\n";
|
print STDERR "Loading non-breaking prefixes from $prefixfile\n";
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
$prefixfile = "$mydir/nonbreaking_prefix.$language";
|
$prefixfile = "$mydir/nonbreaking_prefix.$language";
|
||||||
|
|
||||||
# Default to English, if we don't have a language-specific prefix file.
|
# Default to English, if we don't have a language-specific prefix file.
|
||||||
if (!(-e $prefixfile)) {
|
if (!(-e $prefixfile)) {
|
||||||
$prefixfile = "$mydir/nonbreaking_prefix.en";
|
$prefixfile = "$mydir/nonbreaking_prefix.en";
|
||||||
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
|
print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
|
||||||
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
|
die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (-e "$prefixfile") {
|
if (-e "$prefixfile") {
|
||||||
@ -122,21 +122,20 @@ sub preprocess {
|
|||||||
|
|
||||||
##### Add sentence breaks as needed #####
|
##### Add sentence breaks as needed #####
|
||||||
|
|
||||||
# Sentences can start with upper-case, numnbers, or Indic characters
|
# Sentences can start with upper-case, numnbers, or Indic characters
|
||||||
my $sentence_start = "\\p{IsUpper}0-9";
|
my $sentence_start = "\\p{IsUpper}0-9";
|
||||||
$sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr");
|
$sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr");
|
||||||
$sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu";
|
$sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu";
|
||||||
$sentence_start .= "\\p{Block: Bengali}" if ($language eq "as" || $language eq "bn" || $language eq "mni");
|
$sentence_start .= "\\p{Block: Bengali}" if ($language eq "as" || $language eq "bn" || $language eq "mni");
|
||||||
$sentence_start .= "\\p{Block: Kannada}" if $language eq "kn";
|
$sentence_start .= "\\p{Block: Kannada}" if $language eq "kn";
|
||||||
$sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml";
|
$sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml";
|
||||||
$sentence_start .= "\\p{Block: Oriya}" if $language eq "or";
|
$sentence_start .= "\\p{Block: Oriya}" if $language eq "or";
|
||||||
$sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa";
|
$sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa";
|
||||||
$sentence_start .= "\\p{Block: Tamil}" if $language eq "ta";
|
$sentence_start .= "\\p{Block: Tamil}" if $language eq "ta";
|
||||||
$sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
|
$sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
|
||||||
$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";
|
$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";
|
||||||
|
|
||||||
|
# we include danda and double danda (U+0964 and U+0965) as sentence split characters
|
||||||
# we include danda and double danda (U+0964 and U+0965) as sentence split characters
|
|
||||||
|
|
||||||
# Non-period end of sentence markers (?!) followed by sentence starters.
|
# Non-period end of sentence markers (?!) followed by sentence starters.
|
||||||
$text =~ s/([?!\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
|
$text =~ s/([?!\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
|
||||||
@ -153,15 +152,15 @@ sub preprocess {
|
|||||||
# and are followed by a sentence starter punctuation and upper case.
|
# and are followed by a sentence starter punctuation and upper case.
|
||||||
$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
|
$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
|
||||||
|
|
||||||
|
|
||||||
#NOTE: Korean no longer handled here.
|
#NOTE: Korean no longer handled here, cos Korean has spaces.
|
||||||
if ($is_cjk == 1) {
|
if ($is_cjk == 1) {
|
||||||
# Chinese uses unusual end-of-sentence markers. These are NOT
|
# Chinese uses unusual end-of-sentence markers. These are NOT
|
||||||
# followed by whitespace. Nor is there any idea of capitalization.
|
# followed by whitespace. Nor is there any idea of capitalization.
|
||||||
# There does not appear to be any unicode category for full-stops
|
# There does not appear to be any unicode category for full-stops
|
||||||
# in general, so list them here. U+3002 U+FF0E U+FF1F U+FF01
|
# in general, so list them here. U+3002 U+FF0E U+FF1F U+FF01
|
||||||
#$text =~ s/([。.?!♪])/$1\n/g;
|
#$text =~ s/([。.?!♪])/$1\n/g;
|
||||||
$text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
|
$text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
|
||||||
|
|
||||||
# A normal full-stop or other Western sentence enders followed
|
# A normal full-stop or other Western sentence enders followed
|
||||||
# by an ideograph is an end-of-sentence, always.
|
# by an ideograph is an end-of-sentence, always.
|
||||||
@ -175,33 +174,34 @@ sub preprocess {
|
|||||||
# spaces here, so that later processing stages can tokenize readily.
|
# spaces here, so that later processing stages can tokenize readily.
|
||||||
# Note that this handles mixed latinate+CJK.
|
# Note that this handles mixed latinate+CJK.
|
||||||
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
|
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
|
||||||
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
|
|
||||||
# what we want from sentence-splitter!
|
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
|
||||||
|
# what we want from sentence-splitter!
|
||||||
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
|
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
|
||||||
#$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
|
#$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
|
||||||
#$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
|
#$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
|
||||||
#$text =~ s/ +/ /g;
|
#$text =~ s/ +/ /g;
|
||||||
}
|
}
|
||||||
|
|
||||||
# Urdu support
|
# Urdu support
|
||||||
# https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
|
# https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
|
||||||
if ($language eq 'ur') {
|
if ($language eq 'ur') {
|
||||||
$text =~ s{
|
$text =~ s{
|
||||||
( (?: [\.\?!\x{06d4}] | \.\.+ )
|
( (?: [\.\?!\x{06d4}] | \.\.+ )
|
||||||
[\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
|
[\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
|
||||||
)
|
)
|
||||||
\s+
|
\s+
|
||||||
( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
|
( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
|
||||||
[\x{0600}-\x{06ff}]
|
[\x{0600}-\x{06ff}]
|
||||||
)
|
)
|
||||||
}{$1\n$2}gx;
|
}{$1\n$2}gx;
|
||||||
}
|
}
|
||||||
|
|
||||||
# Special punctuation cases are covered. Check all remaining periods.
|
# Special punctuation cases are covered. Check all remaining periods.
|
||||||
my $word;
|
my $word;
|
||||||
my $i;
|
my $i;
|
||||||
my @words = split(/\h/,$text);
|
my @words = split(/\h/,$text);
|
||||||
#print "NOW $text\n";
|
#print "NOW $text\n";
|
||||||
$text = "";
|
$text = "";
|
||||||
for ($i=0;$i<(scalar(@words)-1);$i++) {
|
for ($i=0;$i<(scalar(@words)-1);$i++) {
|
||||||
#print "Checking $words[$i] $words[$i+1]\n";
|
#print "Checking $words[$i] $words[$i+1]\n";
|
||||||
@ -211,19 +211,19 @@ sub preprocess {
|
|||||||
my $starting_punct = $2;
|
my $starting_punct = $2;
|
||||||
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
|
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
|
||||||
# Not breaking;
|
# Not breaking;
|
||||||
# print "NBP1 $words[$i] $words[$i+1]\n";
|
## print "NBP1 $words[$i] $words[$i+1]\n";
|
||||||
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
|
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
|
||||||
# Not breaking - upper case acronym
|
# Not breaking - upper case acronym
|
||||||
# print "NBP2 $words[$i] $words[$i+1]\n";
|
#print "NBP2 $words[$i] $words[$i+1]\n";
|
||||||
} elsif ($LIST_ITEM
|
} elsif ($LIST_ITEM
|
||||||
&& ($i == 0 || substr($words[$i-1], -1) eq "\n")
|
&& ($i == 0 || substr($words[$i-1], -1) eq "\n")
|
||||||
&& $words[$i] =~ /^\(?(([0-9]+)|([ivx]+)|([A-Za-z]))\)?\.$/) {
|
&& $words[$i] =~ /^\(?(([0-9]+)|([ivx]+)|([A-Za-z]))\)?\.$/) {
|
||||||
#Maybe list item - non breaking
|
# Maybe list item - non breaking
|
||||||
# print "NBP3 $words[$i] $words[$i+1]\n";
|
#print "NBP3 $words[$i] $words[$i+1]\n";
|
||||||
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) {
|
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) {
|
||||||
# The next word has a bunch of initial quotes, maybe a
|
# The next word has a bunch of initial quotes, maybe a
|
||||||
# space, then either upper case or a number
|
# space, then either upper case or a number
|
||||||
# print "MAYBE $words[$i] $words[$i+1]\n";
|
#print "MAYBE $words[$i] $words[$i+1]\n";
|
||||||
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
|
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
|
||||||
# We always add a return for these, unless we have a
|
# We always add a return for these, unless we have a
|
||||||
# numeric non-breaker and a number start.
|
# numeric non-breaker and a number start.
|
||||||
|
Loading…
Reference in New Issue
Block a user