mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-25 21:03:22 +03:00
Proper spacing
This commit is contained in:
parent
f46ee7c5ac
commit
d03df21e88
@ -135,7 +135,6 @@ sub preprocess {
|
|||||||
$sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
|
$sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
|
||||||
$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";
|
$sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko";
|
||||||
|
|
||||||
|
|
||||||
# we include danda and double danda (U+0964 and U+0965) as sentence split characters
|
# we include danda and double danda (U+0964 and U+0965) as sentence split characters
|
||||||
|
|
||||||
# Non-period end of sentence markers (?!) followed by sentence starters.
|
# Non-period end of sentence markers (?!) followed by sentence starters.
|
||||||
@ -154,7 +153,7 @@ sub preprocess {
|
|||||||
$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
|
$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
|
||||||
|
|
||||||
|
|
||||||
#NOTE: Korean no longer handled here.
|
#NOTE: Korean no longer handled here, cos Korean has spaces.
|
||||||
if ($is_cjk == 1) {
|
if ($is_cjk == 1) {
|
||||||
# Chinese uses unusual end-of-sentence markers. These are NOT
|
# Chinese uses unusual end-of-sentence markers. These are NOT
|
||||||
# followed by whitespace. Nor is there any idea of capitalization.
|
# followed by whitespace. Nor is there any idea of capitalization.
|
||||||
@ -175,6 +174,7 @@ sub preprocess {
|
|||||||
# spaces here, so that later processing stages can tokenize readily.
|
# spaces here, so that later processing stages can tokenize readily.
|
||||||
# Note that this handles mixed latinate+CJK.
|
# Note that this handles mixed latinate+CJK.
|
||||||
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
|
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
|
||||||
|
|
||||||
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
|
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
|
||||||
# what we want from sentence-splitter!
|
# what we want from sentence-splitter!
|
||||||
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
|
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
|
||||||
@ -211,7 +211,7 @@ sub preprocess {
|
|||||||
my $starting_punct = $2;
|
my $starting_punct = $2;
|
||||||
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
|
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
|
||||||
# Not breaking;
|
# Not breaking;
|
||||||
# print "NBP1 $words[$i] $words[$i+1]\n";
|
## print "NBP1 $words[$i] $words[$i+1]\n";
|
||||||
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
|
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
|
||||||
# Not breaking - upper case acronym
|
# Not breaking - upper case acronym
|
||||||
#print "NBP2 $words[$i] $words[$i+1]\n";
|
#print "NBP2 $words[$i] $words[$i+1]\n";
|
||||||
|
Loading…
Reference in New Issue
Block a user