mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-05 02:22:21 +03:00
Handle punctuation+CJK combinations.
This commit is contained in:
parent
6fb2c97029
commit
2e48f83ab4
@ -128,13 +128,19 @@ sub preprocess {
|
||||
|
||||
# A normal full-stop or other Western sentence enders followed
|
||||
# by an ideograph is an and-of-sentence, always.
|
||||
$text =~ s/([\.?!]) *(\p{InCJK})/$1\n$2/g;
|
||||
$text =~ s/([\.?!]) *(\p{CJK})/$1\n$2/g;
|
||||
|
||||
# Split close-paren-then-comma into two.
|
||||
$text =~ s/(\p{Punctuation}) *(\p{Punctuation})/ $1 $2 /g;
|
||||
|
||||
# Chinese does not use any sort of white-space between ideographs.
|
||||
# Nominally, each single ideograph corresponds to one word. Add
|
||||
# spaces here, so that later processing stages can tokenize readily.
|
||||
# Note that this handles mixed latinate+CJK.
|
||||
$text =~ s/(\p{InCJK})/ $1 /g;
|
||||
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
|
||||
$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g
|
||||
$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
|
||||
$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
|
||||
$text =~ s/ +/ /g;
|
||||
|
||||
# Special punctuation cases are covered. Check all remaining periods.
|
||||
|
Loading…
Reference in New Issue
Block a user