mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-08-16 15:00:33 +03:00
add fi/sv-specific colon handling in tokenizer.perl
This commit is contained in:
parent
5be48ce9db
commit
4a7f16b366
@ -257,8 +257,17 @@ sub tokenize
|
|||||||
$text =~ s/^ //g;
|
$text =~ s/^ //g;
|
||||||
$text =~ s/ $//g;
|
$text =~ s/ $//g;
|
||||||
|
|
||||||
# seperate out all "other" special characters
|
# separate out all "other" special characters
|
||||||
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
|
if (($language eq "fi") or ($language eq "sv")) {
|
||||||
|
# in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
|
||||||
|
# USA:n, 20:een, EU:ssa, USA:s, S:t
|
||||||
|
$text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
|
||||||
|
# if a colon is not immediately followed by lower-case characters, separate it out anyway
|
||||||
|
$text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
|
||||||
|
}
|
||||||
|
|
||||||
# aggressive hyphen splitting
|
# aggressive hyphen splitting
|
||||||
if ($AGGRESSIVE)
|
if ($AGGRESSIVE)
|
||||||
|
Loading…
Reference in New Issue
Block a user