basic support for Gujarati and Hindi, backported from one of the many upstreams

This commit is contained in:
Barry Haddow 2018-10-30 14:16:16 +00:00
parent 979dd5a403
commit d2b558728f
2 changed files with 16 additions and 2 deletions

View File

@ -12,11 +12,11 @@ if ($type =~ /^s/i) {
print "<doc docid=\"doc\">\n";
}
elsif ($type =~ /^t/i) {
print "<tstset setid=\"test\" tgtlang=\"any\" srclang=\"any\">\n";
print "<tstset setid=\"test\" trglang=\"any\" srclang=\"any\">\n";
print "<doc sysid=\"moses\" docid=\"doc\">\n";
}
elsif ($type =~ /^r/i) {
print "<refset setid=\"test\" tgtlang=\"any\" srclang=\"any\">\n";
print "<refset setid=\"test\" trglang=\"any\" srclang=\"any\">\n";
print "<doc sysid=\"ref\" docid=\"doc\">\n";
}
else {

View File

@ -151,6 +151,20 @@ sub preprocess {
$text =~ s/ +/ /g;
}
# Hindi and Gujarati do not capitalise beginning of sentence characters.
# Also Hindi traditionally uses a danda as a sentence separator (U+0964)
if ($language eq 'hi' || $language eq 'gu') {
$text =~ s{
( (?: [\.\?!\x{0964}\x{0965}] | \.\.+ )
[\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
)
\s+
( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
[\x{0900}-\x{097F}\x{0a80}-\x{0aff}]
)
}{$1\n$2}gx;
}
# Special punctuation cases are covered. Check all remaining periods.
my $word;
my $i;