added some heuristics for Czech quotation marks

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1567 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
bojar 2008-02-22 15:07:46 +00:00
parent 6af3140978
commit 7f3e34207a

View File

@ -7,6 +7,7 @@
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
use strict;
use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
my $language = "en";
my $QUIET = 0;
@ -84,9 +85,21 @@ sub detokenize {
$text = $text.$prependSpace.$words[$i].$words[$i+1];
$i++; # advance over the dash
$prependSpace = "";
} elsif ($words[$i] =~ /^[\'\"]+$/) {
} elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
#combine punctuation smartly
if (($quoteCount{$words[$i]} % 2) eq 0) {
my $normalized_quo = $words[$i];
$normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
$quoteCount{$normalized_quo} = 0
if !defined $quoteCount{$normalized_quo};
if ($language eq "cs" && $words[$i] eq "„") {
# this is always the starting quote in Czech
$quoteCount{$normalized_quo} = 0;
}
if ($language eq "cs" && $words[$i] eq "“") {
# this is usually the ending quote in Czech
$quoteCount{$normalized_quo} = 1;
}
if (($quoteCount{$normalized_quo} % 2) eq 0) {
if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
#single quote for posesssives ending in s... "The Jones' house"
#left shift
@ -96,14 +109,14 @@ sub detokenize {
#right shift
$text = $text.$prependSpace.$words[$i];
$prependSpace = "";
$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
$quoteCount{$normalized_quo} ++;
}
} else {
#left shift
$text=$text.$words[$i];
$prependSpace = " ";
$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
$quoteCount{$normalized_quo} ++;
}