mosesdecoder/scripts/tokenizer/normalize-punctuation.perl
2013-05-16 17:03:37 +01:00

74 lines
1.5 KiB
Perl
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/perl -w
use strict;
my ($language) = @ARGV;
while(<STDIN>) {
s/\r//g;
# remove extra spaces
s/\(/ \(/g;
s/\)/\) /g; s/ +/ /g;
s/\) ([\.\!\:\?\;\,])/\)$1/g;
s/\( /\(/g;
s/ \)/\)/g;
s/(\d) \%/$1\%/g;
s/ :/:/g;
s/ ;/;/g;
# normalize unicode punctuation
s/„/\"/g;
s/“/\"/g;
s/”/\"/g;
s//-/g;
s/—/ - /g; s/ +/ /g;
s/´/\'/g;
s/([a-z])([a-z])/$1\'$2/gi;
s/([a-z])([a-z])/$1\'$2/gi;
s//\"/g;
s//\"/g;
s//\"/g;
s/''/\"/g;
s/´´/\"/g;
s/…/.../g;
# French quotes
s/ « / \"/g;
s/« /\"/g;
s/«/\"/g;
s/ » /\" /g;
s/ »/\"/g;
s/»/\"/g;
# handle pseudo-spaces
s/ \%/\%/g;
s/nº /nº /g;
s/ :/:/g;
s/ ºC/ ºC/g;
s/ cm/ cm/g;
s/ \?/\?/g;
s/ \!/\!/g;
s/ ;/;/g;
s/, /, /g; s/ +/ /g;
# English "quotation," followed by comma, style
if ($language eq "en") {
s/\"([,\.]+)/$1\"/g;
}
# Czech is confused
elsif ($language eq "cs" || $language eq "cz") {
}
# German/Spanish/French "quotation", followed by comma, style
else {
s/,\"/\",/g;
s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
}
print STDERR $_ if //;
if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
s/(\d) (\d)/$1,$2/g;
}
else {
s/(\d) (\d)/$1.$2/g;
}
print $_;
}