mosesdecoder/scripts/tokenizer/replace-unicode-punctuation.perl

56 lines
872 B
Perl
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
while (@ARGV) {
$_ = shift;
/^-b$/ && ($| = 1, next); # not buffered (flush each line)
}
#binmode(STDIN, ":utf8");
#binmode(STDOUT, ":utf8");
while(<STDIN>) {
s//,/g;
s/。 */. /g;
s/、/,/g;
s/”/"/g;
s/“/"/g;
s//:/g;
s//:/g;
s//\?/g;
s/《/"/g;
s/》/"/g;
s//\)/g;
s//\!/g;
s//\(/g;
s//;/g;
s//"/g;
s/」/"/g;
s/「/"/g;
s//0/g;
s//3/g;
s//2/g;
s//5/g;
s//6/g;
s//9/g;
s//7/g;
s//8/g;
s//4/g;
s/ */. /g;
s//\~/g;
s//\'/g;
s/…/\.\.\./g;
s/━/\-/g;
s/〈/\</g;
s/〉/\>/g;
s/【/\[/g;
s/】/\]/g;
s//\%/g;
print $_;
}