mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-21 08:07:14 +03:00
Preprocessor for tokenization that fixes known errors in the raw input.
This commit is contained in:
parent
a8d66cd68d
commit
e70766a6de
34
scripts/tokenizer/pre-tokenizer.perl
Executable file
34
scripts/tokenizer/pre-tokenizer.perl
Executable file
@ -0,0 +1,34 @@
|
||||
#!/usr/bin/perl -W
|
||||
# script for preprocessing language data prior to tokenization
|
||||
# Start by Ulrich Germann, after noticing systematic preprocessing errors
|
||||
# in some of the English Europarl data.
|
||||
|
||||
use strict;
|
||||
use Getopt::Std;
|
||||
|
||||
binmode(STDIN, ":utf8");
|
||||
binmode(STDOUT, ":utf8");
|
||||
|
||||
sub usage
|
||||
{
|
||||
print "Script for preprocessing of raw language data prior to tokenization\n";
|
||||
print "Usage: $0 -l <language tag>\n";
|
||||
}
|
||||
|
||||
my %args;
|
||||
getopt('l=s h',\%args);
|
||||
usage() && exit(0) if $args{'h'};
|
||||
|
||||
if ($args{'l'} eq "en")
|
||||
{
|
||||
while (<>)
|
||||
{
|
||||
s/([[:alpha:]]\') s\b/$1s/g;
|
||||
print;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
print while <>;
|
||||
}
|
Loading…
Reference in New Issue
Block a user