Preprocessor for tokenization that fixes known errors in the raw input.

This commit is contained in:
Ulrich Germann 2014-02-22 00:28:33 +00:00
parent a8d66cd68d
commit e70766a6de

View File

@ -0,0 +1,34 @@
#!/usr/bin/perl -W
# script for preprocessing language data prior to tokenization
# Start by Ulrich Germann, after noticing systematic preprocessing errors
# in some of the English Europarl data.
use strict;
use Getopt::Std;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
sub usage
{
print "Script for preprocessing of raw language data prior to tokenization\n";
print "Usage: $0 -l <language tag>\n";
}
my %args;
getopt('l=s h',\%args);
usage() && exit(0) if $args{'h'};
if ($args{'l'} eq "en")
{
while (<>)
{
s/([[:alpha:]]\') s\b/$1s/g;
print;
}
}
else
{
print while <>;
}