add pre tokenization cleaning script. In case training has bad, overlying long lines which blows up some taggers/segmenters, eg. mada

This commit is contained in:
Hieu Hoang 2015-04-19 11:21:07 +04:00
parent f98de4dc83
commit 637e8a17e8
3 changed files with 147 additions and 1 deletions

View File

@ -7,8 +7,15 @@ get-corpus
default-name: corpus/txt
rerun-on-change: input-extension output-extension
template: IN OUT $input-extension $output-extension
pre-tok-clean
in: raw-stem
out: pre-tok-cleaned
default-name: corpus/pre-tok-cleaned
pass-unless: pre-tok-clean
template: $pre-tok-clean IN $input-extension $output-extension OUT OUT.lines-retained
parallelizable: yes
tokenize
in: raw-stem
in: pre-tok-cleaned
out: tokenized-stem
default-name: corpus/tok
pass-unless: input-tokenizer output-tokenizer

View File

@ -0,0 +1,46 @@
#!/usr/bin/env perl
use strict;
my $minChars = $ARGV[0];
my $maxChars = $ARGV[1];
my $inputStem = $ARGV[2];
my $source = $ARGV[3];
my $target = $ARGV[4];
my $outputStem = $ARGV[5];
my $linesRetained = $ARGV[6];
open(IN_SOURCE, "<:encoding(UTF-8)", "$inputStem.$source") or die "cannot open $inputStem.$source";
open(IN_TARGET, "<:encoding(UTF-8)", "$inputStem.$target") or die "cannot open $inputStem.$target";
open(OUT_SOURCE, ">:encoding(UTF-8)", "$outputStem.$source") or die "cannot open $outputStem.$source";
open(OUT_TARGET, ">:encoding(UTF-8)", "$outputStem.$target") or die "cannot open $outputStem.$target";
open(LINE_RETAINED, ">:encoding(UTF-8)", "$linesRetained");
my $lineNum = 0;
while (my $lineSource = <IN_SOURCE>) {
++$lineNum;
#print STDERR "$lineNum ";
chomp($lineSource);
my $lineTarget = <IN_TARGET>;
chomp($lineTarget);
my $lenSource = length($lineSource);
my $lenTarget = length($lineTarget);
if ($lenSource < $minChars || $lenSource > $maxChars
|| $lenTarget < $minChars || $lenTarget > $maxChars) {
# do nothing
}
else {
print OUT_SOURCE "$lineSource\n";
print OUT_TARGET "$lineTarget\n";
print LINE_RETAINED "$lineNum\n";
}
}
close(OUT_SOURCE);
close(OUT_SOURCE);
close(LINE_RETAINED);

View File

@ -0,0 +1,93 @@
#!/usr/bin/env perl
use warnings;
use strict;
use File::Temp qw/tempfile/;
use Getopt::Long "GetOptions";
use File::Basename;
use FindBin qw($RealBin);
use Cwd 'abs_path';
my $TMPDIR = "tmp";
my $SCHEME = "D2";
my $KEEP_TMP = 0;
my $MADA_DIR;
GetOptions(
"scheme=s" => \$SCHEME,
"tmpdir=s" => \$TMPDIR,
"keep-tmp" => \$KEEP_TMP,
"mada-dir=s" => \$MADA_DIR
) or die("ERROR: unknown options");
$TMPDIR = abs_path($TMPDIR);
print STDERR "TMPDIR=$TMPDIR \n";
#binmode(STDIN, ":utf8");
#binmode(STDOUT, ":utf8");
$TMPDIR = "$TMPDIR/madamira.$$";
`mkdir -p $TMPDIR`;
`mkdir -p $TMPDIR/split`;
`mkdir -p $TMPDIR/out`;
my $infile = "$TMPDIR/input";
print STDERR $infile."\n";
open(TMP,">$infile");
while(<STDIN>) {
print TMP $_;
}
close(TMP);
my $cmd;
# split input file
my $SPLIT_EXEC = `gsplit --help 2>/dev/null`;
if($SPLIT_EXEC) {
$SPLIT_EXEC = 'gsplit';
}
else {
$SPLIT_EXEC = 'split';
}
$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x";
`$cmd`;
$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $MADA_DIR/samples/sampleConfigFile.xml ::: $TMPDIR/split/x*";
print STDERR "Executing: $cmd\n";
`$cmd`;
$cmd = "cat $TMPDIR/out/x*.mada > $infile.mada";
print STDERR "Executing: $cmd\n";
`$cmd`;
# get stuff out of mada output
open(MADA_OUT,"<$infile.mada");
#binmode(MADA_OUT, ":utf8");
while(my $line = <MADA_OUT>) {
chop($line);
#print STDERR "line=$line \n";
if (index($line, "SENTENCE BREAK") == 0) {
# new sentence
#print STDERR "BREAK\n";
print "\n";
}
elsif (index($line, ";;WORD") == 0) {
# word
my $word = substr($line, 7, length($line) - 8);
#print STDERR "FOund $word\n";
print "$word ";
}
else {
#print STDERR "NADA\n";
}
}
close (MADA_OUT);
if ($KEEP_TMP == 0) {
# `rm -rf $TMPDIR`;
}