optional output what lines are retained

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1576 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
hieuhoang1972 2008-02-27 18:38:31 +00:00
parent 1f722ffb1e
commit cb1f0e56dc

View File

@ -11,11 +11,11 @@ my $enc = "utf8"; # encoding of the input and output files
GetOptions(
"help" => \$help,
"lowercase|lc" => \$lc,
"encoding=s" => \$enc,
"encoding" => \$enc,
) or exit(1);
if (scalar(@ARGV) != 6 || $help) {
print "syntax: clean-corpus-n.perl corpus l1 l2 clean-corpus min max\n";
if (scalar(@ARGV) < 6 || $help) {
print "syntax: clean-corpus-n.perl corpus l1 l2 clean-corpus min max [lines retained file]\n";
exit;
}
@ -26,6 +26,12 @@ my $out = $ARGV[3];
my $min = $ARGV[4];
my $max = $ARGV[5];
my $linesRetainedFile = "";
if (scalar(@ARGV) > 6) {
$linesRetainedFile = $ARGV[6];
open(LINES_RETAINED,">$linesRetainedFile") or die "Can't write $linesRetainedFile";
}
print STDERR "clean-corpus.perl: processing $corpus.$l1 & .$l2 to $out, cutoff $min-$max\n";
my $opn = undef;
@ -61,7 +67,6 @@ binmode(EO, $binmode);
my $innr = 0;
my $outnr = 0;
my $factored_flag;
while(my $f = <F>) {
$innr++;
print STDERR "." if $innr % 10000 == 0;
@ -70,9 +75,6 @@ while(my $f = <F>) {
die "$corpus.$l2 is too short!" if !defined $e;
chomp($e);
chomp($f);
if ($innr == 1) {
$factored_flag = ($e =~ /\|/ || $f =~ /\|/);
}
#if lowercasing, lowercase
if ($lc) {
@ -80,11 +82,11 @@ while(my $f = <F>) {
$f = lc($f);
}
$e =~ s/\|//g unless $factored_flag;
# $e =~ s/\|//g; # kinda hurts in factored input
$e =~ s/\s+/ /g;
$e =~ s/^ //;
$e =~ s/ $//;
$f =~ s/\|//g unless $factored_flag;
# $f =~ s/\|//g; # kinda hurts in factored input
$f =~ s/\s+/ /g;
$f =~ s/^ //;
$f =~ s/ $//;
@ -109,7 +111,16 @@ while(my $f = <F>) {
$outnr++;
print FO $f."\n";
print EO $e."\n";
if ($linesRetainedFile ne "") {
print LINES_RETAINED $innr."\n";
}
}
if ($linesRetainedFile ne "") {
close LINES_RETAINED;
}
print STDERR "\n";
my $e = <E>;
die "$corpus.$l2 is too long!" if defined $e;