re-apply commit 1fb51dc (use gunzip -c instead of zcat)

plus be more tolerant about xml input
This commit is contained in:
Rico Sennrich 2015-04-03 14:55:05 +01:00
parent c52d7be025
commit 8d8097632b

View File

@ -49,7 +49,7 @@ my $l1input = "$corpus.$l1";
if (-e $l1input) {
$opn = $l1input;
} elsif (-e $l1input.".gz") {
$opn = "zcat $l1input.gz |";
$opn = "gunzip -c $l1input.gz |";
} else {
die "Error: $l1input does not exist";
}
@ -59,7 +59,7 @@ my $l2input = "$corpus.$l2";
if (-e $l2input) {
$opn = $l2input;
} elsif (-e $l2input.".gz") {
$opn = "zcat $l2input.gz |";
$opn = "gunzip -c $l2input.gz |";
} else {
die "Error: $l2input does not exist";
}
@ -154,7 +154,7 @@ print STDERR "Input sentences: $innr Output sentences: $outnr\n";
sub word_count {
my ($line) = @_;
if ($ignore_xml) {
$line =~ s/<\S[^>]*\S>//g;
$line =~ s/<\S[^>]*\S>/ /g;
$line =~ s/\s+/ /g;
$line =~ s/^ //g;
$line =~ s/ $//g;