Merge pull request #28 from amittai/amittai

three small things
This commit is contained in:
Hieu Hoang 2013-02-27 07:33:52 -08:00
commit 68dbe85545
3 changed files with 8 additions and 7 deletions

View File

@ -13,10 +13,10 @@ chomp(@OUT);
while(<SRC>) {
chomp;
if (/^<srcset/) {
s/<srcset/<tstset trglang="$language"/;
s/<srcset/<tstset trglang="$language"/i;
}
elsif (/^<\/srcset/) {
s/<\/srcset/<\/tstset/;
s/<\/srcset/<\/tstset/i;
}
elsif (/^<doc/i) {
s/ *sysid="[^\"]+"//;
@ -26,10 +26,10 @@ while(<SRC>) {
my $line = shift(@OUT);
$line = "" if $line =~ /NO BEST TRANSLATION/;
if (/<\/seg>/) {
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
}
else {
s/(<seg[^>]+> *)[^<]*/$1$line/;
s/(<seg[^>]+> *)[^<]*/$1$line/i;
}
}
print $_."\n";

View File

@ -171,7 +171,7 @@ if ($TIMING)
# tokenize a batch of texts saved in an array
# input: an array containing a batch of texts
# return: another array cotaining a batch of tokenized texts for the input array
# return: another array containing a batch of tokenized texts for the input array
sub tokenize_batch
{
my(@text_list) = @_;

View File

@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1";
if (-e $l1input) {
$opn = $l1input;
} elsif (-e $l1input.".gz") {
$opn = "zcat $l1input.gz |";
$opn = "gunzip -c $l1input.gz |";
} else {
die "Error: $l1input does not exist";
}
@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2";
if (-e $l2input) {
$opn = $l2input;
} elsif (-e $l2input.".gz") {
$opn = "zcat $l2input.gz |";
$opn = "gunzip -c $l2input.gz |";
} else {
die "Error: $l2input does not exist";
}
@ -160,3 +160,4 @@ sub word_count {
my @w = split(/ /,$line);
return scalar @w;
}