mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
commit
68dbe85545
@ -13,10 +13,10 @@ chomp(@OUT);
|
||||
while(<SRC>) {
|
||||
chomp;
|
||||
if (/^<srcset/) {
|
||||
s/<srcset/<tstset trglang="$language"/;
|
||||
s/<srcset/<tstset trglang="$language"/i;
|
||||
}
|
||||
elsif (/^<\/srcset/) {
|
||||
s/<\/srcset/<\/tstset/;
|
||||
s/<\/srcset/<\/tstset/i;
|
||||
}
|
||||
elsif (/^<doc/i) {
|
||||
s/ *sysid="[^\"]+"//;
|
||||
@ -26,10 +26,10 @@ while(<SRC>) {
|
||||
my $line = shift(@OUT);
|
||||
$line = "" if $line =~ /NO BEST TRANSLATION/;
|
||||
if (/<\/seg>/) {
|
||||
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
|
||||
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
|
||||
}
|
||||
else {
|
||||
s/(<seg[^>]+> *)[^<]*/$1$line/;
|
||||
s/(<seg[^>]+> *)[^<]*/$1$line/i;
|
||||
}
|
||||
}
|
||||
print $_."\n";
|
||||
|
@ -171,7 +171,7 @@ if ($TIMING)
|
||||
|
||||
# tokenize a batch of texts saved in an array
|
||||
# input: an array containing a batch of texts
|
||||
# return: another array cotaining a batch of tokenized texts for the input array
|
||||
# return: another array containing a batch of tokenized texts for the input array
|
||||
sub tokenize_batch
|
||||
{
|
||||
my(@text_list) = @_;
|
||||
|
@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1";
|
||||
if (-e $l1input) {
|
||||
$opn = $l1input;
|
||||
} elsif (-e $l1input.".gz") {
|
||||
$opn = "zcat $l1input.gz |";
|
||||
$opn = "gunzip -c $l1input.gz |";
|
||||
} else {
|
||||
die "Error: $l1input does not exist";
|
||||
}
|
||||
@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2";
|
||||
if (-e $l2input) {
|
||||
$opn = $l2input;
|
||||
} elsif (-e $l2input.".gz") {
|
||||
$opn = "zcat $l2input.gz |";
|
||||
$opn = "gunzip -c $l2input.gz |";
|
||||
} else {
|
||||
die "Error: $l2input does not exist";
|
||||
}
|
||||
@ -160,3 +160,4 @@ sub word_count {
|
||||
my @w = split(/ /,$line);
|
||||
return scalar @w;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user