mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-09-20 23:58:15 +03:00
a handy script to count words that passed through the decoder unchanged (mostly because they're unknown); can exclude numbers and punctuation
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@667 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
5cb683cd7e
commit
8f504a1d9b
69
scripts/analysis/nontranslated_words.pl
Executable file
69
scripts/analysis/nontranslated_words.pl
Executable file
@ -0,0 +1,69 @@
|
||||
#!/usr/bin/perl
|
||||
#
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long;
|
||||
|
||||
my $ignore_numbers = 0;
|
||||
my $ignore_punct = 0;
|
||||
my $usage = 0;
|
||||
|
||||
GetOptions(
|
||||
"help" => \$usage,
|
||||
"ignore-numbers" => \$ignore_numbers,
|
||||
"ignore-punct" => \$ignore_punct,
|
||||
) or exit 1;
|
||||
my $src = shift;
|
||||
my $tgt = shift;
|
||||
|
||||
if ($usage || !defined $src || !defined $tgt) {
|
||||
print STDERR "nontranslated_words.pl srcfile hypothesisfile
|
||||
...counts the number of words that are equal in src and hyp. These are
|
||||
typically unknown words.
|
||||
Options:
|
||||
--ignore-numbers ... numbers usually do not get translated, but do
|
||||
not count them (it is not an error)
|
||||
--ignore-punct ... same for punct, do not include it in the count
|
||||
";
|
||||
exit 1;
|
||||
}
|
||||
|
||||
open SRC, $src or die "Can't read $src";
|
||||
open TGT, $tgt or die "Can't read $tgt";
|
||||
binmode(SRC, ":utf8");
|
||||
binmode(TGT, ":utf8");
|
||||
|
||||
my $nr=0;
|
||||
my $outtoks = 0;
|
||||
my $intoks = 0;
|
||||
my $copiedtoks = 0;
|
||||
while (<SRC>) {
|
||||
$nr++;
|
||||
chomp;
|
||||
s/^\s+|\s+$//g;
|
||||
my @src = split /\s+/;
|
||||
my %src = map {($_,1)} @src;
|
||||
$intoks += scalar @src;
|
||||
my $t = <TGT>;
|
||||
die "$tgt too short!" if !defined $t;
|
||||
$t =~ s/^\s+|\s+$//g;
|
||||
foreach my $outtok (split /\s+/, $t) {
|
||||
$outtoks++;
|
||||
next if !defined $src{$outtok}; # this word did not appear in input, we generated it
|
||||
next if $ignore_numbers && $outtok =~ /^-?[0-9]*([.,][0-9]+)?$/;
|
||||
next if $ignore_punct && $outtok =~ /^[[:punct:]]+$/;
|
||||
$copiedtoks++;
|
||||
}
|
||||
}
|
||||
close SRC;
|
||||
close TGT;
|
||||
|
||||
print "Sentences:\t$nr
|
||||
Source tokens:\t$intoks
|
||||
Output tokens:\t$outtoks
|
||||
Output tokens appearing also in input sent:\t$copiedtoks\t"
|
||||
.sprintf("%.2f %%", $copiedtoks/$outtoks*100)
|
||||
."\t".($ignore_punct?"ignoring":"including")." punctuation"
|
||||
."\t".($ignore_numbers?"ignoring":"including")." numbers"
|
||||
."\n";
|
Loading…
Reference in New Issue
Block a user