2006-08-12 01:26:24 +04:00
|
|
|
#!/usr/bin/perl
|
2007-03-15 01:22:36 +03:00
|
|
|
|
|
|
|
# $Id$
|
2006-09-21 16:23:42 +04:00
|
|
|
# Reads a source and hypothesis file and counts equal tokens. Some of these
|
|
|
|
# are punctuation, some are numbers, but most of the remaining are simply
|
|
|
|
# unknown words that the decoder just copied. This script tells you how often
|
|
|
|
# this happens.
|
2006-08-12 01:26:24 +04:00
|
|
|
#
|
2006-09-21 16:23:42 +04:00
|
|
|
# Ondrej Bojar
|
|
|
|
|
2006-08-12 01:26:24 +04:00
|
|
|
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
use Getopt::Long;
|
|
|
|
|
|
|
|
my $ignore_numbers = 0;
|
|
|
|
my $ignore_punct = 0;
|
|
|
|
my $usage = 0;
|
2006-08-12 01:48:39 +04:00
|
|
|
my $top = 10;
|
2006-08-12 01:26:24 +04:00
|
|
|
|
|
|
|
GetOptions(
|
|
|
|
"help" => \$usage,
|
2006-08-12 01:48:39 +04:00
|
|
|
"top=i" => \$top,
|
2006-08-12 01:26:24 +04:00
|
|
|
"ignore-numbers" => \$ignore_numbers,
|
|
|
|
"ignore-punct" => \$ignore_punct,
|
|
|
|
) or exit 1;
|
|
|
|
my $src = shift;
|
|
|
|
my $tgt = shift;
|
|
|
|
|
|
|
|
if ($usage || !defined $src || !defined $tgt) {
|
|
|
|
print STDERR "nontranslated_words.pl srcfile hypothesisfile
|
|
|
|
...counts the number of words that are equal in src and hyp. These are
|
|
|
|
typically unknown words.
|
|
|
|
Options:
|
2006-08-12 01:48:39 +04:00
|
|
|
--top=N ... list N top copied tokens
|
2006-08-12 01:26:24 +04:00
|
|
|
--ignore-numbers ... numbers usually do not get translated, but do
|
|
|
|
not count them (it is not an error)
|
|
|
|
--ignore-punct ... same for punct, do not include it in the count
|
|
|
|
";
|
|
|
|
exit 1;
|
|
|
|
}
|
|
|
|
|
2006-08-13 06:44:28 +04:00
|
|
|
binmode(STDOUT, ":utf8");
|
|
|
|
binmode(STDERR, ":utf8");
|
|
|
|
|
2006-08-12 01:26:24 +04:00
|
|
|
open SRC, $src or die "Can't read $src";
|
|
|
|
open TGT, $tgt or die "Can't read $tgt";
|
|
|
|
binmode(SRC, ":utf8");
|
|
|
|
binmode(TGT, ":utf8");
|
|
|
|
|
|
|
|
my $nr=0;
|
|
|
|
my $outtoks = 0;
|
|
|
|
my $intoks = 0;
|
|
|
|
my $copiedtoks = 0;
|
2006-08-12 01:48:39 +04:00
|
|
|
my %copiedtok;
|
2006-08-12 01:26:24 +04:00
|
|
|
while (<SRC>) {
|
|
|
|
$nr++;
|
|
|
|
chomp;
|
|
|
|
s/^\s+|\s+$//g;
|
|
|
|
my @src = split /\s+/;
|
|
|
|
my %src = map {($_,1)} @src;
|
|
|
|
$intoks += scalar @src;
|
|
|
|
my $t = <TGT>;
|
|
|
|
die "$tgt too short!" if !defined $t;
|
|
|
|
$t =~ s/^\s+|\s+$//g;
|
|
|
|
foreach my $outtok (split /\s+/, $t) {
|
|
|
|
$outtoks++;
|
|
|
|
next if !defined $src{$outtok}; # this word did not appear in input, we generated it
|
|
|
|
next if $ignore_numbers && $outtok =~ /^-?[0-9]*([.,][0-9]+)?$/;
|
|
|
|
next if $ignore_punct && $outtok =~ /^[[:punct:]]+$/;
|
|
|
|
$copiedtoks++;
|
2006-08-12 01:48:39 +04:00
|
|
|
$copiedtok{$outtok}++;
|
2006-08-12 01:26:24 +04:00
|
|
|
}
|
|
|
|
}
|
2006-08-12 01:48:39 +04:00
|
|
|
my $t = <TGT>;
|
|
|
|
die "$tgt too long!" if defined $t;
|
2006-08-12 01:26:24 +04:00
|
|
|
close SRC;
|
|
|
|
close TGT;
|
|
|
|
|
|
|
|
print "Sentences:\t$nr
|
|
|
|
Source tokens:\t$intoks
|
|
|
|
Output tokens:\t$outtoks
|
|
|
|
Output tokens appearing also in input sent:\t$copiedtoks\t"
|
|
|
|
.sprintf("%.2f %%", $copiedtoks/$outtoks*100)
|
|
|
|
."\t".($ignore_punct?"ignoring":"including")." punctuation"
|
|
|
|
."\t".($ignore_numbers?"ignoring":"including")." numbers"
|
|
|
|
."\n";
|
2006-08-12 01:48:39 +04:00
|
|
|
|
|
|
|
if ($top) {
|
|
|
|
my $cnt = 0;
|
|
|
|
print "Top $top copied tokens:\n";
|
|
|
|
foreach my $t (sort {$copiedtok{$b}<=>$copiedtok{$a} || $a cmp $b} keys %copiedtok) {
|
|
|
|
print "$copiedtok{$t}\t$t\n";
|
|
|
|
last if $cnt > $top;
|
|
|
|
$cnt++;
|
|
|
|
}
|
|
|
|
}
|