mosesdecoder/scripts/analysis/suspicious_tokenization.pl

80 lines
1.8 KiB
Perl
Raw Normal View History

#!/usr/bin/perl
# Collects and prints all n-grams that appear in the given corpus both
# tokenized as well as untokenized.
# Ondrej Bojar
use strict;
use warnings;
use Getopt::Long;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my $usage = 0;
my $lowercase = 0;
my $n = 2;
GetOptions(
"n=i" => \$n, # the n-grams to search for (default: bigrams)
"lc|lowercase" => \$lowercase, # ignore case
"h|help|usage" => \$usage, # show info
) or exit 1;
my $nl = 0;
my $ngrams;
my $words;
while (<>) {
$nl++;
print STDERR "." if $nl % 100000 == 0;
print STDERR "($nl)" if $nl % 500000 == 0;
chomp;
$_ = lc($_) if $lowercase;
my @words = split /\s+/;
foreach my $w (@words) {
$words->{$w}++;
}
$ngrams = ngrams($n, \@words, $ngrams); # add ngram counts from this
}
print STDERR "Done.\n";
# Find suspicious
my $report;
foreach my $ngr (keys %$ngrams) {
my $w = $ngr;
$w =~ s/ //g;
my $untokcnt = $words->{$w};
next if ! $untokcnt; # never seen untokenized
my $tokcnt = $ngrams->{$ngr};
$report->{$ngr}->{"tok"} = $tokcnt;
$report->{$ngr}->{"untok"} = $untokcnt;
$report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt);
$report->{$ngr}->{"sum"} = $untokcnt+$tokcnt;
}
# Report
foreach my $ngr (sort {
$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}
|| $report->{$b}->{"sum"} <=> $report->{$a}->{"sum"}
}
keys %$report) {
print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n";
}
sub ngrams {
my $n = shift;
my @words = @{shift()};
my $out = shift;
if ($n == 1) {
foreach my $w (@words) {
$out->{$w}++;
}
} else {
while ($#words >= $n-1) {
$out->{join(" ", @words[0..$n-1])}++;
shift @words;
}
}
return $out;
}