2015-05-17 16:04:04 +03:00
|
|
|
#!/usr/bin/env perl
|
2010-02-03 17:23:06 +03:00
|
|
|
# Collects and prints all n-grams that appear in the given corpus both
|
|
|
|
# tokenized as well as untokenized.
|
|
|
|
# Ondrej Bojar
|
2015-05-29 14:30:26 +03:00
|
|
|
#
|
|
|
|
# This file is part of moses. Its use is licensed under the GNU Lesser General
|
|
|
|
# Public License version 2.1 or, at your option, any later version.
|
2010-02-03 17:23:06 +03:00
|
|
|
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
|
|
|
|
use Getopt::Long;
|
|
|
|
|
|
|
|
binmode(STDIN, ":utf8");
|
|
|
|
binmode(STDOUT, ":utf8");
|
|
|
|
binmode(STDERR, ":utf8");
|
|
|
|
|
|
|
|
my $usage = 0;
|
|
|
|
my $lowercase = 0;
|
|
|
|
my $n = 2;
|
|
|
|
GetOptions(
|
|
|
|
"n=i" => \$n, # the n-grams to search for (default: bigrams)
|
|
|
|
"lc|lowercase" => \$lowercase, # ignore case
|
|
|
|
"h|help|usage" => \$usage, # show info
|
|
|
|
) or exit 1;
|
|
|
|
|
|
|
|
my $nl = 0;
|
|
|
|
my $ngrams;
|
|
|
|
my $words;
|
|
|
|
while (<>) {
|
|
|
|
$nl++;
|
|
|
|
print STDERR "." if $nl % 100000 == 0;
|
|
|
|
print STDERR "($nl)" if $nl % 500000 == 0;
|
|
|
|
chomp;
|
|
|
|
$_ = lc($_) if $lowercase;
|
|
|
|
my @words = split /\s+/;
|
|
|
|
foreach my $w (@words) {
|
|
|
|
$words->{$w}++;
|
|
|
|
}
|
|
|
|
$ngrams = ngrams($n, \@words, $ngrams); # add ngram counts from this
|
|
|
|
}
|
|
|
|
print STDERR "Done.\n";
|
|
|
|
|
|
|
|
# Find suspicious
|
|
|
|
my $report;
|
|
|
|
foreach my $ngr (keys %$ngrams) {
|
|
|
|
my $w = $ngr;
|
|
|
|
$w =~ s/ //g;
|
|
|
|
my $untokcnt = $words->{$w};
|
|
|
|
next if ! $untokcnt; # never seen untokenized
|
|
|
|
my $tokcnt = $ngrams->{$ngr};
|
|
|
|
$report->{$ngr}->{"tok"} = $tokcnt;
|
|
|
|
$report->{$ngr}->{"untok"} = $untokcnt;
|
|
|
|
$report->{$ngr}->{"diff"} = abs($untokcnt-$tokcnt);
|
2010-02-03 19:37:08 +03:00
|
|
|
$report->{$ngr}->{"sum"} = $untokcnt+$tokcnt;
|
2010-02-03 17:23:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
# Report
|
2010-02-03 19:37:08 +03:00
|
|
|
foreach my $ngr (sort {
|
|
|
|
$report->{$a}->{"diff"} <=> $report->{$b}->{"diff"}
|
|
|
|
|| $report->{$b}->{"sum"} <=> $report->{$a}->{"sum"}
|
|
|
|
}
|
2010-02-03 17:23:06 +03:00
|
|
|
keys %$report) {
|
|
|
|
print "$ngr\t$report->{$ngr}->{untok}\t$report->{$ngr}->{tok}\t$report->{$ngr}->{diff}\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
sub ngrams {
|
|
|
|
my $n = shift;
|
|
|
|
my @words = @{shift()};
|
|
|
|
my $out = shift;
|
|
|
|
if ($n == 1) {
|
|
|
|
foreach my $w (@words) {
|
|
|
|
$out->{$w}++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
while ($#words >= $n-1) {
|
|
|
|
$out->{join(" ", @words[0..$n-1])}++;
|
|
|
|
shift @words;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return $out;
|
|
|
|
}
|