2012-07-06 21:48:05 +04:00
|
|
|
#!/usr/bin/perl
|
|
|
|
|
|
|
|
# given a moses.ini, filter the phrase tables to contain
|
|
|
|
# only ttable-limit options per source phrase
|
|
|
|
#
|
|
|
|
# outputs new phrase tables and updated moses.ini into targetdir
|
|
|
|
#
|
|
|
|
# usage: reduce-topt-count.pl moses.ini targetdir
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
use File::Basename;
|
|
|
|
use File::Path;
|
|
|
|
use POSIX;
|
|
|
|
use List::Util qw( min sum );
|
|
|
|
|
|
|
|
my ($ini_file, $targetdir) = @ARGV;
|
|
|
|
|
|
|
|
if (! defined $targetdir) {
|
|
|
|
die "usage: reduce-topt-count.pl moses.ini targetdir\n"
|
|
|
|
}
|
|
|
|
|
|
|
|
my %ttables;
|
|
|
|
my $ini_hdl = my_open($ini_file);
|
|
|
|
my $outini_hdl = my_save("$targetdir/moses.ini");
|
|
|
|
|
|
|
|
my $section = "";
|
|
|
|
|
|
|
|
my %section_handlers = (
|
|
|
|
'ttable-file' => read_ttable_file(),
|
|
|
|
'ttable-limit' => read_ttable_limit(),
|
|
|
|
'weight-t' => read_weight_t()
|
|
|
|
);
|
|
|
|
|
|
|
|
# print header for updated moses.ini
|
|
|
|
my $timestamp = POSIX::strftime("%m/%d/%Y %H:%M:%S", localtime);
|
|
|
|
print $outini_hdl <<"END";
|
|
|
|
# Generated by reduce-topt-count.pl at $timestamp
|
|
|
|
# Original file: $ini_file
|
|
|
|
|
|
|
|
END
|
|
|
|
|
|
|
|
# load original moses.ini & generate new moses.ini
|
|
|
|
while (<$ini_hdl>) {
|
|
|
|
chomp(my $line = $_);
|
|
|
|
my $do_print = 1;
|
|
|
|
if ($line =~ m/^\s*#/ || $line =~ m/^\s*$/) {
|
|
|
|
#ignore empty and commented lines
|
|
|
|
} elsif ($line =~ m/^\[(.*)\]/) {
|
|
|
|
$section = $1; # start of a new section
|
|
|
|
} else {
|
|
|
|
if (defined $section_handlers{$section}) {
|
|
|
|
# call appropriate section handler;
|
|
|
|
# handlers are also responsible for printing out
|
|
|
|
# (possibly modified) line into new moses.ini
|
|
|
|
$do_print = 0;
|
|
|
|
$section_handlers{$section}->($line, $outini_hdl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($do_print) {
|
|
|
|
print $outini_hdl "$line\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
close $outini_hdl;
|
|
|
|
|
|
|
|
# write filtered phrase tables
|
|
|
|
for my $ttable (keys %ttables) {
|
|
|
|
filter_table($ttables{$ttable});
|
|
|
|
}
|
|
|
|
|
|
|
|
# filter phrase tables
|
|
|
|
|
|
|
|
## subroutines
|
|
|
|
|
|
|
|
sub read_ttable_file
|
|
|
|
{
|
|
|
|
my $ttable_id = 0;
|
|
|
|
return sub {
|
|
|
|
my ($line, $outhdl) = @_;
|
|
|
|
if ($line !~ m/^(\d+) ([\d\,\-]+) ([\d\,\-]+) (\d+) (\S+)$/) {
|
|
|
|
die "Format not recognized: $line";
|
|
|
|
}
|
|
|
|
my ($type, $srcfacts, $tgtfacts, $numscores, $file) = ($1, $2, $3, $4, $5);
|
|
|
|
if ($type != 0) {
|
|
|
|
die "Cannot work with ttables of type $type";
|
|
|
|
}
|
|
|
|
$ttables{$ttable_id} = {
|
|
|
|
file => $file,
|
|
|
|
scores => $numscores
|
|
|
|
};
|
|
|
|
|
|
|
|
print $outhdl
|
|
|
|
"$type $srcfacts $tgtfacts $numscores $targetdir/", basename($file), "\n";
|
|
|
|
$ttable_id++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sub read_ttable_limit
|
|
|
|
{
|
|
|
|
my $ttable_id = 0;
|
|
|
|
return sub {
|
|
|
|
my ($line, $outhdl) = @_;
|
|
|
|
$ttables{$ttable_id}->{limit} = $line;
|
|
|
|
print $outhdl "$line\n";
|
|
|
|
$ttable_id++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sub read_weight_t
|
|
|
|
{
|
|
|
|
my $weight_idx = 0;
|
|
|
|
my $ttable_id = 0;
|
|
|
|
return sub {
|
|
|
|
my ($line, $outhdl) = @_;
|
|
|
|
if ($ttables{$ttable_id}->{scores} == $weight_idx) {
|
|
|
|
$weight_idx = 0;
|
|
|
|
$ttable_id++;
|
|
|
|
}
|
|
|
|
push @{ $ttables{$ttable_id}->{weights} }, $line;
|
|
|
|
print $outhdl "$line\n";
|
|
|
|
$weight_idx++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sub filter_table
|
|
|
|
{
|
|
|
|
my $ttable = shift;
|
|
|
|
my $in = my_open($ttable->{file});
|
|
|
|
my $out = my_save($targetdir . "/" . basename($ttable->{file}));
|
|
|
|
my $limit = $ttable->{limit};
|
|
|
|
my @weights = @{ $ttable->{weights} };
|
|
|
|
|
|
|
|
print STDERR "Filtering ", $ttable->{file}, ", using limit $limit\n";
|
|
|
|
my $kept = 0;
|
|
|
|
my $total = 0;
|
|
|
|
|
|
|
|
my $src_phrase = "";
|
|
|
|
my @tgt_phrases;
|
|
|
|
while (<$in>) {
|
|
|
|
chomp(my $line = $_);
|
|
|
|
$total++;
|
|
|
|
print STDERR '.' if $total % 1000 == 0;
|
|
|
|
my @cols = split / \|\|\| /, $line;
|
|
|
|
if ($cols[0] ne $src_phrase) {
|
|
|
|
my @sorted = sort { $b->{score} <=> $a->{score} } @tgt_phrases;
|
2012-07-06 22:18:39 +04:00
|
|
|
for my $phrase (@sorted[0 .. min($#sorted, $limit - 1)]) {
|
2012-07-06 21:48:05 +04:00
|
|
|
$kept++;
|
|
|
|
print $out $phrase->{str}, "\n";
|
|
|
|
}
|
|
|
|
$src_phrase = $cols[0];
|
|
|
|
@tgt_phrases = ();
|
|
|
|
}
|
|
|
|
my @scores = split ' ', $cols[2];
|
|
|
|
push @tgt_phrases, {
|
|
|
|
str => $line,
|
|
|
|
score => sum(map { $weights[$_] * log $scores[$_] } (0 .. $#weights))
|
|
|
|
};
|
|
|
|
}
|
|
|
|
printf STDERR "Finished, kept %d%% of phrases\n", $kept / $total * 100;
|
|
|
|
close $in;
|
|
|
|
close $out;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub my_open {
|
|
|
|
my $f = shift;
|
|
|
|
die "Not found: $f" if ! -e $f;
|
|
|
|
|
|
|
|
my $opn;
|
|
|
|
my $hdl;
|
|
|
|
my $ft = `file $f`;
|
|
|
|
# file might not recognize some files!
|
|
|
|
if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
|
|
|
|
$opn = "zcat $f |";
|
|
|
|
} elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
|
|
|
|
$opn = "bzcat $f |";
|
|
|
|
} else {
|
|
|
|
$opn = "$f";
|
|
|
|
}
|
|
|
|
open $hdl, $opn or die "Can't open '$opn': $!";
|
|
|
|
binmode $hdl, ":utf8";
|
|
|
|
return $hdl;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub my_save {
|
|
|
|
my $f = shift;
|
|
|
|
if ($f eq "-") {
|
|
|
|
binmode(STDOUT, ":utf8");
|
|
|
|
return *STDOUT;
|
|
|
|
}
|
|
|
|
|
|
|
|
my $opn;
|
|
|
|
my $hdl;
|
|
|
|
# file might not recognize some files!
|
|
|
|
if ($f =~ /\.gz$/) {
|
|
|
|
$opn = "| gzip -c > '$f'";
|
|
|
|
} elsif ($f =~ /\.bz2$/) {
|
|
|
|
$opn = "| bzip2 > '$f'";
|
|
|
|
} else {
|
|
|
|
$opn = ">$f";
|
|
|
|
}
|
|
|
|
mkpath( dirname($f) );
|
|
|
|
open $hdl, $opn or die "Can't write to '$opn': $!";
|
|
|
|
binmode $hdl, ":utf8";
|
|
|
|
return $hdl;
|
|
|
|
}
|
|
|
|
|