mosesdecoder/scripts/training/reduce-topt-count.pl

208 lines
4.7 KiB
Perl
Raw Normal View History

#!/usr/bin/perl
# given a moses.ini, filter the phrase tables to contain
# only ttable-limit options per source phrase
#
# outputs new phrase tables and updated moses.ini into targetdir
#
# usage: reduce-topt-count.pl moses.ini targetdir
use strict;
use warnings;
use File::Basename;
use File::Path;
use POSIX;
use List::Util qw( min sum );
my ($ini_file, $targetdir) = @ARGV;
if (! defined $targetdir) {
die "usage: reduce-topt-count.pl moses.ini targetdir\n"
}
my %ttables;
my $ini_hdl = my_open($ini_file);
my $outini_hdl = my_save("$targetdir/moses.ini");
my $section = "";
my %section_handlers = (
'ttable-file' => read_ttable_file(),
'ttable-limit' => read_ttable_limit(),
'weight-t' => read_weight_t()
);
# print header for updated moses.ini
my $timestamp = POSIX::strftime("%m/%d/%Y %H:%M:%S", localtime);
print $outini_hdl <<"END";
# Generated by reduce-topt-count.pl at $timestamp
# Original file: $ini_file
END
# load original moses.ini & generate new moses.ini
while (<$ini_hdl>) {
chomp(my $line = $_);
my $do_print = 1;
if ($line =~ m/^\s*#/ || $line =~ m/^\s*$/) {
#ignore empty and commented lines
} elsif ($line =~ m/^\[(.*)\]/) {
$section = $1; # start of a new section
} else {
if (defined $section_handlers{$section}) {
# call appropriate section handler;
# handlers are also responsible for printing out
# (possibly modified) line into new moses.ini
$do_print = 0;
$section_handlers{$section}->($line, $outini_hdl);
}
}
if ($do_print) {
print $outini_hdl "$line\n";
}
}
close $outini_hdl;
# write filtered phrase tables
for my $ttable (keys %ttables) {
filter_table($ttables{$ttable});
}
# filter phrase tables
## subroutines
sub read_ttable_file
{
my $ttable_id = 0;
return sub {
my ($line, $outhdl) = @_;
if ($line !~ m/^(\d+) ([\d\,\-]+) ([\d\,\-]+) (\d+) (\S+)$/) {
die "Format not recognized: $line";
}
my ($type, $srcfacts, $tgtfacts, $numscores, $file) = ($1, $2, $3, $4, $5);
if ($type != 0) {
die "Cannot work with ttables of type $type";
}
$ttables{$ttable_id} = {
file => $file,
scores => $numscores
};
print $outhdl
"$type $srcfacts $tgtfacts $numscores $targetdir/", basename($file), "\n";
$ttable_id++;
}
}
sub read_ttable_limit
{
my $ttable_id = 0;
return sub {
my ($line, $outhdl) = @_;
$ttables{$ttable_id}->{limit} = $line;
print $outhdl "$line\n";
$ttable_id++;
}
}
sub read_weight_t
{
my $weight_idx = 0;
my $ttable_id = 0;
return sub {
my ($line, $outhdl) = @_;
if ($ttables{$ttable_id}->{scores} == $weight_idx) {
$weight_idx = 0;
$ttable_id++;
}
push @{ $ttables{$ttable_id}->{weights} }, $line;
print $outhdl "$line\n";
$weight_idx++;
}
}
sub filter_table
{
my $ttable = shift;
my $in = my_open($ttable->{file});
my $out = my_save($targetdir . "/" . basename($ttable->{file}));
my $limit = $ttable->{limit};
my @weights = @{ $ttable->{weights} };
print STDERR "Filtering ", $ttable->{file}, ", using limit $limit\n";
my $kept = 0;
my $total = 0;
my $src_phrase = "";
my @tgt_phrases;
while (<$in>) {
chomp(my $line = $_);
$total++;
print STDERR '.' if $total % 1000 == 0;
my @cols = split / \|\|\| /, $line;
if ($cols[0] ne $src_phrase) {
my @sorted = sort { $b->{score} <=> $a->{score} } @tgt_phrases;
2012-07-06 22:18:39 +04:00
for my $phrase (@sorted[0 .. min($#sorted, $limit - 1)]) {
$kept++;
print $out $phrase->{str}, "\n";
}
$src_phrase = $cols[0];
@tgt_phrases = ();
}
my @scores = split ' ', $cols[2];
push @tgt_phrases, {
str => $line,
score => sum(map { $weights[$_] * log $scores[$_] } (0 .. $#weights))
};
}
printf STDERR "Finished, kept %d%% of phrases\n", $kept / $total * 100;
close $in;
close $out;
}
sub my_open {
my $f = shift;
die "Not found: $f" if ! -e $f;
my $opn;
my $hdl;
my $ft = `file $f`;
# file might not recognize some files!
if ($f =~ /\.gz$/ || $ft =~ /gzip compressed data/) {
$opn = "zcat $f |";
} elsif ($f =~ /\.bz2$/ || $ft =~ /bzip2 compressed data/) {
$opn = "bzcat $f |";
} else {
$opn = "$f";
}
open $hdl, $opn or die "Can't open '$opn': $!";
binmode $hdl, ":utf8";
return $hdl;
}
sub my_save {
my $f = shift;
if ($f eq "-") {
binmode(STDOUT, ":utf8");
return *STDOUT;
}
my $opn;
my $hdl;
# file might not recognize some files!
if ($f =~ /\.gz$/) {
$opn = "| gzip -c > '$f'";
} elsif ($f =~ /\.bz2$/) {
$opn = "| bzip2 > '$f'";
} else {
$opn = ">$f";
}
mkpath( dirname($f) );
open $hdl, $opn or die "Can't write to '$opn': $!";
binmode $hdl, ":utf8";
return $hdl;
}