mosesdecoder/scripts/ems/support/interpolate-lm.perl
2012-11-20 17:18:57 +00:00

209 lines
5.9 KiB
Perl
Executable File

#!/usr/bin/perl -w
use strict;
use IPC::Open3;
use File::Temp qw/tempdir/;
use File::Path qw/rmtree/;
use Getopt::Long "GetOptions";
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64";
my $TEMPDIR = "/tmp";
my ($TUNING,$LM,$NAME,$GROUP,$CONTINUE);
die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]")
unless &GetOptions('tuning=s' => => \$TUNING,
'name=s' => \$NAME,
'srilm=s' => \$SRILM,
'tempdir=s' => \$TEMPDIR,
'continue' => \$CONTINUE,
'group=s' => \$GROUP,
'lm=s' => \$LM);
# check and set default to unset parameters
die("ERROR: please specify output language model name --name") unless defined($NAME);
die("ERROR: please specify tuning set with --tuning") unless defined($TUNING);
die("ERROR: please specify language models with --lm") unless defined($LM);
die("ERROR: can't read $TUNING") unless -e $TUNING;
die("ERROR: did not find srilm dir") unless -e $SRILM;
die("ERROR: cannot run ngram") unless -x $SRILM."/ngram";
my @LM = split(/,/,$LM);
# establish order
my $order = 0;
foreach my $lm (@LM) {
my $lm_order;
$lm .= ".gz" if (! -e $lm && -e "$lm.gz");
if ($lm =~ /gz$/) {
open(LM,"zcat $lm|") || die("ERROR: could not find language model file '$lm'");
}
else {
open(LM,$lm) || die("ERROR: could not find language model file '$lm'");
}
while(<LM>) {
$lm_order = $1 if /ngram\s+(\d+)/;
last if /1-grams/;
}
close(LM);
$order = $lm_order if $order == 0;
die("ERROR: language models have different order") if $order != $lm_order;
}
print STDERR "language models have order $order.\n";
# too many language models? group them first
if (!defined($GROUP) && scalar(@LM) > 10) {
print STDERR "more than 10, automatically grouping language models.\n";
my $num_groups = int(scalar(@LM)/10 + 0.99);
my $size_groups = int(scalar(@LM)/$num_groups + 0.99);
$GROUP = "";
for(my $i=0;$i<$num_groups;$i++) {
$GROUP .= " " unless $i==0;
for(my $j=0;$j<$size_groups;$j++) {
my $lm_i = $i*$size_groups+$j;
next if $lm_i >= scalar(@LM);
$GROUP .= "," unless $j==0;
$GROUP .= $lm_i;
}
}
print STDERR "groups: $GROUP\n";
}
# normal interpolation
if (!defined($GROUP)) {
&interpolate($NAME,@LM);
exit;
}
# group language models into sub-interpolated models
my %ALREADY;
my $g = 0;
my @SUB_NAME;
foreach my $subgroup (split(/ /,$GROUP)) {
my @SUB_LM;
foreach my $lm_i (split(/,/,$subgroup)) {
die("ERROR: LM id $lm_i in group definition out of range") if $lm_i >= scalar(@LM);
push @SUB_LM,$LM[$lm_i];
$ALREADY{$lm_i} = 1;
}
#if (scalar @SUB_NAME == 0 && scalar keys %ALREADY == scalar @LM) {
# print STDERR "WARNING: grouped all language models into one, perform normal interpolation\n";
# &interpolate($NAME,@LM);
# exit;
#}
my $name = $NAME.".group-".chr(97+($g++));
push @SUB_NAME,$name;
print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n";
&interpolate($name, @SUB_LM) unless $CONTINUE && -e $name;
}
for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) {
next if defined($ALREADY{$lm_i});
push @SUB_NAME, $LM[$lm_i];
}
print STDERR "\n=== BUILDING FINAL LM ===\n\n";
&interpolate($NAME, @SUB_NAME);
# main interpolation function
sub interpolate {
my ($name,@LM) = @_;
die("cannot interpolate more than 10 language models at once: ",join(",",@LM))
if scalar(@LM) > 10;
my $tmp = tempdir(DIR=>$TEMPDIR);
# compute perplexity
my $i = 0;
foreach my $lm (@LM) {
print STDERR "compute perplexity for $lm\n";
safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n";
print STDERR `tail -n 2 $tmp/iplm.$$.$i`;
$i++;
}
# compute lambdas
print STDERR "computing lambdas...\n";
my $cmd = "$SRILM/compute-best-mix";
for(my $i=0;$i<scalar(@LM);$i++) {
$cmd .= " $tmp/iplm.$$.$i";
}
my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
my $mix = $mixout;
`rm $tmp/iplm.$$.*`;
$mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
my @LAMBDA = split(/ /,$1);
# create new language model
print STDERR "creating new language model...\n";
$i = 0;
$cmd = "$SRILM/ngram -unk -order $order -write-lm $name";
foreach my $lm (@LM) {
$cmd .= " -lm " if $i==0;
$cmd .= " -mix-lm " if $i==1;
$cmd .= " -mix-lm$i " if $i>1;
$cmd .= $lm;
$cmd .= " -lambda " if $i==0;
$cmd .= " -mix-lambda$i " if $i>1;
$cmd .= $LAMBDA[$i] if $i!=1;
$i++;
}
safesystem($cmd) or die "Failed.";
rmtree($tmp); # remove the temp dir
print STDERR "done.\n";
}
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit(1);
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
sub saferun3 {
print STDERR "Executing: @_\n";
my($wtr, $rdr, $err);
my $pid = open3($wtr, $rdr, $err, @_);
close($wtr);
waitpid($pid, 0);
my $gotout = "";
$gotout .= $_ while (<$rdr>);
close $rdr;
my $goterr = "";
if (defined $err) {
$goterr .= $_ while (<$err>);
close $err;
}
if ($? == -1) {
print STDERR "Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit(1);
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ( $gotout, $goterr, $exitcode );
}
}