From 5cd614ecd87894653c3dd1b529c9feff77954c68 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Tue, 20 Nov 2012 17:18:57 +0000 Subject: [PATCH] adjust to irstlm changes --- scripts/ems/support/interpolate-lm.perl | 2 +- scripts/ems/support/train-irstlm.perl | 22 -------- scripts/generic/trainlm-irst2.perl | 68 +++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 23 deletions(-) delete mode 100644 scripts/ems/support/train-irstlm.perl create mode 100755 scripts/generic/trainlm-irst2.perl diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl index 39eb1483b..155829556 100755 --- a/scripts/ems/support/interpolate-lm.perl +++ b/scripts/ems/support/interpolate-lm.perl @@ -45,7 +45,7 @@ foreach my $lm (@LM) { open(LM,$lm) || die("ERROR: could not find language model file '$lm'"); } while() { - $lm_order = $1 if /ngram (\d+)/; + $lm_order = $1 if /ngram\s+(\d+)/; last if /1-grams/; } close(LM); diff --git a/scripts/ems/support/train-irstlm.perl b/scripts/ems/support/train-irstlm.perl deleted file mode 100644 index 5d2c05ce2..000000000 --- a/scripts/ems/support/train-irstlm.perl +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/perl -w - -use strict; - -# wrapper for irstlm training - -my $IRSTLM = shift @ARGV; - -my $settings = join(" ",@ARGV); -$settings =~ s/\-order/\-n/; -$settings =~ s/\-text/\-i/; -$settings =~ s/\-lm/\-o/; - -if ($settings !~ /\-o +(\S+)/) { - die("ERROR: no output file specified"); -} -my $lm = $1; -$settings =~ s/(\-o +\S+)/$1.iarpa.gz/; - -my $cmd = "IRSTLM=$IRSTLM $IRSTLM/scripts/build-lm.sh $settings ; ~/moses/irstlm/bin/compile-lm --text yes $lm.iarpa.gz $lm"; -print STDERR $cmd."\n"; -print `$cmd`; diff --git a/scripts/generic/trainlm-irst2.perl b/scripts/generic/trainlm-irst2.perl new file mode 100755 index 000000000..8ad53e880 --- /dev/null +++ b/scripts/generic/trainlm-irst2.perl @@ -0,0 +1,68 @@ +#!/usr/bin/perl -w + +# Compatible with sri LM-creating script, eg. +# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt +# To use it in the EMS, add this to the [LM] section +# lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irst-dir" +# settings = "" +# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section. +# It should point to the root of the LM toolkit, eg +# irst-dir = /Users/hieu/workspace/irstlm/trunk/bin +# Set smoothing method in settings, if different from modified Kneser-Ney + +use strict; +use FindBin qw($RealBin); +use Getopt::Long; + +my $order = 3; # order of language model (default trigram) +my $corpusPath; # input text data +my $lmPath; # generated language model +my $cores = 2; # number of CPUs used +my $irstPath; # bin directory of IRSTLM +my $tempPath = "tmp"; # temp dir +my $pruneSingletons = 1; # 1 = prune singletons, 0 = keep singletons +my $smoothing = "msb"; # smoothing method: wb = witten-bell, sb = kneser-ney, msb = modified-kneser-ney +my $dummy; + +GetOptions("order=s" => \$order, + "text=s" => \$corpusPath, + "lm=s" => \$lmPath, + "cores=s" => \$cores, + "irst-dir=s" => \$irstPath, + "temp-dir=s" => \$tempPath, + "p=i" => \$pruneSingletons, # irstlm parameter: prune singletons + "s=s" => \$smoothing, # irstlm parameter: smoothing method + "interpolate!" => \$dummy, #ignore + "kndiscount!" => \$dummy #ignore + ) or exit 1; + +#die("ERROR: please set order") unless defined($order); +die("ERROR: please set text") unless defined($corpusPath); +die("ERROR: please set lm") unless defined($lmPath); +die("ERROR: please set irst-dir") unless defined($irstPath); + + +$tempPath .= "/irstlm-build-tmp.$$"; +`mkdir -p $tempPath`; + +# add and +my $cmd = "cat $corpusPath | $irstPath/add-start-end.sh > $tempPath/setagged"; +print STDERR "EXECUTING $cmd\n"; +`$cmd`; + +# collect n-gram counts +$cmd = "$irstPath/ngt -i=$tempPath/setagged -n=$order -b=yes -o=$tempPath/counts"; +print STDERR "EXECUTING $cmd\n"; +`$cmd`; + +# build lm +$cmd = "$irstPath/tlm -o=$lmPath -lm=$smoothing -bo=yes -n=$order -tr=$tempPath/counts"; +$cmd .= " -ps=no" unless $pruneSingletons; +print STDERR "EXECUTING $cmd\n"; +`$cmd`; + +$cmd = "rm -rf $tempPath"; +print STDERR "EXECUTING $cmd\n"; +`$cmd`; + +print STDERR "FINISH.\n";