mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
adjust to irstlm changes
This commit is contained in:
parent
7d6d91a2e8
commit
5cd614ecd8
@ -45,7 +45,7 @@ foreach my $lm (@LM) {
|
||||
open(LM,$lm) || die("ERROR: could not find language model file '$lm'");
|
||||
}
|
||||
while(<LM>) {
|
||||
$lm_order = $1 if /ngram (\d+)/;
|
||||
$lm_order = $1 if /ngram\s+(\d+)/;
|
||||
last if /1-grams/;
|
||||
}
|
||||
close(LM);
|
||||
|
@ -1,22 +0,0 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
|
||||
# wrapper for irstlm training
|
||||
|
||||
my $IRSTLM = shift @ARGV;
|
||||
|
||||
my $settings = join(" ",@ARGV);
|
||||
$settings =~ s/\-order/\-n/;
|
||||
$settings =~ s/\-text/\-i/;
|
||||
$settings =~ s/\-lm/\-o/;
|
||||
|
||||
if ($settings !~ /\-o +(\S+)/) {
|
||||
die("ERROR: no output file specified");
|
||||
}
|
||||
my $lm = $1;
|
||||
$settings =~ s/(\-o +\S+)/$1.iarpa.gz/;
|
||||
|
||||
my $cmd = "IRSTLM=$IRSTLM $IRSTLM/scripts/build-lm.sh $settings ; ~/moses/irstlm/bin/compile-lm --text yes $lm.iarpa.gz $lm";
|
||||
print STDERR $cmd."\n";
|
||||
print `$cmd`;
|
68
scripts/generic/trainlm-irst2.perl
Executable file
68
scripts/generic/trainlm-irst2.perl
Executable file
@ -0,0 +1,68 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
# Compatible with sri LM-creating script, eg.
|
||||
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
|
||||
# To use it in the EMS, add this to the [LM] section
|
||||
# lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irst-dir"
|
||||
# settings = ""
|
||||
# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
|
||||
# It should point to the root of the LM toolkit, eg
|
||||
# irst-dir = /Users/hieu/workspace/irstlm/trunk/bin
|
||||
# Set smoothing method in settings, if different from modified Kneser-Ney
|
||||
|
||||
use strict;
|
||||
use FindBin qw($RealBin);
|
||||
use Getopt::Long;
|
||||
|
||||
my $order = 3; # order of language model (default trigram)
|
||||
my $corpusPath; # input text data
|
||||
my $lmPath; # generated language model
|
||||
my $cores = 2; # number of CPUs used
|
||||
my $irstPath; # bin directory of IRSTLM
|
||||
my $tempPath = "tmp"; # temp dir
|
||||
my $pruneSingletons = 1; # 1 = prune singletons, 0 = keep singletons
|
||||
my $smoothing = "msb"; # smoothing method: wb = witten-bell, sb = kneser-ney, msb = modified-kneser-ney
|
||||
my $dummy;
|
||||
|
||||
GetOptions("order=s" => \$order,
|
||||
"text=s" => \$corpusPath,
|
||||
"lm=s" => \$lmPath,
|
||||
"cores=s" => \$cores,
|
||||
"irst-dir=s" => \$irstPath,
|
||||
"temp-dir=s" => \$tempPath,
|
||||
"p=i" => \$pruneSingletons, # irstlm parameter: prune singletons
|
||||
"s=s" => \$smoothing, # irstlm parameter: smoothing method
|
||||
"interpolate!" => \$dummy, #ignore
|
||||
"kndiscount!" => \$dummy #ignore
|
||||
) or exit 1;
|
||||
|
||||
#die("ERROR: please set order") unless defined($order);
|
||||
die("ERROR: please set text") unless defined($corpusPath);
|
||||
die("ERROR: please set lm") unless defined($lmPath);
|
||||
die("ERROR: please set irst-dir") unless defined($irstPath);
|
||||
|
||||
|
||||
$tempPath .= "/irstlm-build-tmp.$$";
|
||||
`mkdir -p $tempPath`;
|
||||
|
||||
# add <s> and </s>
|
||||
my $cmd = "cat $corpusPath | $irstPath/add-start-end.sh > $tempPath/setagged";
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
# collect n-gram counts
|
||||
$cmd = "$irstPath/ngt -i=$tempPath/setagged -n=$order -b=yes -o=$tempPath/counts";
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
# build lm
|
||||
$cmd = "$irstPath/tlm -o=$lmPath -lm=$smoothing -bo=yes -n=$order -tr=$tempPath/counts";
|
||||
$cmd .= " -ps=no" unless $pruneSingletons;
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
$cmd = "rm -rf $tempPath";
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
print STDERR "FINISH.\n";
|
Loading…
Reference in New Issue
Block a user