mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2025-01-04 01:45:52 +03:00
79 lines
2.2 KiB
Perl
Executable File
79 lines
2.2 KiB
Perl
Executable File
#!/usr/bin/perl -w
|
|
|
|
# Compatible with sri LM-creating script, eg.
|
|
# ngram-count -order 5 -interpolate -wbdiscount -unk -text corpus.txt -lm lm.txt
|
|
# To use it in the EMS, add this to the [LM] section
|
|
# lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irst-dir"
|
|
# settings = ""
|
|
# Also, make sure that $irst-dir is defined (in the [LM] or [GENERAL] section.
|
|
# It should point to the root of the LM toolkit, eg
|
|
# irst-dir = /Users/hieu/workspace/irstlm/trunk
|
|
# And make sure that $cores is defined, eg $cores = 8
|
|
|
|
use strict;
|
|
use FindBin qw($Bin);
|
|
use Getopt::Long;
|
|
|
|
my $order;
|
|
my $corpusPath;
|
|
my $lmPath;
|
|
my $cores = 2;
|
|
my $irstPath;
|
|
my $tempPath = "tmp";
|
|
|
|
GetOptions("order=s" => \$order,
|
|
"text=s" => \$corpusPath,
|
|
"lm=s" => \$lmPath,
|
|
"cores=s" => \$cores,
|
|
"irst-dir=s" => \$irstPath,
|
|
"temp-dir=s" => \$tempPath
|
|
) or exit 1;
|
|
|
|
die("ERROR: please set order") unless defined($order);
|
|
die("ERROR: please set text") unless defined($corpusPath);
|
|
die("ERROR: please set lm") unless defined($lmPath);
|
|
die("ERROR: please set irst-dir") unless defined($irstPath);
|
|
|
|
my $ext = ($corpusPath =~ m/([^.]+)$/)[0];
|
|
print "extension is $ext\n";
|
|
|
|
$tempPath .= "/irstlm-build-tmp.$$";
|
|
`mkdir -p $tempPath`;
|
|
|
|
my $cmd;
|
|
if ($ext eq "gz")
|
|
{
|
|
$cmd = "zcat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
|
|
}
|
|
else
|
|
{
|
|
$cmd = "cat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
|
|
}
|
|
print STDERR "EXECUTING $cmd\n";
|
|
`$cmd`;
|
|
|
|
$cmd = "IRSTLM=$irstPath/.. $irstPath/build-lm.sh -t $tempPath/stat4 -i \"gunzip -c $tempPath/monolingual.setagged.gz\" -n $order -p -o $tempPath/iarpa.gz -k $cores";
|
|
print STDERR "EXECUTING $cmd\n";
|
|
`$cmd`;
|
|
|
|
$ext = ($lmPath =~ m/([^.]+)$/)[0];
|
|
print "extension is $ext\n";
|
|
|
|
if ($ext eq "gz")
|
|
{
|
|
$cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
|
|
}
|
|
else
|
|
{
|
|
$cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes $lmPath";
|
|
}
|
|
|
|
print STDERR "EXECUTING $cmd\n";
|
|
`$cmd`;
|
|
|
|
$cmd = "rm -rf $tempPath";
|
|
print STDERR "EXECUTING $cmd\n";
|
|
`$cmd`;
|
|
|
|
print STDERR "FINISH.\n";
|