mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-29 06:52:34 +03:00
better error message when no corpus defined, better integration of IRSTLM training
This commit is contained in:
parent
b95c372e3a
commit
cdf735b01b
@ -132,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
|
@ -132,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
|
@ -132,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
|
@ -136,10 +136,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
|
@ -126,10 +126,15 @@ raw-stem = $toy-data/nc-5k
|
||||
[LM]
|
||||
|
||||
### tool to be used for language model training
|
||||
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
|
||||
#
|
||||
# srilm
|
||||
lm-training = $srilm-dir/ngram-count
|
||||
settings = "-interpolate -kndiscount -unk"
|
||||
|
||||
# irstlm
|
||||
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
|
||||
#settings = ""
|
||||
|
||||
# order of the language model
|
||||
order = 5
|
||||
|
||||
### tool to be used for training randomized language model from scratch
|
||||
|
@ -2264,12 +2264,13 @@ sub define_reporting_report {
|
||||
### subs for step definition
|
||||
|
||||
sub get_output_and_input {
|
||||
my ($step_id) = @_;
|
||||
my ($step_id) = @_;
|
||||
|
||||
my $step = $DO_STEP[$step_id];
|
||||
my $output = &get_default_file(&deconstruct_name($step));
|
||||
my $step = $DO_STEP[$step_id];
|
||||
my $output = &get_default_file(&deconstruct_name($step));
|
||||
|
||||
my @INPUT;
|
||||
my @INPUT;
|
||||
if (defined($USES_INPUT{$step_id})) {
|
||||
for(my $i=0; $i<scalar @{$USES_INPUT{$step_id}}; $i++) {
|
||||
# get name of input file needed
|
||||
my $in_file = $USES_INPUT{$step_id}[$i];
|
||||
@ -2301,7 +2302,8 @@ sub get_output_and_input {
|
||||
push @INPUT,&get_specified_or_default_file(&deconstruct_name($in_file),
|
||||
&deconstruct_name($prev_step));
|
||||
}
|
||||
return ($output,@INPUT);
|
||||
}
|
||||
return ($output,@INPUT);
|
||||
}
|
||||
|
||||
sub define_template {
|
||||
@ -2400,6 +2402,9 @@ sub define_template {
|
||||
}
|
||||
# input is defined as IN or IN0, IN1, IN2
|
||||
else {
|
||||
if ($cmd =~ /([^ANS])IN/ && scalar(@INPUT) == 0) {
|
||||
die("ERROR: Step $step requires input from prior steps, but none defined.");
|
||||
}
|
||||
$cmd =~ s/([^ANS])IN(\d+)/$1$INPUT[$2]/g; # a bit trickier to
|
||||
$cmd =~ s/([^ANS])IN/$1$INPUT[0]/g; # avoid matching TRAINING, RECASING
|
||||
$cmd =~ s/^IN(\d+)/$INPUT[$2]/g;
|
||||
|
@ -17,34 +17,42 @@ use Getopt::Long;
|
||||
my $order;
|
||||
my $corpusPath;
|
||||
my $lmPath;
|
||||
my $cores;
|
||||
my $cores = 2;
|
||||
my $irstPath;
|
||||
my $tempPath = "tmp";
|
||||
|
||||
GetOptions("order=s" => \$order,
|
||||
"text=s" => \$corpusPath,
|
||||
"lm=s" => \$lmPath,
|
||||
"cores=s" => \$cores,
|
||||
"irst-dir=s" => \$irstPath,
|
||||
"temp-dir=s" => \$tempPath
|
||||
) or exit 1;
|
||||
|
||||
die("ERROR: please set order") unless defined($order);
|
||||
die("ERROR: please set text") unless defined($corpusPath);
|
||||
die("ERROR: please set lm") unless defined($lmPath);
|
||||
die("ERROR: please set irst-dir") unless defined($irstPath);
|
||||
|
||||
my $ext = ($corpusPath =~ m/([^.]+)$/)[0];
|
||||
print "extension is $ext\n";
|
||||
|
||||
mkdir 'temp';
|
||||
$tempPath .= "/irstlm-build-tmp.$$";
|
||||
`mkdir -p $tempPath`;
|
||||
|
||||
my $cmd;
|
||||
if ($ext eq "gz")
|
||||
{
|
||||
$cmd = "zcat $corpusPath | $irstPath/bin/add-start-end.sh | gzip -c > temp/monolingual.setagged.gz";
|
||||
$cmd = "zcat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
|
||||
}
|
||||
else
|
||||
{
|
||||
$cmd = "cat $corpusPath | $irstPath/bin/add-start-end.sh | gzip -c > temp/monolingual.setagged.gz";
|
||||
$cmd = "cat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
|
||||
}
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
$cmd = "IRSTLM=$irstPath $irstPath/bin/build-lm.sh -t stat4 -i \"gunzip -c temp/monolingual.setagged.gz\" -n $order -p -o temp/iarpa.gz -k $cores";
|
||||
$cmd = "IRSTLM=$irstPath/.. $irstPath/build-lm.sh -t $tempPath/stat4 -i \"gunzip -c $tempPath/monolingual.setagged.gz\" -n $order -p -o $tempPath/iarpa.gz -k $cores";
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
@ -53,17 +61,17 @@ print "extension is $ext\n";
|
||||
|
||||
if ($ext eq "gz")
|
||||
{
|
||||
$cmd = "$irstPath/bin/compile-lm temp/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
|
||||
$cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
|
||||
}
|
||||
else
|
||||
{
|
||||
$cmd = "$irstPath/bin/compile-lm temp/iarpa.gz --text yes $lmPath";
|
||||
$cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes $lmPath";
|
||||
}
|
||||
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
$cmd = "rm -rf temp stat4";
|
||||
$cmd = "rm -rf $tempPath";
|
||||
print STDERR "EXECUTING $cmd\n";
|
||||
`$cmd`;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user