better error message when no corpus defined, better integration of IRSTLM training

This commit is contained in:
Philipp Koehn 2011-12-21 05:50:59 +00:00
parent b95c372e3a
commit cdf735b01b
7 changed files with 61 additions and 23 deletions

View File

@ -132,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
[LM]
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
# srilm
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
# irstlm
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
#settings = ""
# order of the language model
order = 5
### tool to be used for training randomized language model from scratch

View File

@ -132,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
[LM]
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
# srilm
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
# irstlm
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
#settings = ""
# order of the language model
order = 5
### tool to be used for training randomized language model from scratch

View File

@ -132,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
[LM]
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
# srilm
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
# irstlm
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
#settings = ""
# order of the language model
order = 5
### tool to be used for training randomized language model from scratch

View File

@ -136,10 +136,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
[LM]
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
# srilm
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
# irstlm
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
#settings = ""
# order of the language model
order = 5
### tool to be used for training randomized language model from scratch

View File

@ -126,10 +126,15 @@ raw-stem = $toy-data/nc-5k
[LM]
### tool to be used for language model training
# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
#
# srilm
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
# irstlm
#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
#settings = ""
# order of the language model
order = 5
### tool to be used for training randomized language model from scratch

View File

@ -2264,12 +2264,13 @@ sub define_reporting_report {
### subs for step definition
sub get_output_and_input {
my ($step_id) = @_;
my ($step_id) = @_;
my $step = $DO_STEP[$step_id];
my $output = &get_default_file(&deconstruct_name($step));
my $step = $DO_STEP[$step_id];
my $output = &get_default_file(&deconstruct_name($step));
my @INPUT;
my @INPUT;
if (defined($USES_INPUT{$step_id})) {
for(my $i=0; $i<scalar @{$USES_INPUT{$step_id}}; $i++) {
# get name of input file needed
my $in_file = $USES_INPUT{$step_id}[$i];
@ -2301,7 +2302,8 @@ sub get_output_and_input {
push @INPUT,&get_specified_or_default_file(&deconstruct_name($in_file),
&deconstruct_name($prev_step));
}
return ($output,@INPUT);
}
return ($output,@INPUT);
}
sub define_template {
@ -2400,6 +2402,9 @@ sub define_template {
}
# input is defined as IN or IN0, IN1, IN2
else {
if ($cmd =~ /([^ANS])IN/ && scalar(@INPUT) == 0) {
die("ERROR: Step $step requires input from prior steps, but none defined.");
}
$cmd =~ s/([^ANS])IN(\d+)/$1$INPUT[$2]/g; # a bit trickier to
$cmd =~ s/([^ANS])IN/$1$INPUT[0]/g; # avoid matching TRAINING, RECASING
$cmd =~ s/^IN(\d+)/$INPUT[$2]/g;

View File

@ -17,34 +17,42 @@ use Getopt::Long;
my $order;
my $corpusPath;
my $lmPath;
my $cores;
my $cores = 2;
my $irstPath;
my $tempPath = "tmp";
GetOptions("order=s" => \$order,
"text=s" => \$corpusPath,
"lm=s" => \$lmPath,
"cores=s" => \$cores,
"irst-dir=s" => \$irstPath,
"temp-dir=s" => \$tempPath
) or exit 1;
die("ERROR: please set order") unless defined($order);
die("ERROR: please set text") unless defined($corpusPath);
die("ERROR: please set lm") unless defined($lmPath);
die("ERROR: please set irst-dir") unless defined($irstPath);
my $ext = ($corpusPath =~ m/([^.]+)$/)[0];
print "extension is $ext\n";
mkdir 'temp';
$tempPath .= "/irstlm-build-tmp.$$";
`mkdir -p $tempPath`;
my $cmd;
if ($ext eq "gz")
{
$cmd = "zcat $corpusPath | $irstPath/bin/add-start-end.sh | gzip -c > temp/monolingual.setagged.gz";
$cmd = "zcat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
}
else
{
$cmd = "cat $corpusPath | $irstPath/bin/add-start-end.sh | gzip -c > temp/monolingual.setagged.gz";
$cmd = "cat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
}
print STDERR "EXECUTING $cmd\n";
`$cmd`;
$cmd = "IRSTLM=$irstPath $irstPath/bin/build-lm.sh -t stat4 -i \"gunzip -c temp/monolingual.setagged.gz\" -n $order -p -o temp/iarpa.gz -k $cores";
$cmd = "IRSTLM=$irstPath/.. $irstPath/build-lm.sh -t $tempPath/stat4 -i \"gunzip -c $tempPath/monolingual.setagged.gz\" -n $order -p -o $tempPath/iarpa.gz -k $cores";
print STDERR "EXECUTING $cmd\n";
`$cmd`;
@ -53,17 +61,17 @@ print "extension is $ext\n";
if ($ext eq "gz")
{
$cmd = "$irstPath/bin/compile-lm temp/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
$cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
}
else
{
$cmd = "$irstPath/bin/compile-lm temp/iarpa.gz --text yes $lmPath";
$cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes $lmPath";
}
print STDERR "EXECUTING $cmd\n";
`$cmd`;
$cmd = "rm -rf temp stat4";
$cmd = "rm -rf $tempPath";
print STDERR "EXECUTING $cmd\n";
`$cmd`;