better error message when no corpus defined, better integration of IRSTLM training

2024-12-29 06:52:34 +03:00 · 2011-12-21 05:50:59 +00:00 · 2011-12-21 05:50:59 +00:00 · cdf735b01b
commit cdf735b01b
parent b95c372e3a
7 changed files with 61 additions and 23 deletions
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@ -132,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@ -132,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@ -132,10 +132,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@ -136,10 +136,15 @@ raw-stem = $wmt10-data/training/undoc.2000.$pair-extension
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@ -126,10 +126,15 @@ raw-stem = $toy-data/nc-5k
 [LM]

 ### tool to be used for language model training
-# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh) 
-# 
+# srilm 
 lm-training = $srilm-dir/ngram-count
 settings = "-interpolate -kndiscount -unk"
+
+# irstlm
+#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
+#settings = ""
+
+# order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@ -2264,12 +2264,13 @@ sub define_reporting_report {
 ### subs for step definition

 sub get_output_and_input {
-    my ($step_id) = @_;
+  my ($step_id) = @_;

-    my $step = $DO_STEP[$step_id];
-    my $output = &get_default_file(&deconstruct_name($step));
+  my $step = $DO_STEP[$step_id];
+  my $output = &get_default_file(&deconstruct_name($step));

-    my @INPUT;
+  my @INPUT;
+  if (defined($USES_INPUT{$step_id})) { 
    for(my $i=0; $i<scalar @{$USES_INPUT{$step_id}}; $i++) {
 	# get name of input file needed
 	my $in_file = $USES_INPUT{$step_id}[$i];
@ -2301,7 +2302,8 @@ sub get_output_and_input {
 	push @INPUT,&get_specified_or_default_file(&deconstruct_name($in_file),
 						   &deconstruct_name($prev_step));
    }
-    return ($output,@INPUT);
+  }
+  return ($output,@INPUT);
 }

 sub define_template {
@ -2400,6 +2402,9 @@ sub define_template {
    }
    # input is defined as IN or IN0, IN1, IN2
    else {
+  if ($cmd =~ /([^ANS])IN/ && scalar(@INPUT) == 0) {
+    die("ERROR: Step $step requires input from prior steps, but none defined.");
+  }
 	$cmd =~ s/([^ANS])IN(\d+)/$1$INPUT[$2]/g;  # a bit trickier to
 	$cmd =~ s/([^ANS])IN/$1$INPUT[0]/g;        # avoid matching TRAINING, RECASING
 	$cmd =~ s/^IN(\d+)/$INPUT[$2]/g;
--- a/scripts/generic/trainlm-irst.perl
+++ b/scripts/generic/trainlm-irst.perl
@ -17,34 +17,42 @@ use Getopt::Long;
 my $order;
 my $corpusPath;
 my $lmPath;
-my $cores;
+my $cores = 2;
 my $irstPath;
+my $tempPath = "tmp";

 GetOptions("order=s"  => \$order,
           "text=s"   => \$corpusPath,
           "lm=s"     => \$lmPath,
           "cores=s"  => \$cores,
           "irst-dir=s"  => \$irstPath,
+           "temp-dir=s"  => \$tempPath
 	   ) or exit 1;

+die("ERROR: please set order") unless defined($order);
+die("ERROR: please set text") unless defined($corpusPath);
+die("ERROR: please set lm") unless defined($lmPath);
+die("ERROR: please set irst-dir") unless defined($irstPath);
+
 my $ext = ($corpusPath =~ m/([^.]+)$/)[0];
 print "extension is $ext\n";

-mkdir 'temp';
+$tempPath .= "/irstlm-build-tmp.$$";
+`mkdir -p $tempPath`;

 my $cmd;
 if ($ext eq "gz")
 {
-    $cmd = "zcat $corpusPath | $irstPath/bin/add-start-end.sh | gzip -c > temp/monolingual.setagged.gz";
+    $cmd = "zcat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
 }
 else
 {
-    $cmd = "cat $corpusPath | $irstPath/bin/add-start-end.sh | gzip -c > temp/monolingual.setagged.gz";
+    $cmd = "cat $corpusPath | $irstPath/add-start-end.sh | gzip -c > $tempPath/monolingual.setagged.gz";
 }
 print STDERR "EXECUTING $cmd\n";
 `$cmd`;

-$cmd = "IRSTLM=$irstPath $irstPath/bin/build-lm.sh -t stat4 -i \"gunzip -c temp/monolingual.setagged.gz\" -n $order -p -o temp/iarpa.gz -k $cores";
+$cmd = "IRSTLM=$irstPath/.. $irstPath/build-lm.sh -t $tempPath/stat4 -i \"gunzip -c $tempPath/monolingual.setagged.gz\" -n $order -p -o $tempPath/iarpa.gz -k $cores";
 print STDERR "EXECUTING $cmd\n";
 `$cmd`;

@ -53,17 +61,17 @@ print "extension is $ext\n";

 if ($ext eq "gz")
 {
-    $cmd = "$irstPath/bin/compile-lm temp/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
+    $cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes /dev/stdout | gzip -c > $lmPath";
 }
 else
 {
-    $cmd = "$irstPath/bin/compile-lm temp/iarpa.gz --text yes $lmPath";
+    $cmd = "$irstPath/compile-lm $tempPath/iarpa.gz --text yes $lmPath";
 }

 print STDERR "EXECUTING $cmd\n";
 `$cmd`;

-$cmd = "rm -rf temp stat4";
+$cmd = "rm -rf $tempPath";
 print STDERR "EXECUTING $cmd\n";
 `$cmd`;