enabled the --continue option to re-start an interrupted mert from the last finished step

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@2568 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-28 14:32:38 +03:00 · 2009-10-07 16:35:57 +00:00 · 2009-10-07 16:35:57 +00:00 · 124f88e55a
commit 124f88e55a
parent e25b8c41b7
1 changed files with 86 additions and 27 deletions
--- a/scripts/training/mert-moses-new.pl
+++ b/scripts/training/mert-moses-new.pl
@ -497,38 +497,101 @@ my $weights_out_file = "weights.txt";

 # set start run
 my $start_run = 1;
+my $bestpoint = undef;
+my $devbleu = undef;
+
+my $prev_feature_file = undef;
+my $prev_score_file = undef;

 if ($continue) {
-  die "continue not yet supported by the new mert script\nNeed to load features and scores from last iteration\n";
-  # need to load last best values
+  # getting the last finished step
  print STDERR "Trying to continue an interrupted optimization.\n";
  open IN, "finished_step.txt" or die "Failed to find the step number, failed to read finished_step.txt";
  my $step = <IN>;
  chomp $step;
-  $step++;
  close IN;

-  if (! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz") {
-    # allow stepping one extra iteration back
-    $step--;
-    die "Can't start from step $step, because run$step.best$___N_BEST_LIST_SIZE.out.gz was not found!"
-      if ! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz";
+  print STDERR "Last finished step is $step\n";
+
+  # getting the first needed step
+  my $firststep;
+  if ($prev_aggregate_nbl_size==-1){
+    $firststep=1;
+  }
+  else{
+    $firststep=$step-$prev_aggregate_nbl_size+1;
+    $firststep=($firststep>0)?$firststep:1;
+  }
+
+#checking if all needed data are available
+  if ($firststep<=$step){
+    print STDERR "First previous needed data index is $firststep\n";
+    print STDERR "Checking whether all needed data (from step $firststep to step $step) are available\n";
+    
+    for (my $prevstep=$firststep; $prevstep<=$step;$prevstep++){
+      print STDERR "Checking whether data of step $prevstep are available\n";
+      if (! -e "run$prevstep.features.dat"){
+	die "Can't start from step $step, because run$prevstep.features.dat was not found!";
+      }else{
+	if (defined $prev_feature_file){
+	  $prev_feature_file = "${prev_feature_file},run$prevstep.features.dat";
+	}
+	else{
+	  $prev_feature_file = "run$prevstep.features.dat";
+	}
+      }
+      if (! -e "run$prevstep.scores.dat"){
+	die "Can't start from step $step, because run$prevstep.scores.dat was not found!";
+      }else{
+	if (defined $prev_score_file){
+	  $prev_score_file = "${prev_score_file},run$prevstep.scores.dat";
+	}
+	else{
+	  $prev_score_file = "run$prevstep.scores.dat";
+	}
+      }
+    }
+    if (! -e "run$step.weights.txt"){
+      die "Can't start from step $step, because run$step.weights.txt was not found!";
+    }
+    if (! -e "run$step.$mert_logfile"){
+      die "Can't start from step $step, because run$step.$mert_logfile was not found!";
+    }
+    if (! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz"){
+      die "Can't start from step $step, because run$step.best$___N_BEST_LIST_SIZE.out.gz was not found!";
+    }
+    print STDERR "All needed data are available\n";
+
+    print STDERR "Loading information from last step ($step)\n";
+    open(IN,"run$step.$mert_logfile") or die "Can't open run$step.$mert_logfile";
+    while (<IN>) {
+      if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
+	$bestpoint = $1;
+	$devbleu = $2;
+	last;
+      }
+    }
+    close IN;
+    die "Failed to parse mert.log, missed Best point there."
+      if !defined $bestpoint || !defined $devbleu;
+    print "($step) BEST at $step $bestpoint => $devbleu at ".`date`;
+    
+    my @newweights = split /\s+/, $bestpoint;
+    
+    
+    print STDERR "Reading last cached lambda values (result from step $step)\n";
+    @order_of_lambdas_from_decoder = get_order_of_scores_from_nbestlist("gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |");
+    
+    
+    # update my cache of lambda values
+    store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights);
+    
+  }
+  else{
+    print STDERR "No pevious data are needed\n";
  }

  $start_run = $step +1;
-
-  print STDERR "Reading last cached lambda values (result from step $step)\n";
-  @order_of_lambdas_from_decoder = get_order_of_scores_from_nbestlist("gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |");
-
-  open IN, "$weights_out_file" or die "Can't read $weights_out_file";
-  my $newweights = <IN>;
-  chomp $newweights;
-  close IN;
-  my @newweights = split /\s+/, $newweights;
-
-  #dump_triples(\%used_triples);
-  store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights);
-  #dump_triples(\%used_triples);
 }

 if ($___FILTER_PHRASE_TABLE){
@ -557,8 +620,6 @@ my $PARAMETERS;
 #$PARAMETERS = $___DECODER_FLAGS . " -config $___CONFIG -inputtype $___INPUTTYPE";
 $PARAMETERS = $___DECODER_FLAGS;

-my $devbleu = undef;
-my $bestpoint = undef;
 my $run=$start_run-1;

 my $oldallsorted = undef;
@ -566,8 +627,6 @@ my $allsorted = undef;

 my $cmd;
 # features and scores from the last run.
-my $prev_feature_file=undef;
-my $prev_score_file=undef;
 my $nbest_file=undef;

 while(1) {
@ -770,8 +829,8 @@ while(1) {
      $prev_score_file = "run${i}.${base_score_file}";
    }
  }
-  print "loading data from $prev_feature_file\n";
-  print "loading data from $prev_score_file\n";
+  print "loading data from $prev_feature_file\n" if defined($prev_feature_file);
+  print "loading data from $prev_score_file\n" if defined($prev_score_file);
 }
 print "Training finished at ".`date`;