support for sparse feature functions (mert support only when using PRO)

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230
2024-12-26 13:23:25 +03:00 · 2011-09-07 16:37:33 +00:00 · 2011-09-07 16:37:33 +00:00 · 41a1849437
commit 41a1849437
parent 9fee4a97f2
16 changed files with 391 additions and 126 deletions
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@ -13,7 +13,8 @@


 Data::Data(Scorer& ptr):
-  theScorer(&ptr)
+  theScorer(&ptr),
+  _sparse_flag(false)
 {
  score_type = (*theScorer).getName();
  TRACE_ERR("Data::score_type " << score_type << std::endl);
@ -40,7 +41,6 @@ void Data::loadnbest(const std::string &file)
  std::string theSentence;
  std::string::size_type loc;

-
  while (getline(inp,stringBuf,'\n')) {
    if (stringBuf.empty()) continue;

@ -56,16 +56,15 @@ void Data::loadnbest(const std::string &file)
    featentry.reset();
    scoreentry.clear();

-
    theScorer->prepareStats(sentence_index, theSentence, scoreentry);

    scoredata->add(scoreentry, sentence_index);

    getNextPound(stringBuf, substring, "|||"); //third field

+    // examine first line for name of features
    if (!existsFeatureNames()) {
      std::string stringsupport=substring;
-      // adding feature names
      std::string features="";
      std::string tmpname="";

@ -75,10 +74,17 @@ void Data::loadnbest(const std::string &file)
        getNextPound(stringsupport, subsubstring);

        // string ending with ":" are skipped, because they are the names of the features
-        if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
+        if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
          features+=tmpname+"_"+stringify(tmpidx)+" ";
          tmpidx++;
-        } else {
+        }
+        // ignore sparse feature name
+        else if (subsubstring.find("_") != string::npos) {
+          // also ignore its value
+          getNextPound(stringsupport, subsubstring);
+        }
+        // update current feature name
+        else {
          tmpidx=0;
          tmpname=subsubstring.substr(0,subsubstring.size() - 1);
        }
@ -87,22 +93,36 @@ void Data::loadnbest(const std::string &file)
      featdata->setFeatureMap(features);
    }

-// adding features
+    // adding features
    while (!substring.empty()) {
 //			TRACE_ERR("Decompounding: " << substring << std::endl);
      getNextPound(substring, subsubstring);

-// string ending with ":" are skipped, because they are the names of the features
-      if ((loc = subsubstring.find(":")) != subsubstring.length()-1) {
+      // no ':' -> feature value that needs to be stored
+      if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
        featentry.add(ATOFST(subsubstring.c_str()));
      }
+      // sparse feature name? store as well
+      else if (subsubstring.find("_") != string::npos) {
+        std::string name = subsubstring;
+        getNextPound(substring, subsubstring);
+        featentry.addSparse( name, atof(subsubstring.c_str()) );
+        _sparse_flag = true;
+      }
    }
+    //cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
    featdata->add(featentry,sentence_index);
  }

  inp.close();
 }

+// TODO
+void Data::mergeSparseFeatures() { 
+  std::cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";
+  exit(1);
+}
+
 // really not the right place...
 float sentenceLevelBleuPlusOne( ScoreStats &stats ) {
 	float logbleu = 0.0;
@ -144,7 +164,7 @@ public:
 };
 	

-void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
+void Data::sampleRankedPairs( const std::string &rankedpairfile ) {
 	cout << "Sampling ranked pairs." << endl;

 	ofstream *outFile = new ofstream();
@ -187,20 +207,15 @@ void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
 		for(unsigned int i=0; i<samples.size() && collected < n_samples; i++) {
 			if (samples[i]->getDiff() >= min_diff) {
 				collected++;
-				FeatureStats &f1 = featdata->get(S,samples[i]->getTranslation1());
-				FeatureStats &f2 = featdata->get(S,samples[i]->getTranslation2());

 				*out << "1";
-				for(unsigned int j=0; j<f1.size(); j++)
-					if (abs(f1.get(j)-f2.get(j)) > 0.00001)
-						*out << " F" << j << " " << (f1.get(j)-f2.get(j));
-				*out << endl;
-
+        outputSample( *out, featdata->get(S,samples[i]->getTranslation1()),
+                            featdata->get(S,samples[i]->getTranslation2()) );
+        *out << endl;
 				*out << "0";
-				for(unsigned int j=0; j<f1.size(); j++)
-					if (abs(f1.get(j)-f2.get(j)) > 0.00001)
-						*out << " F" << j << " " << (f2.get(j)-f1.get(j));
-				*out << endl;
+        outputSample( *out, featdata->get(S,samples[i]->getTranslation2()),
+                            featdata->get(S,samples[i]->getTranslation1()) );
+        *out << endl;
 			}
 			delete samples[i];
 		}
@ -209,3 +224,31 @@ void Data::sample_ranked_pairs( const std::string &rankedpairfile ) {
 	out->flush();
 	outFile->close();
 }
+
+void Data::outputSample( ostream &out, const FeatureStats &f1, const FeatureStats &f2 ) 
+{
+  // difference in score in regular features
+	for(unsigned int j=0; j<f1.size(); j++)
+		if (abs(f1.get(j)-f2.get(j)) > 0.00001)
+			out << " F" << j << " " << (f1.get(j)-f2.get(j));
+
+  if (!hasSparseFeatures())
+    return;
+
+  // sparse features
+  const sparse_featstats_t &s1 = f1.getSparse();
+  const sparse_featstats_t &s2 = f2.getSparse();
+  for( sparse_featstats_t::const_iterator i=s1.begin(); i!=s1.end(); i++) {
+    if (s2.find(i->first) == s2.end())
+      out << " " << i->first << " " << i->second;
+    else {
+      float diff = i->second - s2.find(i->first)->second;
+      if (abs(diff) > 0.00001)
+        out << " " << i->first << " " << diff;
+    }
+  }
+  for( sparse_featstats_t::const_iterator i=s2.begin(); i!=s2.end(); i++) {
+    if (s1.find(i->first) == s1.end())
+      out << " " << i->first << " " << (- i->second);
+  }
+}
--- a/mert/Data.h
+++ b/mert/Data.h
@ -31,10 +31,10 @@ private:
  Scorer* theScorer;
  std::string score_type;
  size_t number_of_scores; //number of scores
+  bool _sparse_flag;

 public:
  Data(Scorer& sc);
-
  ~Data() {};

  inline void clear() {
@ -62,11 +62,16 @@ public:
    featdata->Features(f);
  }

+  inline bool hasSparseFeatures() const { return _sparse_flag; }
+  void mergeSparseFeatures();
+
  void loadnbest(const std::string &file);

  void load(const std::string &featfile,const std::string &scorefile) {
    featdata->load(featfile);
    scoredata->load(scorefile);
+    if (featdata->hasSparseFeatures())
+      _sparse_flag = true;
  }

  void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
@ -90,7 +95,8 @@ public:
    return featdata->getFeatureIndex(name);
  };

-	void sample_ranked_pairs( const std::string &rankedPairFile );
+	void sampleRankedPairs( const std::string &rankedPairFile );
+  void outputSample( std::ostream &out, const FeatureStats &f1, const FeatureStats &f2 );
 };


--- a/mert/FeatureArray.cpp
+++ b/mert/FeatureArray.cpp
@ -11,7 +11,7 @@
 #include "Util.h"


-FeatureArray::FeatureArray(): idx("")
+FeatureArray::FeatureArray(): idx(""), _sparse_flag(false)
 {};

 void FeatureArray::savetxt(std::ofstream& outFile)
@ -69,6 +69,8 @@ void FeatureArray::loadtxt(ifstream& inFile, size_t n)
  for (size_t i=0 ; i < n; i++) {
    entry.loadtxt(inFile);
    add(entry);
+    if (entry.getSparse().size()>0)
+      _sparse_flag = true;
  }
 }

--- a/mert/FeatureArray.h
+++ b/mert/FeatureArray.h
@ -30,6 +30,7 @@ protected:
  featarray_t array_;
  size_t number_of_features;
  std::string features;
+  bool _sparse_flag;

 private:
  std::string idx; // idx to identify the utterance, it can differ from the index inside the vector
@ -43,6 +44,10 @@ public:
    array_.clear();
  }

+  inline bool hasSparseFeatures() const {
+    return _sparse_flag;
+  }
+
  inline std::string getIndex() {
    return idx;
  }
--- a/mert/FeatureData.cpp
+++ b/mert/FeatureData.cpp
@ -51,9 +51,12 @@ void FeatureData::load(ifstream& inFile)
    if (entry.size() == 0)
      break;

-    if (size() == 0) {
+    if (size() == 0)
      setFeatureMap(entry.Features());
-    }
+
+    if (entry.hasSparseFeatures())
+      _sparse_flag = true;
+
    add(entry);
  }
 }
--- a/mert/FeatureData.h
+++ b/mert/FeatureData.h
@ -26,10 +26,10 @@ protected:
  idx2name idx2arrayname_; //map from index to name of array
  name2idx arrayname2idx_; //map from name to index of array

-
 private:
  size_t number_of_features;
  std::string features;
+  bool _sparse_flag;

  map<std::string, size_t> featname2idx_; //map from name to index of features
  map<size_t, std::string> idx2featname_; //map from index to name of features
@ -43,6 +43,9 @@ public:
    array_.clear();
  }

+  inline bool hasSparseFeatures() const { 
+    return _sparse_flag; 
+  }
  inline FeatureArray get(const std::string& idx) {
    return array_.at(getIndex(idx));
  }
--- a/mert/FeatureStats.cpp
+++ b/mert/FeatureStats.cpp
@ -30,6 +30,7 @@ FeatureStats::FeatureStats(const FeatureStats &stats)
  entries_ = stats.size();
  array_ = new FeatureStatsType[available_];
  memcpy(array_,stats.getArray(),featbytes_);
+  map_ = stats.getSparse();
 };

 FeatureStats::FeatureStats(const size_t size)
@ -61,6 +62,11 @@ void FeatureStats::add(FeatureStatsType v)
  array_[entries_++]=v;
 }

+void FeatureStats::addSparse(string name, FeatureStatsType v)
+{
+  map_[name]=v;
+}
+
 void FeatureStats::set(std::string &theString)
 {
  std::string substring, stringBuf;
@ -68,7 +74,15 @@ void FeatureStats::set(std::string &theString)

  while (!theString.empty()) {
    getNextPound(theString, substring);
-    add(ATOFST(substring.c_str()));
+    // regular feature
+    if (substring.find(":") == string::npos) {
+      add(ATOFST(substring.c_str()));
+    }
+    // sparse feature
+    else {
+      size_t separator = substring.find_last_of(":");
+      addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
+    }
  }
 }

@ -123,6 +137,7 @@ FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
  entries_ = stats.size();
  array_ = new FeatureStatsType[available_];
  memcpy(array_,stats.getArray(),featbytes_);
+  map_ = stats.getSparse();

  return *this;
 }
@ -131,7 +146,14 @@ FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
 /**write the whole object to a stream*/
 ostream& operator<<(ostream& o, const FeatureStats& e)
 {
-  for (size_t i=0; i< e.size(); i++)
+  // print regular features
+  for (size_t i=0; i< e.size(); i++) {
    o << e.get(i) << " ";
+  }
+  // sparse features
+  const sparse_featstats_t &sparse = e.getSparse();
+  for(sparse_featstats_t::const_iterator i = sparse.begin(); i != sparse.end(); i++) {
+    o << i->first << i->second << " ";
+  }
  return o;
 }
--- a/mert/FeatureStats.h
+++ b/mert/FeatureStats.h
@ -26,6 +26,7 @@ class FeatureStats
 {
 private:
  featstats_t array_;
+  sparse_featstats_t map_;
  size_t entries_;
  size_t available_;

@ -43,9 +44,11 @@ public:
  }
  void expand();
  void add(FeatureStatsType v);
+  void addSparse(string name, FeatureStatsType v);

  inline void clear() {
    memset((void*) array_,0,featbytes_);
+    map_.clear();
  }

  inline FeatureStatsType get(size_t i) {
@ -57,6 +60,9 @@ public:
  inline featstats_t getArray() const {
    return array_;
  }
+  inline sparse_featstats_t getSparse() const {
+    return map_;
+  }

  void set(std::string &theString);

--- a/mert/Types.h
+++ b/mert/Types.h
@ -26,6 +26,7 @@ typedef vector<statscore_t> statscores_t;

 typedef float FeatureStatsType;
 typedef FeatureStatsType* featstats_t;
+typedef map<string,FeatureStatsType> sparse_featstats_t;
 //typedef vector<FeatureStatsType> featstats_t;
 typedef vector<FeatureStats> featarray_t;
 typedef vector<FeatureArray> featdata_t;
--- a/mert/mert.cpp
+++ b/mert/mert.cpp
@ -278,6 +278,12 @@ int main (int argc, char **argv)

  PrintUserTime("Data loaded");

+  // starting point score over latest n-best, accumulative n-best
+  //vector<unsigned> bests;
+  //compute bests with sparse features needs to be implemented
+  //currently sparse weights are not even loaded
+  //statscore_t score = TheScorer->score(bests);
+
  if (tooptimizestr.length() > 0) {
    cerr << "Weights to optimize: " << tooptimizestr << endl;

@ -305,16 +311,20 @@ int main (int argc, char **argv)
  }

 	if (pairedrankfile.compare("") != 0) {
-		D.sample_ranked_pairs(pairedrankfile);
+		D.sampleRankedPairs(pairedrankfile);
 		PrintUserTime("Stopping...");
 		exit(0);
 	}

+  // treat sparse features just like regular features
+  if (D.hasSparseFeatures()) {
+    D.mergeSparseFeatures();
+  }
+
  Optimizer *O=OptimizerFactory::BuildOptimizer(pdim,tooptimize,start_list[0],type,nrandom);
  O->SetScorer(TheScorer);
  O->SetFData(D.getFeatureData());

-
 #ifdef WITH_THREADS
  cerr << "Creating a pool of " << threads << " threads" << endl;
  Moses::ThreadPool pool(threads);
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@ -11,6 +11,7 @@
 # Excerpts from revision history

 # Sept 2011   multi-threaded mert (Barry Haddow)
+# 3 Aug 2011  Added random directions, historic best, pairwise ranked (PK)
 # Jul 2011    simplifications (Ondrej Bojar)
 #             -- rely on moses' -show-weights instead of parsing moses.ini 
 #                ... so moses is also run once *before* mert starts, checking
@ -287,8 +288,6 @@ $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper
 $moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl"
  if !defined $moses_parallel_cmd;

-
-
 if (!defined $mertdir) {
  $mertdir = "$SCRIPTS_ROOTDIR/../mert";
  print STDERR "Assuming --mertdir=$mertdir\n";
@ -357,13 +356,11 @@ die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_par
 die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
 die "Not executable: $___DECODER" if ! -x $___DECODER;

-
 my $input_abs = ensure_full_path($___DEV_F);
 die "File not found: $___DEV_F (interpreted as $input_abs)."
  if ! -e $input_abs;
 $___DEV_F = $input_abs;

-
 # Option to pass to qsubwrapper and moses-parallel
 my $pass_old_sge = $old_sge ? "-old-sge" : "";

@ -372,7 +369,6 @@ die "File not executable: $___DECODER (interpreted as $decoder_abs)."
  if ! -x $decoder_abs;
 $___DECODER = $decoder_abs;

-
 my $ref_abs = ensure_full_path($___DEV_E);
 # check if English dev set (reference translations) exist and store a list of all references
 my @references;
@ -409,9 +405,6 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
 # normalize initial LAMBDAs, too
 my $need_to_normalize = 1;

-
-
-
 #store current directory and create the working directory (if needed)
 my $cwd = `pawd 2>/dev/null`; 
 if(!$cwd){$cwd = `pwd`;}
@ -431,17 +424,16 @@ my $mert_logfile = "mert.log";
 my $weights_in_file = "init.opt";
 my $weights_out_file = "weights.txt";

-
 # set start run
 my $start_run = 1;
 my $bestpoint = undef;
 my $devbleu = undef;
+my $sparse_weights_file = undef;

 my $prev_feature_file = undef;
 my $prev_score_file = undef;
 my $prev_init_file = undef;

-
 if ($___FILTER_PHRASE_TABLE) {
  my $outdir = "filtered";
  if (-e "$outdir/moses.ini") {
@ -471,7 +463,6 @@ else{
  $___CONFIG_ORIG = $___CONFIG;
 }

-
 # we run moses to check validity of moses.ini and to obtain all the feature
 # names
 my $featlist = get_featlist_from_moses($___CONFIG);
@ -579,28 +570,19 @@ if ($continue) {
    print STDERR "All needed data are available\n";

    print STDERR "Loading information from last step ($step)\n";
-    open(IN,"run$step.$mert_logfile") or die "Can't open run$step.$mert_logfile";
-    while (<IN>) {
-      if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
-	$bestpoint = $1;
-	$devbleu = $2;
-	last;
-      }
-    }
-    close IN;
+    my %dummy; # sparse features
+    ($bestpoint,$devbleu) = &get_weights_from_mert("run$step.$mert_outfile","run$step.$mert_logfile",scalar @{$featlist->{"names"}},\%dummy);
    die "Failed to parse mert.log, missed Best point there."
      if !defined $bestpoint || !defined $devbleu;
    print "($step) BEST at $step $bestpoint => $devbleu at ".`date`;
-    
    my @newweights = split /\s+/, $bestpoint;
    
    # Sanity check: order of lambdas must match
    sanity_check_order_of_lambdas($featlist,
      "gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |");
-    
+
    # update my cache of lambda values
    $featlist->{"values"} = \@newweights;
-    
  }
  else{
    print STDERR "No previous data are needed\n";
@ -630,10 +612,10 @@ while(1) {
  print "run $run start at ".`date`;

  # In case something dies later, we might wish to have a copy
-  create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"));
+  create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file);


-  # skip if the user wanted
+  # skip running the decoder if the user wanted
  if (!$skip_decoder) {
      print "($run) run decoder to produce n-best lists\n";
      $nbest_file = run_decoder($featlist, $run, $need_to_normalize);
@ -648,8 +630,6 @@ while(1) {
      $need_to_normalize = 0;
  }

-
-
  # extract score statistics and features from the nbest lists
  print STDERR "Scoring the nbestlist.\n";

@ -740,7 +720,7 @@ while(1) {
    if ! -s $weights_out_file;


- # backup copies
+  # backup copies
  safesystem ("\\cp -f extract.err run$run.extract.err") or die;
  safesystem ("\\cp -f extract.out run$run.extract.out") or die;
  if ($___PAIRWISE_RANKED_OPTIMIZER) { safesystem ("\\cp -f pro.data run$run.pro.data") or die; }
@ -751,34 +731,10 @@ while(1) {

  print "run $run end at ".`date`;

-  $bestpoint = undef;
-  $devbleu = undef;
-  if ($___PAIRWISE_RANKED_OPTIMIZER) {
-    open(IN,"run$run.$mert_outfile") or die "Can't open run$run.$mert_outfile";
-    my (@WEIGHT,$sum);
-    foreach (@CURR) { push @WEIGHT, 0; }
-    while(<IN>) {
-      if (/^F(\d+) ([\-\.\de]+)/) {
-	$WEIGHT[$1] = $2;
-	$sum += abs($2);
-      }
-    }
-    $devbleu = "unknown";
-    foreach (@WEIGHT) { $_ /= $sum; }
-    $bestpoint = join(" ",@WEIGHT);
-    close IN;
-  }
-  else {
-    open(IN,"run$run.$mert_logfile") or die "Can't open run$run.$mert_logfile";
-    while (<IN>) {
-      if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
-        $bestpoint = $1;
-        $devbleu = $2;
-        last;
-      }
-    }
-    close IN;
-  }
+  my %sparse_weights; # sparse features
+  ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.$mert_outfile","run$run.$mert_logfile",scalar @{$featlist->{"names"}},\%sparse_weights);
+
+
  die "Failed to parse mert.log, missed Best point there."
    if !defined $bestpoint || !defined $devbleu;
  print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`;
@ -788,6 +744,15 @@ while(1) {

  $featlist->{"values"} = \@newweights;

+  if (scalar keys %sparse_weights) {
+    $sparse_weights_file = "run".($run+1).".sparse-weights";
+    open(SPARSE,">".$sparse_weights_file);
+    foreach my $feature (keys %sparse_weights) {
+      print SPARSE "$feature $sparse_weights{$feature}\n";
+    }
+    close(SPARSE);
+  }
+
  ## additional stopping criterion: weights have not changed
  my $shouldstop = 1;
  for(my $i=0; $i<@CURR; $i++) {
@ -864,6 +829,43 @@ chdir($cwd);

 } # end of local scope

+sub get_weights_from_mert {
+  my ($outfile,$logfile,$weight_count,$sparse_weights) = @_;
+  my ($bestpoint,$devbleu);
+  if ($___PAIRWISE_RANKED_OPTIMIZER) {
+    open(IN,$outfile) or die "Can't open $outfile";
+    my (@WEIGHT,$sum);
+    for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; }
+    while(<IN>) {
+      # regular features
+      if (/^F(\d+) ([\-\.\de]+)/) {
+        $WEIGHT[$1] = $2;
+        $sum += abs($2);
+      }
+      # sparse features
+      elsif(/^(.+_.+) ([\-\.\de]+)/) {
+        $$sparse_weights{$1} = $2;
+      }
+    }
+    $devbleu = "unknown";
+    foreach (@WEIGHT) { $_ /= $sum; }
+    $bestpoint = join(" ",@WEIGHT);
+    close IN;
+  }
+  else {
+    open(IN,$logfile) or die "Can't open $logfile";
+    while (<IN>) {
+      if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
+        $bestpoint = $1;
+        $devbleu = $2;
+        last;
+      }
+    }
+    close IN;
+  }
+  return ($bestpoint,$devbleu);
+}
+
 sub run_decoder {
    my ($featlist, $run, $need_to_normalize) = @_;
    my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out";
@ -984,6 +986,7 @@ sub get_featlist_from_moses {
    $nr++;
    chomp;
    my ($longname, $feature, $value) = split / /;
+    next if $value eq "sparse";
    push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
      if $value !~ /^[+-]?[0-9.e]+$/;
    push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
@ -1015,14 +1018,20 @@ sub get_order_of_scores_from_nbestlist {

  my @order = ();
  my $label = undef;
+  my $sparse = 0; # we ignore sparse features here
  foreach my $tok (split /\s+/, $scores) {
-    if ($tok =~ /^([a-z][0-9a-z]*):/i) {
+    if ($tok =~ /.+_.+:/) {
+      $sparse = 1;
+    } elsif ($tok =~ /^([a-z][0-9a-z]*):/i) {
      $label = $1;
    } elsif ($tok =~ /^-?[-0-9.e]+$/) {
-      # a score found, remember it
-      die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
-        if !defined $label;
-      push @order, $label;
+      if (!$sparse) {
+        # a score found, remember it
+        die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
+          if !defined $label;
+        push @order, $label;
+      }
+      $sparse = 0;
    } else {
      die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'";
    }
@ -1037,6 +1046,7 @@ sub create_config {
    my $featlist = shift; # the lambdas we should write
    my $iteration = shift;  # just for verbosity
    my $bleu_achieved = shift; # just for verbosity
+    my $sparse_weights_file = shift; # only defined when optimizing sparse features

    my %P; # the hash of all parameters we wish to override

@ -1076,6 +1086,10 @@ sub create_config {
      push @{$P{$name}}, $val;
    }

+    if (defined($sparse_weights_file)) {
+      push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file;
+    }
+
    # create new moses.ini decoder config file by cloning and overriding the original one
    open(INI,$infn) or die "Can't read $infn";
    delete($P{"config"}); # never output 
--- a/scripts/training/phrase-extract/consolidate.cpp
+++ b/scripts/training/phrase-extract/consolidate.cpp
@ -36,10 +36,15 @@ using namespace std;
 bool hierarchicalFlag = false;
 bool onlyDirectFlag = false;
 bool phraseCountFlag = true;
+bool lowCountFlag = false;
+bool goodTuringFlag = false;
+bool kneserNeyFlag = false;
 bool logProbFlag = false;
-char line[LINE_MAX_LENGTH];
+inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }

-void processFiles( char*, char*, char* );
+char line[LINE_MAX_LENGTH];
+void processFiles( char*, char*, char*, char* );
+void loadCountOfCounts( char* );
 bool getLine( istream &fileP, vector< string > &item );
 vector< string > splitLine();

@ -55,6 +60,7 @@ int main(int argc, char* argv[])
  char* &fileNameDirect = argv[1];
  char* &fileNameIndirect = argv[2];
  char* &fileNameConsolidated = argv[3];
+  char* fileNameCountOfCounts;

  for(int i=4; i<argc; i++) {
    if (strcmp(argv[i],"--Hierarchical") == 0) {
@ -66,6 +72,25 @@ int main(int argc, char* argv[])
    } else if (strcmp(argv[i],"--NoPhraseCount") == 0) {
      phraseCountFlag = false;
      cerr << "not including the phrase count feature\n";
+    } else if (strcmp(argv[i],"--GoodTuring") == 0) {
+      goodTuringFlag = true;
+      if (i+1==argc) { 
+        cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
+        exit(1);
+      }
+      fileNameCountOfCounts = argv[++i];
+      cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
+    } else if (strcmp(argv[i],"--KneserNey") == 0) {
+      kneserNeyFlag = true;
+      if (i+1==argc) { 
+        cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
+        exit(1);
+      }
+      fileNameCountOfCounts = argv[++i];
+      cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
+    } else if (strcmp(argv[i],"--LowCountFeature") == 0) {
+      lowCountFlag = true;
+      cerr << "including the low count feature\n";
    } else if (strcmp(argv[i],"--LogProb") == 0) {
      logProbFlag = true;
      cerr << "using log-probabilities\n";
@ -75,11 +100,61 @@ int main(int argc, char* argv[])
    }
  }

-  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated );
+  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
 }

-void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated )
+vector< float > countOfCounts;
+vector< float > goodTuringDiscount;
+float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
+void loadCountOfCounts( char* fileNameCountOfCounts )
 {
+  Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts);
+  if (fileCountOfCounts.fail()) {
+    cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl;
+    exit(1);
+  }
+  istream &fileP = fileCountOfCounts;
+
+  countOfCounts.push_back(0.0);
+  while(1) {
+    if (fileP.eof()) break;
+    SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+    if (fileP.eof()) break;
+    if (totalCount < 0)
+      totalCount = atof(line); // total number of distinct phrase pairs
+    else
+      countOfCounts.push_back( atof(line) );
+  }
+  fileCountOfCounts.Close();
+
+  // compute Good Turing discounts
+  if (goodTuringFlag) {
+    goodTuringDiscount.push_back(0.01); // floor value
+    for( size_t i=1; i<countOfCounts.size()-1; i++ ) {
+      goodTuringDiscount.push_back(((float)i+1)/(float)i*((countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1))); 
+      if (goodTuringDiscount[i]>1)
+        goodTuringDiscount[i] = 1;
+      if (goodTuringDiscount[i]<goodTuringDiscount[i-1])
+        goodTuringDiscount[i] = goodTuringDiscount[i-1];
+    }
+  }
+
+  // compute Kneser Ney co-efficients [Chen&Goodman, 1998]
+  float Y = countOfCounts[1] / (countOfCounts[1] + 2*countOfCounts[2]);
+  kneserNey_D1 = 1 - 2*Y * countOfCounts[2] / countOfCounts[1];
+  kneserNey_D2 = 2 - 3*Y * countOfCounts[3] / countOfCounts[2];
+  kneserNey_D3 = 3 - 4*Y * countOfCounts[4] / countOfCounts[3];
+  // sanity constraints
+  if (kneserNey_D1 > 0.9) kneserNey_D1 = 0.9;
+  if (kneserNey_D2 > 1.9) kneserNey_D2 = 1.9;
+  if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
+}
+
+void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
+{
+  if (goodTuringFlag || kneserNeyFlag)
+    loadCountOfCounts( fileNameCountOfCounts );
+
  // open input files
  Moses::InputFileStream fileDirect(fileNameDirect);
  Moses::InputFileStream fileIndirect(fileNameIndirect);
@ -134,29 +209,67 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
    // output hierarchical phrase pair (with separated labels)
    fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];

-    // probs
-    fileConsolidated << " ||| ";
-    if (!onlyDirectFlag) {
-      fileConsolidated << itemIndirect[2];    // prob indirect
+    // SCORES ...
+    fileConsolidated << " |||";
+    vector<string> directCounts = tokenize(itemDirect[4].c_str());
+    vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+    float countF = atof(directCounts[0].c_str());
+    float countE = atof(indirectCounts[0].c_str());
+    float countEF = atof(indirectCounts[1].c_str());
+    float n1_F, n1_E;
+    if (kneserNeyFlag) {
+      n1_F = atof(directCounts[2].c_str());
+      n1_E = atof(indirectCounts[2].c_str());
    }
-    fileConsolidated << " " << itemDirect[2]; // prob direct
+
+    // Good Turing discounting
+    float adjustedCountEF = countEF;
+    if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
+      adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
+    float adjustedCountEF_indirect = adjustedCountEF;
+
+    // Kneser Ney discounting [Foster et al, 2006]
+   if (kneserNeyFlag) {
+     float D = kneserNey_D3;
+     if (countEF < 2) D = kneserNey_D1;
+     if (countEF < 3) D = kneserNey_D2;
+     if (D > countEF) D = countEF - 0.01; // sanity constraint
+
+     float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
+     float alpha_F = D * n1_F / countF; // available mass
+     adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;
+
+     // for indirect
+     float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
+     float alpha_E = D * n1_E / countE; // available mass
+     adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
+   }
+
+    // prob indirect
+    if (!onlyDirectFlag) {
+      fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
+      fileConsolidated << " " << itemIndirect[2];
+    }
+
+    // prob direct
+    fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
+    fileConsolidated << " " << itemDirect[2];
+
+    // phrase count feature
    if (phraseCountFlag) {
-      fileConsolidated << " " << (logProbFlag ? 1 : 2.718); // phrase count feature
+      fileConsolidated << " " << maybeLogProb(2.718);
+    }
+
+    // low count feature
+    if (lowCountFlag) {
+      fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
    }

    // alignment
    fileConsolidated << " ||| " << itemDirect[3];

    // counts, for debugging
-    vector<string> directCounts = tokenize(itemDirect[4].c_str());
-    vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
-    fileConsolidated << "||| " << indirectCounts[0] << " " << directCounts[0];
-    // output rule count if present in either file
-    if (directCounts.size() > 1) {
-      fileConsolidated << " " << directCounts[1];
-    } else if (indirectCounts.size() > 1) {
-      fileConsolidated << " " << indirectCounts[1];
-    }
+    fileConsolidated << "||| " << countE << " " << countF; // << " " << countEF;

    fileConsolidated << endl;
  }
@ -165,6 +278,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
  fileConsolidated.close();
 }

+
 bool getLine( istream &fileP, vector< string > &item )
 {
  if (fileP.eof())
--- a/scripts/training/phrase-extract/extract-rules.cpp
+++ b/scripts/training/phrase-extract/extract-rules.cpp
@ -45,7 +45,7 @@
 #include "tables-core.h"
 #include "XmlTree.h"

-#define LINE_MAX_LENGTH 60000
+#define LINE_MAX_LENGTH 500000

 using namespace std;

--- a/scripts/training/phrase-extract/extract.cpp
+++ b/scripts/training/phrase-extract/extract.cpp
@ -24,7 +24,7 @@

 using namespace std;

-#define LINE_MAX_LENGTH 60000
+#define LINE_MAX_LENGTH 500000

 // HPhraseVertex represents a point in the alignment matrix
 typedef pair <int, int> HPhraseVertex;
--- a/scripts/training/phrase-extract/tables-core.cpp
+++ b/scripts/training/phrase-extract/tables-core.cpp
@ -29,6 +29,10 @@ vector<string> tokenize( const char* input )
  return token;
 }

+bool isNonTerminal( const WORD &symbol ) {
+   return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
+}
+
 WORD_ID Vocabulary::storeIfNew( const WORD& word )
 {
  map<WORD, WORD_ID>::iterator i = lookup.find( word );
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@ -34,9 +34,10 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
   $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
   $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
   $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
+   $_ADDITIONAL_INI,
   $_DICTIONARY, $_EPPEX);

-my $debug = 0; # debug this script, do not delete any files in debug mode
+my $debug = 1; # debug this script, do not delete any files in debug mode

 # the following line is set installation time by 'make release'.  BEWARE!
 my $BINDIR="/home/pkoehn/statmt/bin";
@ -109,7 +110,7 @@ $_HELP = 1
 		       'memscore:s' => \$_MEMSCORE,
 		       'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
 		       'dictionary=s' => \$_DICTIONARY,
-		       'eppex:s' => \$_EPPEX,
+           'additional-ini=s' => \$_ADDITIONAL_INI
               );

 if ($_HELP) {
@ -1372,11 +1373,28 @@ sub score_phrase {
 sub score_phrase_phrase_extract {
    my ($ttable_file,$lexical_file,$extract_file) = @_;

+    # remove consolidation options
    my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
    my $PHRASE_COUNT = (!defined($_SCORE_OPTIONS) || $_SCORE_OPTIONS !~ /NoPhraseCount/);
-    my $CORE_SCORE_OPTIONS = defined($_SCORE_OPTIONS) ? $_SCORE_OPTIONS : "";
-    $CORE_SCORE_OPTIONS =~ s/\-+OnlyDirect//i;
-    $CORE_SCORE_OPTIONS =~ s/\-+NoPhraseCount//i;
+    my $LOW_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/);
+    my $UNALIGNED_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/);
+    my ($UNALIGNED_FW_COUNT,$UNALIGNED_FW_F,$UNALIGNED_FW_E);
+    if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty +(\S+) +(\S+)/) {
+      $UNALIGNED_FW_COUNT = 1;
+      $UNALIGNED_FW_F = $1;
+      $UNALIGNED_FW_E = $2;
+    }
+    my $GOOD_TURING = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /GoodTuring/);
+    my $KNESER_NEY = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /KneserNey/);
+    my $LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LogProb/);
+    my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/);
+    my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/);
+    my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
+    my $CORE_SCORE_OPTIONS = "";
+    $CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
+    $CORE_SCORE_OPTIONS .= " --NegLogProb" if $NEG_LOG_PROB;
+    $CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
+
    my $substep = 1;
    for my $direction ("f2e","e2f") {
 	next if $___CONTINUE && -e "$ttable_file.half.$direction";
@ -1405,6 +1423,11 @@ sub score_phrase_phrase_extract {
        my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse";
        $cmd .= " --Hierarchical" if $_HIERARCHICAL;
        $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
+        $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
+        $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING && $inverse eq "";
+        $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
+        $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
+        $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
        $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
        print $cmd."\n";
        safesystem($cmd) or die "ERROR: Scoring of phrases failed";	    
@ -1423,8 +1446,13 @@ sub score_phrase_phrase_extract {
    return if $___CONTINUE && -e "$ttable_file.gz";
    my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file";
    $cmd .= " --Hierarchical" if $_HIERARCHICAL;
+    $cmd .= " --LogProb" if $LOG_PROB;
+    $cmd .= " --NegLogProb" if $NEG_LOG_PROB;
    $cmd .= " --OnlyDirect" if $ONLY_DIRECT;
    $cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
+    $cmd .= " --LowCountFeature" if $LOW_COUNT;
+    $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING;
+    $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
    safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
    if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
    if (! $___DONT_ZIP) {
@ -1681,6 +1709,13 @@ sub create_ini {
 [ttable-file]\n";
   my $num_of_ttables = 0;
   my @SPECIFIED_TABLE = @_PHRASE_TABLE;
+   my $basic_weight_count = 4; # both directions, lex and phrase
+   $basic_weight_count-=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
+   $basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/; # word ins/del
+   $basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty/;
+   $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
+   $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
+   $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
   foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
     $num_of_ttables++;
     my $ff = $f;
@ -1688,10 +1723,6 @@ sub create_ini {
     my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
     $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
     my $phrase_table_impl = ($_HIERARCHICAL ? 6 : 0);
-     my $basic_weight_count = 4; # both directions, lex and phrase
-     $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
-     $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
-     $basic_weight_count++ unless defined($_SCORE_OPTIONS) &&  $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
     print INI "$phrase_table_impl $ff $basic_weight_count $file\n";
   }
   if ($_GLUE_GRAMMAR) {
@ -1783,10 +1814,6 @@ sub create_ini {

  print INI "\n\n# translation model weights\n[weight-t]\n";
  foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
-     my $basic_weight_count = 4; # both directions, lex and phrase
-     $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
-     $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
-     $basic_weight_count++ unless defined($_SCORE_OPTIONS) &&  $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
     for(1..$basic_weight_count) {
       printf INI "%.2f\n", 1/$basic_weight_count;
     }
@ -1826,6 +1853,11 @@ sub create_ini {
    print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
  }

+  if ($_ADDITIONAL_INI) {
+    print INI "\n# additional settings\n\n";
+    foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
+  }
+
  close(INI);
 }