Mmsapt/ug_bitext:

- Sampling now uses a static seed for consistency between runs. - (In order to have consistency guarantees, specify workers=1 in the - Mmsapt line in moses.ini; otherwise the randomness of thread scheduling may cause inconsistencies between runs. - Denominator for fwd phrase probabilities can now be chosen via an external argument.
2024-12-25 21:03:22 +03:00 · 2014-05-31 14:29:44 +01:00 · 2014-05-31 14:29:44 +01:00 · c452875343
commit c452875343
parent 0efea15dbe
3 changed files with 22 additions and 15 deletions
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@ -1,4 +1,4 @@
-//-*- c++-mode -*-
+//-*- c++ -*-

 #include "ug_bitext.h"
 #include <algorithm>
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@ -25,6 +25,7 @@
 #include <boost/unordered_map.hpp>
 #include <boost/foreach.hpp>
 #include <boost/thread.hpp>
+#include <boost/random.hpp>

 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
@ -220,7 +221,7 @@ namespace Moses {
    PScorePfwd : public PhraseScorer<Token>
    {
      float conf;
-      int denom;
+      char denom;
    public:
      PScorePfwd() 
      {
@ -228,7 +229,7 @@ namespace Moses {
      }

      int 
-      init(int const i, float const c, int d=0) 
+      init(int const i, float const c, char d=0) 
      { 
 	conf  = c; 
 	denom = d;
@ -249,13 +250,13 @@ namespace Moses {
 	  }
 	switch (denom)
 	  {
-	  case 0: 
+	  case 'g': 
 	    (*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf)); 
 	    break;
-	  case 1: 
+	  case 's': 
 	    (*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf)); 
 	    break;
-	  case 2:
+	  case 'r':
 	    (*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf)); 
 	  }
      }
@ -281,10 +282,11 @@ namespace Moses {
      }

      void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
+      operator()(Bitext<Token> const& bt, PhrasePair& pp, 
+		 vector<float> * dest = NULL) const
      {
 	if (!dest) dest = &pp.fvals;
-	(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf));
+	(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint),pp.joint,conf));
      }
    };

@ -551,8 +553,10 @@ namespace Moses {
      class job 
      {
 	static ThreadSafeCounter active;
-	boost::mutex      lock; 
+	boost::mutex lock; 
 	friend class agenda;
+	boost::taus88 rnd; // every job has its own pseudo random generator 
+	double rnddenom;   // denominator for scaling random sampling
      public:
 	size_t         workers; // how many workers are working on this job?
 	sptr<TSA<Token> const> root; // root of the underlying suffix array
@ -560,7 +564,8 @@ namespace Moses {
 	char const*       stop; // end of index range
 	size_t     max_samples; // how many samples to extract at most
 	size_t             ctr; /* # of phrase occurrences considered so far
-				 * # of samples chosen is stored in stats->good */
+				 * # of samples chosen is stored in stats->good 
+				 */
 	size_t             len; // phrase length
 	bool               fwd; // if true, source phrase is L1 
 	sptr<pstats>     stats; // stores statistics collected during sampling
@ -622,7 +627,7 @@ namespace Moses {
 	      {
 		boost::lock_guard<boost::mutex> sguard(stats->lock);
 		if (stats->raw_cnt == ctr) ++stats->raw_cnt;
-		size_t rnum = randInt(stats->raw_cnt - ctr++);
+		size_t rnum = (stats->raw_cnt - ctr++)*(rnd()/(rnd.max()+1.));
 		if (rnum < max_samples - stats->good)
 		  {
 		    stats->sample_cnt++;
@ -770,7 +775,9 @@ namespace Moses {
    job::
    job(typename TSA<Token>::tree_iterator const& m, 
 	sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
-      : workers(0)
+      : rnd(0)
+      , rnddenom(rnd.max() + 1.)
+      , workers(0)
      , root(r)
      , next(m.lower_bound(-1))
      , stop(m.upper_bound(-1))
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@ -139,15 +139,15 @@ namespace Moses
    size_t num_feats;
    // TO DO: should we use different lbop parameters 
    //        for the relative-frequency based features?
-    num_feats  = calc_pfwd_fix.init(0,m_lbop_parameter);
+    num_feats  = calc_pfwd_fix.init(0,m_lbop_parameter,m_pfwd_denom);
    num_feats  = calc_pbwd_fix.init(num_feats,m_lbop_parameter);
    num_feats  = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
    num_feats  = apply_pp.init(num_feats);
    if (num_feats < this->m_numScoreComponents)
      {
 	poolCounts = false;
-	num_feats  = calc_pfwd_dyn.init(num_feats,m_lbop_parameter);
-	num_feats  = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
+	num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter,m_pfwd_denom);
+	num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
      }

    if (num_feats != this->m_numScoreComponents)