From c4528753433dc6a8be5c50676504e1f68724f275 Mon Sep 17 00:00:00 2001
From: Ulrich Germann <ugermann@inf.ed.ac.uk>
Date: Sat, 31 May 2014 14:29:44 +0100
Subject: [PATCH] Mmsapt/ug_bitext: - Sampling now uses a static seed for
 consistency between runs. - (In order to have consistency guarantees, specify
 workers=1 in the - Mmsapt line in moses.ini; otherwise the randomness of
 thread scheduling may   cause inconsistencies between runs. - Denominator for
 fwd phrase probabilities can now be chosen via an   external argument.

---
 moses/TranslationModel/UG/mm/ug_bitext.cc |  2 +-
 moses/TranslationModel/UG/mm/ug_bitext.h  | 29 ++++++++++++++---------
 moses/TranslationModel/UG/mmsapt.cpp      |  6 ++---
 3 files changed, 22 insertions(+), 15 deletions(-)
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc
index 9e9cf86b9..e8e72dba8 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@@ -1,4 +1,4 @@
-//-*- c++-mode -*-
+//-*- c++ -*-
 
 #include "ug_bitext.h"
 #include <algorithm>
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 48a84f577..a664c51f7 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -25,6 +25,7 @@
 #include <boost/unordered_map.hpp>
 #include <boost/foreach.hpp>
 #include <boost/thread.hpp>
+#include <boost/random.hpp>
 
 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
@@ -220,7 +221,7 @@ namespace Moses {
     PScorePfwd : public PhraseScorer<Token>
     {
       float conf;
-      int denom;
+      char denom;
     public:
       PScorePfwd() 
       {
@@ -228,7 +229,7 @@ namespace Moses {
       }
 
       int 
-      init(int const i, float const c, int d=0) 
+      init(int const i, float const c, char d=0) 
       { 
 	conf  = c; 
 	denom = d;
@@ -249,13 +250,13 @@ namespace Moses {
 	  }
 	switch (denom)
 	  {
-	  case 0: 
+	  case 'g': 
 	    (*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf)); 
 	    break;
-	  case 1: 
+	  case 's': 
 	    (*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf)); 
 	    break;
-	  case 2:
+	  case 'r':
 	    (*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf)); 
 	  }
       }
@@ -281,10 +282,11 @@ namespace Moses {
       }
 
       void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
+      operator()(Bitext<Token> const& bt, PhrasePair& pp, 
+		 vector<float> * dest = NULL) const
       {
 	if (!dest) dest = &pp.fvals;
-	(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf));
+	(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint),pp.joint,conf));
       }
     };
 
@@ -551,8 +553,10 @@ namespace Moses {
       class job 
       {
 	static ThreadSafeCounter active;
-	boost::mutex      lock; 
+	boost::mutex lock; 
 	friend class agenda;
+	boost::taus88 rnd; // every job has its own pseudo random generator 
+	double rnddenom;   // denominator for scaling random sampling
       public:
 	size_t         workers; // how many workers are working on this job?
 	sptr<TSA<Token> const> root; // root of the underlying suffix array
@@ -560,7 +564,8 @@ namespace Moses {
 	char const*       stop; // end of index range
 	size_t     max_samples; // how many samples to extract at most
 	size_t             ctr; /* # of phrase occurrences considered so far
-				 * # of samples chosen is stored in stats->good */
+				 * # of samples chosen is stored in stats->good 
+				 */
 	size_t             len; // phrase length
 	bool               fwd; // if true, source phrase is L1 
 	sptr<pstats>     stats; // stores statistics collected during sampling
@@ -622,7 +627,7 @@ namespace Moses {
 	      {
 		boost::lock_guard<boost::mutex> sguard(stats->lock);
 		if (stats->raw_cnt == ctr) ++stats->raw_cnt;
-		size_t rnum = randInt(stats->raw_cnt - ctr++);
+		size_t rnum = (stats->raw_cnt - ctr++)*(rnd()/(rnd.max()+1.));
 		if (rnum < max_samples - stats->good)
 		  {
 		    stats->sample_cnt++;
@@ -770,7 +775,9 @@ namespace Moses {
     job::
     job(typename TSA<Token>::tree_iterator const& m, 
 	sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
-      : workers(0)
+      : rnd(0)
+      , rnddenom(rnd.max() + 1.)
+      , workers(0)
       , root(r)
       , next(m.lower_bound(-1))
       , stop(m.upper_bound(-1))
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index 0d951b882..09912b889 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -139,15 +139,15 @@ namespace Moses
     size_t num_feats;
     // TO DO: should we use different lbop parameters 
     //        for the relative-frequency based features?
-    num_feats  = calc_pfwd_fix.init(0,m_lbop_parameter);
+    num_feats  = calc_pfwd_fix.init(0,m_lbop_parameter,m_pfwd_denom);
     num_feats  = calc_pbwd_fix.init(num_feats,m_lbop_parameter);
     num_feats  = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
     num_feats  = apply_pp.init(num_feats);
     if (num_feats < this->m_numScoreComponents)
       {
 	poolCounts = false;
-	num_feats  = calc_pfwd_dyn.init(num_feats,m_lbop_parameter);
-	num_feats  = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
+	num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter,m_pfwd_denom);
+	num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
       }
 
     if (num_feats != this->m_numScoreComponents)