From c4528753433dc6a8be5c50676504e1f68724f275 Mon Sep 17 00:00:00 2001 From: Ulrich Germann Date: Sat, 31 May 2014 14:29:44 +0100 Subject: [PATCH] Mmsapt/ug_bitext: - Sampling now uses a static seed for consistency between runs. - (In order to have consistency guarantees, specify workers=1 in the - Mmsapt line in moses.ini; otherwise the randomness of thread scheduling may cause inconsistencies between runs. - Denominator for fwd phrase probabilities can now be chosen via an external argument. --- moses/TranslationModel/UG/mm/ug_bitext.cc | 2 +- moses/TranslationModel/UG/mm/ug_bitext.h | 29 ++++++++++++++--------- moses/TranslationModel/UG/mmsapt.cpp | 6 ++--- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc index 9e9cf86b9..e8e72dba8 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext.cc @@ -1,4 +1,4 @@ -//-*- c++-mode -*- +//-*- c++ -*- #include "ug_bitext.h" #include diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h index 48a84f577..a664c51f7 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext.h +++ b/moses/TranslationModel/UG/mm/ug_bitext.h @@ -25,6 +25,7 @@ #include #include #include +#include #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h" #include "moses/TranslationModel/UG/generic/sampling/Sampling.h" @@ -220,7 +221,7 @@ namespace Moses { PScorePfwd : public PhraseScorer { float conf; - int denom; + char denom; public: PScorePfwd() { @@ -228,7 +229,7 @@ namespace Moses { } int - init(int const i, float const c, int d=0) + init(int const i, float const c, char d=0) { conf = c; denom = d; @@ -249,13 +250,13 @@ namespace Moses { } switch (denom) { - case 0: + case 'g': (*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf)); break; - case 1: + case 's': (*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf)); break; - case 2: + case 'r': (*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf)); } } @@ -281,10 +282,11 @@ namespace Moses { } void - operator()(Bitext const& bt, PhrasePair& pp, vector * dest = NULL) const + operator()(Bitext const& bt, PhrasePair& pp, + vector * dest = NULL) const { if (!dest) dest = &pp.fvals; - (*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf)); + (*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint),pp.joint,conf)); } }; @@ -551,8 +553,10 @@ namespace Moses { class job { static ThreadSafeCounter active; - boost::mutex lock; + boost::mutex lock; friend class agenda; + boost::taus88 rnd; // every job has its own pseudo random generator + double rnddenom; // denominator for scaling random sampling public: size_t workers; // how many workers are working on this job? sptr const> root; // root of the underlying suffix array @@ -560,7 +564,8 @@ namespace Moses { char const* stop; // end of index range size_t max_samples; // how many samples to extract at most size_t ctr; /* # of phrase occurrences considered so far - * # of samples chosen is stored in stats->good */ + * # of samples chosen is stored in stats->good + */ size_t len; // phrase length bool fwd; // if true, source phrase is L1 sptr stats; // stores statistics collected during sampling @@ -622,7 +627,7 @@ namespace Moses { { boost::lock_guard sguard(stats->lock); if (stats->raw_cnt == ctr) ++stats->raw_cnt; - size_t rnum = randInt(stats->raw_cnt - ctr++); + size_t rnum = (stats->raw_cnt - ctr++)*(rnd()/(rnd.max()+1.)); if (rnum < max_samples - stats->good) { stats->sample_cnt++; @@ -770,7 +775,9 @@ namespace Moses { job:: job(typename TSA::tree_iterator const& m, sptr > const& r, size_t maxsmpl, bool isfwd) - : workers(0) + : rnd(0) + , rnddenom(rnd.max() + 1.) + , workers(0) , root(r) , next(m.lower_bound(-1)) , stop(m.upper_bound(-1)) diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp index 0d951b882..09912b889 100644 --- a/moses/TranslationModel/UG/mmsapt.cpp +++ b/moses/TranslationModel/UG/mmsapt.cpp @@ -139,15 +139,15 @@ namespace Moses size_t num_feats; // TO DO: should we use different lbop parameters // for the relative-frequency based features? - num_feats = calc_pfwd_fix.init(0,m_lbop_parameter); + num_feats = calc_pfwd_fix.init(0,m_lbop_parameter,m_pfwd_denom); num_feats = calc_pbwd_fix.init(num_feats,m_lbop_parameter); num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex"); num_feats = apply_pp.init(num_feats); if (num_feats < this->m_numScoreComponents) { poolCounts = false; - num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter); - num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter); + num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter,m_pfwd_denom); + num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter); } if (num_feats != this->m_numScoreComponents)