Mmsapt/ug_bitext:

- Sampling now uses a static seed for consistency between runs.
- (In order to have consistency guarantees, specify workers=1 in the
- Mmsapt line in moses.ini; otherwise the randomness of thread scheduling may
  cause inconsistencies between runs.
- Denominator for fwd phrase probabilities can now be chosen via an
  external argument.
This commit is contained in:
Ulrich Germann 2014-05-31 14:29:44 +01:00
parent 0efea15dbe
commit c452875343
3 changed files with 22 additions and 15 deletions

View File

@ -1,4 +1,4 @@
//-*- c++-mode -*- //-*- c++ -*-
#include "ug_bitext.h" #include "ug_bitext.h"
#include <algorithm> #include <algorithm>

View File

@ -25,6 +25,7 @@
#include <boost/unordered_map.hpp> #include <boost/unordered_map.hpp>
#include <boost/foreach.hpp> #include <boost/foreach.hpp>
#include <boost/thread.hpp> #include <boost/thread.hpp>
#include <boost/random.hpp>
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h" #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h" #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
@ -220,7 +221,7 @@ namespace Moses {
PScorePfwd : public PhraseScorer<Token> PScorePfwd : public PhraseScorer<Token>
{ {
float conf; float conf;
int denom; char denom;
public: public:
PScorePfwd() PScorePfwd()
{ {
@ -228,7 +229,7 @@ namespace Moses {
} }
int int
init(int const i, float const c, int d=0) init(int const i, float const c, char d=0)
{ {
conf = c; conf = c;
denom = d; denom = d;
@ -249,13 +250,13 @@ namespace Moses {
} }
switch (denom) switch (denom)
{ {
case 0: case 'g':
(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf)); (*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
break; break;
case 1: case 's':
(*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf)); (*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf));
break; break;
case 2: case 'r':
(*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf)); (*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf));
} }
} }
@ -281,10 +282,11 @@ namespace Moses {
} }
void void
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const operator()(Bitext<Token> const& bt, PhrasePair& pp,
vector<float> * dest = NULL) const
{ {
if (!dest) dest = &pp.fvals; if (!dest) dest = &pp.fvals;
(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf)); (*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint),pp.joint,conf));
} }
}; };
@ -551,8 +553,10 @@ namespace Moses {
class job class job
{ {
static ThreadSafeCounter active; static ThreadSafeCounter active;
boost::mutex lock; boost::mutex lock;
friend class agenda; friend class agenda;
boost::taus88 rnd; // every job has its own pseudo random generator
double rnddenom; // denominator for scaling random sampling
public: public:
size_t workers; // how many workers are working on this job? size_t workers; // how many workers are working on this job?
sptr<TSA<Token> const> root; // root of the underlying suffix array sptr<TSA<Token> const> root; // root of the underlying suffix array
@ -560,7 +564,8 @@ namespace Moses {
char const* stop; // end of index range char const* stop; // end of index range
size_t max_samples; // how many samples to extract at most size_t max_samples; // how many samples to extract at most
size_t ctr; /* # of phrase occurrences considered so far size_t ctr; /* # of phrase occurrences considered so far
* # of samples chosen is stored in stats->good */ * # of samples chosen is stored in stats->good
*/
size_t len; // phrase length size_t len; // phrase length
bool fwd; // if true, source phrase is L1 bool fwd; // if true, source phrase is L1
sptr<pstats> stats; // stores statistics collected during sampling sptr<pstats> stats; // stores statistics collected during sampling
@ -622,7 +627,7 @@ namespace Moses {
{ {
boost::lock_guard<boost::mutex> sguard(stats->lock); boost::lock_guard<boost::mutex> sguard(stats->lock);
if (stats->raw_cnt == ctr) ++stats->raw_cnt; if (stats->raw_cnt == ctr) ++stats->raw_cnt;
size_t rnum = randInt(stats->raw_cnt - ctr++); size_t rnum = (stats->raw_cnt - ctr++)*(rnd()/(rnd.max()+1.));
if (rnum < max_samples - stats->good) if (rnum < max_samples - stats->good)
{ {
stats->sample_cnt++; stats->sample_cnt++;
@ -770,7 +775,9 @@ namespace Moses {
job:: job::
job(typename TSA<Token>::tree_iterator const& m, job(typename TSA<Token>::tree_iterator const& m,
sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd) sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
: workers(0) : rnd(0)
, rnddenom(rnd.max() + 1.)
, workers(0)
, root(r) , root(r)
, next(m.lower_bound(-1)) , next(m.lower_bound(-1))
, stop(m.upper_bound(-1)) , stop(m.upper_bound(-1))

View File

@ -139,15 +139,15 @@ namespace Moses
size_t num_feats; size_t num_feats;
// TO DO: should we use different lbop parameters // TO DO: should we use different lbop parameters
// for the relative-frequency based features? // for the relative-frequency based features?
num_feats = calc_pfwd_fix.init(0,m_lbop_parameter); num_feats = calc_pfwd_fix.init(0,m_lbop_parameter,m_pfwd_denom);
num_feats = calc_pbwd_fix.init(num_feats,m_lbop_parameter); num_feats = calc_pbwd_fix.init(num_feats,m_lbop_parameter);
num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex"); num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
num_feats = apply_pp.init(num_feats); num_feats = apply_pp.init(num_feats);
if (num_feats < this->m_numScoreComponents) if (num_feats < this->m_numScoreComponents)
{ {
poolCounts = false; poolCounts = false;
num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter); num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter,m_pfwd_denom);
num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter); num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
} }
if (num_feats != this->m_numScoreComponents) if (num_feats != this->m_numScoreComponents)