Mmsapt/ug_bitext:

- Sampling now uses a static seed for consistency between runs.
- (In order to have consistency guarantees, specify workers=1 in the
- Mmsapt line in moses.ini; otherwise the randomness of thread scheduling may
  cause inconsistencies between runs.
- Denominator for fwd phrase probabilities can now be chosen via an
  external argument.
This commit is contained in:
Ulrich Germann 2014-05-31 14:29:44 +01:00
parent 0efea15dbe
commit c452875343
3 changed files with 22 additions and 15 deletions

View File

@ -1,4 +1,4 @@
//-*- c++-mode -*-
//-*- c++ -*-
#include "ug_bitext.h"
#include <algorithm>

View File

@ -25,6 +25,7 @@
#include <boost/unordered_map.hpp>
#include <boost/foreach.hpp>
#include <boost/thread.hpp>
#include <boost/random.hpp>
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
@ -220,7 +221,7 @@ namespace Moses {
PScorePfwd : public PhraseScorer<Token>
{
float conf;
int denom;
char denom;
public:
PScorePfwd()
{
@ -228,7 +229,7 @@ namespace Moses {
}
int
init(int const i, float const c, int d=0)
init(int const i, float const c, char d=0)
{
conf = c;
denom = d;
@ -249,13 +250,13 @@ namespace Moses {
}
switch (denom)
{
case 0:
case 'g':
(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
break;
case 1:
case 's':
(*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf));
break;
case 2:
case 'r':
(*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf));
}
}
@ -281,10 +282,11 @@ namespace Moses {
}
void
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
operator()(Bitext<Token> const& bt, PhrasePair& pp,
vector<float> * dest = NULL) const
{
if (!dest) dest = &pp.fvals;
(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf));
(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint),pp.joint,conf));
}
};
@ -551,8 +553,10 @@ namespace Moses {
class job
{
static ThreadSafeCounter active;
boost::mutex lock;
boost::mutex lock;
friend class agenda;
boost::taus88 rnd; // every job has its own pseudo random generator
double rnddenom; // denominator for scaling random sampling
public:
size_t workers; // how many workers are working on this job?
sptr<TSA<Token> const> root; // root of the underlying suffix array
@ -560,7 +564,8 @@ namespace Moses {
char const* stop; // end of index range
size_t max_samples; // how many samples to extract at most
size_t ctr; /* # of phrase occurrences considered so far
* # of samples chosen is stored in stats->good */
* # of samples chosen is stored in stats->good
*/
size_t len; // phrase length
bool fwd; // if true, source phrase is L1
sptr<pstats> stats; // stores statistics collected during sampling
@ -622,7 +627,7 @@ namespace Moses {
{
boost::lock_guard<boost::mutex> sguard(stats->lock);
if (stats->raw_cnt == ctr) ++stats->raw_cnt;
size_t rnum = randInt(stats->raw_cnt - ctr++);
size_t rnum = (stats->raw_cnt - ctr++)*(rnd()/(rnd.max()+1.));
if (rnum < max_samples - stats->good)
{
stats->sample_cnt++;
@ -770,7 +775,9 @@ namespace Moses {
job::
job(typename TSA<Token>::tree_iterator const& m,
sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
: workers(0)
: rnd(0)
, rnddenom(rnd.max() + 1.)
, workers(0)
, root(r)
, next(m.lower_bound(-1))
, stop(m.upper_bound(-1))

View File

@ -139,15 +139,15 @@ namespace Moses
size_t num_feats;
// TO DO: should we use different lbop parameters
// for the relative-frequency based features?
num_feats = calc_pfwd_fix.init(0,m_lbop_parameter);
num_feats = calc_pfwd_fix.init(0,m_lbop_parameter,m_pfwd_denom);
num_feats = calc_pbwd_fix.init(num_feats,m_lbop_parameter);
num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
num_feats = apply_pp.init(num_feats);
if (num_feats < this->m_numScoreComponents)
{
poolCounts = false;
num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter);
num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter,m_pfwd_denom);
num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
}
if (num_feats != this->m_numScoreComponents)