mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 21:42:19 +03:00
Mmsapt/ug_bitext:
- Sampling now uses a static seed for consistency between runs. - (In order to have consistency guarantees, specify workers=1 in the - Mmsapt line in moses.ini; otherwise the randomness of thread scheduling may cause inconsistencies between runs. - Denominator for fwd phrase probabilities can now be chosen via an external argument.
This commit is contained in:
parent
0efea15dbe
commit
c452875343
@ -1,4 +1,4 @@
|
|||||||
//-*- c++-mode -*-
|
//-*- c++ -*-
|
||||||
|
|
||||||
#include "ug_bitext.h"
|
#include "ug_bitext.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include <boost/unordered_map.hpp>
|
#include <boost/unordered_map.hpp>
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <boost/thread.hpp>
|
#include <boost/thread.hpp>
|
||||||
|
#include <boost/random.hpp>
|
||||||
|
|
||||||
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
|
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
|
||||||
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
|
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
|
||||||
@ -220,7 +221,7 @@ namespace Moses {
|
|||||||
PScorePfwd : public PhraseScorer<Token>
|
PScorePfwd : public PhraseScorer<Token>
|
||||||
{
|
{
|
||||||
float conf;
|
float conf;
|
||||||
int denom;
|
char denom;
|
||||||
public:
|
public:
|
||||||
PScorePfwd()
|
PScorePfwd()
|
||||||
{
|
{
|
||||||
@ -228,7 +229,7 @@ namespace Moses {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
init(int const i, float const c, int d=0)
|
init(int const i, float const c, char d=0)
|
||||||
{
|
{
|
||||||
conf = c;
|
conf = c;
|
||||||
denom = d;
|
denom = d;
|
||||||
@ -249,13 +250,13 @@ namespace Moses {
|
|||||||
}
|
}
|
||||||
switch (denom)
|
switch (denom)
|
||||||
{
|
{
|
||||||
case 0:
|
case 'g':
|
||||||
(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
|
(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 's':
|
||||||
(*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf));
|
(*dest)[this->index] = log(lbop(pp.sample1, pp.joint, conf));
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 'r':
|
||||||
(*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf));
|
(*dest)[this->index] = log(lbop(pp.raw1, pp.joint, conf));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -281,7 +282,8 @@ namespace Moses {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
|
operator()(Bitext<Token> const& bt, PhrasePair& pp,
|
||||||
|
vector<float> * dest = NULL) const
|
||||||
{
|
{
|
||||||
if (!dest) dest = &pp.fvals;
|
if (!dest) dest = &pp.fvals;
|
||||||
(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint),pp.joint,conf));
|
(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint),pp.joint,conf));
|
||||||
@ -553,6 +555,8 @@ namespace Moses {
|
|||||||
static ThreadSafeCounter active;
|
static ThreadSafeCounter active;
|
||||||
boost::mutex lock;
|
boost::mutex lock;
|
||||||
friend class agenda;
|
friend class agenda;
|
||||||
|
boost::taus88 rnd; // every job has its own pseudo random generator
|
||||||
|
double rnddenom; // denominator for scaling random sampling
|
||||||
public:
|
public:
|
||||||
size_t workers; // how many workers are working on this job?
|
size_t workers; // how many workers are working on this job?
|
||||||
sptr<TSA<Token> const> root; // root of the underlying suffix array
|
sptr<TSA<Token> const> root; // root of the underlying suffix array
|
||||||
@ -560,7 +564,8 @@ namespace Moses {
|
|||||||
char const* stop; // end of index range
|
char const* stop; // end of index range
|
||||||
size_t max_samples; // how many samples to extract at most
|
size_t max_samples; // how many samples to extract at most
|
||||||
size_t ctr; /* # of phrase occurrences considered so far
|
size_t ctr; /* # of phrase occurrences considered so far
|
||||||
* # of samples chosen is stored in stats->good */
|
* # of samples chosen is stored in stats->good
|
||||||
|
*/
|
||||||
size_t len; // phrase length
|
size_t len; // phrase length
|
||||||
bool fwd; // if true, source phrase is L1
|
bool fwd; // if true, source phrase is L1
|
||||||
sptr<pstats> stats; // stores statistics collected during sampling
|
sptr<pstats> stats; // stores statistics collected during sampling
|
||||||
@ -622,7 +627,7 @@ namespace Moses {
|
|||||||
{
|
{
|
||||||
boost::lock_guard<boost::mutex> sguard(stats->lock);
|
boost::lock_guard<boost::mutex> sguard(stats->lock);
|
||||||
if (stats->raw_cnt == ctr) ++stats->raw_cnt;
|
if (stats->raw_cnt == ctr) ++stats->raw_cnt;
|
||||||
size_t rnum = randInt(stats->raw_cnt - ctr++);
|
size_t rnum = (stats->raw_cnt - ctr++)*(rnd()/(rnd.max()+1.));
|
||||||
if (rnum < max_samples - stats->good)
|
if (rnum < max_samples - stats->good)
|
||||||
{
|
{
|
||||||
stats->sample_cnt++;
|
stats->sample_cnt++;
|
||||||
@ -770,7 +775,9 @@ namespace Moses {
|
|||||||
job::
|
job::
|
||||||
job(typename TSA<Token>::tree_iterator const& m,
|
job(typename TSA<Token>::tree_iterator const& m,
|
||||||
sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
|
sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd)
|
||||||
: workers(0)
|
: rnd(0)
|
||||||
|
, rnddenom(rnd.max() + 1.)
|
||||||
|
, workers(0)
|
||||||
, root(r)
|
, root(r)
|
||||||
, next(m.lower_bound(-1))
|
, next(m.lower_bound(-1))
|
||||||
, stop(m.upper_bound(-1))
|
, stop(m.upper_bound(-1))
|
||||||
|
@ -139,14 +139,14 @@ namespace Moses
|
|||||||
size_t num_feats;
|
size_t num_feats;
|
||||||
// TO DO: should we use different lbop parameters
|
// TO DO: should we use different lbop parameters
|
||||||
// for the relative-frequency based features?
|
// for the relative-frequency based features?
|
||||||
num_feats = calc_pfwd_fix.init(0,m_lbop_parameter);
|
num_feats = calc_pfwd_fix.init(0,m_lbop_parameter,m_pfwd_denom);
|
||||||
num_feats = calc_pbwd_fix.init(num_feats,m_lbop_parameter);
|
num_feats = calc_pbwd_fix.init(num_feats,m_lbop_parameter);
|
||||||
num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
|
num_feats = calc_lex.init(num_feats, bname + L1 + "-" + L2 + ".lex");
|
||||||
num_feats = apply_pp.init(num_feats);
|
num_feats = apply_pp.init(num_feats);
|
||||||
if (num_feats < this->m_numScoreComponents)
|
if (num_feats < this->m_numScoreComponents)
|
||||||
{
|
{
|
||||||
poolCounts = false;
|
poolCounts = false;
|
||||||
num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter);
|
num_feats = calc_pfwd_dyn.init(num_feats,m_lbop_parameter,m_pfwd_denom);
|
||||||
num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
|
num_feats = calc_pbwd_dyn.init(num_feats,m_lbop_parameter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user