diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc index 4b4e3ed7c..2fa9a49f5 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc @@ -32,8 +32,8 @@ namespace sapt indoc = other.indoc; for (int i = 0; i <= LRModel::NONE; i++) { - ofwd[i] = other.ofwd[i]; - obwd[i] = other.obwd[i]; + ofwd[i] = other.ofwd[i]; + obwd[i] = other.obwd[i]; } } @@ -53,7 +53,7 @@ namespace sapt return obwd[idx]; } - void + size_t jstats:: add(float w, float b, std::vector const& a, uint32_t const cnt2, uint32_t fwd_orient, uint32_t bwd_orient, int const docid) @@ -65,24 +65,25 @@ namespace sapt my_bcnt += b; if (a.size()) { - size_t i = 0; - while (i < my_aln.size() && my_aln[i].second != a) ++i; - if (i == my_aln.size()) - my_aln.push_back(std::pair >(1,a)); - else - my_aln[i].first++; - if (my_aln[i].first > my_aln[i/2].first) - push_heap(my_aln.begin(),my_aln.begin()+i+1); + size_t i = 0; + while (i < my_aln.size() && my_aln[i].second != a) ++i; + if (i == my_aln.size()) + my_aln.push_back(std::pair >(1,a)); + else + my_aln[i].first++; + if (my_aln[i].first > my_aln[i/2].first) + push_heap(my_aln.begin(),my_aln.begin()+i+1); } ++ofwd[fwd_orient]; ++obwd[bwd_orient]; if (docid >= 0) { - // while (int(indoc.size()) <= docid) indoc.push_back(0); - ++indoc[docid]; + // while (int(indoc.size()) <= docid) indoc.push_back(0); + ++indoc[docid]; } + return my_rcnt; } - + std::vector > > const& jstats:: aln() const diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h index dd82a79e3..d8e0bb18a 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h @@ -39,7 +39,7 @@ namespace sapt std::vector > > const & aln() const; - void + size_t add(float w, float b, std::vector const& a, uint32_t const cnt2, uint32_t fwd_orient, uint32_t bwd_orient, int const docid); diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc index 998932023..f1602ab96 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc @@ -63,7 +63,7 @@ namespace sapt } } - bool + size_t pstats:: add(uint64_t pid, float const w, float const b, std::vector const& a, @@ -73,13 +73,13 @@ namespace sapt { boost::lock_guard guard(this->lock); jstats& entry = this->trg[pid]; - entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid); + size_t ret = entry.add(w, b, a, cnt2, fwd_o, bwd_o, docid); if (this->good < entry.rcnt()) { - UTIL_THROW(util::Exception, "more joint counts than good counts:" - << entry.rcnt() << "/" << this->good << "!"); + UTIL_THROW(util::Exception, "more joint counts than good counts:" + << entry.rcnt() << "/" << this->good << "!"); } - return true; + return ret; } void diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h index 0bc31ad1c..cdc4f0c3d 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h @@ -41,7 +41,7 @@ namespace sapt void register_worker(); size_t count_workers() { return in_progress; } - bool + size_t add(uint64_t const pid, // target phrase id float const w, // sample weight (1./(# of phrases extractable)) float const b, // sample bias score diff --git a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h index fd70cafff..8e7399c6c 100644 --- a/moses/TranslationModel/UG/mm/ug_bitext_sampler.h +++ b/moses/TranslationModel/UG/mm/ug_bitext_sampler.h @@ -45,7 +45,7 @@ BitextSampler : public Moses::reference_counter { typedef Bitext bitext; typedef TSA tsa; - typedef SamplingBias bias; + typedef SamplingBias bias_t; typedef typename Bitext::iter tsa_iter; mutable boost::condition_variable m_ready; mutable boost::mutex m_lock; @@ -59,7 +59,7 @@ BitextSampler : public Moses::reference_counter char const* m_next; // current position char const* m_stop; // end of search range sampling_method const m_method; // look at all/random/ranked samples - SPTR const m_bias; // bias over candidates + SPTR const m_bias; // bias over candidates size_t const m_samples; // how many samples at most size_t const m_min_samples; // non-const members @@ -67,20 +67,20 @@ BitextSampler : public Moses::reference_counter size_t m_ctr; // number of samples considered float m_total_bias; // for random sampling with bias bool m_finished; - + size_t m_num_occurrences; // estimated number of phrase occurrences in corpus boost::taus88 m_rnd; // every job has its own pseudo random generator - // double m_rnd_denom; // denominator for scaling random sampling double m_bias_total; - bool consider_sample(TokenPosition const& p); + size_t consider_sample(TokenPosition const& p); size_t perform_random_sampling(); + size_t perform_full_phrase_extraction(); int check_sample_distribution(uint64_t const& sid, uint64_t const& offset); - bool flip_coin(id_type & sid, ushort & offset); + bool flip_coin(id_type const& sid, ushort const& offset, SamplingBias const* bias); public: BitextSampler(BitextSampler const& other); - BitextSampler const& operator=(BitextSampler const& other); + // BitextSampler const& operator=(BitextSampler const& other); BitextSampler(SPTR const& bitext, typename bitext::iter const& phrase, SPTR const& bias, @@ -159,9 +159,9 @@ check_sample_distribution(uint64_t const& sid, uint64_t const& offset) template bool BitextSampler:: -flip_coin(id_type & sid, ushort & offset) +flip_coin(id_type const& sid, ushort const& offset, bias_t const* bias) { - int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1; + int no_maybe_yes = bias ? check_sample_distribution(sid, offset) : 1; if (no_maybe_yes == 0) return false; // no if (no_maybe_yes > 1) return true; // yes // ... maybe: flip a coin @@ -170,8 +170,8 @@ flip_coin(id_type & sid, ushort & offset) size_t options_left = (options_total - m_ctr); size_t random_number = options_left * (m_rnd()/(m_rnd.max()+1.)); size_t threshold; - if (m_bias_total) // we have a bias and there are candidates with non-zero prob - threshold = ((*m_bias)[sid]/m_bias_total * options_total * m_samples); + if (bias && m_bias_total > 0) // we have a bias and there are candidates with non-zero prob + threshold = ((*bias)[sid]/m_bias_total * options_total * m_samples); else // no bias, or all have prob 0 (can happen with a very opinionated bias) threshold = m_samples; return random_number + options_chosen < threshold; @@ -199,13 +199,12 @@ BitextSampler(SPTR const> const& bitext, , m_ctr(0) , m_total_bias(0) , m_finished(false) + , m_num_occurrences(phrase.ca()) , m_rnd(0) - // , m_rnd_denom(m_rnd.max() + 1) { m_stats.reset(new pstats); m_stats->raw_cnt = phrase.ca(); m_stats->register_worker(); - // cerr << phrase.str(bitext->V1.get()) << " [" << HERE << "]" << endl; } template @@ -221,8 +220,8 @@ BitextSampler(BitextSampler const& other) , m_bias(other.m_bias) , m_samples(other.m_samples) , m_min_samples(other.m_min_samples) + , m_num_occurrences(other.m_num_occurrences) , m_rnd(0) - // , m_rnd_denom(m_rnd.max() + 1) { // lock both instances boost::unique_lock mylock(m_lock); @@ -235,6 +234,23 @@ BitextSampler(BitextSampler const& other) m_finished = other.m_finished; } +// Uniform sampling +template +size_t +BitextSampler:: +perform_full_phrase_extraction() +{ + if (m_next == m_stop) return m_ctr; + for (sapt::tsa::ArrayEntry I(m_next); I.next < m_stop; ++m_ctr) + { + ++m_ctr; + m_root->readEntry(I.next, I); + consider_sample(I); + } + return m_ctr; +} + + // Uniform sampling template size_t @@ -260,14 +276,14 @@ perform_random_sampling() { ++m_ctr; m_root->readEntry(I.next,I); - if (!flip_coin(I.sid, I.offset)) continue; + if (!flip_coin(I.sid, I.offset, m_bias.get())) continue; consider_sample(I); } return m_ctr; } template -bool +size_t BitextSampler:: consider_sample(TokenPosition const& p) { @@ -279,7 +295,7 @@ consider_sample(TokenPosition const& p) if (!m_bitext->find_trg_phr_bounds(rec)) { // no good, probably because phrase is not coherent m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd); - return false; + return 0; } // all good: register this sample as valid @@ -300,6 +316,7 @@ consider_sample(TokenPosition const& p) // pair once per source phrase occurrence, or else run the risk of // having more joint counts than marginal counts. + size_t max_evidence = 0; for (size_t s = rec.s1; s <= rec.s2; ++s) { TSA const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1; @@ -313,8 +330,10 @@ consider_sample(TokenPosition const& p) continue; // don't over-count seen.push_back(tpid); size_t raw2 = b->approxOccurrenceCount(); - m_stats->add(tpid, sample_weight, m_bias ? (*m_bias)[p.sid] : 1, - aln, raw2, rec.po_fwd, rec.po_bwd, docid); + size_t evid = m_stats->add(tpid, sample_weight, + m_bias ? (*m_bias)[p.sid] : 1, + aln, raw2, rec.po_fwd, rec.po_bwd, docid); + max_evidence = std::max(max_evidence, evid); bool ok = (i == rec.e2) || b->extend(o[i].id()); UTIL_THROW_IF2(!ok, "Could not extend target phrase."); } @@ -322,7 +341,7 @@ consider_sample(TokenPosition const& p) for (size_t k = 1; k < aln.size(); k += 2) --aln[k]; } - return true; + return max_evidence; } #ifndef MMT @@ -333,7 +352,9 @@ operator()() { if (m_finished) return true; boost::unique_lock lock(m_lock); - if (m_method == random_sampling) + if (m_method == full_coverage) + preform_full_phrase_extraction(); // consider all occurrences + else if (m_method == random_sampling) perform_random_sampling(); else UTIL_THROW2("Unsupported sampling method."); m_finished = true; diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h index 91f0aaad6..35c7e1fa9 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pbwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h @@ -26,7 +26,7 @@ namespace sapt BOOST_FOREACH(char const& x, denom) { if (x == '+') { --checksum; continue; } - if (x != 'g' && x != 's' && x != 'r') continue; + if (x != 'g' && x != 's' && x != 'r' && x != 'b') continue; std::string s = (boost::format("pbwd-%c%.3f") % x % c).str(); this->m_feature_names.push_back(s); } @@ -48,9 +48,12 @@ namespace sapt BOOST_FOREACH(char const& x, denom) { uint32_t m2 = pp.raw2; - if (x == 'g') m2 = round(m2 * float(pp.good1) / pp.raw1); + if (x == 'g' || x == 'b') m2 = round(m2 * float(pp.good1) / pp.raw1); else if (x == 's') m2 = round(m2 * float(pp.sample1) / pp.raw1); - (*dest)[i++] = log(lbop(std::max(m2, pp.joint), pp.joint,conf)); + + (*dest)[i] = log(lbop(std::max(m2, pp.joint), pp.joint,conf)); + if (x == 'b') (*dest)[i] += log(pp.cum_bias) - log(pp.joint); + ++i; } } }; diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h index b42bb464e..bfa8027d1 100644 --- a/moses/TranslationModel/UG/sapt_pscore_pfwd.h +++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h @@ -27,7 +27,7 @@ namespace sapt BOOST_FOREACH(char const& x, denom) { if (x == '+') { --checksum; continue; } - if (x != 'g' && x != 's' && x != 'r') continue; + if (x != 'g' && x != 's' && x != 'r' && x != 'b') continue; std::string s = (boost::format("pfwd-%c%.3f") % x % c).str(); this->m_feature_names.push_back(s); } @@ -49,12 +49,16 @@ namespace sapt // cerr<m_index; + float g = log(lbop(pp.good1, pp.joint, conf));; BOOST_FOREACH(char const& c, this->denom) { switch (c) { + case 'b': + (*dest)[i++] = g + log(pp.cum_bias) - log(pp.joint); + break; case 'g': - (*dest)[i++] = log(lbop(pp.good1, pp.joint, conf)); + (*dest)[i++] = g; break; case 's': (*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf)); diff --git a/regtest b/regtest index f69e79f5f..e07a00c97 160000 --- a/regtest +++ b/regtest @@ -1 +1 @@ -Subproject commit f69e79f5fc92d993354fa775de197b029d321175 +Subproject commit e07a00c9733e0fecb8433f1c9d5805d3f0b35c6f