mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-24 20:32:59 +03:00
Initial check-in.
This commit is contained in:
parent
8f4b2afe26
commit
e8ee56876e
@ -0,0 +1,25 @@
|
||||
// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*-
|
||||
#include <vector>
|
||||
#include "ug_typedefs.h"
|
||||
|
||||
namespace Moses
|
||||
{
|
||||
namespace bitext
|
||||
{
|
||||
struct PhraseExtractionRecord
|
||||
{
|
||||
size_t const sid, start, stop;
|
||||
bool const flip; // 'backward' lookup from L2
|
||||
size_t s1, s2, e1, e2; // soft and hard boundaries of target phrase
|
||||
int po_fwd, po_bwd; // fwd and bwd phrase orientation
|
||||
std::vector<uchar>* aln; // local alignments
|
||||
bitvector* full_aln; // full word alignment for sentence
|
||||
|
||||
PhraseExtractionRecord(size_t const xsid, size_t const xstart,
|
||||
size_t const xstop, bool const xflip,
|
||||
std::vector<uchar>* xaln, bitvector* xfull_aln = NULL)
|
||||
: sid(xsid), start(xstart), stop(xstop), flip(xflip)
|
||||
, aln(xaln), full_aln(xfull_aln) { }
|
||||
};
|
||||
}
|
||||
}
|
223
moses/TranslationModel/UG/mm/ug_bitext_sampler.h
Normal file
223
moses/TranslationModel/UG/mm/ug_bitext_sampler.h
Normal file
@ -0,0 +1,223 @@
|
||||
// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*-
|
||||
#pragma once
|
||||
#include <boost/thread.hpp>
|
||||
#include <boost/thread/locks.hpp>
|
||||
#include <boost/intrusive_ptr.hpp>
|
||||
#include "ug_bitext.h"
|
||||
#include "ug_bitext_pstats.h"
|
||||
#include "ug_sampling_bias.h"
|
||||
#include "ug_tsa_array_entry.h"
|
||||
#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
|
||||
#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
|
||||
#include "moses/TranslationModel/UG/generic/sorting/NBestList.h"
|
||||
namespace Moses
|
||||
{
|
||||
namespace bitext
|
||||
{
|
||||
|
||||
enum sampling_method { full_coverage, random_sampling, ranked_sampling };
|
||||
|
||||
typedef ugdiss::ttrack::Position TokenPosition;
|
||||
class CandidateSorter
|
||||
{
|
||||
SamplingBias const& score;
|
||||
public:
|
||||
CandidateSorter(SamplingBias const& s) : score(s) {}
|
||||
bool operator()(TokenPosition const& a, TokenPosition const& b) const
|
||||
{ return score[a.sid] > score[b.sid]; }
|
||||
};
|
||||
|
||||
template<typename Token>
|
||||
class
|
||||
BitextSampler : public reference_counter
|
||||
{
|
||||
typedef Bitext<Token> bitext;
|
||||
typedef TSA<Token> tsa;
|
||||
typedef SamplingBias bias;
|
||||
typedef typename Bitext<Token>::iter tsa_iter;
|
||||
mutable boost::condition_variable m_ready;
|
||||
mutable boost::mutex m_lock;
|
||||
// const members
|
||||
// sptr<bitext const> const m_bitext; // keep bitext alive while I am
|
||||
// should be an
|
||||
iptr<bitext> const m_bitext; // keep bitext alive as long as I am
|
||||
size_t const m_plen; // length of lookup phrase
|
||||
bool const m_fwd; // forward or backward direction?
|
||||
sptr<tsa const> const m_root; // root of suffix array
|
||||
char const* m_next; // current position
|
||||
char const* m_stop; // end of search range
|
||||
sampling_method const m_method; /* look at all / random sample /
|
||||
* ranked samples */
|
||||
sptr<bias const> const m_bias; // bias over candidates
|
||||
size_t const m_samples; // how many samples at most
|
||||
// non-const members
|
||||
sptr<pstats> m_stats; // destination for phrase stats
|
||||
size_t m_ctr; // number of samples considered
|
||||
float m_total_bias; // for random sampling with bias
|
||||
bool m_finished;
|
||||
void consider_sample(TokenPosition const& p);
|
||||
size_t perform_ranked_sampling();
|
||||
|
||||
public:
|
||||
BitextSampler(bitext* const bitext, typename bitext::iter const& phrase,
|
||||
sptr<SamplingBias const> const& bias, size_t const max_samples,
|
||||
sampling_method const method);
|
||||
~BitextSampler();
|
||||
bool operator()(); // run sampling
|
||||
sptr<pstats> stats();
|
||||
bool done() const;
|
||||
};
|
||||
|
||||
template<typename Token>
|
||||
BitextSampler<Token>::
|
||||
BitextSampler(Bitext<Token>* const bitext,
|
||||
typename bitext::iter const& phrase,
|
||||
sptr<SamplingBias const> const& bias, size_t const max_samples,
|
||||
sampling_method const method)
|
||||
: m_bitext(bitext)
|
||||
, m_plen(phrase.size())
|
||||
, m_fwd(phrase.root == bitext->I1.get())
|
||||
, m_root(m_fwd ? bitext->I1 : bitext->I2)
|
||||
, m_next(phrase.lower_bound(-1))
|
||||
, m_stop(phrase.upper_bound(-1))
|
||||
, m_method(method)
|
||||
, m_bias(bias)
|
||||
, m_samples(max_samples)
|
||||
, m_ctr(0)
|
||||
, m_total_bias(0)
|
||||
, m_finished(false)
|
||||
{
|
||||
m_stats.reset(new pstats);
|
||||
m_stats->raw_cnt = phrase.ca();
|
||||
m_stats->register_worker();
|
||||
}
|
||||
|
||||
// Ranked sampling sorts all samples by score and then considers the top-ranked
|
||||
// candidates for phrase extraction.
|
||||
template<typename Token>
|
||||
size_t
|
||||
BitextSampler<Token>::
|
||||
perform_ranked_sampling()
|
||||
{
|
||||
if (m_next == m_stop) return m_ctr;
|
||||
CandidateSorter sorter(*m_bias);
|
||||
NBestList<TokenPosition, CandidateSorter> nbest(m_samples,sorter);
|
||||
ugdiss::tsa::ArrayEntry I(m_next);
|
||||
while (I.next < m_stop)
|
||||
{
|
||||
++m_ctr;
|
||||
nbest.add(m_root->readEntry(I.next,I));
|
||||
}
|
||||
for (size_t i = 0; i < nbest.size(); ++i)
|
||||
consider_sample(nbest.get_unsorted(i));
|
||||
cerr << m_ctr << " samples considered at "
|
||||
<< __FILE__ << ":" << __LINE__ << endl;
|
||||
return m_ctr;
|
||||
}
|
||||
|
||||
template<typename Token>
|
||||
void
|
||||
BitextSampler<Token>::
|
||||
consider_sample(TokenPosition const& p)
|
||||
{
|
||||
vector<uchar> aln;
|
||||
bitvector full_aln(100*100);
|
||||
PhraseExtractionRecord rec(p.sid, p.offset, p.offset + m_plen,
|
||||
!m_fwd, &aln, &full_aln);
|
||||
int docid = m_bias ? m_bias->GetClass(p.sid) : -1;
|
||||
bool good = m_bitext->find_trg_phr_bounds(rec);
|
||||
if (!good)
|
||||
{ // no good, probably because phrase is not coherent
|
||||
m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd);
|
||||
return;
|
||||
}
|
||||
|
||||
// all good: register this sample as valid
|
||||
size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1);
|
||||
m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd);
|
||||
|
||||
float sample_weight = 1./num_pairs;
|
||||
Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid);
|
||||
|
||||
// adjust offsets in phrase-internal aligment
|
||||
for (size_t k = 1; k < aln.size(); k += 2) aln[k] += rec.s2 - rec.s1;
|
||||
|
||||
vector<uint64_t> seen; seen.reserve(10);
|
||||
// It is possible that the phrase extraction extracts the same
|
||||
// phrase twice, e.g., when word a co-occurs with sequence b b b
|
||||
// but is aligned only to the middle word. We can only count
|
||||
// each phrase pair once per source phrase occurrence, or else
|
||||
// run the risk of having more joint counts than marginal
|
||||
// counts.
|
||||
|
||||
for (size_t s = rec.s1; s <= rec.s2; ++s)
|
||||
{
|
||||
TSA<Token> const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1;
|
||||
sptr<tsa_iter> b = I.find(o + s, rec.e1 - s);
|
||||
UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found");
|
||||
|
||||
for (size_t i = rec.e1; i <= rec.e2; ++i)
|
||||
{
|
||||
uint64_t tpid = b->getPid();
|
||||
|
||||
// poor man's protection against over-counting
|
||||
size_t s = 0;
|
||||
while (s < seen.size() && seen[s] != tpid) ++s;
|
||||
if (s < seen.size()) continue;
|
||||
seen.push_back(tpid);
|
||||
|
||||
size_t raw2 = b->approxOccurrenceCount();
|
||||
m_stats->add(tpid, sample_weight, aln, raw2,
|
||||
rec.po_fwd, rec.po_bwd, docid);
|
||||
bool ok = (i == rec.e2) || b->extend(o[i].id());
|
||||
UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
|
||||
}
|
||||
if (s < rec.s2) // shift phrase-internal alignments
|
||||
for (size_t k = 1; k < aln.size(); k += 2)
|
||||
--aln[k];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Token>
|
||||
bool
|
||||
BitextSampler<Token>::
|
||||
operator()()
|
||||
{
|
||||
if (m_finished) return true;
|
||||
boost::unique_lock<boost::mutex> lock(m_lock);
|
||||
perform_ranked_sampling();
|
||||
m_finished = true;
|
||||
m_ready.notify_all();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
template<typename Token>
|
||||
bool
|
||||
BitextSampler<Token>::
|
||||
done() const
|
||||
{
|
||||
return m_next == m_stop;
|
||||
}
|
||||
|
||||
template<typename Token>
|
||||
sptr<pstats>
|
||||
BitextSampler<Token>::
|
||||
stats()
|
||||
{
|
||||
if (m_ctr == 0) (*this)();
|
||||
boost::unique_lock<boost::mutex> lock(m_lock);
|
||||
while (!m_finished)
|
||||
m_ready.wait(lock);
|
||||
return m_stats;
|
||||
}
|
||||
|
||||
template<typename Token>
|
||||
BitextSampler<Token>::
|
||||
~BitextSampler()
|
||||
{
|
||||
cerr << "bye" << endl;
|
||||
}
|
||||
|
||||
} // end of namespace bitext
|
||||
} // end of namespace Moses
|
Loading…
Reference in New Issue
Block a user