Initial check-in.

This commit is contained in:
Ulrich Germann 2015-06-05 17:24:53 +01:00
parent 8f4b2afe26
commit e8ee56876e
2 changed files with 248 additions and 0 deletions

View File

@ -0,0 +1,25 @@
// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*-
#include <vector>
#include "ug_typedefs.h"
namespace Moses
{
namespace bitext
{
struct PhraseExtractionRecord
{
size_t const sid, start, stop;
bool const flip; // 'backward' lookup from L2
size_t s1, s2, e1, e2; // soft and hard boundaries of target phrase
int po_fwd, po_bwd; // fwd and bwd phrase orientation
std::vector<uchar>* aln; // local alignments
bitvector* full_aln; // full word alignment for sentence
PhraseExtractionRecord(size_t const xsid, size_t const xstart,
size_t const xstop, bool const xflip,
std::vector<uchar>* xaln, bitvector* xfull_aln = NULL)
: sid(xsid), start(xstart), stop(xstop), flip(xflip)
, aln(xaln), full_aln(xfull_aln) { }
};
}
}

View File

@ -0,0 +1,223 @@
// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*-
#pragma once
#include <boost/thread.hpp>
#include <boost/thread/locks.hpp>
#include <boost/intrusive_ptr.hpp>
#include "ug_bitext.h"
#include "ug_bitext_pstats.h"
#include "ug_sampling_bias.h"
#include "ug_tsa_array_entry.h"
#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
#include "moses/TranslationModel/UG/generic/sorting/NBestList.h"
namespace Moses
{
namespace bitext
{
enum sampling_method { full_coverage, random_sampling, ranked_sampling };
typedef ugdiss::ttrack::Position TokenPosition;
class CandidateSorter
{
SamplingBias const& score;
public:
CandidateSorter(SamplingBias const& s) : score(s) {}
bool operator()(TokenPosition const& a, TokenPosition const& b) const
{ return score[a.sid] > score[b.sid]; }
};
template<typename Token>
class
BitextSampler : public reference_counter
{
typedef Bitext<Token> bitext;
typedef TSA<Token> tsa;
typedef SamplingBias bias;
typedef typename Bitext<Token>::iter tsa_iter;
mutable boost::condition_variable m_ready;
mutable boost::mutex m_lock;
// const members
// sptr<bitext const> const m_bitext; // keep bitext alive while I am
// should be an
iptr<bitext> const m_bitext; // keep bitext alive as long as I am
size_t const m_plen; // length of lookup phrase
bool const m_fwd; // forward or backward direction?
sptr<tsa const> const m_root; // root of suffix array
char const* m_next; // current position
char const* m_stop; // end of search range
sampling_method const m_method; /* look at all / random sample /
* ranked samples */
sptr<bias const> const m_bias; // bias over candidates
size_t const m_samples; // how many samples at most
// non-const members
sptr<pstats> m_stats; // destination for phrase stats
size_t m_ctr; // number of samples considered
float m_total_bias; // for random sampling with bias
bool m_finished;
void consider_sample(TokenPosition const& p);
size_t perform_ranked_sampling();
public:
BitextSampler(bitext* const bitext, typename bitext::iter const& phrase,
sptr<SamplingBias const> const& bias, size_t const max_samples,
sampling_method const method);
~BitextSampler();
bool operator()(); // run sampling
sptr<pstats> stats();
bool done() const;
};
template<typename Token>
BitextSampler<Token>::
BitextSampler(Bitext<Token>* const bitext,
typename bitext::iter const& phrase,
sptr<SamplingBias const> const& bias, size_t const max_samples,
sampling_method const method)
: m_bitext(bitext)
, m_plen(phrase.size())
, m_fwd(phrase.root == bitext->I1.get())
, m_root(m_fwd ? bitext->I1 : bitext->I2)
, m_next(phrase.lower_bound(-1))
, m_stop(phrase.upper_bound(-1))
, m_method(method)
, m_bias(bias)
, m_samples(max_samples)
, m_ctr(0)
, m_total_bias(0)
, m_finished(false)
{
m_stats.reset(new pstats);
m_stats->raw_cnt = phrase.ca();
m_stats->register_worker();
}
// Ranked sampling sorts all samples by score and then considers the top-ranked
// candidates for phrase extraction.
template<typename Token>
size_t
BitextSampler<Token>::
perform_ranked_sampling()
{
if (m_next == m_stop) return m_ctr;
CandidateSorter sorter(*m_bias);
NBestList<TokenPosition, CandidateSorter> nbest(m_samples,sorter);
ugdiss::tsa::ArrayEntry I(m_next);
while (I.next < m_stop)
{
++m_ctr;
nbest.add(m_root->readEntry(I.next,I));
}
for (size_t i = 0; i < nbest.size(); ++i)
consider_sample(nbest.get_unsorted(i));
cerr << m_ctr << " samples considered at "
<< __FILE__ << ":" << __LINE__ << endl;
return m_ctr;
}
template<typename Token>
void
BitextSampler<Token>::
consider_sample(TokenPosition const& p)
{
vector<uchar> aln;
bitvector full_aln(100*100);
PhraseExtractionRecord rec(p.sid, p.offset, p.offset + m_plen,
!m_fwd, &aln, &full_aln);
int docid = m_bias ? m_bias->GetClass(p.sid) : -1;
bool good = m_bitext->find_trg_phr_bounds(rec);
if (!good)
{ // no good, probably because phrase is not coherent
m_stats->count_sample(docid, 0, rec.po_fwd, rec.po_bwd);
return;
}
// all good: register this sample as valid
size_t num_pairs = (rec.s2 - rec.s1 + 1) * (rec.e2 - rec.e1 + 1);
m_stats->count_sample(docid, num_pairs, rec.po_fwd, rec.po_bwd);
float sample_weight = 1./num_pairs;
Token const* o = (m_fwd ? m_bitext->T2 : m_bitext->T1)->sntStart(rec.sid);
// adjust offsets in phrase-internal aligment
for (size_t k = 1; k < aln.size(); k += 2) aln[k] += rec.s2 - rec.s1;
vector<uint64_t> seen; seen.reserve(10);
// It is possible that the phrase extraction extracts the same
// phrase twice, e.g., when word a co-occurs with sequence b b b
// but is aligned only to the middle word. We can only count
// each phrase pair once per source phrase occurrence, or else
// run the risk of having more joint counts than marginal
// counts.
for (size_t s = rec.s1; s <= rec.s2; ++s)
{
TSA<Token> const& I = m_fwd ? *m_bitext->I2 : *m_bitext->I1;
sptr<tsa_iter> b = I.find(o + s, rec.e1 - s);
UTIL_THROW_IF2(!b || b->size() < rec.e1 - s, "target phrase not found");
for (size_t i = rec.e1; i <= rec.e2; ++i)
{
uint64_t tpid = b->getPid();
// poor man's protection against over-counting
size_t s = 0;
while (s < seen.size() && seen[s] != tpid) ++s;
if (s < seen.size()) continue;
seen.push_back(tpid);
size_t raw2 = b->approxOccurrenceCount();
m_stats->add(tpid, sample_weight, aln, raw2,
rec.po_fwd, rec.po_bwd, docid);
bool ok = (i == rec.e2) || b->extend(o[i].id());
UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
}
if (s < rec.s2) // shift phrase-internal alignments
for (size_t k = 1; k < aln.size(); k += 2)
--aln[k];
}
}
template<typename Token>
bool
BitextSampler<Token>::
operator()()
{
if (m_finished) return true;
boost::unique_lock<boost::mutex> lock(m_lock);
perform_ranked_sampling();
m_finished = true;
m_ready.notify_all();
return true;
}
template<typename Token>
bool
BitextSampler<Token>::
done() const
{
return m_next == m_stop;
}
template<typename Token>
sptr<pstats>
BitextSampler<Token>::
stats()
{
if (m_ctr == 0) (*this)();
boost::unique_lock<boost::mutex> lock(m_lock);
while (!m_finished)
m_ready.wait(lock);
return m_stats;
}
template<typename Token>
BitextSampler<Token>::
~BitextSampler()
{
cerr << "bye" << endl;
}
} // end of namespace bitext
} // end of namespace Moses