Added member function find_trg_phrase_bound(PhraseExtractionRecord& rec) to Bitext class.

This commit is contained in:
Ulrich Germann 2015-06-05 22:50:17 +01:00
parent 8ae2894107
commit f87f123366

View File

@ -35,6 +35,7 @@
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h" #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h" #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h" #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
// #include "moses/FF/LexicalReordering/LexicalReorderingState.h" // #include "moses/FF/LexicalReordering/LexicalReorderingState.h"
#include "moses/Util.h" #include "moses/Util.h"
@ -97,6 +98,7 @@ namespace Moses {
class Bitext : public reference_counter class Bitext : public reference_counter
{ {
public: public:
template<typename Token> friend class BitextSampler;
typedef TKN Token; typedef TKN Token;
typedef typename TSA<Token>::tree_iterator iter; typedef typename TSA<Token>::tree_iterator iter;
typedef typename std::vector<PhrasePair<Token> > vec_ppair; typedef typename std::vector<PhrasePair<Token> > vec_ppair;
@ -136,6 +138,7 @@ namespace Moses {
// points of the target phrase; if non-NULL, store word // points of the target phrase; if non-NULL, store word
// alignments in *core_alignment. If /flip/, source phrase is // alignments in *core_alignment. If /flip/, source phrase is
// L2. // L2.
bool find_trg_phr_bounds(PhraseExtractionRecord& rec);
bool find_trg_phr_bounds bool find_trg_phr_bounds
( size_t const sid, // sentence to investigate ( size_t const sid, // sentence to investigate
size_t const start, // start of source phrase size_t const start, // start of source phrase
@ -147,8 +150,6 @@ namespace Moses {
bitvector* full_alignment, // stores full word alignment for this sent. bitvector* full_alignment, // stores full word alignment for this sent.
bool const flip) const; // flip source and target (reverse lookup) bool const flip) const; // flip source and target (reverse lookup)
bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const;
// prep2 launches sampling and returns immediately. // prep2 launches sampling and returns immediately.
// lookup (below) waits for the job to finish before it returns // lookup (below) waits for the job to finish before it returns
sptr<pstats> sptr<pstats>
@ -182,7 +183,6 @@ namespace Moses {
void prep(ttasksptr const& ttask, iter const& phrase) const; void prep(ttasksptr const& ttask, iter const& phrase) const;
#endif #endif
void setDefaultSampleSize(size_t const max_samples); void setDefaultSampleSize(size_t const max_samples);
size_t getDefaultSampleSize() const; size_t getDefaultSampleSize() const;
@ -337,21 +337,24 @@ namespace Moses {
bool bool
Bitext<Token>:: Bitext<Token>::
find_trg_phr_bounds find_trg_phr_bounds
(size_t const sid, size_t const start, size_t const stop, ( size_t const sid, // sentence to investigate
size_t & s1, size_t & s2, size_t & e1, size_t & e2, size_t const start, // start of source phrase
int & po_fwd, int & po_bwd, size_t const stop, // last position of source phrase
std::vector<uchar>* core_alignment, bitvector* full_alignment, size_t & s1, size_t & s2, // beginning and end of target start
bool const flip) const size_t & e1, size_t & e2, // beginning and end of target end
int& po_fwd, int& po_bwd, // phrase orientations
std::vector<uchar> * core_alignment, // stores the core alignment
bitvector* full_alignment, // stores full word alignment for this sent.
bool const flip) const // flip source and target (reverse lookup)
{ {
// if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl; // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
// a word on the core_alignment (core_alignment):
// a word on the core_alignment:
// //
// since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1 // Since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1
// < e2, respectively) are be definition unaligned, we store // < e2, respectively) are be definition unaligned, we store
// only the core alignment in *core_alignment it is up to the // only the core alignment in *aln. It is up to the calling
// calling function to shift alignment points over for start // function to shift alignment points over for start positions
// positions of extracted phrases that start with a fringe word // of extracted phrases that start with a fringe word
assert(T1); assert(T1);
assert(T2); assert(T2);
assert(Tx); assert(Tx);
@ -378,18 +381,28 @@ namespace Moses {
size_t lft = forbidden.size(); size_t lft = forbidden.size();
size_t rgt = 0; size_t rgt = 0;
std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2); std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2);
// process word alignment for this sentence
char const* p = Tx->sntStart(sid); char const* p = Tx->sntStart(sid);
char const* x = Tx->sntEnd(sid); char const* x = Tx->sntEnd(sid);
while (p < x) while (p < x)
{ {
if (flip) { p = binread(p,trg); assert(p<x); p = binread(p,src); } if (flip)
else { p = binread(p,src); assert(p<x); p = binread(p,trg); } {
p = binread(p,trg);
assert(p<x);
p = binread(p,src);
}
else
{
p = binread(p,src);
assert(p<x);
p = binread(p,trg);
}
UTIL_THROW_IF2((src >= slen1 || trg >= slen2), UTIL_THROW_IF2((src >= slen1 || trg >= slen2),
"Alignment range error at sentence " << sid << "!\n" "Alignment range error at sentence " << sid << "!\n"
<< src << "/" << slen1 << " " << << src << "/" << slen1 << " " << trg << "/" << slen2);
trg << "/" << slen2);
if (src < start || src >= stop) if (src < start || src >= stop)
forbidden.set(trg); forbidden.set(trg);