mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 05:14:36 +03:00
Added capability to add sentence pairs to imBitext. Various minor fixes.
This commit is contained in:
parent
e089c7463d
commit
e81e1772f8
@ -38,7 +38,7 @@ namespace Moses
|
|||||||
this->lock.unlock();
|
this->lock.unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
bool
|
||||||
pstats::
|
pstats::
|
||||||
add(uint64_t pid, float const w,
|
add(uint64_t pid, float const w,
|
||||||
vector<uchar> const& a,
|
vector<uchar> const& a,
|
||||||
@ -51,9 +51,11 @@ namespace Moses
|
|||||||
if (this->good < entry.rcnt())
|
if (this->good < entry.rcnt())
|
||||||
{
|
{
|
||||||
this->lock.lock();
|
this->lock.lock();
|
||||||
UTIL_THROW(util::Exception, "more joint counts than good counts!"
|
return false;
|
||||||
<< entry.rcnt() << "/" << this->good);
|
// UTIL_THROW(util::Exception, "more joint counts than good counts!"
|
||||||
|
// << entry.rcnt() << "/" << this->good);
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
jstats::
|
jstats::
|
||||||
@ -112,6 +114,20 @@ namespace Moses
|
|||||||
aln() const
|
aln() const
|
||||||
{ return my_aln; }
|
{ return my_aln; }
|
||||||
|
|
||||||
|
void
|
||||||
|
jstats::
|
||||||
|
invalidate()
|
||||||
|
{
|
||||||
|
my_rcnt = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
jstats::
|
||||||
|
valid()
|
||||||
|
{
|
||||||
|
return my_rcnt != 0;
|
||||||
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
PhrasePair::
|
PhrasePair::
|
||||||
operator<(PhrasePair const& other) const
|
operator<(PhrasePair const& other) const
|
||||||
@ -140,6 +156,22 @@ namespace Moses
|
|||||||
good2 = 0;
|
good2 = 0;
|
||||||
fvals.resize(numfeats);
|
fvals.resize(numfeats);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
PhrasePair::
|
||||||
|
init(uint64_t const pid1,
|
||||||
|
pstats const& ps1,
|
||||||
|
pstats const& ps2,
|
||||||
|
size_t const numfeats)
|
||||||
|
{
|
||||||
|
p1 = pid1;
|
||||||
|
raw1 = ps1.raw_cnt + ps2.raw_cnt;
|
||||||
|
sample1 = ps1.sample_cnt + ps2.sample_cnt;
|
||||||
|
sample2 = 0;
|
||||||
|
good1 = ps1.good + ps2.good;
|
||||||
|
good2 = 0;
|
||||||
|
fvals.resize(numfeats);
|
||||||
|
}
|
||||||
|
|
||||||
float
|
float
|
||||||
lbop(size_t const tries, size_t const succ, float const confidence)
|
lbop(size_t const tries, size_t const succ, float const confidence)
|
||||||
@ -149,7 +181,7 @@ namespace Moses
|
|||||||
find_lower_bound_on_p(tries, succ, confidence);
|
find_lower_bound_on_p(tries, succ, confidence);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
PhrasePair const&
|
||||||
PhrasePair::
|
PhrasePair::
|
||||||
update(uint64_t const pid2, jstats const& js)
|
update(uint64_t const pid2, jstats const& js)
|
||||||
{
|
{
|
||||||
@ -159,8 +191,39 @@ namespace Moses
|
|||||||
assert(js.aln().size());
|
assert(js.aln().size());
|
||||||
if (js.aln().size())
|
if (js.aln().size())
|
||||||
aln = js.aln()[0].second;
|
aln = js.aln()[0].second;
|
||||||
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PhrasePair const&
|
||||||
|
PhrasePair::
|
||||||
|
update(uint64_t const pid2, jstats const& js1, jstats const& js2)
|
||||||
|
{
|
||||||
|
p2 = pid2;
|
||||||
|
raw2 = js1.cnt2() + js2.cnt2();
|
||||||
|
joint = js1.rcnt() + js2.rcnt();
|
||||||
|
assert(js1.aln().size() || js2.aln().size());
|
||||||
|
if (js1.aln().size())
|
||||||
|
aln = js1.aln()[0].second;
|
||||||
|
else if (js2.aln().size())
|
||||||
|
aln = js2.aln()[0].second;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
PhrasePair const&
|
||||||
|
PhrasePair::
|
||||||
|
update(uint64_t const pid2,
|
||||||
|
size_t const raw2extra,
|
||||||
|
jstats const& js)
|
||||||
|
{
|
||||||
|
p2 = pid2;
|
||||||
|
raw2 = js.cnt2() + raw2extra;
|
||||||
|
joint = js.rcnt();
|
||||||
|
assert(js.aln().size());
|
||||||
|
if (js.aln().size())
|
||||||
|
aln = js.aln()[0].second;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
float
|
float
|
||||||
PhrasePair::
|
PhrasePair::
|
||||||
eval(vector<float> const& w)
|
eval(vector<float> const& w)
|
||||||
@ -172,5 +235,81 @@ namespace Moses
|
|||||||
return this->score;
|
return this->score;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
sptr<imBitext<L2R_Token<SimpleWordId> > >
|
||||||
|
imBitext<L2R_Token<SimpleWordId> >::
|
||||||
|
add(vector<string> const& s1,
|
||||||
|
vector<string> const& s2,
|
||||||
|
vector<string> const& aln) const
|
||||||
|
{
|
||||||
|
typedef L2R_Token<SimpleWordId> TKN;
|
||||||
|
assert(s1.size() == s2.size() && s1.size() == aln.size());
|
||||||
|
|
||||||
|
sptr<imBitext<TKN> > ret;
|
||||||
|
{
|
||||||
|
lock_guard<mutex> guard(this->lock);
|
||||||
|
ret.reset(new imBitext<TKN>(*this));
|
||||||
|
}
|
||||||
|
|
||||||
|
// we add the sentences in separate threads (so it's faster)
|
||||||
|
boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
|
||||||
|
thread1.join(); // for debugging
|
||||||
|
boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
|
||||||
|
BOOST_FOREACH(string const& a, aln)
|
||||||
|
{
|
||||||
|
istringstream ibuf(a);
|
||||||
|
ostringstream obuf;
|
||||||
|
uint32_t row,col; char c;
|
||||||
|
while (ibuf>>row>>c>>col)
|
||||||
|
{
|
||||||
|
assert(c == '-');
|
||||||
|
binwrite(obuf,row);
|
||||||
|
binwrite(obuf,col);
|
||||||
|
}
|
||||||
|
char const* x = obuf.str().c_str();
|
||||||
|
vector<char> v(x,x+obuf.str().size());
|
||||||
|
ret->myTx = append(ret->myTx, v);
|
||||||
|
}
|
||||||
|
thread1.join();
|
||||||
|
thread2.join();
|
||||||
|
ret->Tx = ret->myTx;
|
||||||
|
ret->T1 = ret->myT1;
|
||||||
|
ret->T2 = ret->myT2;
|
||||||
|
ret->I1 = ret->myI1;
|
||||||
|
ret->I2 = ret->myI2;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// template<>
|
||||||
|
void
|
||||||
|
snt_adder<L2R_Token<SimpleWordId> >::
|
||||||
|
operator()()
|
||||||
|
{
|
||||||
|
vector<id_type> sids;
|
||||||
|
sids.reserve(snt.size());
|
||||||
|
BOOST_FOREACH(string const& s, snt)
|
||||||
|
{
|
||||||
|
sids.push_back(track ? track->size() : 0);
|
||||||
|
istringstream buf(s);
|
||||||
|
string w;
|
||||||
|
vector<L2R_Token<SimpleWordId > > s;
|
||||||
|
s.reserve(100);
|
||||||
|
while (buf >> w)
|
||||||
|
s.push_back(L2R_Token<SimpleWordId>(V[w]));
|
||||||
|
track = append(track,s);
|
||||||
|
}
|
||||||
|
if (index)
|
||||||
|
index.reset(new imTSA<L2R_Token<SimpleWordId> >(*index,track,sids,V.tsize()));
|
||||||
|
else
|
||||||
|
index.reset(new imTSA<L2R_Token<SimpleWordId> >(track,NULL,NULL));
|
||||||
|
}
|
||||||
|
|
||||||
|
snt_adder<L2R_Token<SimpleWordId> >::
|
||||||
|
snt_adder(vector<string> const& s, TokenIndex& v,
|
||||||
|
sptr<imTtrack<L2R_Token<SimpleWordId> > >& t,
|
||||||
|
sptr<imTSA<L2R_Token<SimpleWordId> > >& i)
|
||||||
|
: snt(s), V(v), track(t), index(i)
|
||||||
|
{ }
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -88,6 +88,8 @@ namespace Moses {
|
|||||||
|
|
||||||
vector<pair<size_t, vector<uchar> > > const & aln() const;
|
vector<pair<size_t, vector<uchar> > > const & aln() const;
|
||||||
void add(float w, vector<uchar> const& a, uint32_t const cnt2);
|
void add(float w, vector<uchar> const& a, uint32_t const cnt2);
|
||||||
|
void invalidate();
|
||||||
|
bool valid();
|
||||||
};
|
};
|
||||||
|
|
||||||
struct
|
struct
|
||||||
@ -107,7 +109,7 @@ namespace Moses {
|
|||||||
void register_worker();
|
void register_worker();
|
||||||
size_t count_workers() { return in_progress; }
|
size_t count_workers() { return in_progress; }
|
||||||
|
|
||||||
void add(uint64_t const pid, float const w,
|
bool add(uint64_t const pid, float const w,
|
||||||
vector<uchar> const& a, uint32_t const cnt2);
|
vector<uchar> const& a, uint32_t const cnt2);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -127,9 +129,18 @@ namespace Moses {
|
|||||||
PhrasePair();
|
PhrasePair();
|
||||||
bool operator<(PhrasePair const& other) const;
|
bool operator<(PhrasePair const& other) const;
|
||||||
bool operator>(PhrasePair const& other) const;
|
bool operator>(PhrasePair const& other) const;
|
||||||
void init(uint64_t const pid1, pstats const& ps,
|
|
||||||
|
void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
|
||||||
|
void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
|
||||||
size_t const numfeats);
|
size_t const numfeats);
|
||||||
void update(uint64_t const pid2, jstats const& js);
|
|
||||||
|
PhrasePair const& update(uint64_t const pid2, jstats const& js);
|
||||||
|
PhrasePair const& update(uint64_t const pid2,
|
||||||
|
jstats const& js1,
|
||||||
|
jstats const& js2);
|
||||||
|
PhrasePair const& update(uint64_t const pid2,
|
||||||
|
size_t const raw2extra,
|
||||||
|
jstats const& js);
|
||||||
float eval(vector<float> const& w);
|
float eval(vector<float> const& w);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -144,10 +155,16 @@ namespace Moses {
|
|||||||
virtual
|
virtual
|
||||||
|
|
||||||
void
|
void
|
||||||
operator()(Bitext<Token> const& pt, PhrasePair& pp) const = 0;
|
operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest)
|
||||||
|
const = 0;
|
||||||
|
|
||||||
int
|
int
|
||||||
fcnt() const { return num_feats; }
|
fcnt() const
|
||||||
|
{ return num_feats; }
|
||||||
|
|
||||||
|
int
|
||||||
|
getIndex() const
|
||||||
|
{ return index; }
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Token>
|
template<typename Token>
|
||||||
@ -170,14 +187,15 @@ namespace Moses {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
operator()(Bitext<Token> const& bt, PhrasePair& pp) const
|
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
|
||||||
{
|
{
|
||||||
|
if (!dest) dest = &pp.fvals;
|
||||||
if (pp.joint > pp.good1)
|
if (pp.joint > pp.good1)
|
||||||
{
|
{
|
||||||
cerr << bt.toString(pp.p1,0) << " ::: " << bt.toString(pp.p2,1) << endl;
|
cerr << bt.toString(pp.p1,0) << " ::: " << bt.toString(pp.p2,1) << endl;
|
||||||
cerr << pp.joint << "/" << pp.good1 << "/" << pp.raw2 << endl;
|
cerr << pp.joint << "/" << pp.good1 << "/" << pp.raw2 << endl;
|
||||||
}
|
}
|
||||||
pp.fvals[this->index] = log(lbop(pp.good1, pp.joint, conf));
|
(*dest)[this->index] = log(lbop(pp.good1, pp.joint, conf));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -201,9 +219,10 @@ namespace Moses {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
operator()(Bitext<Token> const& pt, PhrasePair& pp) const
|
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
|
||||||
{
|
{
|
||||||
pp.fvals[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf));
|
if (!dest) dest = &pp.fvals;
|
||||||
|
(*dest)[this->index] = log(lbop(max(pp.raw2,pp.joint), pp.joint, conf));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -225,8 +244,9 @@ namespace Moses {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
operator()(Bitext<Token> const& bt, PhrasePair& pp) const
|
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
|
||||||
{
|
{
|
||||||
|
if (!dest) dest = &pp.fvals;
|
||||||
uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
|
uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
|
||||||
parse_pid(pp.p1, sid1, off1, len1);
|
parse_pid(pp.p1, sid1, off1, len1);
|
||||||
parse_pid(pp.p2, sid2, off2, len2);
|
parse_pid(pp.p2, sid2, off2, len2);
|
||||||
@ -248,8 +268,8 @@ namespace Moses {
|
|||||||
#endif
|
#endif
|
||||||
scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
|
scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
|
||||||
bt.T2->sntStart(sid2)+off2,0,len2,
|
bt.T2->sntStart(sid2)+off2,0,len2,
|
||||||
pp.aln, pp.fvals[this->index],
|
pp.aln, (*dest)[this->index],
|
||||||
pp.fvals[this->index+1]);
|
(*dest)[this->index+1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -271,11 +291,12 @@ namespace Moses {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
operator()(Bitext<Token> const& bt, PhrasePair& pp) const
|
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
|
||||||
{
|
{
|
||||||
|
if (!dest) dest = &pp.fvals;
|
||||||
uint32_t sid2=0,off2=0,len2=0;
|
uint32_t sid2=0,off2=0,len2=0;
|
||||||
parse_pid(pp.p2, sid2, off2, len2);
|
parse_pid(pp.p2, sid2, off2, len2);
|
||||||
pp.fvals[this->index] = len2;
|
(*dest)[this->index] = len2;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -297,9 +318,10 @@ namespace Moses {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
operator()(Bitext<Token> const& bt, PhrasePair& pp) const
|
operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
|
||||||
{
|
{
|
||||||
pp.fvals[this->index] = 1;
|
if (!dest) dest = &pp.fvals;
|
||||||
|
(*dest)[this->index] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -307,6 +329,7 @@ namespace Moses {
|
|||||||
template<typename TKN>
|
template<typename TKN>
|
||||||
class Bitext
|
class Bitext
|
||||||
{
|
{
|
||||||
|
protected:
|
||||||
mutable boost::mutex lock;
|
mutable boost::mutex lock;
|
||||||
public:
|
public:
|
||||||
typedef TKN Token;
|
typedef TKN Token;
|
||||||
@ -322,13 +345,13 @@ namespace Moses {
|
|||||||
// each other's way.
|
// each other's way.
|
||||||
mutable sptr<agenda> ag;
|
mutable sptr<agenda> ag;
|
||||||
|
|
||||||
sptr<Ttrack<char> > const Tx; // word alignments
|
sptr<Ttrack<char> > Tx; // word alignments
|
||||||
sptr<Ttrack<Token> > const T1; // token track
|
sptr<Ttrack<Token> > T1; // token track
|
||||||
sptr<Ttrack<Token> > const T2; // token track
|
sptr<Ttrack<Token> > T2; // token track
|
||||||
sptr<TokenIndex> const V1; // vocab
|
sptr<TokenIndex> V1; // vocab
|
||||||
sptr<TokenIndex> const V2; // vocab
|
sptr<TokenIndex> V2; // vocab
|
||||||
sptr<TSA<Token> > const I1; // indices
|
sptr<TSA<Token> > I1; // indices
|
||||||
sptr<TSA<Token> > const I2; // indices
|
sptr<TSA<Token> > I2; // indices
|
||||||
|
|
||||||
/// given the source phrase sid[start:stop]
|
/// given the source phrase sid[start:stop]
|
||||||
// find the possible start (s1 .. s2) and end (e1 .. e2)
|
// find the possible start (s1 .. s2) and end (e1 .. e2)
|
||||||
@ -339,14 +362,19 @@ namespace Moses {
|
|||||||
find_trg_phr_bounds
|
find_trg_phr_bounds
|
||||||
(size_t const sid, size_t const start, size_t const stop,
|
(size_t const sid, size_t const start, size_t const stop,
|
||||||
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
|
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
|
||||||
vector<uchar> * core_alignment, bool const flip) const;
|
vector<uchar> * core_alignment,
|
||||||
|
bitvector* full_alignment,
|
||||||
|
bool const flip) const;
|
||||||
|
|
||||||
mutable boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2;
|
mutable boost::unordered_map<uint64_t,sptr<pstats> > cache1,cache2;
|
||||||
private:
|
protected:
|
||||||
size_t default_sample_size;
|
size_t default_sample_size;
|
||||||
|
private:
|
||||||
sptr<pstats>
|
sptr<pstats>
|
||||||
prep2(iter const& phrase, size_t const max_sample) const;
|
prep2(iter const& phrase, size_t const max_sample) const;
|
||||||
public:
|
public:
|
||||||
|
Bitext(size_t const max_sample=5000);
|
||||||
|
|
||||||
Bitext(Ttrack<Token>* const t1,
|
Bitext(Ttrack<Token>* const t1,
|
||||||
Ttrack<Token>* const t2,
|
Ttrack<Token>* const t2,
|
||||||
Ttrack<char>* const tx,
|
Ttrack<char>* const tx,
|
||||||
@ -358,6 +386,7 @@ namespace Moses {
|
|||||||
|
|
||||||
virtual void open(string const base, string const L1, string const L2) = 0;
|
virtual void open(string const base, string const L1, string const L2) = 0;
|
||||||
|
|
||||||
|
// sptr<pstats> lookup(Phrase const& phrase, size_t factor) const;
|
||||||
sptr<pstats> lookup(iter const& phrase) const;
|
sptr<pstats> lookup(iter const& phrase) const;
|
||||||
sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
|
sptr<pstats> lookup(iter const& phrase, size_t const max_sample) const;
|
||||||
void prep(iter const& phrase) const;
|
void prep(iter const& phrase) const;
|
||||||
@ -407,6 +436,12 @@ namespace Moses {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Token>
|
||||||
|
Bitext<Token>::
|
||||||
|
Bitext(size_t const max_sample)
|
||||||
|
: default_sample_size(max_sample)
|
||||||
|
{ }
|
||||||
|
|
||||||
template<typename Token>
|
template<typename Token>
|
||||||
Bitext<Token>::
|
Bitext<Token>::
|
||||||
Bitext(Ttrack<Token>* const t1,
|
Bitext(Ttrack<Token>* const t1,
|
||||||
@ -557,12 +592,20 @@ namespace Moses {
|
|||||||
{
|
{
|
||||||
j->stats->register_worker();
|
j->stats->register_worker();
|
||||||
vector<uchar> aln;
|
vector<uchar> aln;
|
||||||
|
bitvector full_alignment(100*100);
|
||||||
while (j->step(sid,offset))
|
while (j->step(sid,offset))
|
||||||
{
|
{
|
||||||
aln.clear();
|
aln.clear();
|
||||||
if (!ag.bt.find_trg_phr_bounds
|
if (j->fwd)
|
||||||
(sid, offset, offset + j->len, s1, s2, e1, e2,
|
{
|
||||||
j->fwd?&aln:NULL, !j->fwd))
|
if (!ag.bt.find_trg_phr_bounds
|
||||||
|
(sid,offset,offset+j->len,s1,s2,e1,e2,
|
||||||
|
&aln,&full_alignment,false))
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if (!ag.bt.find_trg_phr_bounds
|
||||||
|
(sid,offset,offset+j->len,s1,s2,e1,e2,
|
||||||
|
NULL,NULL,true))
|
||||||
continue;
|
continue;
|
||||||
j->stats->lock.lock();
|
j->stats->lock.lock();
|
||||||
j->stats->good += 1;
|
j->stats->good += 1;
|
||||||
@ -580,8 +623,16 @@ namespace Moses {
|
|||||||
// assert(b);
|
// assert(b);
|
||||||
for (size_t i = e1; i <= e2; ++i)
|
for (size_t i = e1; i <= e2; ++i)
|
||||||
{
|
{
|
||||||
|
if (!j->stats->add(b->getPid(),sample_weight,aln,b->approxOccurrenceCount()))
|
||||||
j->stats->add(b->getPid(),sample_weight,aln,b->approxOccurrenceCount());
|
{
|
||||||
|
for (size_t z = 0; z < j->len; ++z)
|
||||||
|
cout << (*ag.bt.V1)[ag.bt.T1->sntStart(sid)[offset+z].id()] << " ";
|
||||||
|
cout << endl;
|
||||||
|
for (size_t z = s; z <= i; ++z)
|
||||||
|
cout << (*ag.bt.V2)[(o+z)->id()] << " ";
|
||||||
|
cout << endl;
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
if (i < e2)
|
if (i < e2)
|
||||||
{
|
{
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
@ -742,51 +793,230 @@ namespace Moses {
|
|||||||
template<typename TKN>
|
template<typename TKN>
|
||||||
class imBitext : public Bitext<TKN>
|
class imBitext : public Bitext<TKN>
|
||||||
{
|
{
|
||||||
|
sptr<imTtrack<char> > myTx;
|
||||||
|
sptr<imTtrack<TKN> > myT1;
|
||||||
|
sptr<imTtrack<TKN> > myT2;
|
||||||
|
sptr<imTSA<TKN> > myI1;
|
||||||
|
sptr<imTSA<TKN> > myI2;
|
||||||
public:
|
public:
|
||||||
void open(string const base, string const L1, string L2);
|
void open(string const base, string const L1, string L2);
|
||||||
imBitext();
|
imBitext(sptr<TokenIndex> const& V1,
|
||||||
|
sptr<TokenIndex> const& V2,
|
||||||
|
size_t max_sample = 5000);
|
||||||
|
imBitext(size_t max_sample = 5000);
|
||||||
|
imBitext(imBitext const& other);
|
||||||
|
|
||||||
|
// sptr<imBitext<TKN> >
|
||||||
|
// add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a);
|
||||||
|
|
||||||
|
sptr<imBitext<TKN> >
|
||||||
|
add(vector<string> const& s1,
|
||||||
|
vector<string> const& s2,
|
||||||
|
vector<string> const& a) const;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename TKN>
|
template<typename TKN>
|
||||||
imBitext<TKN>::
|
imBitext<TKN>::
|
||||||
imBitext()
|
imBitext(size_t max_sample)
|
||||||
: Bitext<TKN>(new imTtrack<TKN>(),
|
{
|
||||||
new imTtrack<TKN>(),
|
this->default_sample_size = max_sample;
|
||||||
new imTtrack<char>(),
|
this->V1.reset(new TokenIndex());
|
||||||
new TokenIndex(),
|
this->V2.reset(new TokenIndex());
|
||||||
new TokenIndex(),
|
this->V1->setDynamic(true);
|
||||||
new imTSA<TKN>(),
|
this->V2->setDynamic(true);
|
||||||
new imTSA<TKN>())
|
}
|
||||||
{}
|
|
||||||
|
template<typename TKN>
|
||||||
|
imBitext<TKN>::
|
||||||
|
imBitext(sptr<TokenIndex> const& v1,
|
||||||
|
sptr<TokenIndex> const& v2,
|
||||||
|
size_t max_sample)
|
||||||
|
{
|
||||||
|
this->default_sample_size = max_sample;
|
||||||
|
this->V1 = v1;
|
||||||
|
this->V2 = v2;
|
||||||
|
this->V1->setDynamic(true);
|
||||||
|
this->V2->setDynamic(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename TKN>
|
||||||
|
imBitext<TKN>::
|
||||||
|
imBitext(imBitext<TKN> const& other)
|
||||||
|
{
|
||||||
|
this->myTx = other.myTx;
|
||||||
|
this->myT1 = other.myT1;
|
||||||
|
this->myT2 = other.myT2;
|
||||||
|
this->myI1 = other.myI1;
|
||||||
|
this->myI2 = other.myI2;
|
||||||
|
this->Tx = this->myTx;
|
||||||
|
this->T1 = this->myT1;
|
||||||
|
this->T2 = this->myT2;
|
||||||
|
this->I1 = this->myI1;
|
||||||
|
this->I2 = this->myI2;
|
||||||
|
this->V1 = other.V1;
|
||||||
|
this->V2 = other.V2;
|
||||||
|
this->default_sample_size = other.default_sample_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename TKN> class snt_adder;
|
||||||
|
template<> class snt_adder<L2R_Token<SimpleWordId> >;
|
||||||
|
|
||||||
|
template<>
|
||||||
|
class snt_adder<L2R_Token<SimpleWordId> >
|
||||||
|
{
|
||||||
|
typedef L2R_Token<SimpleWordId> TKN;
|
||||||
|
vector<string> const & snt;
|
||||||
|
TokenIndex & V;
|
||||||
|
sptr<imTtrack<TKN> > & track;
|
||||||
|
sptr<imTSA<TKN > > & index;
|
||||||
|
public:
|
||||||
|
snt_adder(vector<string> const& s, TokenIndex& v,
|
||||||
|
sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
|
||||||
|
|
||||||
|
void operator()();
|
||||||
|
};
|
||||||
|
|
||||||
// template<typename TKN>
|
// template<typename TKN>
|
||||||
// void
|
// class snt_adder
|
||||||
// imBitext<TKN>::
|
|
||||||
// open(string const base, string const L1, string L2)
|
|
||||||
// {
|
// {
|
||||||
// mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtracuk<TKN>*>(this->T1.get());
|
// vector<string> const & snt;
|
||||||
// mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
|
// TokenIndex & V;
|
||||||
// mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
|
// sptr<imTtrack<TKN> > & track;
|
||||||
// t1.open(base+L1+".mct");
|
// sptr<imTSA<TKN > > & index;
|
||||||
// t2.open(base+L2+".mct");
|
// public:
|
||||||
// tx.open(base+L1+"-"+L2+".mam");
|
// snt_adder(vector<string> const& s, TokenIndex& v,
|
||||||
// cerr << "DADA" << endl;
|
// sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
|
||||||
// this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
|
|
||||||
// this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
|
// template<typename T>
|
||||||
// mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
|
// void operator()();
|
||||||
// mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
|
// };
|
||||||
// i1.open(base+L1+".sfa", this->T1.get());
|
|
||||||
// i2.open(base+L2+".sfa", this->T2.get());
|
// // template<>
|
||||||
// assert(this->T1->size() == this->T2->size());
|
// void
|
||||||
|
// snt_adder<L2R_Token<SimpleWordId> >::
|
||||||
|
// operator()();
|
||||||
|
|
||||||
|
// template<>
|
||||||
|
// void
|
||||||
|
// snt_adder<char>::
|
||||||
|
// operator()()
|
||||||
|
// {
|
||||||
|
// vector<id_type> sids;
|
||||||
|
// sids.reserve(snt.size());
|
||||||
|
// BOOST_FOREACH(string const& s, snt)
|
||||||
|
// {
|
||||||
|
// sids.push_back(track ? track->size() : 0);
|
||||||
|
// istringstream buf(s);
|
||||||
|
// string w;
|
||||||
|
// vector<char> s;
|
||||||
|
// s.reserve(100);
|
||||||
|
// while (buf >> w)
|
||||||
|
// s.push_back(vector<char>(V[w]));
|
||||||
|
// track = append(track,s);
|
||||||
|
// }
|
||||||
|
// index.reset(new imTSA<char>(*index,track,sids,V.tsize()));
|
||||||
// }
|
// }
|
||||||
|
|
||||||
|
// template<typename TKN>
|
||||||
|
// snt_adder<TKN>::
|
||||||
|
// snt_adder(vector<string> const& s, TokenIndex& v,
|
||||||
|
// sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i)
|
||||||
|
// : snt(s), V(v), track(t), index(i)
|
||||||
|
// {
|
||||||
|
// throw "Not implemented yet.";
|
||||||
|
// }
|
||||||
|
|
||||||
|
template<>
|
||||||
|
sptr<imBitext<L2R_Token<SimpleWordId> > >
|
||||||
|
imBitext<L2R_Token<SimpleWordId> >::
|
||||||
|
add(vector<string> const& s1,
|
||||||
|
vector<string> const& s2,
|
||||||
|
vector<string> const& aln) const;
|
||||||
|
|
||||||
|
template<typename TKN>
|
||||||
|
sptr<imBitext<TKN> >
|
||||||
|
imBitext<TKN>::
|
||||||
|
add(vector<string> const& s1,
|
||||||
|
vector<string> const& s2,
|
||||||
|
vector<string> const& aln) const
|
||||||
|
{
|
||||||
|
throw "Not yet implemented";
|
||||||
|
}
|
||||||
|
// template<typename TKN>
|
||||||
|
// sptr<imBitext<TKN> >
|
||||||
|
// imBitext<TKN>::
|
||||||
|
// add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a)
|
||||||
|
// {
|
||||||
|
// boost::lock_guard<boost::mutex> guard(this->lock);
|
||||||
|
// sptr<imBitext<TKN> > ret(new imBitext<TKN>());
|
||||||
|
// vector<id_type> sids(1,this->myT1.size()-1);
|
||||||
|
// ret->myT1 = add(this->myT1,s1);
|
||||||
|
// ret->myT2 = add(this->myT2,s2);
|
||||||
|
// size_t v1size = this->V1.tsize();
|
||||||
|
// size_t v2size = this->V2.tsize();
|
||||||
|
// BOOST_FOREACH(TKN const& t, s1) { if (t->id() >= v1size) v1size = t->id() + 1; }
|
||||||
|
// BOOST_FOREACH(TKN const& t, s2) { if (t->id() >= v2size) v2size = t->id() + 1; }
|
||||||
|
// ret->myI1.reset(new imTSA<TKN>(*this->I1,ret->myT1,sids,v1size));
|
||||||
|
// ret->myI2.reset(new imTSA<TKN>(*this->I2,ret->myT2,sids,v2size));
|
||||||
|
// ostringstream abuf;
|
||||||
|
// BOOST_FOREACH(ushort x, a) binwrite(abuf,x);
|
||||||
|
// vector<char> foo(abuf.str().begin(),abuf.str().end());
|
||||||
|
// ret->myTx = add(this->myTx,foo);
|
||||||
|
// ret->T1 = ret->myT1;
|
||||||
|
// ret->T2 = ret->myT2;
|
||||||
|
// ret->Tx = ret->myTx;
|
||||||
|
// ret->I1 = ret->myI1;
|
||||||
|
// ret->I2 = ret->myI2;
|
||||||
|
// ret->V1 = this->V1;
|
||||||
|
// ret->V2 = this->V2;
|
||||||
|
// return ret;
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
|
// template<typename TKN>
|
||||||
|
// imBitext<TKN>::
|
||||||
|
// imBitext()
|
||||||
|
// : Bitext<TKN>(new imTtrack<TKN>(),
|
||||||
|
// new imTtrack<TKN>(),
|
||||||
|
// new imTtrack<char>(),
|
||||||
|
// new TokenIndex(),
|
||||||
|
// new TokenIndex(),
|
||||||
|
// new imTSA<TKN>(),
|
||||||
|
// new imTSA<TKN>())
|
||||||
|
// {}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename TKN>
|
||||||
|
void
|
||||||
|
imBitext<TKN>::
|
||||||
|
open(string const base, string const L1, string L2)
|
||||||
|
{
|
||||||
|
mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
|
||||||
|
mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
|
||||||
|
mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
|
||||||
|
t1.open(base+L1+".mct");
|
||||||
|
t2.open(base+L2+".mct");
|
||||||
|
tx.open(base+L1+"-"+L2+".mam");
|
||||||
|
cerr << "DADA" << endl;
|
||||||
|
this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
|
||||||
|
this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
|
||||||
|
mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
|
||||||
|
mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
|
||||||
|
i1.open(base+L1+".sfa", this->T1);
|
||||||
|
i2.open(base+L2+".sfa", this->T2);
|
||||||
|
assert(this->T1->size() == this->T2->size());
|
||||||
|
}
|
||||||
|
|
||||||
template<typename Token>
|
template<typename Token>
|
||||||
bool
|
bool
|
||||||
Bitext<Token>::
|
Bitext<Token>::
|
||||||
find_trg_phr_bounds(size_t const sid, size_t const start, size_t const stop,
|
find_trg_phr_bounds(size_t const sid, size_t const start, size_t const stop,
|
||||||
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
|
size_t & s1, size_t & s2, size_t & e1, size_t & e2,
|
||||||
vector<uchar>* core_alignment, bool const flip) const
|
vector<uchar>* core_alignment,
|
||||||
|
bitvector* full_alignment,
|
||||||
|
bool const flip) const
|
||||||
{
|
{
|
||||||
// if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
|
// if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
|
||||||
// a word on the core_alignment:
|
// a word on the core_alignment:
|
||||||
@ -795,10 +1025,18 @@ namespace Moses {
|
|||||||
// it is up to the calling function to shift alignment points over for start positions
|
// it is up to the calling function to shift alignment points over for start positions
|
||||||
// of extracted phrases that start with a fringe word
|
// of extracted phrases that start with a fringe word
|
||||||
bitvector forbidden((flip ? T1 : T2)->sntLen(sid));
|
bitvector forbidden((flip ? T1 : T2)->sntLen(sid));
|
||||||
|
size_t slen1 = (*T1).sntLen(sid);
|
||||||
|
size_t slen2 = (*T2).sntLen(sid);
|
||||||
|
if (full_alignment)
|
||||||
|
{
|
||||||
|
if (slen1*slen2 > full_alignment->size())
|
||||||
|
full_alignment->resize(slen1*slen2*2);
|
||||||
|
full_alignment->reset();
|
||||||
|
}
|
||||||
size_t src,trg;
|
size_t src,trg;
|
||||||
size_t lft = forbidden.size();
|
size_t lft = forbidden.size();
|
||||||
size_t rgt = 0;
|
size_t rgt = 0;
|
||||||
vector<vector<ushort> > aln((*T1).sntLen(sid));
|
vector<vector<ushort> > aln(slen1);
|
||||||
char const* p = Tx->sntStart(sid);
|
char const* p = Tx->sntStart(sid);
|
||||||
char const* x = Tx->sntEnd(sid);
|
char const* x = Tx->sntEnd(sid);
|
||||||
|
|
||||||
@ -819,6 +1057,11 @@ namespace Moses {
|
|||||||
if (flip) aln[trg].push_back(src);
|
if (flip) aln[trg].push_back(src);
|
||||||
else aln[src].push_back(trg);
|
else aln[src].push_back(trg);
|
||||||
}
|
}
|
||||||
|
if (full_alignment)
|
||||||
|
{
|
||||||
|
if (flip) full_alignment->set(trg*slen2 + src);
|
||||||
|
else full_alignment->set(src*slen2 + trg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -928,7 +1171,7 @@ namespace Moses {
|
|||||||
else ret = ag->add_job(phrase, max_sample);
|
else ret = ag->add_job(phrase, max_sample);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Token>
|
template<typename Token>
|
||||||
sptr<pstats>
|
sptr<pstats>
|
||||||
Bitext<Token>::
|
Bitext<Token>::
|
||||||
|
Loading…
Reference in New Issue
Block a user