mirror of
https://github.com/moses-smt/mosesdecoder.git
synced 2024-12-26 13:23:25 +03:00
almost no modification to existing system, only additions:
- prefix tree structured phrase table - binary phrase table format with on-demand loading git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@101 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
parent
6e8e6a9ed2
commit
45588167bd
68
moses/src/CountedPointer.h
Normal file
68
moses/src/CountedPointer.h
Normal file
@ -0,0 +1,68 @@
|
||||
// $Id$
|
||||
|
||||
#ifndef COUNTEDPOINTER_H_
|
||||
#define COUNTEDPOINTER_H_
|
||||
|
||||
// see http://ootips.org/yonat/4dev/counted_ptr.h
|
||||
template <class T> class CountedPointer
|
||||
{
|
||||
public:
|
||||
explicit CountedPointer(T *p = 0) : pointerInfo_(0)
|
||||
{
|
||||
if (p)
|
||||
pointerInfo_ = new PointerAndCounter(p);
|
||||
}
|
||||
CountedPointer(const CountedPointer &p) { acquire(p.pointerInfo_); };
|
||||
~CountedPointer() { release(); };
|
||||
CountedPointer &operator=(const CountedPointer &p)
|
||||
{
|
||||
if (this != &p) {
|
||||
release();
|
||||
acquire(p.pointerInfo_);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
CountedPointer &operator=(T *p)
|
||||
{
|
||||
release();
|
||||
pointerInfo_ = new PointerAndCounter(p);
|
||||
return *this;
|
||||
}
|
||||
|
||||
operator bool() const { return pointerInfo_; };
|
||||
bool hasP() const { return pointerInfo_->pointer != 0; };
|
||||
|
||||
T& operator*() const { return *pointerInfo_->pointer; };
|
||||
T* operator->() const { return pointerInfo_->pointer; };
|
||||
bool unique() const { return (!pointerInfo_ || pointerInfo_->counter == 1); };
|
||||
void destroy() { release(); };
|
||||
|
||||
//void operator delete(void *p) { release(); };
|
||||
|
||||
private:
|
||||
struct PointerAndCounter
|
||||
{
|
||||
T *pointer;
|
||||
unsigned counter;
|
||||
PointerAndCounter(T* p = 0, unsigned c = 1) : pointer(p), counter(c) {};
|
||||
} *pointerInfo_;
|
||||
|
||||
void acquire(PointerAndCounter *c)
|
||||
{
|
||||
pointerInfo_ = c;
|
||||
if (pointerInfo_)
|
||||
(pointerInfo_->counter)++;
|
||||
}
|
||||
|
||||
void release()
|
||||
{
|
||||
if (pointerInfo_) {
|
||||
if (--(pointerInfo_->counter) == 0) {
|
||||
delete pointerInfo_->pointer;
|
||||
delete pointerInfo_;
|
||||
}
|
||||
pointerInfo_ = 0;
|
||||
}
|
||||
}
|
||||
};
|
||||
#endif
|
@ -53,32 +53,9 @@ const Factor *FactorCollection::AddFactor(FactorDirection direction
|
||||
, LmId lmId)
|
||||
{
|
||||
// find string id
|
||||
const string *ptr;
|
||||
StringSet::const_iterator iterString = m_factorStringCollection.find(factorString);
|
||||
if (iterString == m_factorStringCollection.end())
|
||||
{
|
||||
const pair< StringSet::iterator, bool > &pairRet = m_factorStringCollection.insert(factorString);
|
||||
const string &str = *pairRet.first;
|
||||
ptr = &str;
|
||||
}
|
||||
else
|
||||
{
|
||||
const string &str = *iterString;
|
||||
ptr = &str;
|
||||
}
|
||||
|
||||
Factor findFactor(direction, factorType, ptr, lmId);
|
||||
FactorSet::const_iterator iter = m_collection.find(findFactor);
|
||||
|
||||
if (iter == m_collection.end())
|
||||
{ // new factor
|
||||
pair< FactorSet::iterator, bool > pairRet = m_collection.insert(findFactor);
|
||||
return &(*pairRet.first);
|
||||
}
|
||||
else
|
||||
{
|
||||
return &(*iter);
|
||||
}
|
||||
const string *ptr=&(*m_factorStringCollection.insert(factorString).first);
|
||||
// Factor findFactor(direction, factorType, ptr, lmId);
|
||||
return &(*m_collection.insert(Factor(direction, factorType, ptr, lmId)).first);
|
||||
}
|
||||
|
||||
const Factor *FactorCollection::AddFactor(FactorDirection direction
|
||||
|
68
moses/src/File.h
Normal file
68
moses/src/File.h
Normal file
@ -0,0 +1,68 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
|
||||
/* Richard Zens */
|
||||
/* ---------------------------------------------------------------- */
|
||||
#ifndef FILE_H_
|
||||
#define FILE_H_
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
static const off_t InvalidOffT=-1;
|
||||
|
||||
// WARNING:
|
||||
// these functions work only for bitwise read/write-able types
|
||||
|
||||
template<typename T> inline size_t fWrite(FILE* f,const T& t) {
|
||||
if(fwrite(&t,sizeof(t),1,f)!=1) {
|
||||
std::cerr<<"ERROR:: fwrite!\n";abort();}
|
||||
return sizeof(t);
|
||||
}
|
||||
|
||||
template<typename T> inline void fRead(FILE* f,T& t) {
|
||||
if(fread(&t,sizeof(t),1,f)!=1) {std::cerr<<"ERROR: fread!\n";abort();}
|
||||
}
|
||||
|
||||
template<typename T> inline size_t fWrite(FILE* f,const T* b,const T* e) {
|
||||
unsigned s=e-b;size_t rv=fWrite(f,s);
|
||||
if(fwrite(b,sizeof(T),s,f)!=s) {std::cerr<<"ERROR: fwrite!\n";abort();}
|
||||
return rv+sizeof(T)*s;
|
||||
}
|
||||
|
||||
template<typename T> inline size_t fWrite(FILE* f,const T b,const T e) {
|
||||
unsigned s=std::distance(b,e);size_t rv=fWrite(f,s);
|
||||
if(fwrite(&(*b),sizeof(T),s,f)!=s) {std::cerr<<"ERROR: fwrite!\n";abort();}
|
||||
return rv+sizeof(T)*s;
|
||||
}
|
||||
|
||||
template<typename C> inline size_t fWriteVector(FILE* f,const C& v) {
|
||||
unsigned s=v.size();
|
||||
size_t rv=fWrite(f,s);
|
||||
if(fwrite(&v[0],sizeof(typename C::value_type),s,f)!=s) {std::cerr<<"ERROR: fwrite!\n";abort();}
|
||||
return rv+sizeof(typename C::value_type)*s;
|
||||
}
|
||||
|
||||
template<typename C> inline void fReadVector(FILE* f, C& v) {
|
||||
unsigned s;fRead(f,s);v.resize(s);
|
||||
unsigned r=fread(&(*v.begin()),sizeof(typename C::value_type),s,f);
|
||||
if(r!=s) {
|
||||
std::cerr<<"ERROR: freadVec! "<<r<<" "<<s<<"\n";abort();}
|
||||
}
|
||||
|
||||
inline off_t fTell(FILE* f) {return ftello(f);}
|
||||
|
||||
inline void fSeek(FILE* f,off_t o) {
|
||||
if(fseeko(f,o,SEEK_SET)<0) {
|
||||
std::cerr<<"ERROR: could not fseeko position "<<o<<"\n";
|
||||
if(o==InvalidOffT) std::cerr<<"You tried to seek for 'InvalidOffT'!\n";
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
inline FILE* fOpen(const char* fn,const char* m) {
|
||||
if(FILE* f=fopen(fn,m)) return f; else {
|
||||
std::cerr<<"ERROR: could not open file "<<fn<<" with mode "<<m<<"\n";
|
||||
abort();}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
56
moses/src/FilePtr.h
Normal file
56
moses/src/FilePtr.h
Normal file
@ -0,0 +1,56 @@
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
|
||||
/* Richard Zens */
|
||||
/* ---------------------------------------------------------------- */
|
||||
#ifndef FILEPTR_H_
|
||||
#define FILEPTR_H_
|
||||
#include "File.h"
|
||||
#ifdef USECPFP
|
||||
#include "CountedPointer.h"
|
||||
#endif
|
||||
|
||||
template<typename T> class FilePtr {
|
||||
public:
|
||||
#ifdef USECPFP
|
||||
typedef CountedPointer<T> Ptr;
|
||||
#else
|
||||
typedef T* Ptr;
|
||||
#endif
|
||||
|
||||
private:
|
||||
FILE* f;
|
||||
off_t pos;
|
||||
mutable Ptr t;
|
||||
public:
|
||||
FilePtr(FILE* f_=0,off_t p=0) : f(f_),pos(p),t(0) {}
|
||||
~FilePtr() {}
|
||||
|
||||
void set(FILE* f_,off_t p) {f=f_;pos=p;}
|
||||
|
||||
void free() {
|
||||
#ifdef USECPFP
|
||||
t.destroy();
|
||||
#else
|
||||
delete t; t=0;
|
||||
#endif
|
||||
}
|
||||
|
||||
T& operator* () {load();return *t;}
|
||||
Ptr operator->() {load();return t;}
|
||||
operator Ptr () {load();return t;}
|
||||
|
||||
const T& operator* () const {load();return *t;}
|
||||
const Ptr operator->() const {load();return t;}
|
||||
operator const Ptr () const {load();return t;}
|
||||
|
||||
Ptr getPtr() {return t;}
|
||||
const Ptr getPtr() const {return t;}
|
||||
|
||||
operator bool() const {return (f && pos!=InvalidOffT);}
|
||||
|
||||
void load() const {
|
||||
if(t) return;
|
||||
if(f && pos!=InvalidOffT) {fSeek(f,pos); t=new T(f);}
|
||||
}
|
||||
};
|
||||
#endif
|
347
moses/src/PhraseDictionaryTree.cpp
Normal file
347
moses/src/PhraseDictionaryTree.cpp
Normal file
@ -0,0 +1,347 @@
|
||||
#include "PhraseDictionaryTree.h"
|
||||
#include <map>
|
||||
#include <cassert>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include "PrefixTree.h"
|
||||
#include "File.h"
|
||||
#include "FactorCollection.h"
|
||||
|
||||
template<class T>
|
||||
std::ostream& operator<<(std::ostream& out,const std::vector<T>& x) {
|
||||
out<<x.size()<<" ";
|
||||
typename std::vector<T>::const_iterator iend=x.end();
|
||||
for(typename std::vector<T>::const_iterator i=x.begin();i!=iend;++i) out<<*i<<' ';
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
|
||||
typedef unsigned LabelId;
|
||||
LabelId InvalidLabelId=std::numeric_limits<LabelId>::max();
|
||||
LabelId Epsilon=InvalidLabelId-1;
|
||||
|
||||
typedef std::vector<LabelId> IPhrase;
|
||||
typedef std::vector<float> Scores;
|
||||
|
||||
typedef PrefixTreeF<LabelId,off_t> PTF;
|
||||
|
||||
template<class A,class B=std::map<A,LabelId> >
|
||||
class LVoc {
|
||||
typedef A Key;
|
||||
typedef B M;
|
||||
typedef std::vector<Key> V;
|
||||
M m;
|
||||
V data;
|
||||
public:
|
||||
LVoc() {}
|
||||
|
||||
bool isKnown(const Key& k) const {return m.find(k)!=m.end();}
|
||||
LabelId index(const Key& k) const {
|
||||
typename M::const_iterator i=m.find(k);
|
||||
return i!=m.end()? i->second : InvalidLabelId;}
|
||||
LabelId add(const Key& k) {
|
||||
std::pair<typename M::iterator,bool> p=m.insert(std::make_pair(k,data.size()));
|
||||
if(p.second) data.push_back(k);
|
||||
assert(p.first->second>=0 && static_cast<size_t>(p.first->second)<data.size());
|
||||
return p.first->second;
|
||||
}
|
||||
const Key& symbol(LabelId i) const {
|
||||
assert(i>=0);assert(static_cast<size_t>(i)<data.size());
|
||||
return data[i];}
|
||||
|
||||
typedef typename V::const_iterator const_iterator;
|
||||
const_iterator begin() const {return data.begin();}
|
||||
const_iterator end() const {return data.end();}
|
||||
|
||||
void Write(const std::string& fname) const {
|
||||
std::ofstream out(fname.c_str()); Write(out);}
|
||||
void Write(std::ostream& out) const {
|
||||
for(int i=data.size()-1;i>=0;--i)
|
||||
out<<i<<' '<<data[i]<<'\n';
|
||||
}
|
||||
void Read(const std::string& fname) {
|
||||
std::ifstream in(fname.c_str());Read(in);}
|
||||
void Read(std::istream& in) {
|
||||
Key k;size_t i;std::string line;
|
||||
while(getline(in,line)) {
|
||||
std::istringstream is(line);
|
||||
if(is>>i>>k) {
|
||||
if(i>=data.size()) data.resize(i+1);
|
||||
data[i]=k;
|
||||
m[k]=i;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class TgtCand {
|
||||
IPhrase e;
|
||||
Scores sc;
|
||||
public:
|
||||
TgtCand() {}
|
||||
TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {}
|
||||
TgtCand(FILE* f) {readBin(f);}
|
||||
|
||||
const IPhrase& GetPhrase() const {return e;}
|
||||
const Scores& GetScores() const {return sc;}
|
||||
|
||||
void writeBin(FILE* f) const {
|
||||
fWriteVector(f,e);fWriteVector(f,sc);}
|
||||
void readBin(FILE* f) {fReadVector(f,e);fReadVector(f,sc);}
|
||||
};
|
||||
|
||||
|
||||
class TgtCands : public std::vector<TgtCand> {
|
||||
typedef std::vector<TgtCand> MyBase;
|
||||
public:
|
||||
TgtCands() : MyBase() {}
|
||||
|
||||
void writeBin(FILE* f) const {
|
||||
unsigned s=size();fWrite(f,s);for(size_t i=0;i<s;++i) this->operator [](i).writeBin(f);
|
||||
}
|
||||
void readBin(FILE* f) {
|
||||
unsigned s;fRead(f,s);resize(s);for(size_t i=0;i<s;++i) this->operator [](i).readBin(f);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct PDTimp {
|
||||
typedef PrefixTreeF<LabelId,off_t> PTF;
|
||||
typedef FilePtr<PTF> CPT;
|
||||
typedef std::vector<CPT> Data;
|
||||
typedef LVoc<std::string> WordVoc;
|
||||
|
||||
Data data;
|
||||
std::vector<off_t> srcOffsets;
|
||||
|
||||
FILE *os,*ot;
|
||||
WordVoc sv,tv;
|
||||
|
||||
FactorCollection *m_factorCollection;
|
||||
FactorType m_factorType;
|
||||
|
||||
PDTimp() : os(0),ot(0),m_factorCollection(0),m_factorType(Surface) {}
|
||||
~PDTimp() {if(os) fclose(os);if(ot) fclose(ot);}
|
||||
|
||||
int ReadBinary(const std::string& fn) {
|
||||
std::string ifs(fn+".binphr.srctree"),
|
||||
ift(fn+".binphr.tgtdata"),
|
||||
ifi(fn+".binphr.idx"),
|
||||
ifsv(fn+".binphr.srcvoc"),
|
||||
iftv(fn+".binphr.tgtvoc");
|
||||
|
||||
FILE *ii=fOpen(ifi.c_str(),"rb");
|
||||
fReadVector(ii,srcOffsets);
|
||||
fclose(ii);
|
||||
|
||||
os=fOpen(ifs.c_str(),"rb");
|
||||
ot=fOpen(ift.c_str(),"rb");
|
||||
|
||||
// std::cerr<<"the load offsets are "<<vo<<"\n";
|
||||
data.resize(srcOffsets.size());
|
||||
for(size_t i=0;i<data.size();++i)
|
||||
data[i]=CPT(os,srcOffsets[i]);
|
||||
|
||||
sv.Read(ifsv);
|
||||
tv.Read(iftv);
|
||||
|
||||
std::cerr<<"binary phrasefile loaded, default off_t: "<<PTF::getDefault()<<"\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
off_t FindOffT(const IPhrase& f) const {
|
||||
if(f.empty()) return InvalidOffT;
|
||||
if(f[0]>=data.size()) return InvalidOffT;
|
||||
if(data[f[0]]) return data[f[0]]->find(f); else return InvalidOffT;
|
||||
}
|
||||
|
||||
void GetTargetCandidates(const IPhrase& f,TgtCands& tgtCands)
|
||||
{
|
||||
off_t tCandOffset=FindOffT(f);
|
||||
if(tCandOffset==InvalidOffT) return;
|
||||
fSeek(ot,tCandOffset);
|
||||
tgtCands.readBin(ot);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
PhraseDictionaryTree::PhraseDictionaryTree(size_t noScoreComponent,FactorCollection *fc,FactorType ft)
|
||||
: Dictionary(noScoreComponent),imp(new PDTimp)
|
||||
{
|
||||
imp->m_factorCollection=fc;
|
||||
imp->m_factorType=ft;
|
||||
}
|
||||
|
||||
PhraseDictionaryTree::~PhraseDictionaryTree() {delete imp;}
|
||||
|
||||
void PhraseDictionaryTree::GetTargetCandidates(const std::vector<const Factor*>& src,std::vector<FactorTgtCand>& rv) const
|
||||
{
|
||||
IPhrase f(src.size());
|
||||
for(size_t i=0;i<src.size();++i)
|
||||
{
|
||||
f[i]=imp->sv.index(src[i]->GetString());
|
||||
if(f[i]==InvalidLabelId) return;
|
||||
}
|
||||
|
||||
TgtCands tgtCands;
|
||||
imp->GetTargetCandidates(f,tgtCands);
|
||||
|
||||
for(size_t i=0;i<tgtCands.size();++i)
|
||||
{
|
||||
const IPhrase& iphrase=tgtCands[i].GetPhrase();
|
||||
std::vector<const Factor*> vf;
|
||||
vf.reserve(iphrase.size());
|
||||
for(size_t j=0;j<iphrase.size();++j)
|
||||
vf.push_back(imp->m_factorCollection->AddFactor(Output,imp->m_factorType,imp->tv.symbol(iphrase[j])));
|
||||
rv.push_back(FactorTgtCand(vf,tgtCands[i].GetScores()));
|
||||
}
|
||||
}
|
||||
|
||||
void PhraseDictionaryTree::PrintTargetCandidates(const std::vector<std::string>& src,std::ostream& out) const
|
||||
{
|
||||
IPhrase f(src.size());
|
||||
for(size_t i=0;i<src.size();++i)
|
||||
{
|
||||
f[i]=imp->sv.index(src[i]);
|
||||
if(f[i]==InvalidLabelId) return;
|
||||
}
|
||||
|
||||
TgtCands tcand;
|
||||
imp->GetTargetCandidates(f,tcand);
|
||||
|
||||
out<<"there are "<<tcand.size()<<" target candidates for source phrase "<<src<<":\n";
|
||||
|
||||
for(size_t i=0;i<tcand.size();++i)
|
||||
{
|
||||
out<<i<<" -- "<<tcand[i].GetScores()<<" -- ";
|
||||
const IPhrase& iphr=tcand[i].GetPhrase();
|
||||
for(size_t j=0;j<iphr.size();++j)
|
||||
out<<imp->tv.symbol(iphr[j])<<" ";
|
||||
out<<'\n';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// for mert
|
||||
void PhraseDictionaryTree::SetWeightTransModel(const std::vector<float> &) {}
|
||||
|
||||
int PhraseDictionaryTree::CreateBinaryFileFromAsciiPhraseTable(std::istream& inFile,const std::string& out) {
|
||||
std::string line;
|
||||
size_t count = 0;
|
||||
|
||||
std::string ofn(out+".binphr.srctree"),
|
||||
oft(out+".binphr.tgtdata"),
|
||||
ofi(out+".binphr.idx"),
|
||||
ofsv(out+".binphr.srcvoc"),
|
||||
oftv(out+".binphr.tgtvoc");
|
||||
|
||||
FILE *os=fOpen(ofn.c_str(),"wb"),
|
||||
*ot=fOpen(oft.c_str(),"wb");
|
||||
|
||||
typedef PrefixTreeSA<LabelId,off_t> PSA;
|
||||
PSA *psa=new PSA;PSA::setDefault(InvalidOffT);
|
||||
|
||||
LabelId currFirstWord=InvalidLabelId;
|
||||
IPhrase currF;
|
||||
TgtCands tgtCands;
|
||||
std::vector<off_t> vo;
|
||||
size_t lnc=0;
|
||||
while(getline(inFile, line)) {
|
||||
++lnc;
|
||||
std::istringstream is(line);std::string w;
|
||||
IPhrase f,e;Scores sc;
|
||||
|
||||
while(is>>w && w!="|||") f.push_back(imp->sv.add(w));
|
||||
while(is>>w && w!="|||") e.push_back(imp->tv.add(w));
|
||||
while(is>>w && w!="|||") sc.push_back(atof(w.c_str()));
|
||||
|
||||
|
||||
if(f.empty()) {
|
||||
std::cerr<<"WARNING: empty source phrase in line '"<<line<<"'\n";
|
||||
continue;}
|
||||
|
||||
if(currFirstWord==InvalidLabelId) currFirstWord=f[0];
|
||||
if(currF.empty()) {
|
||||
currF=f;
|
||||
// insert src phrase in prefix tree
|
||||
assert(psa);
|
||||
PSA::Data& d=psa->insert(f);
|
||||
if(d==InvalidOffT) d=fTell(ot);
|
||||
else {
|
||||
std::cerr<<"ERROR: source phrase already inserted (A)!\nline: '"<<line<<"'\nf: "<<f<<"\n";;abort();}
|
||||
}
|
||||
|
||||
if(currF!=f) {
|
||||
// new src phrase
|
||||
currF=f;
|
||||
// write tgt cand to disk
|
||||
tgtCands.writeBin(ot);tgtCands.clear();
|
||||
|
||||
if(++count%10000==0) {std::cerr<<".";if(count%500000==0)std::cerr<<"[phrase:"<<count<<"]\n";}
|
||||
|
||||
if(f[0]!=currFirstWord) {
|
||||
// write src prefix tree to file and clear
|
||||
PTF pf;
|
||||
if(currFirstWord>=vo.size()) vo.resize(currFirstWord+1,InvalidOffT);
|
||||
vo[currFirstWord]=fTell(os);
|
||||
pf.create(*psa,os);
|
||||
// clear
|
||||
delete psa;psa=new PSA;
|
||||
currFirstWord=f[0];
|
||||
}
|
||||
|
||||
// insert src phrase in prefix tree
|
||||
assert(psa);
|
||||
PSA::Data& d=psa->insert(f);
|
||||
if(d==InvalidOffT) d=fTell(ot);
|
||||
else {
|
||||
std::cerr<<"ERROR: source phrase already inserted (B)!\nline: '"<<line<<"'\nf: "<<f<<"\n";;abort();}
|
||||
}
|
||||
tgtCands.push_back(TgtCand(e,sc));
|
||||
assert(currFirstWord!=InvalidLabelId);
|
||||
}
|
||||
tgtCands.writeBin(ot);tgtCands.clear();
|
||||
|
||||
std::cerr<<"total word count: "<<count<<" -- "<<vo.size()<<" line count: "<<lnc<<" -- "<<currFirstWord<<"\n";
|
||||
|
||||
PTF pf;
|
||||
if(currFirstWord>=vo.size()) vo.resize(currFirstWord+1,InvalidOffT);
|
||||
vo[currFirstWord]=fTell(os);
|
||||
pf.create(*psa,os);
|
||||
delete psa;psa=0;
|
||||
|
||||
fclose(os);
|
||||
fclose(ot);
|
||||
|
||||
std::vector<size_t> inv;
|
||||
for(size_t i=0;i<vo.size();++i)
|
||||
if(vo[i]==InvalidOffT) inv.push_back(i);
|
||||
|
||||
if(inv.size()) {
|
||||
std::cerr<<"WARNING: there are src voc entries with no phrase translation: count "<<inv.size()<<"\n"
|
||||
"There exists phrase translations for "<<vo.size()-inv.size()<<" entries\n";
|
||||
}
|
||||
|
||||
FILE *oi=fOpen(ofi.c_str(),"wb");
|
||||
size_t vob=fWriteVector(oi,vo);
|
||||
fclose(oi);
|
||||
std::cerr<<"written "<<vob<<" bytes for offset vector\n";
|
||||
|
||||
imp->sv.Write(ofsv);
|
||||
imp->tv.Write(oftv);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
int PhraseDictionaryTree::ReadBinary(const std::string& fn) {
|
||||
std::cerr<<"size of off_t "<<sizeof(off_t)<<"\n";
|
||||
return imp->ReadBinary(fn);
|
||||
}
|
47
moses/src/PhraseDictionaryTree.h
Normal file
47
moses/src/PhraseDictionaryTree.h
Normal file
@ -0,0 +1,47 @@
|
||||
#ifndef PHRASEDICTIONARYTREE_H_
|
||||
#define PHRASEDICTIONARYTREE_H_
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include "TypeDef.h"
|
||||
#include "Dictionary.h"
|
||||
#include "PhraseDictionary.h"
|
||||
|
||||
class Phrase;
|
||||
class PDTimp;
|
||||
class FactorCollection;
|
||||
|
||||
|
||||
typedef std::pair<std::vector<const Factor*>,std::vector<float> > FactorTgtCand;
|
||||
|
||||
|
||||
class PhraseDictionaryTree : public Dictionary {
|
||||
PDTimp *imp; //implementation
|
||||
public:
|
||||
PhraseDictionaryTree(size_t noScoreComponent,FactorCollection* factorCollection=0,FactorType factorType=Surface);
|
||||
virtual ~PhraseDictionaryTree();
|
||||
|
||||
DecodeType GetDecodeType() const
|
||||
{
|
||||
return Translate;
|
||||
}
|
||||
|
||||
int CreateBinaryFileFromAsciiPhraseTable(std::istream& In,const std::string& OutputFileNamePrefix);
|
||||
int ReadBinary(const std::string& FileNamePrefix);
|
||||
|
||||
|
||||
size_t GetSize() const
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// const TargetPhraseCollection *FindEquivPhrase(const Phrase &source) const;
|
||||
|
||||
void GetTargetCandidates(const std::vector<const Factor*>& src,std::vector<FactorTgtCand>& rv) const;
|
||||
void PrintTargetCandidates(const std::vector<std::string>& src,std::ostream& out) const;
|
||||
|
||||
// for mert
|
||||
void SetWeightTransModel(const std::vector<float> &weightT);
|
||||
|
||||
};
|
||||
#endif /*PHRASEDICTIONARYTREE_H_*/
|
281
moses/src/PrefixTree.h
Normal file
281
moses/src/PrefixTree.h
Normal file
@ -0,0 +1,281 @@
|
||||
|
||||
/* ---------------------------------------------------------------- */
|
||||
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
|
||||
/* Richard Zens */
|
||||
/* ---------------------------------------------------------------- */
|
||||
#ifndef PREFIXTREE_H_
|
||||
#define PREFIXTREE_H_
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <deque>
|
||||
#include "Util.h"
|
||||
#include "FilePtr.h"
|
||||
#include "File.h"
|
||||
#ifdef DEBUG
|
||||
#include "CountObjects.h"
|
||||
#endif
|
||||
|
||||
template<typename T,typename D>
|
||||
class PrefixTreeSA {
|
||||
public:
|
||||
typedef T Key;
|
||||
typedef D Data;
|
||||
|
||||
typedef PrefixTreeSA<T,D> Self;
|
||||
typedef std::vector<T> VT;
|
||||
typedef std::vector<Self*> VP;
|
||||
typedef std::vector<D> VD;
|
||||
|
||||
VT keys;
|
||||
VP ptr;
|
||||
VD data;
|
||||
|
||||
static Data def;
|
||||
|
||||
public:
|
||||
PrefixTreeSA() {}
|
||||
|
||||
~PrefixTreeSA() {for(size_t i=0;i<ptr.size();++i) delete ptr[i];}
|
||||
|
||||
static const Data& getDefault() {return def;}
|
||||
static void setDefault(const Data& x) {def=x;}
|
||||
|
||||
|
||||
// insert sequence
|
||||
template<typename fwiter> Data& insert(fwiter b,fwiter e) {
|
||||
typename VT::iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
|
||||
typename VT::iterator kb=keys.begin();
|
||||
size_t pos=std::distance(kb,i);
|
||||
|
||||
if(i==keys.end() || *i!=*b) {
|
||||
keys.insert(i,*b);
|
||||
data.insert(data.begin()+pos,def);
|
||||
ptr.insert(ptr.begin()+pos,0);
|
||||
}
|
||||
if(++b!=e) {
|
||||
if(!ptr[pos]) ptr[pos]=new Self;
|
||||
return ptr[pos]->insert(b,e);
|
||||
}
|
||||
else return data[pos];
|
||||
}
|
||||
// insert container
|
||||
template<typename cont> Data& insert(const cont& c) {
|
||||
return insert(c.begin(),c.end());}
|
||||
|
||||
size_t size() const {return keys.size();}
|
||||
const Key& getKey(size_t i) const {return keys[i];}
|
||||
const Data& getData(size_t i) const {return data[i];}
|
||||
const Self* getPtr(size_t i) const {return ptr[i];}
|
||||
|
||||
size_t findKey(const Key& k) const {
|
||||
typename VT::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
|
||||
if(i==keys.end() || *i!=k) return keys.size();
|
||||
return std::distance(keys.begin(),i);
|
||||
}
|
||||
|
||||
// find sequence
|
||||
template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
|
||||
size_t pos=findKey(*b);
|
||||
if(pos==keys.size()) return 0;
|
||||
if(++b==e) return &data[pos];
|
||||
if(ptr[pos]) return ptr[pos]->findPtr(b,e); else return 0;
|
||||
}
|
||||
// find container
|
||||
template<typename cont> const Data* findPtr(const cont& c) const {
|
||||
return findPtr(c.begin(),c.end());}
|
||||
|
||||
|
||||
// find sequence
|
||||
template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
|
||||
if(const Data* p=findPtr(b,e)) return *p; else return def;
|
||||
}
|
||||
|
||||
// find container
|
||||
template<typename cont> const Data& find(const cont& c) const {
|
||||
return find(c.begin(),c.end());}
|
||||
|
||||
void shrink() {
|
||||
ShrinkToFit(keys); ShrinkToFit(ptr); ShrinkToFit(data);}
|
||||
|
||||
};
|
||||
template<typename T,typename D> D PrefixTreeSA<T,D>::def;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<typename T,typename D>
|
||||
class PrefixTreeF {
|
||||
public:
|
||||
typedef T Key;
|
||||
typedef D Data;
|
||||
private:
|
||||
typedef PrefixTreeF<Key,Data> Self;
|
||||
public:
|
||||
typedef FilePtr<Self> Ptr;
|
||||
private:
|
||||
typedef std::vector<Key> VK;
|
||||
typedef std::vector<Data> VD;
|
||||
typedef std::vector<Ptr> VP;
|
||||
|
||||
VK keys;
|
||||
VD data;
|
||||
VP ptr;
|
||||
|
||||
static Data def;
|
||||
|
||||
off_t startPos;
|
||||
FILE* f;
|
||||
public:
|
||||
#ifdef DEBUG
|
||||
DECLAREMEMSTAT(Self);
|
||||
#endif
|
||||
|
||||
PrefixTreeF(FILE* f_=0) : f(f_) {if(f) read();}
|
||||
|
||||
~PrefixTreeF() {free();}
|
||||
|
||||
void read() {
|
||||
startPos=fTell(f);
|
||||
fReadVector(f,keys);
|
||||
fReadVector(f,data);
|
||||
ptr.clear();ptr.resize(keys.size());
|
||||
for(size_t i=0;i<ptr.size();++i) {
|
||||
off_t pos;
|
||||
fRead(f,pos);
|
||||
if(pos) ptr[i].set(f,pos);
|
||||
}
|
||||
}
|
||||
|
||||
void free() {
|
||||
for(typename VP::iterator i=ptr.begin();i!=ptr.end();++i) i->free();}
|
||||
|
||||
void reserve(size_t s) {
|
||||
keys.reserve(s);data.reserve(s);ptr.reserve(s);}
|
||||
|
||||
template<typename fwiter>
|
||||
void changeData(fwiter b,fwiter e,const Data& d) {
|
||||
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
|
||||
if(i==keys.end() || *i!=*b) {
|
||||
std::cerr<<"ERROR: key not found in changeData!\n"; return;}
|
||||
typename VK::const_iterator kb=keys.begin();
|
||||
size_t pos=std::distance(kb,i);
|
||||
if(++b==e) {
|
||||
off_t p=startPos+keys.size()*sizeof(Key)+2*sizeof(unsigned)+pos*sizeof(Data);
|
||||
std::cerr<<"elem found at pos "<<p<<" old val: "<<data[pos]<<" startpos: "<<startPos<<"\n";
|
||||
if(data[pos]!=d) {
|
||||
data[pos]=d;fSeek(f,p);fWrite(f,d);}
|
||||
return;
|
||||
}
|
||||
if(ptr[pos]) ptr[pos]->changeData(b,e,d); else {
|
||||
std::cerr<<"ERROR: seg not found!in changeData\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void create(const PrefixTreeSA<Key,Data>& psa,const std::string& fname) {
|
||||
FILE* f=fOpen(fname.c_str(),"wb");
|
||||
create(psa,f);
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
void create(const PrefixTreeSA<Key,Data>& psa,FILE* f,int verbose=0) {
|
||||
setDefault(psa.getDefault());
|
||||
|
||||
typedef std::pair<const PrefixTreeSA<Key,Data>*,off_t> P;
|
||||
typedef std::deque<P> Next;
|
||||
|
||||
Next next;
|
||||
|
||||
next.push_back(P(&psa,fTell(f)));
|
||||
bool isFirst=1;
|
||||
size_t ns=1;
|
||||
while(next.size()) {
|
||||
if(verbose && next.size()>ns) {
|
||||
std::cerr<<"stack size in PF create: "<<next.size()<<"\n";
|
||||
while(ns<next.size()) ns*=2;}
|
||||
const P& pp=next.back();
|
||||
const PrefixTreeSA<Key,Data>& p=*pp.first;
|
||||
off_t pos=pp.second;
|
||||
next.pop_back();
|
||||
|
||||
if(!isFirst) {
|
||||
off_t curr=fTell(f);
|
||||
fSeek(f,pos);
|
||||
fWrite(f,curr);
|
||||
fSeek(f,curr);
|
||||
} else isFirst=0;
|
||||
|
||||
size_t s=0;
|
||||
s+=fWriteVector(f,p.keys);
|
||||
s+=fWriteVector(f,p.data);
|
||||
|
||||
for(size_t i=0;i<p.ptr.size();++i) {
|
||||
if(p.ptr[i])
|
||||
next.push_back(P(p.ptr[i],fTell(f)));
|
||||
off_t ppos=0;
|
||||
s+=fWrite(f,ppos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t size() const {return keys.size();}
|
||||
const Key& getKey(size_t i) const {return keys[i];}
|
||||
const Data& getData(size_t i) const {return data[i];}
|
||||
const Self* getPtr(size_t i) const {return ptr[i];}
|
||||
|
||||
size_t findKey(const Key& k) const {
|
||||
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
|
||||
if(i==keys.end() || *i!=k) return keys.size();
|
||||
return std::distance(keys.begin(),i);
|
||||
}
|
||||
|
||||
Ptr const* findKeyPtr(const Key& k) const {
|
||||
size_t pos=findKey(k);
|
||||
if(pos<keys.size()) {return &ptr[pos];} else {return 0;}
|
||||
}
|
||||
|
||||
// find sequence
|
||||
template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
|
||||
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
|
||||
if(i==keys.end() || *i!=*b) return 0;
|
||||
size_t pos=std::distance(keys.begin(),i);
|
||||
if(++b==e) return &data[pos];
|
||||
if(ptr[pos]) return ptr[pos]->findPtr(b,e); else return 0;
|
||||
}
|
||||
// find container
|
||||
template<typename cont> const Data* findPtr(const cont& c) const {
|
||||
return findPtr(c.begin(),c.end());}
|
||||
|
||||
|
||||
// find sequence
|
||||
template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
|
||||
if(const Data* p=findPtr(b,e)) return *p; else return def;} //return (p?*p:def);}
|
||||
|
||||
// find container
|
||||
template<typename cont> const Data& find(const cont& c) const {
|
||||
return find(c.begin(),c.end());}
|
||||
|
||||
|
||||
|
||||
static void setDefault(const Data& d) {def=d;}
|
||||
static const Data& getDefault() {return def;}
|
||||
|
||||
|
||||
void print(std::ostream& out,const std::string s="") const {
|
||||
|
||||
out<<s<<"startpos: "<<startPos<<" size: "<<keys.size()<<"\n";
|
||||
for(size_t i=0;i<keys.size();++i) {
|
||||
out<<s<<i<<" - "<<keys[i]<<" "<<data[i]<<"\n";
|
||||
}
|
||||
for(size_t i=0;i<ptr.size();++i)
|
||||
if(ptr[i])
|
||||
ptr[i]->print(out,s+" ");
|
||||
}
|
||||
|
||||
|
||||
};
|
||||
template<typename T,typename D> D PrefixTreeF<T,D>::def;
|
||||
#ifdef DEBUG
|
||||
template<typename T,typename D> MemoryStatsPrinter< PrefixTreeF<T,D> > PrefixTreeF<T,D>::memStat("PrefixTreeF<T,D>",0);
|
||||
#endif
|
||||
#endif
|
@ -27,8 +27,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#include "FactorCollection.h"
|
||||
#include "HypothesisCollection.h"
|
||||
#include "Timer.h"
|
||||
|
||||
#include "PhraseDictionaryTree.h"
|
||||
#include "boost/filesystem/operations.hpp" // boost::filesystem::exists
|
||||
#include "InputFileStream.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
@ -335,7 +336,6 @@ void StaticData::LoadPhraseTables(bool filter
|
||||
weight[currScore] = weightAll[totalPrevNoScoreComponent + currScore];
|
||||
}
|
||||
totalPrevNoScoreComponent += noScoreComponent;
|
||||
|
||||
string phraseTableHash = GetMD5Hash(filePath);
|
||||
string hashFilePath = GetCachePath()
|
||||
+ PROJECT_NAME + "--"
|
||||
@ -374,7 +374,7 @@ void StaticData::LoadPhraseTables(bool filter
|
||||
, inputPhraseList
|
||||
, this->GetLanguageModel(Initial)
|
||||
, this->GetWeightWordPenalty());
|
||||
|
||||
|
||||
timer.check("Finished loading PhraseTable");
|
||||
}
|
||||
}
|
||||
|
@ -58,12 +58,15 @@ const size_t DEFAULT_VERBOSE_LEVEL = 1;
|
||||
|
||||
#ifdef LM_SRI
|
||||
typedef unsigned int LmId;
|
||||
#endif
|
||||
#else
|
||||
#ifdef LM_INTERNAL
|
||||
class NGramNode;
|
||||
typedef const NGramNode* LmId;
|
||||
#else
|
||||
// if nothing is defined:
|
||||
typedef unsigned int LmId;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// enums.
|
||||
// must be 0, 1, 2, ..., unless otherwise stated
|
||||
|
||||
|
@ -188,3 +188,9 @@ void RemoveAllInColl(COLL &coll)
|
||||
std::string GetTempFolder();
|
||||
void CreateTempFile(std::ofstream &fileStream, std::string &filePath);
|
||||
std::string GetMD5Hash(const std::string &filePath);
|
||||
|
||||
template<typename T> inline void ShrinkToFit(T& v) {
|
||||
if(v.capacity()>v.size()) T(v).swap(v);assert(v.capacity()==v.size());}
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user