almost no modification to existing system, only additions:

- prefix tree structured phrase table
 - binary phrase table format with on-demand loading


git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@101 1f5c12ca-751b-0410-a591-d2e778427230
This commit is contained in:
zens 2006-07-14 02:06:09 +00:00
parent 6e8e6a9ed2
commit 45588167bd
10 changed files with 884 additions and 31 deletions

View File

@ -0,0 +1,68 @@
// $Id$
#ifndef COUNTEDPOINTER_H_
#define COUNTEDPOINTER_H_
// see http://ootips.org/yonat/4dev/counted_ptr.h
template <class T> class CountedPointer
{
public:
explicit CountedPointer(T *p = 0) : pointerInfo_(0)
{
if (p)
pointerInfo_ = new PointerAndCounter(p);
}
CountedPointer(const CountedPointer &p) { acquire(p.pointerInfo_); };
~CountedPointer() { release(); };
CountedPointer &operator=(const CountedPointer &p)
{
if (this != &p) {
release();
acquire(p.pointerInfo_);
}
return *this;
}
CountedPointer &operator=(T *p)
{
release();
pointerInfo_ = new PointerAndCounter(p);
return *this;
}
operator bool() const { return pointerInfo_; };
bool hasP() const { return pointerInfo_->pointer != 0; };
T& operator*() const { return *pointerInfo_->pointer; };
T* operator->() const { return pointerInfo_->pointer; };
bool unique() const { return (!pointerInfo_ || pointerInfo_->counter == 1); };
void destroy() { release(); };
//void operator delete(void *p) { release(); };
private:
struct PointerAndCounter
{
T *pointer;
unsigned counter;
PointerAndCounter(T* p = 0, unsigned c = 1) : pointer(p), counter(c) {};
} *pointerInfo_;
void acquire(PointerAndCounter *c)
{
pointerInfo_ = c;
if (pointerInfo_)
(pointerInfo_->counter)++;
}
void release()
{
if (pointerInfo_) {
if (--(pointerInfo_->counter) == 0) {
delete pointerInfo_->pointer;
delete pointerInfo_;
}
pointerInfo_ = 0;
}
}
};
#endif

View File

@ -53,32 +53,9 @@ const Factor *FactorCollection::AddFactor(FactorDirection direction
, LmId lmId)
{
// find string id
const string *ptr;
StringSet::const_iterator iterString = m_factorStringCollection.find(factorString);
if (iterString == m_factorStringCollection.end())
{
const pair< StringSet::iterator, bool > &pairRet = m_factorStringCollection.insert(factorString);
const string &str = *pairRet.first;
ptr = &str;
}
else
{
const string &str = *iterString;
ptr = &str;
}
Factor findFactor(direction, factorType, ptr, lmId);
FactorSet::const_iterator iter = m_collection.find(findFactor);
if (iter == m_collection.end())
{ // new factor
pair< FactorSet::iterator, bool > pairRet = m_collection.insert(findFactor);
return &(*pairRet.first);
}
else
{
return &(*iter);
}
const string *ptr=&(*m_factorStringCollection.insert(factorString).first);
// Factor findFactor(direction, factorType, ptr, lmId);
return &(*m_collection.insert(Factor(direction, factorType, ptr, lmId)).first);
}
const Factor *FactorCollection::AddFactor(FactorDirection direction

68
moses/src/File.h Normal file
View File

@ -0,0 +1,68 @@
/* ---------------------------------------------------------------- */
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
/* Richard Zens */
/* ---------------------------------------------------------------- */
#ifndef FILE_H_
#define FILE_H_
#include <cstdio>
#include <vector>
static const off_t InvalidOffT=-1;
// WARNING:
// these functions work only for bitwise read/write-able types
template<typename T> inline size_t fWrite(FILE* f,const T& t) {
if(fwrite(&t,sizeof(t),1,f)!=1) {
std::cerr<<"ERROR:: fwrite!\n";abort();}
return sizeof(t);
}
template<typename T> inline void fRead(FILE* f,T& t) {
if(fread(&t,sizeof(t),1,f)!=1) {std::cerr<<"ERROR: fread!\n";abort();}
}
template<typename T> inline size_t fWrite(FILE* f,const T* b,const T* e) {
unsigned s=e-b;size_t rv=fWrite(f,s);
if(fwrite(b,sizeof(T),s,f)!=s) {std::cerr<<"ERROR: fwrite!\n";abort();}
return rv+sizeof(T)*s;
}
template<typename T> inline size_t fWrite(FILE* f,const T b,const T e) {
unsigned s=std::distance(b,e);size_t rv=fWrite(f,s);
if(fwrite(&(*b),sizeof(T),s,f)!=s) {std::cerr<<"ERROR: fwrite!\n";abort();}
return rv+sizeof(T)*s;
}
template<typename C> inline size_t fWriteVector(FILE* f,const C& v) {
unsigned s=v.size();
size_t rv=fWrite(f,s);
if(fwrite(&v[0],sizeof(typename C::value_type),s,f)!=s) {std::cerr<<"ERROR: fwrite!\n";abort();}
return rv+sizeof(typename C::value_type)*s;
}
template<typename C> inline void fReadVector(FILE* f, C& v) {
unsigned s;fRead(f,s);v.resize(s);
unsigned r=fread(&(*v.begin()),sizeof(typename C::value_type),s,f);
if(r!=s) {
std::cerr<<"ERROR: freadVec! "<<r<<" "<<s<<"\n";abort();}
}
inline off_t fTell(FILE* f) {return ftello(f);}
inline void fSeek(FILE* f,off_t o) {
if(fseeko(f,o,SEEK_SET)<0) {
std::cerr<<"ERROR: could not fseeko position "<<o<<"\n";
if(o==InvalidOffT) std::cerr<<"You tried to seek for 'InvalidOffT'!\n";
abort();
}
}
inline FILE* fOpen(const char* fn,const char* m) {
if(FILE* f=fopen(fn,m)) return f; else {
std::cerr<<"ERROR: could not open file "<<fn<<" with mode "<<m<<"\n";
abort();}
}
#endif

56
moses/src/FilePtr.h Normal file
View File

@ -0,0 +1,56 @@
/* ---------------------------------------------------------------- */
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
/* Richard Zens */
/* ---------------------------------------------------------------- */
#ifndef FILEPTR_H_
#define FILEPTR_H_
#include "File.h"
#ifdef USECPFP
#include "CountedPointer.h"
#endif
template<typename T> class FilePtr {
public:
#ifdef USECPFP
typedef CountedPointer<T> Ptr;
#else
typedef T* Ptr;
#endif
private:
FILE* f;
off_t pos;
mutable Ptr t;
public:
FilePtr(FILE* f_=0,off_t p=0) : f(f_),pos(p),t(0) {}
~FilePtr() {}
void set(FILE* f_,off_t p) {f=f_;pos=p;}
void free() {
#ifdef USECPFP
t.destroy();
#else
delete t; t=0;
#endif
}
T& operator* () {load();return *t;}
Ptr operator->() {load();return t;}
operator Ptr () {load();return t;}
const T& operator* () const {load();return *t;}
const Ptr operator->() const {load();return t;}
operator const Ptr () const {load();return t;}
Ptr getPtr() {return t;}
const Ptr getPtr() const {return t;}
operator bool() const {return (f && pos!=InvalidOffT);}
void load() const {
if(t) return;
if(f && pos!=InvalidOffT) {fSeek(f,pos); t=new T(f);}
}
};
#endif

View File

@ -0,0 +1,347 @@
#include "PhraseDictionaryTree.h"
#include <map>
#include <cassert>
#include <sstream>
#include <iostream>
#include <fstream>
#include "PrefixTree.h"
#include "File.h"
#include "FactorCollection.h"
template<class T>
std::ostream& operator<<(std::ostream& out,const std::vector<T>& x) {
out<<x.size()<<" ";
typename std::vector<T>::const_iterator iend=x.end();
for(typename std::vector<T>::const_iterator i=x.begin();i!=iend;++i) out<<*i<<' ';
return out;
}
typedef unsigned LabelId;
LabelId InvalidLabelId=std::numeric_limits<LabelId>::max();
LabelId Epsilon=InvalidLabelId-1;
typedef std::vector<LabelId> IPhrase;
typedef std::vector<float> Scores;
typedef PrefixTreeF<LabelId,off_t> PTF;
template<class A,class B=std::map<A,LabelId> >
class LVoc {
typedef A Key;
typedef B M;
typedef std::vector<Key> V;
M m;
V data;
public:
LVoc() {}
bool isKnown(const Key& k) const {return m.find(k)!=m.end();}
LabelId index(const Key& k) const {
typename M::const_iterator i=m.find(k);
return i!=m.end()? i->second : InvalidLabelId;}
LabelId add(const Key& k) {
std::pair<typename M::iterator,bool> p=m.insert(std::make_pair(k,data.size()));
if(p.second) data.push_back(k);
assert(p.first->second>=0 && static_cast<size_t>(p.first->second)<data.size());
return p.first->second;
}
const Key& symbol(LabelId i) const {
assert(i>=0);assert(static_cast<size_t>(i)<data.size());
return data[i];}
typedef typename V::const_iterator const_iterator;
const_iterator begin() const {return data.begin();}
const_iterator end() const {return data.end();}
void Write(const std::string& fname) const {
std::ofstream out(fname.c_str()); Write(out);}
void Write(std::ostream& out) const {
for(int i=data.size()-1;i>=0;--i)
out<<i<<' '<<data[i]<<'\n';
}
void Read(const std::string& fname) {
std::ifstream in(fname.c_str());Read(in);}
void Read(std::istream& in) {
Key k;size_t i;std::string line;
while(getline(in,line)) {
std::istringstream is(line);
if(is>>i>>k) {
if(i>=data.size()) data.resize(i+1);
data[i]=k;
m[k]=i;
}
}
}
};
class TgtCand {
IPhrase e;
Scores sc;
public:
TgtCand() {}
TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {}
TgtCand(FILE* f) {readBin(f);}
const IPhrase& GetPhrase() const {return e;}
const Scores& GetScores() const {return sc;}
void writeBin(FILE* f) const {
fWriteVector(f,e);fWriteVector(f,sc);}
void readBin(FILE* f) {fReadVector(f,e);fReadVector(f,sc);}
};
class TgtCands : public std::vector<TgtCand> {
typedef std::vector<TgtCand> MyBase;
public:
TgtCands() : MyBase() {}
void writeBin(FILE* f) const {
unsigned s=size();fWrite(f,s);for(size_t i=0;i<s;++i) this->operator [](i).writeBin(f);
}
void readBin(FILE* f) {
unsigned s;fRead(f,s);resize(s);for(size_t i=0;i<s;++i) this->operator [](i).readBin(f);
}
};
struct PDTimp {
typedef PrefixTreeF<LabelId,off_t> PTF;
typedef FilePtr<PTF> CPT;
typedef std::vector<CPT> Data;
typedef LVoc<std::string> WordVoc;
Data data;
std::vector<off_t> srcOffsets;
FILE *os,*ot;
WordVoc sv,tv;
FactorCollection *m_factorCollection;
FactorType m_factorType;
PDTimp() : os(0),ot(0),m_factorCollection(0),m_factorType(Surface) {}
~PDTimp() {if(os) fclose(os);if(ot) fclose(ot);}
int ReadBinary(const std::string& fn) {
std::string ifs(fn+".binphr.srctree"),
ift(fn+".binphr.tgtdata"),
ifi(fn+".binphr.idx"),
ifsv(fn+".binphr.srcvoc"),
iftv(fn+".binphr.tgtvoc");
FILE *ii=fOpen(ifi.c_str(),"rb");
fReadVector(ii,srcOffsets);
fclose(ii);
os=fOpen(ifs.c_str(),"rb");
ot=fOpen(ift.c_str(),"rb");
// std::cerr<<"the load offsets are "<<vo<<"\n";
data.resize(srcOffsets.size());
for(size_t i=0;i<data.size();++i)
data[i]=CPT(os,srcOffsets[i]);
sv.Read(ifsv);
tv.Read(iftv);
std::cerr<<"binary phrasefile loaded, default off_t: "<<PTF::getDefault()<<"\n";
return 1;
}
off_t FindOffT(const IPhrase& f) const {
if(f.empty()) return InvalidOffT;
if(f[0]>=data.size()) return InvalidOffT;
if(data[f[0]]) return data[f[0]]->find(f); else return InvalidOffT;
}
void GetTargetCandidates(const IPhrase& f,TgtCands& tgtCands)
{
off_t tCandOffset=FindOffT(f);
if(tCandOffset==InvalidOffT) return;
fSeek(ot,tCandOffset);
tgtCands.readBin(ot);
}
};
PhraseDictionaryTree::PhraseDictionaryTree(size_t noScoreComponent,FactorCollection *fc,FactorType ft)
: Dictionary(noScoreComponent),imp(new PDTimp)
{
imp->m_factorCollection=fc;
imp->m_factorType=ft;
}
PhraseDictionaryTree::~PhraseDictionaryTree() {delete imp;}
void PhraseDictionaryTree::GetTargetCandidates(const std::vector<const Factor*>& src,std::vector<FactorTgtCand>& rv) const
{
IPhrase f(src.size());
for(size_t i=0;i<src.size();++i)
{
f[i]=imp->sv.index(src[i]->GetString());
if(f[i]==InvalidLabelId) return;
}
TgtCands tgtCands;
imp->GetTargetCandidates(f,tgtCands);
for(size_t i=0;i<tgtCands.size();++i)
{
const IPhrase& iphrase=tgtCands[i].GetPhrase();
std::vector<const Factor*> vf;
vf.reserve(iphrase.size());
for(size_t j=0;j<iphrase.size();++j)
vf.push_back(imp->m_factorCollection->AddFactor(Output,imp->m_factorType,imp->tv.symbol(iphrase[j])));
rv.push_back(FactorTgtCand(vf,tgtCands[i].GetScores()));
}
}
void PhraseDictionaryTree::PrintTargetCandidates(const std::vector<std::string>& src,std::ostream& out) const
{
IPhrase f(src.size());
for(size_t i=0;i<src.size();++i)
{
f[i]=imp->sv.index(src[i]);
if(f[i]==InvalidLabelId) return;
}
TgtCands tcand;
imp->GetTargetCandidates(f,tcand);
out<<"there are "<<tcand.size()<<" target candidates for source phrase "<<src<<":\n";
for(size_t i=0;i<tcand.size();++i)
{
out<<i<<" -- "<<tcand[i].GetScores()<<" -- ";
const IPhrase& iphr=tcand[i].GetPhrase();
for(size_t j=0;j<iphr.size();++j)
out<<imp->tv.symbol(iphr[j])<<" ";
out<<'\n';
}
}
// for mert
void PhraseDictionaryTree::SetWeightTransModel(const std::vector<float> &) {}
int PhraseDictionaryTree::CreateBinaryFileFromAsciiPhraseTable(std::istream& inFile,const std::string& out) {
std::string line;
size_t count = 0;
std::string ofn(out+".binphr.srctree"),
oft(out+".binphr.tgtdata"),
ofi(out+".binphr.idx"),
ofsv(out+".binphr.srcvoc"),
oftv(out+".binphr.tgtvoc");
FILE *os=fOpen(ofn.c_str(),"wb"),
*ot=fOpen(oft.c_str(),"wb");
typedef PrefixTreeSA<LabelId,off_t> PSA;
PSA *psa=new PSA;PSA::setDefault(InvalidOffT);
LabelId currFirstWord=InvalidLabelId;
IPhrase currF;
TgtCands tgtCands;
std::vector<off_t> vo;
size_t lnc=0;
while(getline(inFile, line)) {
++lnc;
std::istringstream is(line);std::string w;
IPhrase f,e;Scores sc;
while(is>>w && w!="|||") f.push_back(imp->sv.add(w));
while(is>>w && w!="|||") e.push_back(imp->tv.add(w));
while(is>>w && w!="|||") sc.push_back(atof(w.c_str()));
if(f.empty()) {
std::cerr<<"WARNING: empty source phrase in line '"<<line<<"'\n";
continue;}
if(currFirstWord==InvalidLabelId) currFirstWord=f[0];
if(currF.empty()) {
currF=f;
// insert src phrase in prefix tree
assert(psa);
PSA::Data& d=psa->insert(f);
if(d==InvalidOffT) d=fTell(ot);
else {
std::cerr<<"ERROR: source phrase already inserted (A)!\nline: '"<<line<<"'\nf: "<<f<<"\n";;abort();}
}
if(currF!=f) {
// new src phrase
currF=f;
// write tgt cand to disk
tgtCands.writeBin(ot);tgtCands.clear();
if(++count%10000==0) {std::cerr<<".";if(count%500000==0)std::cerr<<"[phrase:"<<count<<"]\n";}
if(f[0]!=currFirstWord) {
// write src prefix tree to file and clear
PTF pf;
if(currFirstWord>=vo.size()) vo.resize(currFirstWord+1,InvalidOffT);
vo[currFirstWord]=fTell(os);
pf.create(*psa,os);
// clear
delete psa;psa=new PSA;
currFirstWord=f[0];
}
// insert src phrase in prefix tree
assert(psa);
PSA::Data& d=psa->insert(f);
if(d==InvalidOffT) d=fTell(ot);
else {
std::cerr<<"ERROR: source phrase already inserted (B)!\nline: '"<<line<<"'\nf: "<<f<<"\n";;abort();}
}
tgtCands.push_back(TgtCand(e,sc));
assert(currFirstWord!=InvalidLabelId);
}
tgtCands.writeBin(ot);tgtCands.clear();
std::cerr<<"total word count: "<<count<<" -- "<<vo.size()<<" line count: "<<lnc<<" -- "<<currFirstWord<<"\n";
PTF pf;
if(currFirstWord>=vo.size()) vo.resize(currFirstWord+1,InvalidOffT);
vo[currFirstWord]=fTell(os);
pf.create(*psa,os);
delete psa;psa=0;
fclose(os);
fclose(ot);
std::vector<size_t> inv;
for(size_t i=0;i<vo.size();++i)
if(vo[i]==InvalidOffT) inv.push_back(i);
if(inv.size()) {
std::cerr<<"WARNING: there are src voc entries with no phrase translation: count "<<inv.size()<<"\n"
"There exists phrase translations for "<<vo.size()-inv.size()<<" entries\n";
}
FILE *oi=fOpen(ofi.c_str(),"wb");
size_t vob=fWriteVector(oi,vo);
fclose(oi);
std::cerr<<"written "<<vob<<" bytes for offset vector\n";
imp->sv.Write(ofsv);
imp->tv.Write(oftv);
return 1;
}
int PhraseDictionaryTree::ReadBinary(const std::string& fn) {
std::cerr<<"size of off_t "<<sizeof(off_t)<<"\n";
return imp->ReadBinary(fn);
}

View File

@ -0,0 +1,47 @@
#ifndef PHRASEDICTIONARYTREE_H_
#define PHRASEDICTIONARYTREE_H_
#include <string>
#include <vector>
#include <iostream>
#include "TypeDef.h"
#include "Dictionary.h"
#include "PhraseDictionary.h"
class Phrase;
class PDTimp;
class FactorCollection;
typedef std::pair<std::vector<const Factor*>,std::vector<float> > FactorTgtCand;
class PhraseDictionaryTree : public Dictionary {
PDTimp *imp; //implementation
public:
PhraseDictionaryTree(size_t noScoreComponent,FactorCollection* factorCollection=0,FactorType factorType=Surface);
virtual ~PhraseDictionaryTree();
DecodeType GetDecodeType() const
{
return Translate;
}
int CreateBinaryFileFromAsciiPhraseTable(std::istream& In,const std::string& OutputFileNamePrefix);
int ReadBinary(const std::string& FileNamePrefix);
size_t GetSize() const
{
return 0;
}
// const TargetPhraseCollection *FindEquivPhrase(const Phrase &source) const;
void GetTargetCandidates(const std::vector<const Factor*>& src,std::vector<FactorTgtCand>& rv) const;
void PrintTargetCandidates(const std::vector<std::string>& src,std::ostream& out) const;
// for mert
void SetWeightTransModel(const std::vector<float> &weightT);
};
#endif /*PHRASEDICTIONARYTREE_H_*/

281
moses/src/PrefixTree.h Normal file
View File

@ -0,0 +1,281 @@
/* ---------------------------------------------------------------- */
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
/* Richard Zens */
/* ---------------------------------------------------------------- */
#ifndef PREFIXTREE_H_
#define PREFIXTREE_H_
#include <vector>
#include <algorithm>
#include <cassert>
#include <deque>
#include "Util.h"
#include "FilePtr.h"
#include "File.h"
#ifdef DEBUG
#include "CountObjects.h"
#endif
template<typename T,typename D>
class PrefixTreeSA {
public:
typedef T Key;
typedef D Data;
typedef PrefixTreeSA<T,D> Self;
typedef std::vector<T> VT;
typedef std::vector<Self*> VP;
typedef std::vector<D> VD;
VT keys;
VP ptr;
VD data;
static Data def;
public:
PrefixTreeSA() {}
~PrefixTreeSA() {for(size_t i=0;i<ptr.size();++i) delete ptr[i];}
static const Data& getDefault() {return def;}
static void setDefault(const Data& x) {def=x;}
// insert sequence
template<typename fwiter> Data& insert(fwiter b,fwiter e) {
typename VT::iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
typename VT::iterator kb=keys.begin();
size_t pos=std::distance(kb,i);
if(i==keys.end() || *i!=*b) {
keys.insert(i,*b);
data.insert(data.begin()+pos,def);
ptr.insert(ptr.begin()+pos,0);
}
if(++b!=e) {
if(!ptr[pos]) ptr[pos]=new Self;
return ptr[pos]->insert(b,e);
}
else return data[pos];
}
// insert container
template<typename cont> Data& insert(const cont& c) {
return insert(c.begin(),c.end());}
size_t size() const {return keys.size();}
const Key& getKey(size_t i) const {return keys[i];}
const Data& getData(size_t i) const {return data[i];}
const Self* getPtr(size_t i) const {return ptr[i];}
size_t findKey(const Key& k) const {
typename VT::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
if(i==keys.end() || *i!=k) return keys.size();
return std::distance(keys.begin(),i);
}
// find sequence
template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
size_t pos=findKey(*b);
if(pos==keys.size()) return 0;
if(++b==e) return &data[pos];
if(ptr[pos]) return ptr[pos]->findPtr(b,e); else return 0;
}
// find container
template<typename cont> const Data* findPtr(const cont& c) const {
return findPtr(c.begin(),c.end());}
// find sequence
template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
if(const Data* p=findPtr(b,e)) return *p; else return def;
}
// find container
template<typename cont> const Data& find(const cont& c) const {
return find(c.begin(),c.end());}
void shrink() {
ShrinkToFit(keys); ShrinkToFit(ptr); ShrinkToFit(data);}
};
template<typename T,typename D> D PrefixTreeSA<T,D>::def;
/////////////////////////////////////////////////////////////////////////////
template<typename T,typename D>
class PrefixTreeF {
public:
typedef T Key;
typedef D Data;
private:
typedef PrefixTreeF<Key,Data> Self;
public:
typedef FilePtr<Self> Ptr;
private:
typedef std::vector<Key> VK;
typedef std::vector<Data> VD;
typedef std::vector<Ptr> VP;
VK keys;
VD data;
VP ptr;
static Data def;
off_t startPos;
FILE* f;
public:
#ifdef DEBUG
DECLAREMEMSTAT(Self);
#endif
PrefixTreeF(FILE* f_=0) : f(f_) {if(f) read();}
~PrefixTreeF() {free();}
void read() {
startPos=fTell(f);
fReadVector(f,keys);
fReadVector(f,data);
ptr.clear();ptr.resize(keys.size());
for(size_t i=0;i<ptr.size();++i) {
off_t pos;
fRead(f,pos);
if(pos) ptr[i].set(f,pos);
}
}
void free() {
for(typename VP::iterator i=ptr.begin();i!=ptr.end();++i) i->free();}
void reserve(size_t s) {
keys.reserve(s);data.reserve(s);ptr.reserve(s);}
template<typename fwiter>
void changeData(fwiter b,fwiter e,const Data& d) {
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
if(i==keys.end() || *i!=*b) {
std::cerr<<"ERROR: key not found in changeData!\n"; return;}
typename VK::const_iterator kb=keys.begin();
size_t pos=std::distance(kb,i);
if(++b==e) {
off_t p=startPos+keys.size()*sizeof(Key)+2*sizeof(unsigned)+pos*sizeof(Data);
std::cerr<<"elem found at pos "<<p<<" old val: "<<data[pos]<<" startpos: "<<startPos<<"\n";
if(data[pos]!=d) {
data[pos]=d;fSeek(f,p);fWrite(f,d);}
return;
}
if(ptr[pos]) ptr[pos]->changeData(b,e,d); else {
std::cerr<<"ERROR: seg not found!in changeData\n";
}
}
void create(const PrefixTreeSA<Key,Data>& psa,const std::string& fname) {
FILE* f=fOpen(fname.c_str(),"wb");
create(psa,f);
fclose(f);
}
void create(const PrefixTreeSA<Key,Data>& psa,FILE* f,int verbose=0) {
setDefault(psa.getDefault());
typedef std::pair<const PrefixTreeSA<Key,Data>*,off_t> P;
typedef std::deque<P> Next;
Next next;
next.push_back(P(&psa,fTell(f)));
bool isFirst=1;
size_t ns=1;
while(next.size()) {
if(verbose && next.size()>ns) {
std::cerr<<"stack size in PF create: "<<next.size()<<"\n";
while(ns<next.size()) ns*=2;}
const P& pp=next.back();
const PrefixTreeSA<Key,Data>& p=*pp.first;
off_t pos=pp.second;
next.pop_back();
if(!isFirst) {
off_t curr=fTell(f);
fSeek(f,pos);
fWrite(f,curr);
fSeek(f,curr);
} else isFirst=0;
size_t s=0;
s+=fWriteVector(f,p.keys);
s+=fWriteVector(f,p.data);
for(size_t i=0;i<p.ptr.size();++i) {
if(p.ptr[i])
next.push_back(P(p.ptr[i],fTell(f)));
off_t ppos=0;
s+=fWrite(f,ppos);
}
}
}
size_t size() const {return keys.size();}
const Key& getKey(size_t i) const {return keys[i];}
const Data& getData(size_t i) const {return data[i];}
const Self* getPtr(size_t i) const {return ptr[i];}
size_t findKey(const Key& k) const {
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
if(i==keys.end() || *i!=k) return keys.size();
return std::distance(keys.begin(),i);
}
Ptr const* findKeyPtr(const Key& k) const {
size_t pos=findKey(k);
if(pos<keys.size()) {return &ptr[pos];} else {return 0;}
}
// find sequence
template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
if(i==keys.end() || *i!=*b) return 0;
size_t pos=std::distance(keys.begin(),i);
if(++b==e) return &data[pos];
if(ptr[pos]) return ptr[pos]->findPtr(b,e); else return 0;
}
// find container
template<typename cont> const Data* findPtr(const cont& c) const {
return findPtr(c.begin(),c.end());}
// find sequence
template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
if(const Data* p=findPtr(b,e)) return *p; else return def;} //return (p?*p:def);}
// find container
template<typename cont> const Data& find(const cont& c) const {
return find(c.begin(),c.end());}
static void setDefault(const Data& d) {def=d;}
static const Data& getDefault() {return def;}
void print(std::ostream& out,const std::string s="") const {
out<<s<<"startpos: "<<startPos<<" size: "<<keys.size()<<"\n";
for(size_t i=0;i<keys.size();++i) {
out<<s<<i<<" - "<<keys[i]<<" "<<data[i]<<"\n";
}
for(size_t i=0;i<ptr.size();++i)
if(ptr[i])
ptr[i]->print(out,s+" ");
}
};
template<typename T,typename D> D PrefixTreeF<T,D>::def;
#ifdef DEBUG
template<typename T,typename D> MemoryStatsPrinter< PrefixTreeF<T,D> > PrefixTreeF<T,D>::memStat("PrefixTreeF<T,D>",0);
#endif
#endif

View File

@ -27,8 +27,9 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FactorCollection.h"
#include "HypothesisCollection.h"
#include "Timer.h"
#include "PhraseDictionaryTree.h"
#include "boost/filesystem/operations.hpp" // boost::filesystem::exists
#include "InputFileStream.h"
using namespace std;
@ -335,7 +336,6 @@ void StaticData::LoadPhraseTables(bool filter
weight[currScore] = weightAll[totalPrevNoScoreComponent + currScore];
}
totalPrevNoScoreComponent += noScoreComponent;
string phraseTableHash = GetMD5Hash(filePath);
string hashFilePath = GetCachePath()
+ PROJECT_NAME + "--"
@ -374,7 +374,7 @@ void StaticData::LoadPhraseTables(bool filter
, inputPhraseList
, this->GetLanguageModel(Initial)
, this->GetWeightWordPenalty());
timer.check("Finished loading PhraseTable");
}
}

View File

@ -58,12 +58,15 @@ const size_t DEFAULT_VERBOSE_LEVEL = 1;
#ifdef LM_SRI
typedef unsigned int LmId;
#endif
#else
#ifdef LM_INTERNAL
class NGramNode;
typedef const NGramNode* LmId;
#else
// if nothing is defined:
typedef unsigned int LmId;
#endif
#endif
// enums.
// must be 0, 1, 2, ..., unless otherwise stated

View File

@ -188,3 +188,9 @@ void RemoveAllInColl(COLL &coll)
std::string GetTempFolder();
void CreateTempFile(std::ofstream &fileStream, std::string &filePath);
std::string GetMD5Hash(const std::string &filePath);
template<typename T> inline void ShrinkToFit(T& v) {
if(v.capacity()>v.size()) T(v).swap(v);assert(v.capacity()==v.size());}