2008-06-11 14:52:57 +04:00
|
|
|
// $Id$
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------- */
|
|
|
|
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
|
|
|
|
/* Richard Zens */
|
|
|
|
/* ---------------------------------------------------------------- */
|
2010-02-24 14:15:44 +03:00
|
|
|
#ifndef moses_PrefixTree_h
|
|
|
|
#define moses_PrefixTree_h
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
#include <vector>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <deque>
|
|
|
|
#include "Util.h"
|
|
|
|
#include "FilePtr.h"
|
|
|
|
#include "File.h"
|
|
|
|
|
2008-10-09 03:51:26 +04:00
|
|
|
namespace Moses
|
|
|
|
{
|
|
|
|
|
2012-06-29 02:29:46 +04:00
|
|
|
/** @todo How is this used in the pb binary phrase table?
|
|
|
|
*/
|
2008-06-11 14:52:57 +04:00
|
|
|
template<typename T,typename D>
|
2011-02-24 16:14:42 +03:00
|
|
|
class PrefixTreeSA
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
public:
|
|
|
|
typedef T Key;
|
|
|
|
typedef D Data;
|
|
|
|
|
|
|
|
typedef PrefixTreeSA<T,D> Self;
|
|
|
|
typedef std::vector<T> VT;
|
|
|
|
typedef std::vector<Self*> VP;
|
|
|
|
typedef std::vector<D> VD;
|
|
|
|
|
|
|
|
VT keys;
|
|
|
|
VP ptr;
|
|
|
|
VD data;
|
|
|
|
|
|
|
|
static Data def;
|
|
|
|
|
|
|
|
public:
|
|
|
|
PrefixTreeSA() {}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
~PrefixTreeSA() {
|
|
|
|
for(size_t i=0; i<ptr.size(); ++i) delete ptr[i];
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
static const Data& getDefault() {
|
|
|
|
return def;
|
|
|
|
}
|
|
|
|
static void setDefault(const Data& x) {
|
|
|
|
def=x;
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
|
|
|
|
// insert sequence
|
|
|
|
template<typename fwiter> Data& insert(fwiter b,fwiter e) {
|
|
|
|
typename VT::iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
|
|
|
|
typename VT::iterator kb=keys.begin();
|
|
|
|
size_t pos=std::distance(kb,i);
|
|
|
|
|
|
|
|
if(i==keys.end() || *i!=*b) {
|
|
|
|
keys.insert(i,*b);
|
|
|
|
data.insert(data.begin()+pos,def);
|
2011-07-24 03:52:34 +04:00
|
|
|
|
2013-05-29 21:16:15 +04:00
|
|
|
Self *self = NULL;
|
2011-07-24 03:52:34 +04:00
|
|
|
ptr.insert(ptr.begin()+pos, self);
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
if(++b!=e) {
|
|
|
|
if(!ptr[pos]) ptr[pos]=new Self;
|
|
|
|
return ptr[pos]->insert(b,e);
|
2011-02-24 16:14:42 +03:00
|
|
|
} else return data[pos];
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
// insert container
|
|
|
|
template<typename cont> Data& insert(const cont& c) {
|
2011-02-24 16:14:42 +03:00
|
|
|
return insert(c.begin(),c.end());
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t size() const {
|
|
|
|
return keys.size();
|
|
|
|
}
|
|
|
|
const Key& getKey(size_t i) const {
|
|
|
|
return keys[i];
|
|
|
|
}
|
|
|
|
const Data& getData(size_t i) const {
|
|
|
|
return data[i];
|
|
|
|
}
|
|
|
|
const Self* getPtr(size_t i) const {
|
|
|
|
return ptr[i];
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
size_t findKey(const Key& k) const {
|
|
|
|
typename VT::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
|
|
|
|
if(i==keys.end() || *i!=k) return keys.size();
|
|
|
|
return std::distance(keys.begin(),i);
|
|
|
|
}
|
|
|
|
|
|
|
|
// find sequence
|
|
|
|
template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
|
|
|
|
size_t pos=findKey(*b);
|
|
|
|
if(pos==keys.size()) return 0;
|
|
|
|
if(++b==e) return &data[pos];
|
2011-02-24 16:14:42 +03:00
|
|
|
if(ptr[pos]) return ptr[pos]->findPtr(b,e);
|
|
|
|
else return 0;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
// find container
|
|
|
|
template<typename cont> const Data* findPtr(const cont& c) const {
|
2011-02-24 16:14:42 +03:00
|
|
|
return findPtr(c.begin(),c.end());
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
|
|
|
|
// find sequence
|
|
|
|
template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
|
2011-02-24 16:14:42 +03:00
|
|
|
if(const Data* p=findPtr(b,e)) return *p;
|
|
|
|
else return def;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
// find container
|
|
|
|
template<typename cont> const Data& find(const cont& c) const {
|
2011-02-24 16:14:42 +03:00
|
|
|
return find(c.begin(),c.end());
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
void shrink() {
|
2011-02-24 16:14:42 +03:00
|
|
|
ShrinkToFit(keys);
|
|
|
|
ShrinkToFit(ptr);
|
|
|
|
ShrinkToFit(data);
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
};
|
|
|
|
template<typename T,typename D> D PrefixTreeSA<T,D>::def;
|
|
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
2012-06-29 02:29:46 +04:00
|
|
|
/** @todo How is this used in the pb binary phrase table?
|
|
|
|
*/
|
2008-06-11 14:52:57 +04:00
|
|
|
template<typename T,typename D>
|
2011-02-24 16:14:42 +03:00
|
|
|
class PrefixTreeF
|
|
|
|
{
|
2008-06-11 14:52:57 +04:00
|
|
|
public:
|
|
|
|
typedef T Key;
|
|
|
|
typedef D Data;
|
|
|
|
private:
|
|
|
|
typedef PrefixTreeF<Key,Data> Self;
|
|
|
|
public:
|
|
|
|
typedef FilePtr<Self> Ptr;
|
|
|
|
private:
|
|
|
|
typedef std::vector<Key> VK;
|
|
|
|
typedef std::vector<Data> VD;
|
|
|
|
typedef std::vector<Ptr> VP;
|
|
|
|
|
|
|
|
VK keys;
|
|
|
|
VD data;
|
|
|
|
VP ptr;
|
|
|
|
|
|
|
|
static Data def;
|
|
|
|
|
|
|
|
OFF_T startPos;
|
|
|
|
FILE* f;
|
|
|
|
public:
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
PrefixTreeF(FILE* f_=0) : f(f_) {
|
|
|
|
if(f) read();
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
~PrefixTreeF() {
|
|
|
|
free();
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
void read() {
|
|
|
|
startPos=fTell(f);
|
|
|
|
fReadVector(f,keys);
|
|
|
|
fReadVector(f,data);
|
2011-02-24 16:14:42 +03:00
|
|
|
ptr.clear();
|
|
|
|
ptr.resize(keys.size());
|
2008-06-11 14:52:57 +04:00
|
|
|
std::vector<OFF_T> rawOffs(keys.size());
|
|
|
|
fread(&rawOffs[0], sizeof(OFF_T), keys.size(), f);
|
2011-02-24 16:14:42 +03:00
|
|
|
for(size_t i=0; i<ptr.size(); ++i)
|
2008-06-11 14:52:57 +04:00
|
|
|
if (rawOffs[i]) ptr[i].set(f, rawOffs[i]);
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
void free() {
|
2011-02-24 16:14:42 +03:00
|
|
|
for(typename VP::iterator i=ptr.begin(); i!=ptr.end(); ++i) i->free();
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
void reserve(size_t s) {
|
2011-02-24 16:14:42 +03:00
|
|
|
keys.reserve(s);
|
|
|
|
data.reserve(s);
|
|
|
|
ptr.reserve(s);
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
template<typename fwiter>
|
|
|
|
void changeData(fwiter b,fwiter e,const Data& d) {
|
|
|
|
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
|
|
|
|
if(i==keys.end() || *i!=*b) {
|
2011-02-24 16:14:42 +03:00
|
|
|
TRACE_ERR("ERROR: key not found in changeData!\n");
|
|
|
|
return;
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
typename VK::const_iterator kb=keys.begin();
|
|
|
|
size_t pos=std::distance(kb,i);
|
|
|
|
if(++b==e) {
|
|
|
|
OFF_T p=startPos+keys.size()*sizeof(Key)+2*sizeof(unsigned)+pos*sizeof(Data);
|
|
|
|
TRACE_ERR("elem found at pos "<<p<<" old val: "<<data[pos]<<" startpos: "<<startPos<<"\n");
|
|
|
|
if(data[pos]!=d) {
|
2011-02-24 16:14:42 +03:00
|
|
|
data[pos]=d;
|
|
|
|
fSeek(f,p);
|
|
|
|
fWrite(f,d);
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
return;
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
if(ptr[pos]) ptr[pos]->changeData(b,e,d);
|
|
|
|
else {
|
2008-06-11 14:52:57 +04:00
|
|
|
TRACE_ERR("ERROR: seg not found!in changeData\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void create(const PrefixTreeSA<Key,Data>& psa,const std::string& fname) {
|
|
|
|
FILE* f=fOpen(fname.c_str(),"wb");
|
|
|
|
create(psa,f);
|
|
|
|
fclose(f);
|
|
|
|
}
|
|
|
|
|
|
|
|
void create(const PrefixTreeSA<Key,Data>& psa,FILE* f,int verbose=0) {
|
|
|
|
setDefault(psa.getDefault());
|
|
|
|
|
|
|
|
typedef std::pair<const PrefixTreeSA<Key,Data>*,OFF_T> P;
|
|
|
|
typedef std::deque<P> Queue;
|
|
|
|
|
|
|
|
Queue queue;
|
|
|
|
|
|
|
|
queue.push_back(P(&psa,fTell(f)));
|
|
|
|
bool isFirst=1;
|
|
|
|
size_t ns=1;
|
|
|
|
while(queue.size()) {
|
|
|
|
if(verbose && queue.size()>ns) {
|
|
|
|
TRACE_ERR("stack size in PF create: "<<queue.size()<<"\n");
|
2011-02-24 16:14:42 +03:00
|
|
|
while(ns<queue.size()) ns*=2;
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
const P& pp=queue.back();
|
|
|
|
const PrefixTreeSA<Key,Data>& p=*pp.first;
|
|
|
|
OFF_T pos=pp.second;
|
|
|
|
queue.pop_back();
|
|
|
|
|
|
|
|
if(!isFirst) {
|
|
|
|
OFF_T curr=fTell(f);
|
|
|
|
fSeek(f,pos);
|
|
|
|
fWrite(f,curr);
|
|
|
|
fSeek(f,curr);
|
|
|
|
} else isFirst=0;
|
|
|
|
|
|
|
|
size_t s=0;
|
|
|
|
s+=fWriteVector(f,p.keys);
|
|
|
|
s+=fWriteVector(f,p.data);
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
for(size_t i=0; i<p.ptr.size(); ++i) {
|
2008-06-11 14:52:57 +04:00
|
|
|
if(p.ptr[i])
|
|
|
|
queue.push_back(P(p.ptr[i],fTell(f)));
|
|
|
|
OFF_T ppos=0;
|
|
|
|
s+=fWrite(f,ppos);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
size_t size() const {
|
|
|
|
return keys.size();
|
|
|
|
}
|
|
|
|
const Key& getKey(size_t i) const {
|
|
|
|
return keys[i];
|
|
|
|
}
|
|
|
|
const Data& getData(size_t i) const {
|
|
|
|
return data[i];
|
|
|
|
}
|
|
|
|
const Self* getPtr(size_t i) const {
|
|
|
|
return ptr[i];
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
size_t findKey(const Key& k) const {
|
|
|
|
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
|
|
|
|
if(i==keys.end() || *i!=k) return keys.size();
|
|
|
|
return std::distance(keys.begin(),i);
|
|
|
|
}
|
|
|
|
|
|
|
|
Ptr const* findKeyPtr(const Key& k) const {
|
|
|
|
size_t pos=findKey(k);
|
2011-02-24 16:14:42 +03:00
|
|
|
return (pos<keys.size() ? &ptr[pos] : 0);
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
// find sequence
|
|
|
|
template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
|
|
|
|
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
|
|
|
|
if(i==keys.end() || *i!=*b) return 0;
|
|
|
|
size_t pos=std::distance(keys.begin(),i);
|
|
|
|
if(++b==e) return &data[pos];
|
2011-02-24 16:14:42 +03:00
|
|
|
if(ptr[pos]) return ptr[pos]->findPtr(b,e);
|
|
|
|
else return 0;
|
2008-06-11 14:52:57 +04:00
|
|
|
}
|
|
|
|
// find container
|
|
|
|
template<typename cont> const Data* findPtr(const cont& c) const {
|
2011-02-24 16:14:42 +03:00
|
|
|
return findPtr(c.begin(),c.end());
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
|
|
|
|
// find sequence
|
|
|
|
template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
|
2011-02-24 16:14:42 +03:00
|
|
|
if(const Data* p=findPtr(b,e)) return *p;
|
|
|
|
else return def;
|
|
|
|
} //return (p?*p:def);}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
// find container
|
|
|
|
template<typename cont> const Data& find(const cont& c) const {
|
2011-02-24 16:14:42 +03:00
|
|
|
return find(c.begin(),c.end());
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
2011-02-24 16:14:42 +03:00
|
|
|
static void setDefault(const Data& d) {
|
|
|
|
def=d;
|
|
|
|
}
|
|
|
|
static const Data& getDefault() {
|
|
|
|
return def;
|
|
|
|
}
|
2008-06-11 14:52:57 +04:00
|
|
|
|
|
|
|
|
|
|
|
void print(std::ostream& out,const std::string s="") const {
|
|
|
|
|
|
|
|
out<<s<<"startpos: "<<startPos<<" size: "<<keys.size()<<"\n";
|
2011-02-24 16:14:42 +03:00
|
|
|
for(size_t i=0; i<keys.size(); ++i) {
|
2008-06-11 14:52:57 +04:00
|
|
|
out<<s<<i<<" - "<<keys[i]<<" "<<data[i]<<"\n";
|
|
|
|
}
|
2011-02-24 16:14:42 +03:00
|
|
|
for(size_t i=0; i<ptr.size(); ++i)
|
2008-06-11 14:52:57 +04:00
|
|
|
if(ptr[i])
|
|
|
|
ptr[i]->print(out,s+" ");
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
};
|
|
|
|
template<typename T,typename D> D PrefixTreeF<T,D>::def;
|
2008-10-09 03:51:26 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2008-06-11 14:52:57 +04:00
|
|
|
#endif
|