mosesdecoder/moses/src/PrefixTree.h

277 lines
7.4 KiB
C
Raw Normal View History

// $Id$
/* ---------------------------------------------------------------- */
/* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
/* Richard Zens */
/* ---------------------------------------------------------------- */
#ifndef PREFIXTREE_H_
#define PREFIXTREE_H_
#include <vector>
#include <algorithm>
#include <cassert>
#include <deque>
#include "Util.h"
#include "FilePtr.h"
#include "File.h"
namespace Moses
{
template<typename T,typename D>
class PrefixTreeSA {
public:
typedef T Key;
typedef D Data;
typedef PrefixTreeSA<T,D> Self;
typedef std::vector<T> VT;
typedef std::vector<Self*> VP;
typedef std::vector<D> VD;
VT keys;
VP ptr;
VD data;
static Data def;
public:
PrefixTreeSA() {}
~PrefixTreeSA() {for(size_t i=0;i<ptr.size();++i) delete ptr[i];}
static const Data& getDefault() {return def;}
static void setDefault(const Data& x) {def=x;}
// insert sequence
template<typename fwiter> Data& insert(fwiter b,fwiter e) {
typename VT::iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
typename VT::iterator kb=keys.begin();
size_t pos=std::distance(kb,i);
if(i==keys.end() || *i!=*b) {
keys.insert(i,*b);
data.insert(data.begin()+pos,def);
ptr.insert(ptr.begin()+pos,0);
}
if(++b!=e) {
if(!ptr[pos]) ptr[pos]=new Self;
return ptr[pos]->insert(b,e);
}
else return data[pos];
}
// insert container
template<typename cont> Data& insert(const cont& c) {
return insert(c.begin(),c.end());}
size_t size() const {return keys.size();}
const Key& getKey(size_t i) const {return keys[i];}
const Data& getData(size_t i) const {return data[i];}
const Self* getPtr(size_t i) const {return ptr[i];}
size_t findKey(const Key& k) const {
typename VT::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
if(i==keys.end() || *i!=k) return keys.size();
return std::distance(keys.begin(),i);
}
// find sequence
template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
size_t pos=findKey(*b);
if(pos==keys.size()) return 0;
if(++b==e) return &data[pos];
if(ptr[pos]) return ptr[pos]->findPtr(b,e); else return 0;
}
// find container
template<typename cont> const Data* findPtr(const cont& c) const {
return findPtr(c.begin(),c.end());}
// find sequence
template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
if(const Data* p=findPtr(b,e)) return *p; else return def;
}
// find container
template<typename cont> const Data& find(const cont& c) const {
return find(c.begin(),c.end());}
void shrink() {
ShrinkToFit(keys); ShrinkToFit(ptr); ShrinkToFit(data);}
};
template<typename T,typename D> D PrefixTreeSA<T,D>::def;
/////////////////////////////////////////////////////////////////////////////
template<typename T,typename D>
class PrefixTreeF {
public:
typedef T Key;
typedef D Data;
private:
typedef PrefixTreeF<Key,Data> Self;
public:
typedef FilePtr<Self> Ptr;
private:
typedef std::vector<Key> VK;
typedef std::vector<Data> VD;
typedef std::vector<Ptr> VP;
VK keys;
VD data;
VP ptr;
static Data def;
OFF_T startPos;
FILE* f;
public:
PrefixTreeF(FILE* f_=0) : f(f_) {if(f) read();}
~PrefixTreeF() {free();}
void read() {
startPos=fTell(f);
fReadVector(f,keys);
fReadVector(f,data);
ptr.clear();ptr.resize(keys.size());
std::vector<OFF_T> rawOffs(keys.size());
fread(&rawOffs[0], sizeof(OFF_T), keys.size(), f);
for(size_t i=0;i<ptr.size();++i)
if (rawOffs[i]) ptr[i].set(f, rawOffs[i]);
}
void free() {
for(typename VP::iterator i=ptr.begin();i!=ptr.end();++i) i->free();}
void reserve(size_t s) {
keys.reserve(s);data.reserve(s);ptr.reserve(s);}
template<typename fwiter>
void changeData(fwiter b,fwiter e,const Data& d) {
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
if(i==keys.end() || *i!=*b) {
TRACE_ERR("ERROR: key not found in changeData!\n"); return;}
typename VK::const_iterator kb=keys.begin();
size_t pos=std::distance(kb,i);
if(++b==e) {
OFF_T p=startPos+keys.size()*sizeof(Key)+2*sizeof(unsigned)+pos*sizeof(Data);
TRACE_ERR("elem found at pos "<<p<<" old val: "<<data[pos]<<" startpos: "<<startPos<<"\n");
if(data[pos]!=d) {
data[pos]=d;fSeek(f,p);fWrite(f,d);}
return;
}
if(ptr[pos]) ptr[pos]->changeData(b,e,d); else {
TRACE_ERR("ERROR: seg not found!in changeData\n");
}
}
void create(const PrefixTreeSA<Key,Data>& psa,const std::string& fname) {
FILE* f=fOpen(fname.c_str(),"wb");
create(psa,f);
fclose(f);
}
void create(const PrefixTreeSA<Key,Data>& psa,FILE* f,int verbose=0) {
setDefault(psa.getDefault());
typedef std::pair<const PrefixTreeSA<Key,Data>*,OFF_T> P;
typedef std::deque<P> Queue;
Queue queue;
queue.push_back(P(&psa,fTell(f)));
bool isFirst=1;
size_t ns=1;
while(queue.size()) {
if(verbose && queue.size()>ns) {
TRACE_ERR("stack size in PF create: "<<queue.size()<<"\n");
while(ns<queue.size()) ns*=2;}
const P& pp=queue.back();
const PrefixTreeSA<Key,Data>& p=*pp.first;
OFF_T pos=pp.second;
queue.pop_back();
if(!isFirst) {
OFF_T curr=fTell(f);
fSeek(f,pos);
fWrite(f,curr);
fSeek(f,curr);
} else isFirst=0;
size_t s=0;
s+=fWriteVector(f,p.keys);
s+=fWriteVector(f,p.data);
for(size_t i=0;i<p.ptr.size();++i) {
if(p.ptr[i])
queue.push_back(P(p.ptr[i],fTell(f)));
OFF_T ppos=0;
s+=fWrite(f,ppos);
}
}
}
size_t size() const {return keys.size();}
const Key& getKey(size_t i) const {return keys[i];}
const Data& getData(size_t i) const {return data[i];}
const Self* getPtr(size_t i) const {return ptr[i];}
size_t findKey(const Key& k) const {
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
if(i==keys.end() || *i!=k) return keys.size();
return std::distance(keys.begin(),i);
}
Ptr const* findKeyPtr(const Key& k) const {
size_t pos=findKey(k);
return (pos<keys.size() ? &ptr[pos] : 0);
}
// find sequence
template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
if(i==keys.end() || *i!=*b) return 0;
size_t pos=std::distance(keys.begin(),i);
if(++b==e) return &data[pos];
if(ptr[pos]) return ptr[pos]->findPtr(b,e); else return 0;
}
// find container
template<typename cont> const Data* findPtr(const cont& c) const {
return findPtr(c.begin(),c.end());}
// find sequence
template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
if(const Data* p=findPtr(b,e)) return *p; else return def;} //return (p?*p:def);}
// find container
template<typename cont> const Data& find(const cont& c) const {
return find(c.begin(),c.end());}
static void setDefault(const Data& d) {def=d;}
static const Data& getDefault() {return def;}
void print(std::ostream& out,const std::string s="") const {
out<<s<<"startpos: "<<startPos<<" size: "<<keys.size()<<"\n";
for(size_t i=0;i<keys.size();++i) {
out<<s<<i<<" - "<<keys[i]<<" "<<data[i]<<"\n";
}
for(size_t i=0;i<ptr.size();++i)
if(ptr[i])
ptr[i]->print(out,s+" ");
}
};
template<typename T,typename D> D PrefixTreeF<T,D>::def;
}
#endif