// -*- c++ -*- // (c) 2007-2009 Ulrich Germann. All rights reserved. #ifndef _ug_im_tsa_h #define _ug_im_tsa_h // TO DO: // - multi-threaded sorting during TSA construction (currently painfully slow!) #include #include #include #include #include #include "tpt_tightindex.h" #include "tpt_tokenindex.h" #include "ug_tsa_base.h" #include "tpt_pickler.h" namespace ugdiss { using namespace std; using namespace boost; namespace bio=boost::iostreams; // template class imBitext; //----------------------------------------------------------------------- template class imTSA : public TSA { typedef typename Ttrack::Position cpos; // friend class imBitext; public: class tree_iterator; friend class tree_iterator; private: vector sufa; // stores the actual array vector index; /* top-level index into regions in sufa * (for faster access) */ private: char const* index_jump(char const* a, char const* z, float ratio) const; char const* getLowerBound(id_type id) const; char const* getUpperBound(id_type id) const; public: imTSA(); imTSA(shared_ptr const> c, bdBitset const* filt, ostream* log = NULL); imTSA(imTSA const& prior, shared_ptr const> const& crp, vector const& newsids, size_t const vsize); count_type sntCnt(char const* p, char const * const q) const; count_type rawCnt(char const* p, char const * const q) const; void getCounts(char const* p, char const * const q, count_type& sids, count_type& raw) const; char const* readSid(char const* p, char const* q, id_type& sid) const; char const* readSid(char const* p, char const* q, uint64_t& sid) const; char const* readOffset(char const* p, char const* q, uint16_t& offset) const; char const* readOffset(char const* p, char const* q, uint64_t& offset) const; void sanityCheck() const; void save_as_mm_tsa(string fname) const; /// add a sentence to the database // shared_ptr > add(vector const& snt) const; }; template class imTSA:: tree_iterator : public TSA::tree_iterator { public: tree_iterator(imTSA const* s); }; template imTSA:: tree_iterator:: tree_iterator(imTSA const* s) : TSA::tree_iterator::tree_iterator(reinterpret_cast const*>(s)) {}; /** jump to the point 1/ratio in a tightly packed index * assumes that keys are flagged with '1', values with '0' */ template char const* imTSA:: index_jump(char const* a, char const* z, float ratio) const { typedef cpos cpos; assert(ratio >= 0 && ratio < 1); cpos const* xa = reinterpret_cast(a); cpos const* xz = reinterpret_cast(z); return reinterpret_cast(xa+int(ratio*(xz-xa))); } template imTSA:: imTSA() { this->indexSize = 0; // this->data = NULL; this->startArray = NULL; this->endArray = NULL; this->corpusSize = 0; this->BitSetCachingThreshold=4096; }; // build an array from all the tokens in the sentences in *c that are // specified in filter template imTSA:: imTSA(shared_ptr const> c, bdBitset const* filter, ostream* log) { assert(c); this->corpus = c; bdBitset filter2; if (!filter) { filter2.resize(c->size()); filter2.set(); filter = &filter2; } assert(filter); // In the first iteration over the corpus, we obtain word counts. // They allows us to // a. allocate the exact amount of memory we need // b. place tokens into the right 'section' in the array, based on // the ID of the first token in the sequence. We can then sort // each section separately. if (log) *log << "counting tokens ... "; int slimit = 65536; // slimit=65536 is the upper bound of what we can fit into a ushort which // we currently use for the offset. Actually, due to (memory) word // alignment in the memory, using a ushort instead of a uint32_t might not // even make a difference. vector wcnt; // word counts sufa.resize(c->count_tokens(wcnt,filter,slimit,log)); if (log) *log << sufa.size() << "." << endl; // exit(1); // we use a second vector that keeps track for each ID of the current insertion // position in the array vector tmp(wcnt.size(),0); for (size_t i = 1; i < wcnt.size(); ++i) tmp[i] = tmp[i-1] + wcnt[i-1]; // Now dump all token positions into the right place in sufa this->corpusSize = 0; for (id_type sid = filter->find_first(); sid < filter->size(); sid = filter->find_next(sid)) { TOKEN const* k = c->sntStart(sid); TOKEN const* const stop = c->sntEnd(sid); if (stop - k >= slimit) continue; this->corpusSize++; for (ushort p=0; k < stop; ++p,++k) { id_type wid = k->id(); cpos& cpos = sufa[tmp[wid]++]; cpos.sid = sid; cpos.offset = p; assert(p < c->sntLen(sid)); } } // Now sort the array if (log) *log << "sorting ...." << endl; index.resize(wcnt.size()+1,0); typename ttrack::Position::LESS > sorter(c.get()); for (size_t i = 0; i < wcnt.size(); i++) { if (log && wcnt[i] > 5000) *log << "sorting " << wcnt[i] << " entries starting with id " << i << "." << endl; index[i+1] = index[i]+wcnt[i]; assert(index[i+1]==tmp[i]); // sanity check if (wcnt[i]>1) sort(sufa.begin()+index[i],sufa.begin()+index[i+1],sorter); } this->startArray = reinterpret_cast(&(*sufa.begin())); this->endArray = reinterpret_cast(&(*sufa.end())); this->numTokens = sufa.size(); this->indexSize = this->index.size(); #if 1 // Sanity check during code development. Can be removed once the thing is stable. typename vector::iterator m = sufa.begin(); for (size_t i = 0; i < wcnt.size(); i++) { for (size_t k = 0; k < wcnt[i]; ++k,++m) { assert(c->getToken(*m)->id()==i); assert(m->offset < c->sntLen(m->sid)); } } #endif } // end of imTSA constructor (corpus,filter,quiet) // ---------------------------------------------------------------------- template char const* imTSA:: getLowerBound(id_type id) const { if (id >= this->index.size()) return NULL; assert(index[id] <= this->sufa.size()); return reinterpret_cast(&(this->sufa.front()) + index[id]); } template char const* imTSA:: getUpperBound(id_type id) const { if (++id >= this->index.size()) return NULL; assert(index[id] <= this->sufa.size()); return reinterpret_cast(&(this->sufa.front()) + index[id]); } template char const* imTSA:: readSid(char const* p, char const* q, id_type& sid) const { assert(reinterpret_cast(p) >= &(this->sufa.front())); assert(reinterpret_cast(p) <= &(this->sufa.back())); sid = reinterpret_cast(p)->sid; return p; } template char const* imTSA:: readSid(char const* p, char const* q, uint64_t& sid) const { assert(reinterpret_cast(p) >= &(this->sufa.front())); assert(reinterpret_cast(p) <= &(this->sufa.back())); sid = reinterpret_cast(p)->sid; return p; } template char const* imTSA:: readOffset(char const* p, char const* q, uint16_t& offset) const { assert(reinterpret_cast(p) >= &(this->sufa.front())); assert(reinterpret_cast(p) <= &(this->sufa.back())); offset = reinterpret_cast(p)->offset; return p+sizeof(cpos); } template char const* imTSA:: readOffset(char const* p, char const* q, uint64_t& offset) const { assert(reinterpret_cast(p) >= &(this->sufa.front())); assert(reinterpret_cast(p) <= &(this->sufa.back())); offset = reinterpret_cast(p)->offset; return p+sizeof(cpos); } template count_type imTSA:: rawCnt(char const* p, char const* const q) const { cpos const* xp = reinterpret_cast(p); cpos const* xq = reinterpret_cast(q); return xq-xp; } template void imTSA:: getCounts(char const* p, char const* const q, count_type& sids, count_type& raw) const { id_type sid; // uint16_t off; bdBitset check(this->corpus->size()); cpos const* xp = reinterpret_cast(p); cpos const* xq = reinterpret_cast(q); raw = xq-xp; for (;xp < xq;xp++) { sid = xp->sid; // off = xp->offset; check.set(sid); } sids = check.count(); } template void imTSA:: save_as_mm_tsa(string fname) const { ofstream out(fname.c_str()); filepos_type idxStart(0); id_type idxSize(index.size()); numwrite(out,idxStart); numwrite(out,idxSize); vector mmIndex; for (size_t i = 1; i < this->index.size(); i++) { mmIndex.push_back(out.tellp()); for (size_t k = this->index[i-1]; k < this->index[i]; ++k) { tightwrite(out,sufa[k].sid,0); tightwrite(out,sufa[k].offset,1); } } mmIndex.push_back(out.tellp()); idxStart = out.tellp(); for (size_t i = 0; i < mmIndex.size(); i++) numwrite(out,mmIndex[i]-mmIndex[0]); out.seekp(0); numwrite(out,idxStart); out.close(); } template imTSA:: imTSA(imTSA const& prior, shared_ptr const> const& crp, vector const& newsids, size_t const vsize) { typename ttrack::Position::LESS > sorter(crp.get()); // count how many tokens will be added to the TSA // and index the new additions to the corpus size_t newToks = 0; BOOST_FOREACH(id_type sid, newsids) newToks += crp->sntLen(sid); vector nidx(newToks); // new array entries size_t n = 0; BOOST_FOREACH(id_type sid, newsids) { assert(sid < crp->size()); for (size_t o = 0; o < (*crp)[sid].size(); ++o, ++n) { nidx[n].offset = o; nidx[n].sid = sid; } } sort(nidx.begin(),nidx.end(),sorter); // create the new suffix array this->numTokens = newToks + prior.sufa.size(); this->sufa.resize(this->numTokens); this->startArray = reinterpret_cast(&(*this->sufa.begin())); this->endArray = reinterpret_cast(&(*this->sufa.end())); this->corpusSize = crp->size(); this->corpus = crp; this->index.resize(vsize+1); size_t i = 0; typename vector::iterator k = this->sufa.begin(); // cerr << newToks << " new items at " // << __FILE__ << ":" << __LINE__ << endl; for (size_t n = 0; n < nidx.size();) { id_type nid = crp->getToken(nidx[n])->id(); assert(nid >= i); while (i < nid) { this->index[i] = k - this->sufa.begin(); if (++i < prior.index.size() && prior.index[i-1] < prior.index[i]) { k = copy(prior.sufa.begin() + prior.index[i-1], prior.sufa.begin() + prior.index[i], k); } } this->index[i] = k - this->sufa.begin(); if (++i < prior.index.size() && prior.index[i] > prior.index[i-1]) { size_t j = prior.index[i-1]; while (j < prior.index[i] && n < nidx.size() && crp->getToken(nidx[n])->id() < i) { assert(k < this->sufa.end()); if (sorter(prior.sufa[j],nidx[n])) *k++ = prior.sufa[j++]; else *k++ = nidx[n++]; } while (j < prior.index[i]) { assert(k < this->sufa.end()); *k++ = prior.sufa[j++]; } } while (n < nidx.size() && this->corpus->getToken(nidx[n])->id() < i) { assert(k < this->sufa.end()); *k++ = nidx[n++]; } this->index[i] = k - this->sufa.begin(); } this->index[i] = k - this->sufa.begin(); while (++i < this->index.size()) { if (i < prior.index.size() && prior.index[i-1] < prior.index[i]) k = copy(prior.sufa.begin() + prior.index[i-1], prior.sufa.begin() + prior.index[i], k); this->index[i] = k - this->sufa.begin(); } #if 0 // sanity checks assert(this->sufa.size() == this->index.back()); BOOST_FOREACH(cpos const& x, this->sufa) { assert(x.sid < this->corpusSize); assert(x.offset < this->corpus->sntLen(x.sid)); } for (size_t i = 1; i < index.size(); ++i) { assert(index[i-1] <= index[i]); assert(index[i] <= sufa.size()); for (size_t k = index[i-1]; k < index[i]; ++k) assert(this->corpus->getToken(sufa[k])->id() == i-1); } assert(index[0] == 0); assert(this->startArray == reinterpret_cast(&(*this->sufa.begin()))); assert(this->endArray == reinterpret_cast(&(*this->sufa.end()))); #endif } } #endif