mosesdecoder/moses/TranslationModel/UG/mm/ug_tsa_base.h
Ulrich Germann 9f317f4849 Minor fix.
2014-02-08 17:58:05 +00:00

828 lines
23 KiB
C++

// -*- c++ -*-
// Base class for Token Sequence Arrays
// (c) 2007-2010 Ulrich Germann. All rights reserved.
#ifndef _ug_tsa_base_h
#define _ug_tsa_base_h
#include <iostream>
#include <string>
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/shared_ptr.hpp>
#include "tpt_tokenindex.h"
#include "ug_ttrack_base.h"
#include "ug_im_ttrack.h"
#include "ug_corpus_token.h"
#include "ug_tsa_tree_iterator.h"
#include "ug_tsa_array_entry.h"
#include "ug_tsa_bitset_cache.h"
#include "ug_typedefs.h"
namespace ugdiss
{
using namespace std;
using namespace boost;
namespace bio=boost::iostreams;
template<typename TKN>
TKN const*
next(TKN const* x)
{
return static_cast<TKN const*>(x ? x->next() : NULL);
}
/** Base class for [T]oken [S]equence [A]arrays, a generalization of
* Suffix arrays.
*
* Token types (TKN) must provide a number of functions, see the
* class SimpleWordId (as a simple example of a "core token base
* class") and the template class L2R_Token (a class derived from
* its template parameter (e.g. SimpleWordId) that handles the
* ordering of sequences. Both are decleared/defined in
* ug_corpus_token.{h|cc}
*/
template<typename TKN>
class TSA
{
public:
virtual ~TSA() {};
typedef TSA_tree_iterator<TKN> tree_iterator;
// allows iteration over the array as if it were a trie
typedef tsa::ArrayEntry ArrayEntry;
/* an entry in the array, for iteration over all occurrences of a
* particular sequence */
// typedef boost::dynamic_bitset<uint64_t> bitset;
typedef shared_ptr<bitvector> bitset_pointer;
typedef TKN Token;
typedef BitSetCache<TSA<TKN> > BSC_t;
/* to allow caching of bit vectors that are expensive to create on
* the fly */
friend class TSA_tree_iterator<TKN>;
protected:
shared_ptr<Ttrack<TKN> const> corpus; // pointer to the underlying corpus
char const* startArray; // beginning ...
char const* endArray; // ... and end ...
// of memory block storing the actual TSA
size_t corpusSize;
/** size of the corpus (in number of sentences) of the corpus
* underlying the sequence array.
*
* ATTENTION: This number may differ from
* corpus->size(), namely when the
* suffix array is based on a subset
* of the sentences of /corpus/.
*/
id_type numTokens;
/** size of the corpus (in number of tokens) of the corpus underlying the
* sequence array.
*
* ATTENTION: This number may differ from corpus->numTokens(), namely when
* the suffix array is based on a subset of the sentences of
* /corpus/.
*/
id_type indexSize;
// (number of entries +1) in the index of root-level nodes
size_t BitSetCachingThreshold;
////////////////////////////////////////////////////////////////
// private member functions:
/** @return an index position approximately /fraction/ between
* /startRange/ and /endRange/.
*/
virtual
char const*
index_jump(char const* startRange,
char const* stopRange,
float fraction) const = 0;
/** return the index position of the first item that
* is equal to or includes [refStart,refStart+refLen) as a prefix
*/
char const*
find_start(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const;
/** return the index position of the first item that is greater than
* [refStart,refStart+refLen) and does not include it as a prefix
*/
char const*
find_end(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const;
/** return the index position of the first item that is longer than
* [refStart,refStart+refLen) and includes it as a prefix
*/
char const*
find_longer(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const;
/** Returns a char const* pointing to the position in the data block
* where the first item starting with token /id/ is located.
*/
virtual
char const*
getLowerBound(id_type id) const = 0;
virtual
char const*
getUpperBound(id_type id) const = 0;
public:
shared_ptr<BSC_t> bsc;
char const* arrayStart() const { return startArray; }
char const* arrayEnd() const { return endArray; }
/** @return a pointer to the beginning of the index entry range covering
* [keyStart,keyStop)
*/
char const*
lower_bound(typename vector<TKN>::const_iterator const& keyStart,
typename vector<TKN>::const_iterator const& keyStop) const;
char const*
lower_bound(TKN const* keyStart, TKN const* keyStop) const;
char const*
lower_bound(TKN const* keyStart, int keyLen) const;
/** @return a pointer to the end point of the index entry range covering
* [keyStart,keyStop)
*/
char const*
upper_bound(typename vector<TKN>::const_iterator const& keyStart,
typename vector<TKN>::const_iterator const& keyStop) const;
char const*
upper_bound(TKN const* keyStart, int keyLength) const;
/** dump all suffixes in order to /out/ */
void dump(ostream& out, TokenIndex const& T) const;
/** fill the dynamic bit set with true for all sentences that contain
* /phrase/.
* @return the raw number of occurrences.
*/
count_type
fillBitSet(vector<TKN> const& phrase, bdBitset& dest) const;
count_type
fillBitSet(TKN const* key, size_t keyLen, bdBitset& dest) const;
count_type
setBits(char const* startRange, char const* endRange,
boost::dynamic_bitset<uint64_t>& bs) const;
void
setTokenBits(char const* startRange, char const* endRange, size_t len,
bitvector& bs) const;
/** read the sentence ID into /sid/
* @return position of associated offset.
*
* The function provides an abstraction that uses the right
* interpretation of the position based on the subclass
* (memory-mapped or in-memory).
*/
virtual
char const*
readSid(char const* p, char const* q, id_type& sid) const = 0;
virtual
char const*
readSid(char const* p, char const* q, uint64_t& sid) const = 0;
/** read the offset part of the index entry into /offset/
* @return position of the next entry in the index.
*
* The function provides an abstraction that uses the right
* interpretation of the position based on the subclass
* (memory-mapped or in-memory).
*/
virtual
char const*
readOffset(char const* p, char const* q, uint16_t& offset) const = 0;
virtual
char const*
readOffset(char const* p, char const* q, uint64_t& offset) const = 0;
/** @return sentence count
*/
count_type
sntCnt(char const* p, char const* const q) const;
count_type
rawCnt2(TKN const* keyStart, size_t keyLen) const;
/** @return raw occurrence count
*
* depending on the subclass, this is constant time (imTSA) or
* linear in in the number of occurrences (mmTSA).
*/
virtual
count_type
rawCnt(char const* p, char const* const q) const = 0;
/** get both sentence and word counts.
*
* Avoids having to go over the byte range representing the range
* of suffixes in question twice when dealing with memory-mapped
* suffix arrays.
*/
virtual
void
getCounts(char const* p, char const* const q,
count_type& sids, count_type& raw) const = 0;
string
suffixAt(char const* p, TokenIndex const* V=NULL, size_t maxlen=0)
const;
string
suffixAt(ArrayEntry const& I, TokenIndex const* V=NULL, size_t maxlen=0)
const;
tsa::ArrayEntry& readEntry(char const* p, tsa::ArrayEntry& I) const;
/** return pointer to the end of the data block */
char const* dataEnd() const;
bool sanityCheck1() const;
/** Return an ID that represents a given phrase;
This should NEVER be 0!
Structure of a phrase ID:
leftmost 32 bits: sentence ID in the corpus
next 16 bits: offset from the start of the sentence
next 16 bits: length of the phrase
*/
uint64_t
getSequenceId(typename vector<TKN>::const_iterator const& pstart,
typename vector<TKN>::const_iterator const& pstop) const;
uint64_t
getSequenceId(TKN const* t, ushort plen) const;
/** Return the phrase represented by phrase ID pid_ */
string
getSequence(uint64_t pid, TokenIndex const& V) const;
/** Return the phrase represented by phrase ID pid_ */
vector<TKN>
getSequence(uint64_t pid) const;
TKN const*
getSequenceStart(uint64_t) const;
ushort
getSequenceLength(uint64_t) const;
size_t
getCorpusSize() const;
Ttrack<TKN> const*
getCorpus() const;
bitset_pointer
getBitSet(TKN const* startKey, size_t keyLen) const;
shared_ptr<bitvector>
findTree(TKN const* treeStart, TKN const* treeEnd,
bitvector const* filter) const;
size_t markOccurrences(char const* lo, char const* up, size_t len,
bitvector& bitset,
bool markOnlyStartPosition) const;
bool
findBranches(TKN const* base, bitvector const& terminals,
vector<tree_iterator>& dest) const;
double aveIndexEntrySize() const
{
return (endArray-startArray)/double(numTokens);
}
public:
// virtual
sptr<TSA_tree_iterator<TKN> >
find(TKN const* start, size_t len) const
{
typedef TSA_tree_iterator<TKN> iter;
sptr<iter> ret(new iter(this));
size_t i = 0;
while (i < len && ret->extend(start[i])) ++i;
if (i < len) ret.reset();
return ret;
}
};
// ======================================================================
// template<typename TOKEN>
// sptr<TSA_tree_iterator<TOKEN> >
// TSA<TOKEN>::
// find(TOKEN const* start, size_t len) const
// {
// typedef TSA_tree_iterator<TOKEN> iter;
// sptr<iter> ret(new iter(this));
// size_t i = 0;
// while (i < len && ret->extend(start[i])) ++i;
// if (i < len) ret.reset();
// return ret;
// }
// ---------------------------------------------------------------------------
/** fill the dynamic bitset with information as to which sentences
* the phrase occurs in
* @return number of total occurrences of the phrase in the corpus
*/
template<typename TKN>
count_type
TSA<TKN>::
fillBitSet(vector<TKN> const& key,
bitvector& bitset) const
{
if (!key.size()) return 0;
return fillBitset(&(key[0]),key.size(),bitset);
}
// ---------------------------------------------------------------------------
/** fill the dynamic bitset with information as to which sentences
* the phrase occurs in
* @return number of total occurrences of the phrase in the corpus
*/
template<typename TKN>
count_type
TSA<TKN>::
fillBitSet(TKN const* key, size_t keyLen,
bitvector& bitset) const
{
char const* lo = lower_bound(key,keyLen);
char const* up = upper_bound(key,keyLen);
bitset.resize(corpus->size());
bitset.reset();
return setBits(lo,up,bitset);
}
// ---------------------------------------------------------------------------
template<typename TKN>
count_type
TSA<TKN>::
setBits(char const* startRange, char const* endRange,
bitvector& bs) const
{
count_type wcount=0;
char const* p = startRange;
id_type sid;
ushort off;
while (p < endRange)
{
p = readSid(p,endRange,sid);
p = readOffset(p,endRange,off);
bs.set(sid);
wcount++;
}
return wcount;
}
// ---------------------------------------------------------------------------
template<typename TKN>
void
TSA<TKN>::
setTokenBits(char const* startRange, char const* endRange, size_t len,
bitvector& bs) const
{
ArrayEntry I;
I.next = startRange;
do {
readEntry(I.next,I);
Token const* t = corpus->getToken(I);
Token const* stop = t->stop(*corpus,I.sid);
for (size_t i = 1; i < len; ++i)
{
assert(t != stop);
t = t->next();
}
assert(t != stop);
bs.set(t - corpus->sntStart(0));
} while (I.next != endRange);
}
// ---------------------------------------------------------------------------
template<typename TKN>
count_type
TSA<TKN>::
sntCnt(char const* p, char const* const q) const
{
id_type sid; uint16_t off;
bitvector check(corpus->size());
while (p < q)
{
p = readSid(p,q,sid);
p = readOffset(p,q,off);
check.set(sid);
}
return check.count();
}
//---------------------------------------------------------------------------
/** return the lower bound (first matching entry)
* of the token range matching [startKey,endKey)
*/
template<typename TKN>
char const*
TSA<TKN>::
find_start(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const
{
char const* up = upX;
if (lo >= up) return NULL;
int x;
ArrayEntry I;
while (lo < up)
{
readEntry(index_jump(lo,up,.5),I);
x = corpus->cmp(I,refStart,refLen,d);
if (x >= 0) up = I.pos;
else lo = I.next;
}
assert(lo==up);
if (lo < upX)
{
readEntry(lo,I);
x = corpus->cmp(I,refStart,refLen,d);
}
// return (x >= 0) ? lo : NULL;
return (x == 0 || x == 1) ? lo : NULL;
}
//---------------------------------------------------------------------------
/** return the upper bound (first entry beyond)
* of the token range matching [startKey,endKey)
*/
template<typename TKN>
char const*
TSA<TKN>::
find_end(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const
{
char const* up = upX;
if (lo >= up) return NULL;
int x;
ArrayEntry I;
// float ratio = .1;
while (lo < up)
{
readEntry(index_jump(lo,up,.1),I);
x = corpus->cmp(I,refStart,refLen,d);
if (x == 2) up = I.pos;
else lo = I.next;
// ratio = .5;
}
assert(lo==up);
if (lo < upX)
{
readEntry(lo,I);
x = corpus->cmp(I,refStart,refLen,d);
}
return (x == 2) ? up : upX;
}
//---------------------------------------------------------------------------
/** return the first entry that has the prefix [refStart,refStart+refLen)
* but continues on
*/
template<typename TKN>
char const*
TSA<TKN>::
find_longer(char const* lo, char const* const upX,
TKN const* const refStart, int refLen,
size_t d) const
{
char const* up = upX;
if (lo >= up) return NULL;
int x;
ArrayEntry I;
while (lo < up)
{
readEntry(index_jump(lo,up,.5),I);
x = corpus->cmp(I,refStart,refLen,d);
if (x > 0) up = I.pos;
else lo = I.next;
}
assert(lo==up);
if (lo < upX)
{
readEntry(index_jump(lo,up,.5),I);
x = corpus->cmp(I,refStart,refLen,d);
}
return (x == 1) ? up : NULL;
}
//---------------------------------------------------------------------------
/** returns the start position in the byte array representing
* the tightly packed sorted list of corpus positions for the
* given search phrase
*/
template<typename TKN>
char const*
TSA<TKN>::
lower_bound(typename vector<TKN>::const_iterator const& keyStart,
typename vector<TKN>::const_iterator const& keyStop) const
{
TKN const* const a = &(*keyStart);
TKN const* const z = &(*keyStop);
return lower_bound(a,z);
}
//---------------------------------------------------------------------------
/** returns the start position in the byte array representing
* the tightly packed sorted list of corpus positions for the
* given search phrase
*/
template<typename TKN>
char const*
TSA<TKN>::
lower_bound(TKN const* const keyStart,
TKN const* const keyStop) const
{
return lower_bound(keyStart,keyStop-keyStart);
}
template<typename TKN>
char const*
TSA<TKN>::
lower_bound(TKN const* const keyStart, int keyLen) const
{
if (keyLen == 0) return startArray;
char const* const lower = getLowerBound(keyStart->id());
char const* const upper = getUpperBound(keyStart->id());
return find_start(lower,upper,keyStart,keyLen,0);
}
//---------------------------------------------------------------------------
/** returns the upper bound in the byte array representing
* the tightly packed sorted list of corpus positions for the
* given search phrase (i.e., points just beyond the range)
*/
template<typename TKN>
char const*
TSA<TKN>::
upper_bound(typename vector<TKN>::const_iterator const& keyStart,
typename vector<TKN>::const_iterator const& keyStop) const
{
TKN const* const a = &((TKN)*keyStart);
TKN const* const z = &((TKN)*keyStop);
return upper_bound(a,z-a);
}
//---------------------------------------------------------------------------
/** returns the upper bound in the byte array representing
* the tightly packed sorted list of corpus positions for the
* given search phrase (i.e., points just beyond the range)
*/
template<typename TKN>
char const*
TSA<TKN>::
upper_bound(TKN const* keyStart, int keyLength) const
{
if (keyLength == 0) return arrayEnd();
char const* const lower = getLowerBound(keyStart->id());
char const* const upper = getUpperBound(keyStart->id());
return find_end(lower,upper,keyStart,keyLength,0);
}
//---------------------------------------------------------------------------
template<typename TKN>
count_type
TSA<TKN>::
rawCnt2(TKN const* keyStart, size_t keyLen) const
{
char const* lo = lower_bound(keyStart,keyLen);
char const* up = upper_bound(keyStart,keyLen);
// cerr << up-lo << endl;
return rawCnt(lo,up);
}
//---------------------------------------------------------------------------
template<typename TKN>
uint64_t
TSA<TKN>::
getSequenceId(typename vector<TKN>::const_iterator const& pstart,
typename vector<TKN>::const_iterator const& pstop) const
{
return getSequenceId(&(*pstart),pstop-pstart);
}
//---------------------------------------------------------------------------
template<typename TKN>
uint64_t
TSA<TKN>::
getSequenceId(TKN const* pstart, ushort plen) const
{
char const* p = lower_bound(pstart,plen);
if (!p) return 0; // not found!
ArrayEntry I;
readEntry(p,I);
uint64_t ret = I.sid;
ret <<= 16;
ret += I.offset;
ret <<= 16;
ret += plen;
return ret;
}
//---------------------------------------------------------------------------
template<typename TKN>
vector<TKN>
TSA<TKN>::
getSequence(uint64_t pid) const
{
size_t plen = pid % 65536;
size_t offset = (pid >> 16) % 65536;
TKN const* w = corpus->sntStart(pid >> 32)+offset;
vector<TKN> ret(plen);
for (size_t i = 0; i < plen; i++, w = w->next())
{
assert(w);
ret[i] = *w;
}
return ret;
}
template<typename TKN>
string
TSA<TKN>::
getSequence(uint64_t pid, TokenIndex const& V) const
{
ostringstream buf;
TKN const* a = getSequenceStart(pid);
buf << V[a->id()];
size_t len = getSequenceLength(pid);
for (a = a->next(); --len>0; a = a->next())
buf << " " << V[a->id()];
return buf.str();
}
//---------------------------------------------------------------------------
template<typename TKN>
TKN const*
TSA<TKN>::
getSequenceStart(uint64_t pid) const
{
size_t offset = (pid >> 16) % 65536;
return corpus->sntStart(pid >> 32)+offset;
}
//---------------------------------------------------------------------------
template<typename TKN>
ushort
TSA<TKN>::
getSequenceLength(uint64_t pid) const
{
return (pid % 65536);
}
//---------------------------------------------------------------------------
template<typename TKN>
size_t
TSA<TKN>::
getCorpusSize() const
{
return corpusSize;
}
//---------------------------------------------------------------------------
template<typename TKN>
Ttrack<TKN> const*
TSA<TKN>::
getCorpus() const
{
return corpus.get();
}
//---------------------------------------------------------------------------
template<typename TKN>
tsa::ArrayEntry &
TSA<TKN>::
readEntry(char const* p, tsa::ArrayEntry& I) const
{
I.pos = p;
p = readSid(p,endArray,I.sid);
I.next = readOffset(p,endArray,I.offset);
assert(I.sid < corpus->size());
assert(I.offset < corpus->sntLen(I.sid));
return I;
};
//---------------------------------------------------------------------------
/// find all instances of the tree described by [treeStart, treeEnd)
template<typename TKN>
typename TSA<TKN>::bitset_pointer
TSA<TKN>::
getBitSet(TKN const* startKey, size_t keyLen) const
{
bitset_pointer ret;
if (bsc != NULL)
ret = bsc->get(startKey,keyLen);
else
{
ret.reset(new bitvector(corpus->size()));
fillBitSet(startKey,keyLen,*ret);
}
return ret;
}
//---------------------------------------------------------------------------
template<typename TKN>
size_t
TSA<TKN>::
markOccurrences(char const* lo, char const* up, size_t len,
bitvector& bitset, bool markOnlyStartPosition) const
{
id_type sid;
ushort off;
count_type wcount=0;
TKN const* crpStart = corpus->sntStart(0);
char const* p = lo;
while (p < up)
{
p = readSid(p,up,sid);
p = readOffset(p,up,off);
TKN const* t = corpus->sntStart(sid)+off;
if (markOnlyStartPosition)
bitset.set(t-crpStart);
else
for (size_t i = 0; i < len; ++i, t = t->next())
bitset.set(t-crpStart);
wcount++;
}
return wcount;
}
#if 1
template<typename TKN>
bool
TSA<TKN>::
findBranches(TKN const* base, bitvector const& terminals,
vector<tree_iterator>& dest) const
{
dest.assign(terminals.count(),tree_iterator(this));
for (size_t i = terminals.find_first(), k = 0;
i < terminals.size();
i = terminals.find_next(i),++k)
{
for (TKN const* x = base+i; x && x->id(); x = x->next())
if (!dest[k].extend(x->id()))
return false;
}
typename tree_iterator::SortByApproximateCount sorter;
sort(dest.begin(),dest.end(),sorter);
return true;
}
#endif
}
#endif