open-source-search-engine/Posdb.h
Matt Wells 8a49e87a61 got code with shard rebalancing compiling.
now we store a "sharded by termid" bit in posdb
key for checksums, etc keys that are not sharded
by docid. save having to do disk seeks on every
host in the cluster to do a dup check, etc.
2014-01-11 16:08:42 -08:00

848 lines
24 KiB
C++

// Matt Wells, Copyright May 2012
// . format of an 18-byte posdb key
// tttttttt tttttttt tttttttt tttttttt t = termId (48bits)
// tttttttt tttttttt dddddddd dddddddd d = docId (38 bits)
// dddddddd dddddddd dddddd0r rrrggggg r = siterank, g = langid
// wwwwwwww wwwwwwww wwGGGGss ssvvvvFF w = word postion , s = wordspamrank
// pppppb1N MMMMLZZD v = diversityrank, p = densityrank
// M = multiplier, b = in outlink text
// L = langIdShiftBit (upper bit)
// G: 0 = body
// 1 = intitletag
// 2 = inheading
// 3 = inlist
// 4 = inmetatag
// 5 = inlinktext
// 6 = tag
// 7 = inneighborhood
// 8 = internalinlinktext
// 9 = inurl
//
// F: 0 = original term
// 1 = conjugate/sing/plural
// 2 = synonym
// 3 = hyponym
// NOTE: N bit is 1 if the shard of the record is determined by the
// termid (t bits) and NOT the docid (d bits). N stands for "nosplit"
// and you can find that logic in XmlDoc.cpp and Msg4.cpp. We store
// the hash of the content like this so we can see if it is a dup.
// NOTE: M bits hold scaling factor (logarithmic) for link text voting
// so we do not need to repeat the same link text over and over again.
// Use M bits to hold # of inlinks the page has for other terms.
// NOTE: for inlinktext terms the pattern rank is the siterank of the
// inlinker!
// NOTE: densityrank for title is based on # of title words only. same goes
// for incoming inlink text.
// NOTE: now we can b-step into the termlist looking for a docid match
// and not worry about misalignment from the double compression scheme
// because if the 6th byte's low bit is clear that means its a docid
// 12-byte key, otherwise its the word position 6-byte key since the delbit
// can't be clear for those!
// THEN we can play with a tuner for how these various things affect
// the search results ranking.
#ifndef _POSDB_H_
#define _POSDB_H_
#include "Rdb.h"
#include "Conf.h"
//#include "Indexdb.h"
#include "Titledb.h" // DOCID_MASK
#include "HashTableX.h"
#include "Sections.h"
#define MAXSITERANK 0x0f // 4 bits
#define MAXLANGID 0x3f // 6 bits (5 bits go in 'g' the other in 'L')
#define MAXWORDPOS 0x0003ffff // 18 bits
#define MAXDENSITYRANK 0x1f // 5 bits
#define MAXWORDSPAMRANK 0x0f // 4 bits
#define MAXDIVERSITYRANK 0x0f // 4 bits
#define MAXHASHGROUP 0x0f // 4 bits
#define MAXMULTIPLIER 0x0f // 4 bits
#define MAXISSYNONYM 0x03 // 2 bits
// values for G bits in the posdb key
#define HASHGROUP_BODY 0 // body implied
#define HASHGROUP_TITLE 1
#define HASHGROUP_HEADING 2 // body implied
#define HASHGROUP_INLIST 3 // body implied
#define HASHGROUP_INMETATAG 4
#define HASHGROUP_INLINKTEXT 5
#define HASHGROUP_INTAG 6
#define HASHGROUP_NEIGHBORHOOD 7
#define HASHGROUP_INTERNALINLINKTEXT 8
#define HASHGROUP_INURL 9
#define HASHGROUP_INMENU 10 // body implied
#define HASHGROUP_END 11
float getDiversityWeight ( unsigned char diversityRank );
float getDensityWeight ( unsigned char densityRank );
float getWordSpamWeight ( unsigned char wordSpamRank );
float getLinkerWeight ( unsigned char wordSpamRank );
char *getHashGroupString ( unsigned char hg );
float getHashGroupWeight ( unsigned char hg );
float getTermFreqWeight ( long long termFreq , long long numDocsInColl );
#define SYNONYM_WEIGHT 0.90
#define WIKI_WEIGHT 0.10 // was 0.20
#define SITERANKDIVISOR 3.0
#define SITERANKMULTIPLIER 0.33333333
#define SAMELANGMULT 20.0 // FOREIGNLANGDIVISOR 2.0
#define POSDBKEY key144_t
#define BF_HALFSTOPWIKIBIGRAM 0x01 // "to be" in "to be or not to be"
#define BF_PIPED 0x02 // before a query pipe operator
#define BF_SYNONYM 0x04
#define BF_NEGATIVE 0x08 // query word has a negative sign before it
#define BF_BIGRAM 0x10 // query word has a negative sign before it
#define BF_NUMBER 0x20 // is it like gbsortby:price? numeric?
void printTermList ( long i, char *list, long listSize ) ;
// if query is 'the tigers' we weight bigram "the tigers" x 1.20 because
// its in wikipedia.
// up this to 1.40 for 'the time machine' query
#define WIKI_BIGRAM_WEIGHT 1.40
class Posdb {
public:
// resets rdb
void reset();
// sets up our m_rdb from g_conf (global conf class)
bool init ( );
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool init2 ( long treeMem );
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
// . xmldoc.cpp should call this
// . store all posdb keys from revdbList into one hashtable
// and only add to new list if not in there
//bool makeList ( class RdbList *revdbList ,
// long long docId ,
// class Words *words );
// . make a 16-byte key from all these components
// . since it is 16 bytes, the big bit will be set
void makeKey ( void *kp ,
long long termId ,
unsigned long long docId ,
long wordPos ,
char densityRank ,
char diversityRank ,
char wordSpamRank ,
char siteRank ,
char hashGroup ,
char langId ,
// multiplier: we convert into 7 bits in this function
long multiplier ,
bool isSynonym ,
bool isDelKey ,
bool shardByTermId );
// make just the 6 byte key
void makeKey48 ( char *kp ,
long wordPos ,
char densityRank ,
char diversityRank ,
char wordSpamRank ,
char hashGroup ,
char langId ,
bool isSynonym ,
bool isDelKey );
// we map the 32bit score to like 7 bits here
void setMultiplierBits ( void *vkp , unsigned char mbits ) {
key144_t *kp = (key144_t *)vkp;
if ( mbits > MAXMULTIPLIER ) { char *xx=NULL;*xx=0; }
kp->n0 &= 0xfc0f;
// map score to bits
kp->n0 |= ((unsigned short)mbits) << 4;
}
void setDocIdBits ( void *vkp , unsigned long long docId ) {
key144_t *kp = (key144_t *)vkp;
kp->n1 &= 0x000003ffffffffffLL;
kp->n1 |= (docId<<(32+10));
kp->n2 &= 0xffffffffffff0000LL;
kp->n2 |= docId>>22;
}
void setSiteRankBits ( void *vkp , char siteRank ) {
key144_t *kp = (key144_t *)vkp;
if ( siteRank > MAXSITERANK ) { char *xx=NULL;*xx=0; }
kp->n1 &= 0xfffffe1fffffffffLL;
kp->n1 |= ((unsigned long long)siteRank)<<(32+5);
}
void setLangIdBits ( void *vkp , char langId ) {
key144_t *kp = (key144_t *)vkp;
if ( langId > MAXLANGID ) { char *xx=NULL;*xx=0; }
kp->n1 &= 0xffffffe0ffffffffLL;
// put the lower 5 bits here
kp->n1 |= ((unsigned long long)(langId&0x1f))<<(32);
// and the upper 6th bit here. n0 is a short.
// 0011 1111
if ( langId & 0x20 ) kp->n0 |= 0x08;
}
// set the word position bits et al to this float
void setFloat ( void *vkp , float f ) {
*(float *)(((char *)vkp) + 2) = f; };
// and read the float as well
float getFloat ( void *vkp ) {
return *(float *)(((char *)vkp) + 2); };
void setAlignmentBit ( void *vkp , char val ) {
char *p = (char *)vkp;
if ( val ) p[1] = p[1] | 0x02;
else p[1] = p[1] & 0xfd;
};
bool isAlignmentBitClear ( void *vkp ) {
return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
};
void makeStartKey ( void *kp, long long termId ,
long long docId=0LL){
return makeKey ( kp,
termId ,
docId,
0, // wordpos
0, // density
0, // diversity
0, // wordspam
0, // siterank
0, // hashgroup
0, // langid
0, // multiplier
0, // issynonym/etc.
true , // isdelkey
false ); // shardbytermid?
};
void makeEndKey ( void *kp,long long termId,
long long docId = MAX_DOCID ) {
return makeKey ( kp,
termId ,
docId,
MAXWORDPOS,
MAXDENSITYRANK,
MAXDIVERSITYRANK,
MAXWORDSPAMRANK,
MAXSITERANK,
MAXHASHGROUP,
MAXLANGID,
MAXMULTIPLIER,
MAXISSYNONYM, // issynonym/etc.
false, // isdelkey
true);// shard by termid?
};
// we got two compression bits!
unsigned char getKeySize ( void *key ) {
if ( (((char *)key)[0])&0x04 ) return 6;
if ( (((char *)key)[0])&0x02 ) return 12;
return 18;
};
// PosdbTable uses this to skip from one docid to the next docid
// in a posdblist
char *getNextDocIdSublist ( char *p , char *listEnd ) {
// key must be 12
//if ( getKeySize(p) != 12 ) { char *xx=NULL;*xx=0; }
// skip that first key
p += 12;
// skip the 6 byte keys
for ( ; p < listEnd && getKeySize(p) == 6 ; p += 6 );
// done
return p;
}
long long getTermId ( void *key ) {
return ((key144_t *)key)->n2 >> 16;
};
long long getDocId ( void *key ) {
unsigned long long d = 0LL;
d = ((unsigned char *)key)[11];
d <<= 32;
d |= *(unsigned long *)(((unsigned char *)key)+7);
d >>= 2;
return d;
//long long d = ((key144_t *)key)->n2 & 0xffff;
//d <<= 22;
//d |= ((key144_t *)key)->n1 >> (32+8+2);
//return d;
};
unsigned char getSiteRank ( void *key ) {
return (((key144_t *)key)->n1 >> 37) & MAXSITERANK;
};
unsigned char getLangId ( void *key ) {
if ( ((char *)key)[0] & 0x08 )
return ((((key144_t *)key)->n1 >> 32) & 0x1f) | 0x20;
else
return ((((key144_t *)key)->n1 >> 32) & 0x1f) ;
};
unsigned char getHashGroup ( void *key ) {
//return (((key144_t *)key)->n1 >> 10) & MAXHASHGROUP;
return ((((unsigned char *)key)[3]) >>2) & MAXHASHGROUP;
};
long getWordPos ( void *key ) {
//return (((key144_t *)key)->n1 >> 14) & MAXWORDPOS;
return (*((unsigned long *)((unsigned char *)key+2))) >> (8+6);
};
inline void setWordPos ( char *key , unsigned long wpos ) {
// truncate
wpos &= MAXWORDPOS;
if ( wpos & 0x01 ) key[3] |= 0x40;
else key[3] &= ~((unsigned char)0x40);
if ( wpos & 0x02 ) key[3] |= 0x80;
else key[3] &= ~((unsigned char)0x80);
wpos >>= 2;
key[4] = ((char *)&wpos)[0];
key[5] = ((char *)&wpos)[1];
};
unsigned char getWordSpamRank ( void *key ) {
//return (((key144_t *)key)->n1 >> 6) & MAXWORDSPAMRANK;
return ((((unsigned short *)key)[1]) >>6) & MAXWORDSPAMRANK;
};
unsigned char getDiversityRank ( void *key ) {
//return (((key144_t *)key)->n1 >> 2) & MAXDIVERSITYRANK;
return ((((unsigned char *)key)[2]) >>2) & MAXDIVERSITYRANK;
};
unsigned char getIsSynonym ( void *key ) {
return (((key144_t *)key)->n1 ) & 0x03;
};
unsigned char getIsHalfStopWikiBigram ( void *key ) {
return ((char *)key)[2] & 0x01;
};
unsigned char getDensityRank ( void *key ) {
return ((*(unsigned short *)key) >> 11) & MAXDENSITYRANK;
};
inline void setDensityRank ( char *key , unsigned char dr ) {
// shift up
dr <<= 3;
// clear out
key[1] &= 0x07;
// or in
key[1] |= dr;
};
char isShardedByTermId ( void *key ){return ((char *)key)[1] & 0x01; };
void setShardedByTermIdBit ( void *key ) {
char *k = (char *)key;
k[1] |= 0x01;
};
unsigned char getMultiplier ( void *key ) {
return ((*(unsigned short *)key) >> 4) & MAXMULTIPLIER; };
// . HACK: for sectionhash:xxxxx posdb keys
// . we use the w,G,s,v and F bits
unsigned long getSectionSiteHash32 ( void *key ) {
return *(unsigned long *)(((char *)key)+2); };
void setSectionSiteHash32 ( void *key , long siteHash32 ) {
*(unsigned long *)(((char *)key)+2) = siteHash32; };
long long getTermFreq ( char *coll , long long termId ) ;
//RdbCache *getCache ( ) { return &m_rdb.m_cache; };
Rdb *getRdb ( ) { return &m_rdb; };
Rdb m_rdb;
DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
DiskPageCache m_pc;
};
/*
#include "RdbList.h"
class PosdbList : public RdbList {
public:
// why do i have to repeat this for LinkInfo::set() calling our set()??
void set ( char *list , long listSize , bool ownData ) {
RdbList::set ( list ,
listSize ,
list , // alloc
listSize , // alloc size
0 , // fixed data size
ownData ,
true , // use half keys?
sizeof(key_t));// 12 bytes per key
};
// clear the low bits on the keys so terms are DELETED
void clearDelBits ( );
void print();
// . these are made for special IndexLists, too
// . getTermId() assumes as 12 byte key
long long getCurrentTermId12 ( ) {
return getTermId12 ( m_listPtr ); };
long long getTermId12 ( char *rec ) {
return (*(unsigned long long *)(&rec[4])) >> 16 ;
};
long long getTermId16 ( char *rec ) {
return (*(unsigned long long *)(&rec[8])) >> 16 ;
};
// these 2 assume 12 and 6 byte keys respectively
long long getCurrentDocId () {
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
else return getDocId12(m_listPtr);
};
long long getDocId ( char *rec ) {
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
else return getDocId12(rec);
};
long long getCurrentDocId12 ( ) {
return getDocId12 ( m_listPtr ); };
long long getDocId12 ( char *rec ) {
return ((*(unsigned long long *)(rec)) >> 2) & DOCID_MASK; };
long long getDocId6 ( char *rec ) {
long long docid;
*(long *)(&docid) = *(long *)rec;
((char *)&docid)[4] = rec[4];
docid >>= 2;
return docid & DOCID_MASK;
};
// this works with either 12 or 6 byte keys
unsigned char getCurrentScore ( ) {
return getScore(m_listPtr); };
unsigned char getScore ( char *rec ) { return ~rec[5]; };
// uncomplemented...
void setScore ( char *rec , char score ) { rec[5] = score; };
// for date lists only...
long getCurrentDate ( ) { return ~*(long *)(m_listPtr+6); };
};
*/
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
// max # search results that can be viewed without using TopTree
//#define MAX_RESULTS 1000
class PosdbTable {
public:
// . returns false on error and sets errno
// . "termFreqs" are 1-1 with q->m_qterms[]
// . sets m_q to point to q
void init (Query *q ,
char debug ,
void *logstate ,
class TopTree *topTree ,
char *coll ,
//IndexList *lists ,
//long numLists ,
class Msg2 *msg2,
class Msg39Request *r );
// pre-allocate m_whiteListTable
bool allocWhiteListTable ( ) ;
// pre-allocate memory since intersection runs in a thread
bool allocTopTree ( );
// . returns false on error and sets errno
// . we assume there are "m_numTerms" lists passed in (see set() above)
//void intersectLists_r ( );
//void intersectLists9_r ( );
void getTermPairScoreForNonBody ( long i, long j,
char *wpi, char *wpj,
char *endi, char *endj,
long qdist ,
float *retMax );
float getSingleTermScore ( long i, char *wpi , char *endi,
class DocIdScore *pdcs,
char **bestPos );
void evalSlidingWindow ( char **ptrs ,
long nr ,
char **bestPos ,
float *scoreMatrix ,
long advancedTermNum );
float getTermPairScoreForWindow ( long i, long j,
char *wpi,
char *wpj,
long fixedDistance
);
float getTermPairScoreForAny ( long i, long j,
char *wpi, char *wpj,
char *endi, char *endj,
class DocIdScore *pdcs );
// some generic stuff
PosdbTable();
~PosdbTable();
void reset();
// Msg39 needs to call these
void freeMem ( ) ;
// has init already been called?
bool isInitialized ( ) { return m_initialized; };
unsigned long long m_docId;
unsigned long long m_docIdHack;
// hack for seo.cpp:
float m_finalScore;
float m_preFinalScore;
// how long to add the last batch of lists
long long m_addListsTime;
long long m_t1 ;
long long m_t2 ;
long long m_estimatedTotalHits;
long m_errno;
long m_numSlots;
long m_maxScores;
char *m_coll;
collnum_t m_collnum;
long *m_qpos;
long *m_wikiPhraseIds;
long *m_quotedStartIds;
//class DocIdScore *m_ds;
long m_qdist;
float *m_freqWeights;
//long long *m_freqs;
char *m_bflags;
long *m_qtermNums;
float m_bestWindowScore;
//char **m_finalWinners1;
//char **m_finalWinners2;
//float *m_finalScores;
char **m_windowTermPtrs;
// how many docs in the collection?
long long m_docsInColl;
SectionStats m_sectionStats;
SafeBuf m_siteHashList;
HashTableX m_dt;
class Msg2 *m_msg2;
// if getting more than MAX_RESULTS results, use this top tree to hold
// them rather than the m_top*[] arrays above
class TopTree *m_topTree;
//HashTableX m_docIdTable;
SafeBuf m_scoreInfoBuf;
SafeBuf m_pairScoreBuf;
SafeBuf m_singleScoreBuf;
//SafeBuf m_mergeBuf;
// a reference to the query
Query *m_q;
// these are NOT in imap space, but in query term space, 1-1 with
// Query::m_qterms[]
//IndexList *m_lists;
//long m_numLists;
// has init() been called?
bool m_initialized;
// are we in debug mode?
char m_debug;
// for debug msgs
long m_logstate;
//long long m_numDocsInColl;
class Msg39Request *m_r;
long m_sortByTermNum;
// the new intersection/scoring algo
void intersectLists10_r ( );
HashTableX m_whiteListTable;
bool m_useWhiteTable;
bool m_addedSites;
// sets stuff used by intersect10_r()
bool setQueryTermInfo ( );
void shrinkSubLists ( class QueryTermInfo *qti );
// for intersecting docids
void addDocIdVotes ( class QueryTermInfo *qti , long listGroupNum );
// for negative query terms...
void rmDocIdVotes ( class QueryTermInfo *qti );
// upper score bound
float getMaxPossibleScore ( class QueryTermInfo *qti ,
long bestDist ,
long qdist ,
class QueryTermInfo *qtm ) ;
// stuff set in setQueryTermInf() function:
SafeBuf m_qiBuf;
long m_numQueryTermInfos;
// the size of the smallest set of sublists. each sublists is
// the main term or a synonym, etc. of the main term.
long m_minListSize;
// which query term info has the smallest set of sublists
long m_minListi;
// intersect docids from each QueryTermInfo into here
SafeBuf m_docIdVoteBuf;
// are all positive query terms in same wikipedia phrase like
// 'time enough for love'?
bool m_allInSameWikiPhrase;
long m_realMaxTop;
};
#define MAXDST 10
// distance used when measuring word from title/linktext/etc to word in body
#define FIXED_DISTANCE 400
class PairScore {
public:
float m_finalScore;
char m_isSynonym1;
char m_isSynonym2;
char m_isHalfStopWikiBigram1;
char m_isHalfStopWikiBigram2;
char m_diversityRank1;
char m_diversityRank2;
char m_densityRank1;
char m_densityRank2;
char m_wordSpamRank1;
char m_wordSpamRank2;
char m_hashGroup1;
char m_hashGroup2;
char m_inSameWikiPhrase;
char m_fixedDistance;
long m_wordPos1;
long m_wordPos2;
long long m_termFreq1;
long long m_termFreq2;
float m_tfWeight1;
float m_tfWeight2;
long m_qtermNum1;
long m_qtermNum2;
char m_bflags1;
char m_bflags2;
long m_qdist;
};
class SingleScore {
public:
float m_finalScore;
char m_isSynonym;
char m_isHalfStopWikiBigram;
char m_diversityRank;
char m_densityRank;
char m_wordSpamRank;
char m_hashGroup;
long m_wordPos;
long long m_termFreq; // float m_termFreqWeight;
float m_tfWeight;
long m_qtermNum;
char m_bflags;
};
// we add up the pair scores of this many of the top-scoring pairs
// for inlink text only, so it is accumulative. but now we also
// have a parm "m_realMaxTop" which is <= MAX_TOP and can be used to
// tune this down.
#define MAX_TOP 10
// transparent query scoring info per docid
class DocIdScore {
public:
DocIdScore ( ) { reset(); }
void reset ( ) {
m_numPairs = m_numSingles = 0;
m_pairsOffset = m_singlesOffset = -1;
m_pairScores = NULL;
m_singleScores = NULL;
};
// we use QueryChange::getDebugDocIdScore() to "deserialize" per se
bool serialize ( class SafeBuf *sb );
long long m_docId;
float m_finalScore;
char m_siteRank;
long m_docLang; // langId
long m_numRequiredTerms;
long m_numPairs;
long m_numSingles;
// . m_pairScores is just all the term pairs serialized
// . they contain their query term #1 of each term in the pair and
// they have the match number for each pair, since now each
// pair of query terms can have up to MAX_TOP associated pairs
// whose scores we add together to get the final score for that pair
// . record offset into PosdbTable::m_pairScoreBuf
// . Msg39Reply::ptr_pairScoreBuf will be this
long m_pairsOffset;
// . record offset into PosdbTable.m_singleScoreBuf
// . Msg39Reply::ptr_singleScoreBuf will be this
long m_singlesOffset;
//PairScore m_pairScores [MAXDST][MAXDST][MAX_TOP];
//SingleScore m_singleScores[MAXDST] [MAX_TOP];
// Msg3a.cpp::mergeLists() should set these ptrs after it
// copies over a top DocIdScore for storing the final results array
class PairScore *m_pairScores;
class SingleScore *m_singleScores;
};
extern Posdb g_posdb;
extern Posdb g_posdb2;
extern RdbCache g_termFreqCache;
// . b-step into list looking for docid "docId"
// . assume p is start of list, excluding 6 byte of termid
inline char *getWordPosList ( long long docId , char *list , long listSize ) {
// make step divisible by 6 initially
long step = (listSize / 12) * 6;
// shortcut
char *listEnd = list + listSize;
// divide in half
char *p = list + step;
// for detecting not founds
char count = 0;
loop:
// save it
char *origp = p;
// scan up to docid. we use this special bit to distinguish between
// 6-byte and 12-byte posdb keys
for ( ; p > list && (p[1] & 0x02) ; p -= 6 );
// ok, we hit a 12 byte key i guess, so backup 6 more
p -= 6;
// ok, we got a 12-byte key then i guess
long long d = g_posdb.getDocId ( p );
// we got a match, but it might be a NEGATIVE key so
// we have to try to find the positive keys in that case
if ( d == docId ) {
// if its positive, no need to do anything else
if ( (p[0] & 0x01) == 0x01 ) return p;
// ok, it's negative, try to see if the positive is
// in here, if not then return NULL.
// save current pos
char *current = p;
// back up to 6 byte key before this 12 byte key
p -= 6;
// now go backwards to previous 12 byte key
for ( ; p > list && (p[1] & 0x02) ; p -= 6 );
// ok, we hit a 12 byte key i guess, so backup 6 more
p -= 6;
// is it there?
if ( p >= list && g_posdb.getDocId(p) == docId ) {
// sanity. return NULL if its negative! wtf????
if ( (p[0] & 0x01) == 0x00 ) return NULL;
// got it
return p;
}
// ok, no positive before us, try after us
p = current;
// advance over current 12 byte key
p += 12;
// now go forwards to next 12 byte key
for ( ; p < listEnd && (p[1] & 0x02) ; p += 6 );
// is it there?
if ( p + 12 < listEnd && g_posdb.getDocId(p) == docId ) {
// sanity. return NULL if its negative! wtf????
if ( (p[0] & 0x01) == 0x00 ) return NULL;
// got it
return p;
}
// . crap, i guess just had a single negative docid then
// . return that and the caller will see its negative
return current;
}
// reduce step
//step /= 2;
step >>= 1;
// . make divisible by 6!
// . TODO: speed this up!!!
step = step - (step % 6);
// sanity
if ( step % 6 ) { char *xx=NULL;*xx=0; }
// ensure never 0
if ( step <= 0 ) {
step = 6;
// return NULL if not found
if ( count++ >= 2 ) return NULL;
}
// go up or down then
if ( d < docId ) {
p = origp + step;
if ( p > listEnd ) p = listEnd - 6;
}
else {
p = origp - step;
if ( p < list ) p = list;
}
// and repeat
goto loop;
}
#endif