// Matt Wells, Copyright May 2012 // . format of an 18-byte posdb key // tttttttt tttttttt tttttttt tttttttt t = termId (48bits) // tttttttt tttttttt dddddddd dddddddd d = docId (38 bits) // dddddddd dddddddd dddddd0r rrrggggg r = siterank, g = langid // wwwwwwww wwwwwwww wwGGGGss ssvvvvFF w = word postion , s = wordspamrank // pppppb1N MMMMLZZD v = diversityrank, p = densityrank // M = multiplier, b = in outlink text // L = langIdShiftBit (upper bit) // G: 0 = body // 1 = intitletag // 2 = inheading // 3 = inlist // 4 = inmetatag // 5 = inlinktext // 6 = tag // 7 = inneighborhood // 8 = internalinlinktext // 9 = inurl // // F: 0 = original term // 1 = conjugate/sing/plural // 2 = synonym // 3 = hyponym // NOTE: N bit is 1 if the shard of the record is determined by the // termid (t bits) and NOT the docid (d bits). N stands for "nosplit" // and you can find that logic in XmlDoc.cpp and Msg4.cpp. We store // the hash of the content like this so we can see if it is a dup. // NOTE: M bits hold scaling factor (logarithmic) for link text voting // so we do not need to repeat the same link text over and over again. // Use M bits to hold # of inlinks the page has for other terms. // NOTE: for inlinktext terms the pattern rank is the siterank of the // inlinker! // NOTE: densityrank for title is based on # of title words only. same goes // for incoming inlink text. // NOTE: now we can b-step into the termlist looking for a docid match // and not worry about misalignment from the double compression scheme // because if the 6th byte's low bit is clear that means its a docid // 12-byte key, otherwise its the word position 6-byte key since the delbit // can't be clear for those! // THEN we can play with a tuner for how these various things affect // the search results ranking. #ifndef _POSDB_H_ #define _POSDB_H_ #include "Rdb.h" #include "Conf.h" //#include "Indexdb.h" #include "Titledb.h" // DOCID_MASK #include "HashTableX.h" #include "Sections.h" #define MAXSITERANK 0x0f // 4 bits #define MAXLANGID 0x3f // 6 bits (5 bits go in 'g' the other in 'L') #define MAXWORDPOS 0x0003ffff // 18 bits #define MAXDENSITYRANK 0x1f // 5 bits #define MAXWORDSPAMRANK 0x0f // 4 bits #define MAXDIVERSITYRANK 0x0f // 4 bits #define MAXHASHGROUP 0x0f // 4 bits #define MAXMULTIPLIER 0x0f // 4 bits #define MAXISSYNONYM 0x03 // 2 bits // values for G bits in the posdb key #define HASHGROUP_BODY 0 // body implied #define HASHGROUP_TITLE 1 #define HASHGROUP_HEADING 2 // body implied #define HASHGROUP_INLIST 3 // body implied #define HASHGROUP_INMETATAG 4 #define HASHGROUP_INLINKTEXT 5 #define HASHGROUP_INTAG 6 #define HASHGROUP_NEIGHBORHOOD 7 #define HASHGROUP_INTERNALINLINKTEXT 8 #define HASHGROUP_INURL 9 #define HASHGROUP_INMENU 10 // body implied #define HASHGROUP_END 11 float getDiversityWeight ( unsigned char diversityRank ); float getDensityWeight ( unsigned char densityRank ); float getWordSpamWeight ( unsigned char wordSpamRank ); float getLinkerWeight ( unsigned char wordSpamRank ); char *getHashGroupString ( unsigned char hg ); float getHashGroupWeight ( unsigned char hg ); float getTermFreqWeight ( long long termFreq , long long numDocsInColl ); #define SYNONYM_WEIGHT 0.90 #define WIKI_WEIGHT 0.10 // was 0.20 #define SITERANKDIVISOR 3.0 #define SITERANKMULTIPLIER 0.33333333 #define SAMELANGMULT 20.0 // FOREIGNLANGDIVISOR 2.0 #define POSDBKEY key144_t #define BF_HALFSTOPWIKIBIGRAM 0x01 // "to be" in "to be or not to be" #define BF_PIPED 0x02 // before a query pipe operator #define BF_SYNONYM 0x04 #define BF_NEGATIVE 0x08 // query word has a negative sign before it #define BF_BIGRAM 0x10 // query word has a negative sign before it #define BF_NUMBER 0x20 // is it like gbsortby:price? numeric? void printTermList ( long i, char *list, long listSize ) ; // if query is 'the tigers' we weight bigram "the tigers" x 1.20 because // its in wikipedia. // up this to 1.40 for 'the time machine' query #define WIKI_BIGRAM_WEIGHT 1.40 class Posdb { public: // resets rdb void reset(); // sets up our m_rdb from g_conf (global conf class) bool init ( ); // init the rebuild/secondary rdb, used by PageRepair.cpp bool init2 ( long treeMem ); bool verify ( char *coll ); bool addColl ( char *coll, bool doVerify = true ); // . xmldoc.cpp should call this // . store all posdb keys from revdbList into one hashtable // and only add to new list if not in there //bool makeList ( class RdbList *revdbList , // long long docId , // class Words *words ); // . make a 16-byte key from all these components // . since it is 16 bytes, the big bit will be set void makeKey ( void *kp , long long termId , unsigned long long docId , long wordPos , char densityRank , char diversityRank , char wordSpamRank , char siteRank , char hashGroup , char langId , // multiplier: we convert into 7 bits in this function long multiplier , bool isSynonym , bool isDelKey , bool shardByTermId ); // make just the 6 byte key void makeKey48 ( char *kp , long wordPos , char densityRank , char diversityRank , char wordSpamRank , char hashGroup , char langId , bool isSynonym , bool isDelKey ); // we map the 32bit score to like 7 bits here void setMultiplierBits ( void *vkp , unsigned char mbits ) { key144_t *kp = (key144_t *)vkp; if ( mbits > MAXMULTIPLIER ) { char *xx=NULL;*xx=0; } kp->n0 &= 0xfc0f; // map score to bits kp->n0 |= ((unsigned short)mbits) << 4; } void setDocIdBits ( void *vkp , unsigned long long docId ) { key144_t *kp = (key144_t *)vkp; kp->n1 &= 0x000003ffffffffffLL; kp->n1 |= (docId<<(32+10)); kp->n2 &= 0xffffffffffff0000LL; kp->n2 |= docId>>22; } void setSiteRankBits ( void *vkp , char siteRank ) { key144_t *kp = (key144_t *)vkp; if ( siteRank > MAXSITERANK ) { char *xx=NULL;*xx=0; } kp->n1 &= 0xfffffe1fffffffffLL; kp->n1 |= ((unsigned long long)siteRank)<<(32+5); } void setLangIdBits ( void *vkp , char langId ) { key144_t *kp = (key144_t *)vkp; if ( langId > MAXLANGID ) { char *xx=NULL;*xx=0; } kp->n1 &= 0xffffffe0ffffffffLL; // put the lower 5 bits here kp->n1 |= ((unsigned long long)(langId&0x1f))<<(32); // and the upper 6th bit here. n0 is a short. // 0011 1111 if ( langId & 0x20 ) kp->n0 |= 0x08; } // set the word position bits et al to this float void setFloat ( void *vkp , float f ) { *(float *)(((char *)vkp) + 2) = f; }; void setInt ( void *vkp , long x ) { *(long *)(((char *)vkp) + 2) = x; }; // and read the float as well float getFloat ( void *vkp ) { return *(float *)(((char *)vkp) + 2); }; long getInt ( void *vkp ) { return *(long *)(((char *)vkp) + 2); }; void setAlignmentBit ( void *vkp , char val ) { char *p = (char *)vkp; if ( val ) p[1] = p[1] | 0x02; else p[1] = p[1] & 0xfd; }; bool isAlignmentBitClear ( void *vkp ) { return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 ); }; void makeStartKey ( void *kp, long long termId , long long docId=0LL){ return makeKey ( kp, termId , docId, 0, // wordpos 0, // density 0, // diversity 0, // wordspam 0, // siterank 0, // hashgroup 0, // langid 0, // multiplier 0, // issynonym/etc. true , // isdelkey false ); // shardbytermid? }; void makeEndKey ( void *kp,long long termId, long long docId = MAX_DOCID ) { return makeKey ( kp, termId , docId, MAXWORDPOS, MAXDENSITYRANK, MAXDIVERSITYRANK, MAXWORDSPAMRANK, MAXSITERANK, MAXHASHGROUP, MAXLANGID, MAXMULTIPLIER, MAXISSYNONYM, // issynonym/etc. false, // isdelkey true);// shard by termid? }; // we got two compression bits! unsigned char getKeySize ( void *key ) { if ( (((char *)key)[0])&0x04 ) return 6; if ( (((char *)key)[0])&0x02 ) return 12; return 18; }; // PosdbTable uses this to skip from one docid to the next docid // in a posdblist char *getNextDocIdSublist ( char *p , char *listEnd ) { // key must be 12 //if ( getKeySize(p) != 12 ) { char *xx=NULL;*xx=0; } // skip that first key p += 12; // skip the 6 byte keys for ( ; p < listEnd && getKeySize(p) == 6 ; p += 6 ); // done return p; } long long getTermId ( void *key ) { return ((key144_t *)key)->n2 >> 16; }; long long getDocId ( void *key ) { unsigned long long d = 0LL; d = ((unsigned char *)key)[11]; d <<= 32; d |= *(unsigned long *)(((unsigned char *)key)+7); d >>= 2; return d; //long long d = ((key144_t *)key)->n2 & 0xffff; //d <<= 22; //d |= ((key144_t *)key)->n1 >> (32+8+2); //return d; }; unsigned char getSiteRank ( void *key ) { return (((key144_t *)key)->n1 >> 37) & MAXSITERANK; }; unsigned char getLangId ( void *key ) { if ( ((char *)key)[0] & 0x08 ) return ((((key144_t *)key)->n1 >> 32) & 0x1f) | 0x20; else return ((((key144_t *)key)->n1 >> 32) & 0x1f) ; }; unsigned char getHashGroup ( void *key ) { //return (((key144_t *)key)->n1 >> 10) & MAXHASHGROUP; return ((((unsigned char *)key)[3]) >>2) & MAXHASHGROUP; }; long getWordPos ( void *key ) { //return (((key144_t *)key)->n1 >> 14) & MAXWORDPOS; return (*((unsigned long *)((unsigned char *)key+2))) >> (8+6); }; inline void setWordPos ( char *key , unsigned long wpos ) { // truncate wpos &= MAXWORDPOS; if ( wpos & 0x01 ) key[3] |= 0x40; else key[3] &= ~((unsigned char)0x40); if ( wpos & 0x02 ) key[3] |= 0x80; else key[3] &= ~((unsigned char)0x80); wpos >>= 2; key[4] = ((char *)&wpos)[0]; key[5] = ((char *)&wpos)[1]; }; unsigned char getWordSpamRank ( void *key ) { //return (((key144_t *)key)->n1 >> 6) & MAXWORDSPAMRANK; return ((((unsigned short *)key)[1]) >>6) & MAXWORDSPAMRANK; }; unsigned char getDiversityRank ( void *key ) { //return (((key144_t *)key)->n1 >> 2) & MAXDIVERSITYRANK; return ((((unsigned char *)key)[2]) >>2) & MAXDIVERSITYRANK; }; unsigned char getIsSynonym ( void *key ) { return (((key144_t *)key)->n1 ) & 0x03; }; unsigned char getIsHalfStopWikiBigram ( void *key ) { return ((char *)key)[2] & 0x01; }; unsigned char getDensityRank ( void *key ) { return ((*(unsigned short *)key) >> 11) & MAXDENSITYRANK; }; inline void setDensityRank ( char *key , unsigned char dr ) { // shift up dr <<= 3; // clear out key[1] &= 0x07; // or in key[1] |= dr; }; char isShardedByTermId ( void *key ){return ((char *)key)[1] & 0x01; }; void setShardedByTermIdBit ( void *key ) { char *k = (char *)key; k[1] |= 0x01; }; unsigned char getMultiplier ( void *key ) { return ((*(unsigned short *)key) >> 4) & MAXMULTIPLIER; }; // . HACK: for sectionhash:xxxxx posdb keys // . we use the w,G,s,v and F bits unsigned long getSectionSiteHash32 ( void *key ) { return *(unsigned long *)(((char *)key)+2); }; void setSectionSiteHash32 ( void *key , long siteHash32 ) { *(unsigned long *)(((char *)key)+2) = siteHash32; }; long long getTermFreq ( collnum_t collnum, long long termId ) ; //RdbCache *getCache ( ) { return &m_rdb.m_cache; }; Rdb *getRdb ( ) { return &m_rdb; }; Rdb m_rdb; DiskPageCache *getDiskPageCache ( ) { return &m_pc; }; DiskPageCache m_pc; }; #define MAX_SUBLISTS 50 // . each QueryTerm has this attached additional info now: // . these should be 1-1 with query terms, Query::m_qterms[] class QueryTermInfo { public: class QueryTerm *m_qt; // the required lists for this query term, synonym lists, etc. RdbList *m_subLists [MAX_SUBLISTS]; // flags to indicate if bigram list should be scored higher char m_bigramFlags [MAX_SUBLISTS]; // shrinkSubLists() set this: long m_newSubListSize [MAX_SUBLISTS]; char *m_newSubListStart [MAX_SUBLISTS]; char *m_newSubListEnd [MAX_SUBLISTS]; char *m_cursor [MAX_SUBLISTS]; char *m_savedCursor [MAX_SUBLISTS]; // the corresponding QueryTerm for this sublist //class QueryTerm *m_qtermList [MAX_SUBLISTS]; long m_numNewSubLists; // how many are valid? long m_numSubLists; // size of all m_subLists in bytes long long m_totalSubListsSize; // the term freq weight for this term float m_termFreqWeight; // what query term # do we correspond to in Query.h long m_qtermNum; // the word position of this query term in the Words.h class long m_qpos; // the wikipedia phrase id if we start one long m_wikiPhraseId; // phrase id term or bigram is in long m_quotedStartId; }; /* #include "RdbList.h" class PosdbList : public RdbList { public: // why do i have to repeat this for LinkInfo::set() calling our set()?? void set ( char *list , long listSize , bool ownData ) { RdbList::set ( list , listSize , list , // alloc listSize , // alloc size 0 , // fixed data size ownData , true , // use half keys? sizeof(key_t));// 12 bytes per key }; // clear the low bits on the keys so terms are DELETED void clearDelBits ( ); void print(); // . these are made for special IndexLists, too // . getTermId() assumes as 12 byte key long long getCurrentTermId12 ( ) { return getTermId12 ( m_listPtr ); }; long long getTermId12 ( char *rec ) { return (*(unsigned long long *)(&rec[4])) >> 16 ; }; long long getTermId16 ( char *rec ) { return (*(unsigned long long *)(&rec[8])) >> 16 ; }; // these 2 assume 12 and 6 byte keys respectively long long getCurrentDocId () { if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr); else return getDocId12(m_listPtr); }; long long getDocId ( char *rec ) { if ( isHalfBitOn ( rec ) ) return getDocId6 (rec); else return getDocId12(rec); }; long long getCurrentDocId12 ( ) { return getDocId12 ( m_listPtr ); }; long long getDocId12 ( char *rec ) { return ((*(unsigned long long *)(rec)) >> 2) & DOCID_MASK; }; long long getDocId6 ( char *rec ) { long long docid; *(long *)(&docid) = *(long *)rec; ((char *)&docid)[4] = rec[4]; docid >>= 2; return docid & DOCID_MASK; }; // this works with either 12 or 6 byte keys unsigned char getCurrentScore ( ) { return getScore(m_listPtr); }; unsigned char getScore ( char *rec ) { return ~rec[5]; }; // uncomplemented... void setScore ( char *rec , char score ) { rec[5] = score; }; // for date lists only... long getCurrentDate ( ) { return ~*(long *)(m_listPtr+6); }; }; */ #include "Query.h" // MAX_QUERY_TERMS, qvec_t // max # search results that can be viewed without using TopTree //#define MAX_RESULTS 1000 class PosdbTable { public: // . returns false on error and sets errno // . "termFreqs" are 1-1 with q->m_qterms[] // . sets m_q to point to q void init (Query *q , char debug , void *logstate , class TopTree *topTree , //char *coll , collnum_t collnum , //IndexList *lists , //long numLists , class Msg2 *msg2, class Msg39Request *r ); // pre-allocate m_whiteListTable bool allocWhiteListTable ( ) ; // pre-allocate memory since intersection runs in a thread bool allocTopTree ( ); // . returns false on error and sets errno // . we assume there are "m_numTerms" lists passed in (see set() above) //void intersectLists_r ( ); //void intersectLists9_r ( ); void getTermPairScoreForNonBody ( long i, long j, char *wpi, char *wpj, char *endi, char *endj, long qdist , float *retMax ); float getSingleTermScore ( long i, char *wpi , char *endi, class DocIdScore *pdcs, char **bestPos ); void evalSlidingWindow ( char **ptrs , long nr , char **bestPos , float *scoreMatrix , long advancedTermNum ); float getTermPairScoreForWindow ( long i, long j, char *wpi, char *wpj, long fixedDistance ); float getTermPairScoreForAny ( long i, long j, char *wpi, char *wpj, char *endi, char *endj, class DocIdScore *pdcs ); bool makeDocIdVoteBufForBoolQuery_r ( ) ; // some generic stuff PosdbTable(); ~PosdbTable(); void reset(); // Msg39 needs to call these void freeMem ( ) ; // has init already been called? bool isInitialized ( ) { return m_initialized; }; unsigned long long m_docId; unsigned long long m_docIdHack; bool m_hasMaxSerpScore; // hack for seo.cpp: float m_finalScore; float m_preFinalScore; // how long to add the last batch of lists long long m_addListsTime; long long m_t1 ; long long m_t2 ; long long m_estimatedTotalHits; long m_errno; long m_numSlots; long m_maxScores; //char *m_coll; collnum_t m_collnum; long *m_qpos; long *m_wikiPhraseIds; long *m_quotedStartIds; //class DocIdScore *m_ds; long m_qdist; float *m_freqWeights; //long long *m_freqs; char *m_bflags; long *m_qtermNums; float m_bestWindowScore; //char **m_finalWinners1; //char **m_finalWinners2; //float *m_finalScores; char **m_windowTermPtrs; // how many docs in the collection? long long m_docsInColl; SectionStats m_sectionStats; SafeBuf m_siteHashList; HashTableX m_dt; class Msg2 *m_msg2; // if getting more than MAX_RESULTS results, use this top tree to hold // them rather than the m_top*[] arrays above class TopTree *m_topTree; //HashTableX m_docIdTable; SafeBuf m_scoreInfoBuf; SafeBuf m_pairScoreBuf; SafeBuf m_singleScoreBuf; //SafeBuf m_mergeBuf; // a reference to the query Query *m_q; // these are NOT in imap space, but in query term space, 1-1 with // Query::m_qterms[] //IndexList *m_lists; //long m_numLists; // has init() been called? bool m_initialized; // are we in debug mode? char m_debug; // for debug msgs long m_logstate; //long long m_numDocsInColl; class Msg39Request *m_r; // for gbsortby:item.price ... long m_sortByTermNum; long m_sortByTermNumInt; // for gbmin:price:1.99 long m_minScoreTermNum; long m_maxScoreTermNum; // for gbmin:price:1.99 float m_minScoreVal; float m_maxScoreVal; // for gbmin:count:99 long m_minScoreTermNumInt; long m_maxScoreTermNumInt; // for gbmin:count:99 long m_minScoreValInt; long m_maxScoreValInt; // the new intersection/scoring algo void intersectLists10_r ( ); HashTableX m_whiteListTable; bool m_useWhiteTable; bool m_addedSites; // sets stuff used by intersect10_r() bool setQueryTermInfo ( ); void shrinkSubLists ( class QueryTermInfo *qti ); // for intersecting docids void addDocIdVotes ( class QueryTermInfo *qti , long listGroupNum ); // for negative query terms... void rmDocIdVotes ( class QueryTermInfo *qti ); // upper score bound float getMaxPossibleScore ( class QueryTermInfo *qti , long bestDist , long qdist , class QueryTermInfo *qtm ) ; // stuff set in setQueryTermInf() function: SafeBuf m_qiBuf; long m_numQueryTermInfos; // the size of the smallest set of sublists. each sublists is // the main term or a synonym, etc. of the main term. long m_minListSize; // which query term info has the smallest set of sublists long m_minListi; // intersect docids from each QueryTermInfo into here SafeBuf m_docIdVoteBuf; long m_filtered; // boolean truth table for boolean queries HashTableX m_bt; HashTableX m_ct; // size of the data slot in m_bt long m_vecSize; // are all positive query terms in same wikipedia phrase like // 'time enough for love'? bool m_allInSameWikiPhrase; long m_realMaxTop; }; #define MAXDST 10 // distance used when measuring word from title/linktext/etc to word in body #define FIXED_DISTANCE 400 class PairScore { public: float m_finalScore; char m_isSynonym1; char m_isSynonym2; char m_isHalfStopWikiBigram1; char m_isHalfStopWikiBigram2; char m_diversityRank1; char m_diversityRank2; char m_densityRank1; char m_densityRank2; char m_wordSpamRank1; char m_wordSpamRank2; char m_hashGroup1; char m_hashGroup2; char m_inSameWikiPhrase; char m_fixedDistance; long m_wordPos1; long m_wordPos2; long long m_termFreq1; long long m_termFreq2; float m_tfWeight1; float m_tfWeight2; long m_qtermNum1; long m_qtermNum2; char m_bflags1; char m_bflags2; long m_qdist; }; class SingleScore { public: float m_finalScore; char m_isSynonym; char m_isHalfStopWikiBigram; char m_diversityRank; char m_densityRank; char m_wordSpamRank; char m_hashGroup; long m_wordPos; long long m_termFreq; // float m_termFreqWeight; float m_tfWeight; long m_qtermNum; char m_bflags; }; // we add up the pair scores of this many of the top-scoring pairs // for inlink text only, so it is accumulative. but now we also // have a parm "m_realMaxTop" which is <= MAX_TOP and can be used to // tune this down. #define MAX_TOP 10 // transparent query scoring info per docid class DocIdScore { public: DocIdScore ( ) { reset(); } void reset ( ) { m_numPairs = m_numSingles = 0; m_pairsOffset = m_singlesOffset = -1; m_pairScores = NULL; m_singleScores = NULL; }; // we use QueryChange::getDebugDocIdScore() to "deserialize" per se bool serialize ( class SafeBuf *sb ); long long m_docId; // made this a double because of intScores which can't be captured // fully with a float. intScores are used to sort by spidered time // for example. see Posdb.cpp "intScore". double m_finalScore; char m_siteRank; long m_docLang; // langId long m_numRequiredTerms; long m_numPairs; long m_numSingles; // . m_pairScores is just all the term pairs serialized // . they contain their query term #1 of each term in the pair and // they have the match number for each pair, since now each // pair of query terms can have up to MAX_TOP associated pairs // whose scores we add together to get the final score for that pair // . record offset into PosdbTable::m_pairScoreBuf // . Msg39Reply::ptr_pairScoreBuf will be this long m_pairsOffset; // . record offset into PosdbTable.m_singleScoreBuf // . Msg39Reply::ptr_singleScoreBuf will be this long m_singlesOffset; //PairScore m_pairScores [MAXDST][MAXDST][MAX_TOP]; //SingleScore m_singleScores[MAXDST] [MAX_TOP]; // Msg3a.cpp::mergeLists() should set these ptrs after it // copies over a top DocIdScore for storing the final results array class PairScore *m_pairScores; class SingleScore *m_singleScores; }; extern Posdb g_posdb; extern Posdb g_posdb2; extern RdbCache g_termFreqCache; // . b-step into list looking for docid "docId" // . assume p is start of list, excluding 6 byte of termid inline char *getWordPosList ( long long docId , char *list , long listSize ) { // make step divisible by 6 initially long step = (listSize / 12) * 6; // shortcut char *listEnd = list + listSize; // divide in half char *p = list + step; // for detecting not founds char count = 0; loop: // save it char *origp = p; // scan up to docid. we use this special bit to distinguish between // 6-byte and 12-byte posdb keys for ( ; p > list && (p[1] & 0x02) ; p -= 6 ); // ok, we hit a 12 byte key i guess, so backup 6 more p -= 6; // ok, we got a 12-byte key then i guess long long d = g_posdb.getDocId ( p ); // we got a match, but it might be a NEGATIVE key so // we have to try to find the positive keys in that case if ( d == docId ) { // if its positive, no need to do anything else if ( (p[0] & 0x01) == 0x01 ) return p; // ok, it's negative, try to see if the positive is // in here, if not then return NULL. // save current pos char *current = p; // back up to 6 byte key before this 12 byte key p -= 6; // now go backwards to previous 12 byte key for ( ; p > list && (p[1] & 0x02) ; p -= 6 ); // ok, we hit a 12 byte key i guess, so backup 6 more p -= 6; // is it there? if ( p >= list && g_posdb.getDocId(p) == docId ) { // sanity. return NULL if its negative! wtf???? if ( (p[0] & 0x01) == 0x00 ) return NULL; // got it return p; } // ok, no positive before us, try after us p = current; // advance over current 12 byte key p += 12; // now go forwards to next 12 byte key for ( ; p < listEnd && (p[1] & 0x02) ; p += 6 ); // is it there? if ( p + 12 < listEnd && g_posdb.getDocId(p) == docId ) { // sanity. return NULL if its negative! wtf???? if ( (p[0] & 0x01) == 0x00 ) return NULL; // got it return p; } // . crap, i guess just had a single negative docid then // . return that and the caller will see its negative return current; } // reduce step //step /= 2; step >>= 1; // . make divisible by 6! // . TODO: speed this up!!! step = step - (step % 6); // sanity if ( step % 6 ) { char *xx=NULL;*xx=0; } // ensure never 0 if ( step <= 0 ) { step = 6; // return NULL if not found if ( count++ >= 2 ) return NULL; } // go up or down then if ( d < docId ) { p = origp + step; if ( p > listEnd ) p = listEnd - 6; } else { p = origp - step; if ( p < list ) p = list; } // and repeat goto loop; } #endif