open-source-search-engine/Posdb.h
2021-05-06 01:52:55 +10:00

959 lines
26 KiB
C++

// Matt Wells, Copyright May 2012
// . format of an 18-byte posdb key
// tttttttt tttttttt tttttttt tttttttt t = termId (48bits)
// tttttttt tttttttt dddddddd dddddddd d = docId (38 bits)
// dddddddd dddddddd dddddd0r rrrggggg r = siterank, g = langid
// wwwwwwww wwwwwwww wwGGGGss ssvvvvFF w = word position , s = wordspamrank
// pppppb1N MMMMLZZD v = diversityrank, p = densityrank
// M = multiplier, b = in outlink text
// L = langIdShiftBit (upper bit)
// G: 0 = body
// 1 = intitletag
// 2 = inheading
// 3 = inlist
// 4 = inmetatag
// 5 = inlinktext
// 6 = tag
// 7 = inneighborhood
// 8 = internalinlinktext
// 9 = inurl
//
// F: 0 = original term
// 1 = conjugate/sing/plural
// 2 = synonym
// 3 = hyponym
// NOTE: N bit is 1 if the shard of the record is determined by the
// termid (t bits) and NOT the docid (d bits). N stands for "nosplit"
// and you can find that logic in XmlDoc.cpp and Msg4.cpp. We store
// the hash of the content like this so we can see if it is a dup.
// NOTE: M bits hold scaling factor (logarithmic) for link text voting
// so we do not need to repeat the same link text over and over again.
// Use M bits to hold # of inlinks the page has for other terms.
// NOTE: for inlinktext terms the spam rank is the siterank of the
// inlinker!
// NOTE: densityrank for title is based on # of title words only. same goes
// for incoming inlink text.
// NOTE: now we can b-step into the termlist looking for a docid match
// and not worry about misalignment from the double compression scheme
// because if the 6th byte's low bit is clear that means its a docid
// 12-byte key, otherwise its the word position 6-byte key since the delbit
// can't be clear for those!
// THEN we can play with a tuner for how these various things affect
// the search results ranking.
#ifndef _POSDB_H_
#define _POSDB_H_
#include "Rdb.h"
#include "Conf.h"
//#include "Indexdb.h"
#include "Titledb.h" // DOCID_MASK
#include "HashTableX.h"
#include "Sections.h"
#define MAXSITERANK 0x0f // 4 bits
#define MAXLANGID 0x3f // 6 bits (5 bits go in 'g' the other in 'L')
#define MAXWORDPOS 0x0003ffff // 18 bits
#define MAXDENSITYRANK 0x1f // 5 bits
#define MAXWORDSPAMRANK 0x0f // 4 bits
#define MAXDIVERSITYRANK 0x0f // 4 bits
#define MAXHASHGROUP 0x0f // 4 bits
#define MAXMULTIPLIER 0x0f // 4 bits
#define MAXISSYNONYM 0x03 // 2 bits
// values for G bits in the posdb key
#define HASHGROUP_BODY 0 // body implied
#define HASHGROUP_TITLE 1
#define HASHGROUP_HEADING 2 // body implied
#define HASHGROUP_INLIST 3 // body implied
#define HASHGROUP_INMETATAG 4
#define HASHGROUP_INLINKTEXT 5
#define HASHGROUP_INTAG 6
#define HASHGROUP_NEIGHBORHOOD 7
#define HASHGROUP_INTERNALINLINKTEXT 8
#define HASHGROUP_INURL 9
#define HASHGROUP_INMENU 10 // body implied
#define HASHGROUP_END 11
float getDiversityWeight ( unsigned char diversityRank );
float getDensityWeight ( unsigned char densityRank );
float getWordSpamWeight ( unsigned char wordSpamRank );
float getLinkerWeight ( unsigned char wordSpamRank );
char *getHashGroupString ( unsigned char hg );
float getHashGroupWeight ( unsigned char hg );
float getTermFreqWeight ( int64_t termFreq , int64_t numDocsInColl );
#define SYNONYM_WEIGHT 0.90
#define WIKI_WEIGHT 0.10 // was 0.20
#define SITERANKDIVISOR 3.0
#define SITERANKMULTIPLIER 0.33333333
//#define SAMELANGMULT 20.0 // FOREIGNLANGDIVISOR 2.0
#define POSDBKEY key144_t
#define BF_HALFSTOPWIKIBIGRAM 0x01 // "to be" in "to be or not to be"
#define BF_PIPED 0x02 // before a query pipe operator
#define BF_SYNONYM 0x04
#define BF_NEGATIVE 0x08 // query word has a negative sign before it
#define BF_BIGRAM 0x10 // query word has a negative sign before it
#define BF_NUMBER 0x20 // is it like gbsortby:price? numeric?
#define BF_FACET 0x40 // gbfacet:price
void printTermList ( int32_t i, char *list, int32_t listSize ) ;
// if query is 'the tigers' we weight bigram "the tigers" x 1.20 because
// its in wikipedia.
// up this to 1.40 for 'the time machine' query
#define WIKI_BIGRAM_WEIGHT 1.40
class Posdb {
public:
// resets rdb
void reset();
// sets up our m_rdb from g_conf (global conf class)
bool init ( );
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool init2 ( int32_t treeMem );
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
// . xmldoc.cpp should call this
// . store all posdb keys from revdbList into one hashtable
// and only add to new list if not in there
//bool makeList ( class RdbList *revdbList ,
// int64_t docId ,
// class Words *words );
// . make a 16-byte key from all these components
// . since it is 16 bytes, the big bit will be set
void makeKey ( void *kp ,
int64_t termId ,
uint64_t docId ,
int32_t wordPos ,
char densityRank ,
char diversityRank ,
char wordSpamRank ,
char siteRank ,
char hashGroup ,
char langId ,
// multiplier: we convert into 7 bits in this function
int32_t multiplier ,
bool isSynonym ,
bool isDelKey ,
bool shardByTermId );
// make just the 6 byte key
void makeKey48 ( char *kp ,
int32_t wordPos ,
char densityRank ,
char diversityRank ,
char wordSpamRank ,
char hashGroup ,
char langId ,
bool isSynonym ,
bool isDelKey );
int printList ( RdbList &list ) ;
// we map the 32bit score to like 7 bits here
void setMultiplierBits ( void *vkp , unsigned char mbits ) {
key144_t *kp = (key144_t *)vkp;
if ( mbits > MAXMULTIPLIER ) { char *xx=NULL;*xx=0; }
kp->n0 &= 0xfc0f;
// map score to bits
kp->n0 |= ((uint16_t)mbits) << 4;
}
void setDocIdBits ( void *vkp , uint64_t docId ) {
key144_t *kp = (key144_t *)vkp;
kp->n1 &= 0x000003ffffffffffLL;
kp->n1 |= (docId<<(32+10));
kp->n2 &= 0xffffffffffff0000LL;
kp->n2 |= docId>>22;
}
void setSiteRankBits ( void *vkp , char siteRank ) {
key144_t *kp = (key144_t *)vkp;
if ( siteRank > MAXSITERANK ) { char *xx=NULL;*xx=0; }
kp->n1 &= 0xfffffe1fffffffffLL;
kp->n1 |= ((uint64_t)siteRank)<<(32+5);
}
void setLangIdBits ( void *vkp , char langId ) {
key144_t *kp = (key144_t *)vkp;
if ( langId > MAXLANGID ) { char *xx=NULL;*xx=0; }
kp->n1 &= 0xffffffe0ffffffffLL;
// put the lower 5 bits here
kp->n1 |= ((uint64_t)(langId&0x1f))<<(32);
// and the upper 6th bit here. n0 is a int16_t.
// 0011 1111
if ( langId & 0x20 ) kp->n0 |= 0x08;
}
// set the word position bits et al to this float
void setFloat ( void *vkp , float f ) {
*(float *)(((char *)vkp) + 2) = f; };
void setInt ( void *vkp , int32_t x ) {
*(int32_t *)(((char *)vkp) + 2) = x; };
// and read the float as well
float getFloat ( void *vkp ) {
return *(float *)(((char *)vkp) + 2); };
int32_t getInt ( void *vkp ) {
return *(int32_t *)(((char *)vkp) + 2); };
void setAlignmentBit ( void *vkp , char val ) {
char *p = (char *)vkp;
if ( val ) p[1] = p[1] | 0x02;
else p[1] = p[1] & 0xfd;
};
bool isAlignmentBitClear ( void *vkp ) {
return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
};
void makeStartKey ( void *kp, int64_t termId ,
int64_t docId=0LL){
return makeKey ( kp,
termId ,
docId,
0, // wordpos
0, // density
0, // diversity
0, // wordspam
0, // siterank
0, // hashgroup
0, // langid
0, // multiplier
0, // issynonym/etc.
true , // isdelkey
false ); // shardbytermid?
};
void makeEndKey ( void *kp,int64_t termId,
int64_t docId = MAX_DOCID ) {
return makeKey ( kp,
termId ,
docId,
MAXWORDPOS,
MAXDENSITYRANK,
MAXDIVERSITYRANK,
MAXWORDSPAMRANK,
MAXSITERANK,
MAXHASHGROUP,
MAXLANGID,
MAXMULTIPLIER,
MAXISSYNONYM, // issynonym/etc.
false, // isdelkey
true);// shard by termid?
};
// we got two compression bits!
unsigned char getKeySize ( void *key ) {
if ( (((char *)key)[0])&0x04 ) return 6;
if ( (((char *)key)[0])&0x02 ) return 12;
return 18;
};
// PosdbTable uses this to skip from one docid to the next docid
// in a posdblist
char *getNextDocIdSublist ( char *p , char *listEnd ) {
// key must be 12
//if ( getKeySize(p) != 12 ) { char *xx=NULL;*xx=0; }
// skip that first key
p += 12;
// skip the 6 byte keys
for ( ; p < listEnd && getKeySize(p) == 6 ; p += 6 );
// done
return p;
}
int64_t getTermId ( void *key ) {
return ((key144_t *)key)->n2 >> 16;
};
int64_t getDocId ( void *key ) {
uint64_t d = 0LL;
d = ((unsigned char *)key)[11];
d <<= 32;
d |= *(uint32_t *)(((unsigned char *)key)+7);
d >>= 2;
return d;
//int64_t d = ((key144_t *)key)->n2 & 0xffff;
//d <<= 22;
//d |= ((key144_t *)key)->n1 >> (32+8+2);
//return d;
};
unsigned char getSiteRank ( void *key ) {
return (((key144_t *)key)->n1 >> 37) & MAXSITERANK;
};
unsigned char getLangId ( void *key ) {
if ( ((char *)key)[0] & 0x08 )
return ((((key144_t *)key)->n1 >> 32) & 0x1f) | 0x20;
else
return ((((key144_t *)key)->n1 >> 32) & 0x1f) ;
};
unsigned char getHashGroup ( void *key ) {
//return (((key144_t *)key)->n1 >> 10) & MAXHASHGROUP;
return ((((unsigned char *)key)[3]) >>2) & MAXHASHGROUP;
};
int32_t getWordPos ( void *key ) {
//return (((key144_t *)key)->n1 >> 14) & MAXWORDPOS;
return (*((uint32_t *)((unsigned char *)key+2))) >> (8+6);
};
inline void setWordPos ( char *key , uint32_t wpos ) {
// truncate
wpos &= MAXWORDPOS;
if ( wpos & 0x01 ) key[3] |= 0x40;
else key[3] &= ~((unsigned char)0x40);
if ( wpos & 0x02 ) key[3] |= 0x80;
else key[3] &= ~((unsigned char)0x80);
wpos >>= 2;
key[4] = ((char *)&wpos)[0];
key[5] = ((char *)&wpos)[1];
};
unsigned char getWordSpamRank ( void *key ) {
//return (((key144_t *)key)->n1 >> 6) & MAXWORDSPAMRANK;
return ((((uint16_t *)key)[1]) >>6) & MAXWORDSPAMRANK;
};
unsigned char getDiversityRank ( void *key ) {
//return (((key144_t *)key)->n1 >> 2) & MAXDIVERSITYRANK;
return ((((unsigned char *)key)[2]) >>2) & MAXDIVERSITYRANK;
};
unsigned char getIsSynonym ( void *key ) {
return (((key144_t *)key)->n1 ) & 0x03;
};
unsigned char getIsHalfStopWikiBigram ( void *key ) {
return ((char *)key)[2] & 0x01;
};
unsigned char getDensityRank ( void *key ) {
return ((*(uint16_t *)key) >> 11) & MAXDENSITYRANK;
};
inline void setDensityRank ( char *key , unsigned char dr ) {
// shift up
dr <<= 3;
// clear out
key[1] &= 0x07;
// or in
key[1] |= dr;
};
char isShardedByTermId ( void *key ){return ((char *)key)[1] & 0x01; };
void setShardedByTermIdBit ( void *key ) {
char *k = (char *)key;
k[1] |= 0x01;
};
unsigned char getMultiplier ( void *key ) {
return ((*(uint16_t *)key) >> 4) & MAXMULTIPLIER; };
// . HACK: for sectionhash:xxxxx posdb keys
// . we use the w,G,s,v and F bits
uint32_t getFacetVal32 ( void *key ) {
return *(uint32_t *)(((char *)key)+2); };
void setFacetVal32 ( void *key , int32_t facetVal32 ) {
*(uint32_t *)(((char *)key)+2) = facetVal32; };
int64_t getTermFreq ( collnum_t collnum, int64_t termId ) ;
//RdbCache *getCache ( ) { return &m_rdb.m_cache; };
Rdb *getRdb ( ) { return &m_rdb; };
Rdb m_rdb;
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
//DiskPageCache m_pc;
};
class FacetEntry {
public:
// # of search results that have this value:
int32_t m_count;
// # of docs that have this value:
int32_t m_outsideSearchResultsCount;
int64_t m_docId;
// cast as double/floats for floats:
int64_t m_sum;
int32_t m_max;
int32_t m_min;
};
#define MAX_SUBLISTS 50
// . each QueryTerm has this attached additional info now:
// . these should be 1-1 with query terms, Query::m_qterms[]
class QueryTermInfo {
public:
class QueryTerm *m_qt;
// the required lists for this query term, synonym lists, etc.
RdbList *m_subLists [MAX_SUBLISTS];
// flags to indicate if bigram list should be scored higher
char m_bigramFlags [MAX_SUBLISTS];
// shrinkSubLists() set this:
int32_t m_newSubListSize [MAX_SUBLISTS];
char *m_newSubListStart [MAX_SUBLISTS];
char *m_newSubListEnd [MAX_SUBLISTS];
char *m_cursor [MAX_SUBLISTS];
char *m_savedCursor [MAX_SUBLISTS];
// the corresponding QueryTerm for this sublist
//class QueryTerm *m_qtermList [MAX_SUBLISTS];
int32_t m_numNewSubLists;
// how many are valid?
int32_t m_numSubLists;
// size of all m_subLists in bytes
int64_t m_totalSubListsSize;
// the term freq weight for this term
float m_termFreqWeight;
// what query term # do we correspond to in Query.h
int32_t m_qtermNum;
// the word position of this query term in the Words.h class
int32_t m_qpos;
// the wikipedia phrase id if we start one
int32_t m_wikiPhraseId;
// phrase id term or bigram is in
int32_t m_quotedStartId;
};
/*
#include "RdbList.h"
class PosdbList : public RdbList {
public:
// why do i have to repeat this for LinkInfo::set() calling our set()??
void set ( char *list , int32_t listSize , bool ownData ) {
RdbList::set ( list ,
listSize ,
list , // alloc
listSize , // alloc size
0 , // fixed data size
ownData ,
true , // use half keys?
sizeof(key_t));// 12 bytes per key
};
// clear the low bits on the keys so terms are DELETED
void clearDelBits ( );
void print();
// . these are made for special IndexLists, too
// . getTermId() assumes as 12 byte key
int64_t getCurrentTermId12 ( ) {
return getTermId12 ( m_listPtr ); };
int64_t getTermId12 ( char *rec ) {
return (*(uint64_t *)(&rec[4])) >> 16 ;
};
int64_t getTermId16 ( char *rec ) {
return (*(uint64_t *)(&rec[8])) >> 16 ;
};
// these 2 assume 12 and 6 byte keys respectively
int64_t getCurrentDocId () {
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
else return getDocId12(m_listPtr);
};
int64_t getDocId ( char *rec ) {
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
else return getDocId12(rec);
};
int64_t getCurrentDocId12 ( ) {
return getDocId12 ( m_listPtr ); };
int64_t getDocId12 ( char *rec ) {
return ((*(uint64_t *)(rec)) >> 2) & DOCID_MASK; };
int64_t getDocId6 ( char *rec ) {
int64_t docid;
*(int32_t *)(&docid) = *(int32_t *)rec;
((char *)&docid)[4] = rec[4];
docid >>= 2;
return docid & DOCID_MASK;
};
// this works with either 12 or 6 byte keys
unsigned char getCurrentScore ( ) {
return getScore(m_listPtr); };
unsigned char getScore ( char *rec ) { return ~rec[5]; };
// uncomplemented...
void setScore ( char *rec , char score ) { rec[5] = score; };
// for date lists only...
int32_t getCurrentDate ( ) { return ~*(int32_t *)(m_listPtr+6); };
};
*/
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
// max # search results that can be viewed without using TopTree
//#define MAX_RESULTS 1000
class PosdbTable {
public:
// . returns false on error and sets errno
// . "termFreqs" are 1-1 with q->m_qterms[]
// . sets m_q to point to q
void init (Query *q ,
char debug ,
void *logstate ,
class TopTree *topTree ,
//char *coll ,
collnum_t collnum ,
//IndexList *lists ,
//int32_t numLists ,
class Msg2 *msg2,
class Msg39Request *r );
// pre-allocate m_whiteListTable
bool allocWhiteListTable ( ) ;
// pre-allocate memory since intersection runs in a thread
bool allocTopTree ( );
// . returns false on error and sets errno
// . we assume there are "m_numTerms" lists passed in (see set() above)
//void intersectLists_r ( );
//void intersectLists9_r ( );
void getTermPairScoreForNonBody ( int32_t i, int32_t j,
char *wpi, char *wpj,
char *endi, char *endj,
int32_t qdist ,
float *retMax );
float getSingleTermScore ( int32_t i, char *wpi , char *endi,
class DocIdScore *pdcs,
char **bestPos );
void evalSlidingWindow ( char **ptrs ,
int32_t nr ,
char **bestPos ,
float *scoreMatrix ,
int32_t advancedTermNum );
float getTermPairScoreForWindow ( int32_t i, int32_t j,
char *wpi,
char *wpj,
int32_t fixedDistance
);
float getTermPairScoreForAny ( int32_t i, int32_t j,
char *wpi, char *wpj,
char *endi, char *endj,
class DocIdScore *pdcs );
bool makeDocIdVoteBufForBoolQuery_r ( ) ;
// some generic stuff
PosdbTable();
~PosdbTable();
void reset();
// Msg39 needs to call these
void freeMem ( ) ;
// has init already been called?
bool isInitialized ( ) { return m_initialized; };
uint64_t m_docId;
uint64_t m_docIdHack;
bool m_hasFacetTerm;
bool m_hasMaxSerpScore;
// hack for seo.cpp:
float m_finalScore;
float m_preFinalScore;
float m_siteRankMultiplier;
// how long to add the last batch of lists
int64_t m_addListsTime;
int64_t m_t1 ;
int64_t m_t2 ;
int64_t m_estimatedTotalHits;
int32_t m_errno;
int32_t m_numSlots;
int32_t m_maxScores;
//char *m_coll;
collnum_t m_collnum;
int32_t *m_qpos;
int32_t *m_wikiPhraseIds;
int32_t *m_quotedStartIds;
//class DocIdScore *m_ds;
int32_t m_qdist;
float *m_freqWeights;
//int64_t *m_freqs;
char *m_bflags;
int32_t *m_qtermNums;
float m_bestWindowScore;
//char **m_finalWinners1;
//char **m_finalWinners2;
//float *m_finalScores;
char **m_windowTermPtrs;
// how many docs in the collection?
int64_t m_docsInColl;
//SectionStats m_sectionStats;
//SafeBuf m_facetHashList;
//HashTableX m_dt;
class Msg2 *m_msg2;
// if getting more than MAX_RESULTS results, use this top tree to hold
// them rather than the m_top*[] arrays above
class TopTree *m_topTree;
//HashTableX m_docIdTable;
SafeBuf m_scoreInfoBuf;
SafeBuf m_pairScoreBuf;
SafeBuf m_singleScoreBuf;
SafeBuf m_stackBuf;
//SafeBuf m_mergeBuf;
// a reference to the query
Query *m_q;
int32_t m_nqt;
// these are NOT in imap space, but in query term space, 1-1 with
// Query::m_qterms[]
//IndexList *m_lists;
//int32_t m_numLists;
// has init() been called?
bool m_initialized;
// are we in debug mode?
char m_debug;
// for debug msgs
void *m_logstate;
//int64_t m_numDocsInColl;
class Msg39Request *m_r;
// for gbsortby:item.price ...
int32_t m_sortByTermNum;
int32_t m_sortByTermNumInt;
// fix core with these two
int32_t m_sortByTermInfoNum;
int32_t m_sortByTermInfoNumInt;
// for gbmin:price:1.99
int32_t m_minScoreTermNum;
int32_t m_maxScoreTermNum;
// for gbmin:price:1.99
float m_minScoreVal;
float m_maxScoreVal;
// for gbmin:count:99
int32_t m_minScoreTermNumInt;
int32_t m_maxScoreTermNumInt;
// for gbmin:count:99
int32_t m_minScoreValInt;
int32_t m_maxScoreValInt;
// the new intersection/scoring algo
void intersectLists10_r ( );
HashTableX m_whiteListTable;
bool m_useWhiteTable;
bool m_addedSites;
// sets stuff used by intersect10_r()
bool setQueryTermInfo ( );
void shrinkSubLists ( class QueryTermInfo *qti );
int64_t countUniqueDocids( QueryTermInfo *qti ) ;
// for intersecting docids
void addDocIdVotes ( class QueryTermInfo *qti , int32_t listGroupNum );
// for negative query terms...
void rmDocIdVotes ( class QueryTermInfo *qti );
// upper score bound
float getMaxPossibleScore ( class QueryTermInfo *qti ,
int32_t bestDist ,
int32_t qdist ,
class QueryTermInfo *qtm ) ;
// stuff set in setQueryTermInf() function:
SafeBuf m_qiBuf;
int32_t m_numQueryTermInfos;
// the size of the smallest set of sublists. each sublists is
// the main term or a synonym, etc. of the main term.
int32_t m_minListSize;
// which query term info has the smallest set of sublists
int32_t m_minListi;
// intersect docids from each QueryTermInfo into here
SafeBuf m_docIdVoteBuf;
int32_t m_filtered;
// boolean truth table for boolean queries
HashTableX m_bt;
HashTableX m_ct;
// size of the data slot in m_bt
int32_t m_vecSize;
// are all positive query terms in same wikipedia phrase like
// 'time enough for love'?
bool m_allInSameWikiPhrase;
int32_t m_realMaxTop;
};
#define MAXDST 10
// distance used when measuring word from title/linktext/etc to word in body
#define FIXED_DISTANCE 400
class PairScore {
public:
float m_finalScore;
char m_isSynonym1;
char m_isSynonym2;
char m_isHalfStopWikiBigram1;
char m_isHalfStopWikiBigram2;
char m_diversityRank1;
char m_diversityRank2;
char m_densityRank1;
char m_densityRank2;
char m_wordSpamRank1;
char m_wordSpamRank2;
char m_hashGroup1;
char m_hashGroup2;
char m_inSameWikiPhrase;
char m_fixedDistance;
int32_t m_wordPos1;
int32_t m_wordPos2;
int64_t m_termFreq1;
int64_t m_termFreq2;
float m_tfWeight1;
float m_tfWeight2;
int32_t m_qtermNum1;
int32_t m_qtermNum2;
char m_bflags1;
char m_bflags2;
int32_t m_qdist;
};
class SingleScore {
public:
float m_finalScore;
char m_isSynonym;
char m_isHalfStopWikiBigram;
char m_diversityRank;
char m_densityRank;
char m_wordSpamRank;
char m_hashGroup;
int32_t m_wordPos;
int64_t m_termFreq; // float m_termFreqWeight;
float m_tfWeight;
int32_t m_qtermNum;
char m_bflags;
};
// we add up the pair scores of this many of the top-scoring pairs
// for inlink text only, so it is accumulative. but now we also
// have a parm "m_realMaxTop" which is <= MAX_TOP and can be used to
// tune this down.
#define MAX_TOP 10
// transparent query scoring info per docid
class DocIdScore {
public:
DocIdScore ( ) { reset(); }
void reset ( ) {
m_numPairs = m_numSingles = 0;
m_pairsOffset = m_singlesOffset = -1;
m_pairScores = NULL;
m_singleScores = NULL;
};
// we use QueryChange::getDebugDocIdScore() to "deserialize" per se
bool serialize ( class SafeBuf *sb );
int64_t m_docId;
// made this a double because of intScores which can't be captured
// fully with a float. intScores are used to sort by spidered time
// for example. see Posdb.cpp "intScore".
double m_finalScore;
char m_siteRank;
int32_t m_docLang; // langId
int32_t m_numRequiredTerms;
int32_t m_numPairs;
int32_t m_numSingles;
// . m_pairScores is just all the term pairs serialized
// . they contain their query term #1 of each term in the pair and
// they have the match number for each pair, since now each
// pair of query terms can have up to MAX_TOP associated pairs
// whose scores we add together to get the final score for that pair
// . record offset into PosdbTable::m_pairScoreBuf
// . Msg39Reply::ptr_pairScoreBuf will be this
int32_t m_pairsOffset;
// . record offset into PosdbTable.m_singleScoreBuf
// . Msg39Reply::ptr_singleScoreBuf will be this
int32_t m_singlesOffset;
//PairScore m_pairScores [MAXDST][MAXDST][MAX_TOP];
//SingleScore m_singleScores[MAXDST] [MAX_TOP];
// Msg3a.cpp::mergeLists() should set these ptrs after it
// copies over a top DocIdScore for storing the final results array
class PairScore *m_pairScores;
class SingleScore *m_singleScores;
};
extern Posdb g_posdb;
extern Posdb g_posdb2;
extern RdbCache g_termFreqCache;
// . b-step into list looking for docid "docId"
// . assume p is start of list, excluding 6 byte of termid
inline char *getWordPosList ( int64_t docId , char *list , int32_t listSize ) {
// make step divisible by 6 initially
int32_t step = (listSize / 12) * 6;
// int16_tcut
char *listEnd = list + listSize;
// divide in half
char *p = list + step;
// for detecting not founds
char count = 0;
loop:
// save it
char *origp = p;
// scan up to docid. we use this special bit to distinguish between
// 6-byte and 12-byte posdb keys
for ( ; p > list && (p[1] & 0x02) ; p -= 6 );
// ok, we hit a 12 byte key i guess, so backup 6 more
p -= 6;
// ok, we got a 12-byte key then i guess
int64_t d = g_posdb.getDocId ( p );
// we got a match, but it might be a NEGATIVE key so
// we have to try to find the positive keys in that case
if ( d == docId ) {
// if its positive, no need to do anything else
if ( (p[0] & 0x01) == 0x01 ) return p;
// ok, it's negative, try to see if the positive is
// in here, if not then return NULL.
// save current pos
char *current = p;
// back up to 6 byte key before this 12 byte key
p -= 6;
// now go backwards to previous 12 byte key
for ( ; p > list && (p[1] & 0x02) ; p -= 6 );
// ok, we hit a 12 byte key i guess, so backup 6 more
p -= 6;
// is it there?
if ( p >= list && g_posdb.getDocId(p) == docId ) {
// sanity. return NULL if its negative! wtf????
if ( (p[0] & 0x01) == 0x00 ) return NULL;
// got it
return p;
}
// ok, no positive before us, try after us
p = current;
// advance over current 12 byte key
p += 12;
// now go forwards to next 12 byte key
for ( ; p < listEnd && (p[1] & 0x02) ; p += 6 );
// is it there?
if ( p + 12 < listEnd && g_posdb.getDocId(p) == docId ) {
// sanity. return NULL if its negative! wtf????
if ( (p[0] & 0x01) == 0x00 ) return NULL;
// got it
return p;
}
// . crap, i guess just had a single negative docid then
// . return that and the caller will see its negative
return current;
}
// reduce step
//step /= 2;
step >>= 1;
// . make divisible by 6!
// . TODO: speed this up!!!
step = step - (step % 6);
// sanity
if ( step % 6 ) { char *xx=NULL;*xx=0; }
// ensure never 0
if ( step <= 0 ) {
step = 6;
// return NULL if not found
if ( count++ >= 2 ) return NULL;
}
// go up or down then
if ( d < docId ) {
p = origp + step;
if ( p > listEnd ) p = listEnd - 6;
}
else {
p = origp - step;
if ( p < list ) p = list;
}
// and repeat
goto loop;
}
#endif