open-source-search-engine/Posdb.h
Matt 09de59f026 do not store cblock, etc. tags into tagdb to save
disk space. added tagdb file cache for better performance,
less disk accesses. will help reduce disk load.
put file cache sizes in master controls and if they change
then update the cache size dynamically.
2015-09-10 12:46:00 -06:00

959 lines
26 KiB
C++

// Matt Wells, Copyright May 2012
// . format of an 18-byte posdb key
// tttttttt tttttttt tttttttt tttttttt t = termId (48bits)
// tttttttt tttttttt dddddddd dddddddd d = docId (38 bits)
// dddddddd dddddddd dddddd0r rrrggggg r = siterank, g = langid
// wwwwwwww wwwwwwww wwGGGGss ssvvvvFF w = word postion , s = wordspamrank
// pppppb1N MMMMLZZD v = diversityrank, p = densityrank
// M = multiplier, b = in outlink text
// L = langIdShiftBit (upper bit)
// G: 0 = body
// 1 = intitletag
// 2 = inheading
// 3 = inlist
// 4 = inmetatag
// 5 = inlinktext
// 6 = tag
// 7 = inneighborhood
// 8 = internalinlinktext
// 9 = inurl
//
// F: 0 = original term
// 1 = conjugate/sing/plural
// 2 = synonym
// 3 = hyponym
// NOTE: N bit is 1 if the shard of the record is determined by the
// termid (t bits) and NOT the docid (d bits). N stands for "nosplit"
// and you can find that logic in XmlDoc.cpp and Msg4.cpp. We store
// the hash of the content like this so we can see if it is a dup.
// NOTE: M bits hold scaling factor (logarithmic) for link text voting
// so we do not need to repeat the same link text over and over again.
// Use M bits to hold # of inlinks the page has for other terms.
// NOTE: for inlinktext terms the spam rank is the siterank of the
// inlinker!
// NOTE: densityrank for title is based on # of title words only. same goes
// for incoming inlink text.
// NOTE: now we can b-step into the termlist looking for a docid match
// and not worry about misalignment from the double compression scheme
// because if the 6th byte's low bit is clear that means its a docid
// 12-byte key, otherwise its the word position 6-byte key since the delbit
// can't be clear for those!
// THEN we can play with a tuner for how these various things affect
// the search results ranking.
#ifndef _POSDB_H_
#define _POSDB_H_
#include "Rdb.h"
#include "Conf.h"
//#include "Indexdb.h"
#include "Titledb.h" // DOCID_MASK
#include "HashTableX.h"
#include "Sections.h"
#define MAXSITERANK 0x0f // 4 bits
#define MAXLANGID 0x3f // 6 bits (5 bits go in 'g' the other in 'L')
#define MAXWORDPOS 0x0003ffff // 18 bits
#define MAXDENSITYRANK 0x1f // 5 bits
#define MAXWORDSPAMRANK 0x0f // 4 bits
#define MAXDIVERSITYRANK 0x0f // 4 bits
#define MAXHASHGROUP 0x0f // 4 bits
#define MAXMULTIPLIER 0x0f // 4 bits
#define MAXISSYNONYM 0x03 // 2 bits
// values for G bits in the posdb key
#define HASHGROUP_BODY 0 // body implied
#define HASHGROUP_TITLE 1
#define HASHGROUP_HEADING 2 // body implied
#define HASHGROUP_INLIST 3 // body implied
#define HASHGROUP_INMETATAG 4
#define HASHGROUP_INLINKTEXT 5
#define HASHGROUP_INTAG 6
#define HASHGROUP_NEIGHBORHOOD 7
#define HASHGROUP_INTERNALINLINKTEXT 8
#define HASHGROUP_INURL 9
#define HASHGROUP_INMENU 10 // body implied
#define HASHGROUP_END 11
float getDiversityWeight ( unsigned char diversityRank );
float getDensityWeight ( unsigned char densityRank );
float getWordSpamWeight ( unsigned char wordSpamRank );
float getLinkerWeight ( unsigned char wordSpamRank );
char *getHashGroupString ( unsigned char hg );
float getHashGroupWeight ( unsigned char hg );
float getTermFreqWeight ( int64_t termFreq , int64_t numDocsInColl );
#define SYNONYM_WEIGHT 0.90
#define WIKI_WEIGHT 0.10 // was 0.20
#define SITERANKDIVISOR 3.0
#define SITERANKMULTIPLIER 0.33333333
//#define SAMELANGMULT 20.0 // FOREIGNLANGDIVISOR 2.0
#define POSDBKEY key144_t
#define BF_HALFSTOPWIKIBIGRAM 0x01 // "to be" in "to be or not to be"
#define BF_PIPED 0x02 // before a query pipe operator
#define BF_SYNONYM 0x04
#define BF_NEGATIVE 0x08 // query word has a negative sign before it
#define BF_BIGRAM 0x10 // query word has a negative sign before it
#define BF_NUMBER 0x20 // is it like gbsortby:price? numeric?
#define BF_FACET 0x40 // gbfacet:price
void printTermList ( int32_t i, char *list, int32_t listSize ) ;
// if query is 'the tigers' we weight bigram "the tigers" x 1.20 because
// its in wikipedia.
// up this to 1.40 for 'the time machine' query
#define WIKI_BIGRAM_WEIGHT 1.40
class Posdb {
public:
// resets rdb
void reset();
// sets up our m_rdb from g_conf (global conf class)
bool init ( );
// init the rebuild/secondary rdb, used by PageRepair.cpp
bool init2 ( int32_t treeMem );
bool verify ( char *coll );
bool addColl ( char *coll, bool doVerify = true );
// . xmldoc.cpp should call this
// . store all posdb keys from revdbList into one hashtable
// and only add to new list if not in there
//bool makeList ( class RdbList *revdbList ,
// int64_t docId ,
// class Words *words );
// . make a 16-byte key from all these components
// . since it is 16 bytes, the big bit will be set
void makeKey ( void *kp ,
int64_t termId ,
uint64_t docId ,
int32_t wordPos ,
char densityRank ,
char diversityRank ,
char wordSpamRank ,
char siteRank ,
char hashGroup ,
char langId ,
// multiplier: we convert into 7 bits in this function
int32_t multiplier ,
bool isSynonym ,
bool isDelKey ,
bool shardByTermId );
// make just the 6 byte key
void makeKey48 ( char *kp ,
int32_t wordPos ,
char densityRank ,
char diversityRank ,
char wordSpamRank ,
char hashGroup ,
char langId ,
bool isSynonym ,
bool isDelKey );
int printList ( RdbList &list ) ;
// we map the 32bit score to like 7 bits here
void setMultiplierBits ( void *vkp , unsigned char mbits ) {
key144_t *kp = (key144_t *)vkp;
if ( mbits > MAXMULTIPLIER ) { char *xx=NULL;*xx=0; }
kp->n0 &= 0xfc0f;
// map score to bits
kp->n0 |= ((uint16_t)mbits) << 4;
}
void setDocIdBits ( void *vkp , uint64_t docId ) {
key144_t *kp = (key144_t *)vkp;
kp->n1 &= 0x000003ffffffffffLL;
kp->n1 |= (docId<<(32+10));
kp->n2 &= 0xffffffffffff0000LL;
kp->n2 |= docId>>22;
}
void setSiteRankBits ( void *vkp , char siteRank ) {
key144_t *kp = (key144_t *)vkp;
if ( siteRank > MAXSITERANK ) { char *xx=NULL;*xx=0; }
kp->n1 &= 0xfffffe1fffffffffLL;
kp->n1 |= ((uint64_t)siteRank)<<(32+5);
}
void setLangIdBits ( void *vkp , char langId ) {
key144_t *kp = (key144_t *)vkp;
if ( langId > MAXLANGID ) { char *xx=NULL;*xx=0; }
kp->n1 &= 0xffffffe0ffffffffLL;
// put the lower 5 bits here
kp->n1 |= ((uint64_t)(langId&0x1f))<<(32);
// and the upper 6th bit here. n0 is a int16_t.
// 0011 1111
if ( langId & 0x20 ) kp->n0 |= 0x08;
}
// set the word position bits et al to this float
void setFloat ( void *vkp , float f ) {
*(float *)(((char *)vkp) + 2) = f; };
void setInt ( void *vkp , int32_t x ) {
*(int32_t *)(((char *)vkp) + 2) = x; };
// and read the float as well
float getFloat ( void *vkp ) {
return *(float *)(((char *)vkp) + 2); };
int32_t getInt ( void *vkp ) {
return *(int32_t *)(((char *)vkp) + 2); };
void setAlignmentBit ( void *vkp , char val ) {
char *p = (char *)vkp;
if ( val ) p[1] = p[1] | 0x02;
else p[1] = p[1] & 0xfd;
};
bool isAlignmentBitClear ( void *vkp ) {
return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
};
void makeStartKey ( void *kp, int64_t termId ,
int64_t docId=0LL){
return makeKey ( kp,
termId ,
docId,
0, // wordpos
0, // density
0, // diversity
0, // wordspam
0, // siterank
0, // hashgroup
0, // langid
0, // multiplier
0, // issynonym/etc.
true , // isdelkey
false ); // shardbytermid?
};
void makeEndKey ( void *kp,int64_t termId,
int64_t docId = MAX_DOCID ) {
return makeKey ( kp,
termId ,
docId,
MAXWORDPOS,
MAXDENSITYRANK,
MAXDIVERSITYRANK,
MAXWORDSPAMRANK,
MAXSITERANK,
MAXHASHGROUP,
MAXLANGID,
MAXMULTIPLIER,
MAXISSYNONYM, // issynonym/etc.
false, // isdelkey
true);// shard by termid?
};
// we got two compression bits!
unsigned char getKeySize ( void *key ) {
if ( (((char *)key)[0])&0x04 ) return 6;
if ( (((char *)key)[0])&0x02 ) return 12;
return 18;
};
// PosdbTable uses this to skip from one docid to the next docid
// in a posdblist
char *getNextDocIdSublist ( char *p , char *listEnd ) {
// key must be 12
//if ( getKeySize(p) != 12 ) { char *xx=NULL;*xx=0; }
// skip that first key
p += 12;
// skip the 6 byte keys
for ( ; p < listEnd && getKeySize(p) == 6 ; p += 6 );
// done
return p;
}
int64_t getTermId ( void *key ) {
return ((key144_t *)key)->n2 >> 16;
};
int64_t getDocId ( void *key ) {
uint64_t d = 0LL;
d = ((unsigned char *)key)[11];
d <<= 32;
d |= *(uint32_t *)(((unsigned char *)key)+7);
d >>= 2;
return d;
//int64_t d = ((key144_t *)key)->n2 & 0xffff;
//d <<= 22;
//d |= ((key144_t *)key)->n1 >> (32+8+2);
//return d;
};
unsigned char getSiteRank ( void *key ) {
return (((key144_t *)key)->n1 >> 37) & MAXSITERANK;
};
unsigned char getLangId ( void *key ) {
if ( ((char *)key)[0] & 0x08 )
return ((((key144_t *)key)->n1 >> 32) & 0x1f) | 0x20;
else
return ((((key144_t *)key)->n1 >> 32) & 0x1f) ;
};
unsigned char getHashGroup ( void *key ) {
//return (((key144_t *)key)->n1 >> 10) & MAXHASHGROUP;
return ((((unsigned char *)key)[3]) >>2) & MAXHASHGROUP;
};
int32_t getWordPos ( void *key ) {
//return (((key144_t *)key)->n1 >> 14) & MAXWORDPOS;
return (*((uint32_t *)((unsigned char *)key+2))) >> (8+6);
};
inline void setWordPos ( char *key , uint32_t wpos ) {
// truncate
wpos &= MAXWORDPOS;
if ( wpos & 0x01 ) key[3] |= 0x40;
else key[3] &= ~((unsigned char)0x40);
if ( wpos & 0x02 ) key[3] |= 0x80;
else key[3] &= ~((unsigned char)0x80);
wpos >>= 2;
key[4] = ((char *)&wpos)[0];
key[5] = ((char *)&wpos)[1];
};
unsigned char getWordSpamRank ( void *key ) {
//return (((key144_t *)key)->n1 >> 6) & MAXWORDSPAMRANK;
return ((((uint16_t *)key)[1]) >>6) & MAXWORDSPAMRANK;
};
unsigned char getDiversityRank ( void *key ) {
//return (((key144_t *)key)->n1 >> 2) & MAXDIVERSITYRANK;
return ((((unsigned char *)key)[2]) >>2) & MAXDIVERSITYRANK;
};
unsigned char getIsSynonym ( void *key ) {
return (((key144_t *)key)->n1 ) & 0x03;
};
unsigned char getIsHalfStopWikiBigram ( void *key ) {
return ((char *)key)[2] & 0x01;
};
unsigned char getDensityRank ( void *key ) {
return ((*(uint16_t *)key) >> 11) & MAXDENSITYRANK;
};
inline void setDensityRank ( char *key , unsigned char dr ) {
// shift up
dr <<= 3;
// clear out
key[1] &= 0x07;
// or in
key[1] |= dr;
};
char isShardedByTermId ( void *key ){return ((char *)key)[1] & 0x01; };
void setShardedByTermIdBit ( void *key ) {
char *k = (char *)key;
k[1] |= 0x01;
};
unsigned char getMultiplier ( void *key ) {
return ((*(uint16_t *)key) >> 4) & MAXMULTIPLIER; };
// . HACK: for sectionhash:xxxxx posdb keys
// . we use the w,G,s,v and F bits
uint32_t getFacetVal32 ( void *key ) {
return *(uint32_t *)(((char *)key)+2); };
void setFacetVal32 ( void *key , int32_t facetVal32 ) {
*(uint32_t *)(((char *)key)+2) = facetVal32; };
int64_t getTermFreq ( collnum_t collnum, int64_t termId ) ;
//RdbCache *getCache ( ) { return &m_rdb.m_cache; };
Rdb *getRdb ( ) { return &m_rdb; };
Rdb m_rdb;
//DiskPageCache *getDiskPageCache ( ) { return &m_pc; };
//DiskPageCache m_pc;
};
class FacetEntry {
public:
// # of search results that have this value:
int32_t m_count;
// # of docs that have this value:
int32_t m_outsideSearchResultsCount;
int64_t m_docId;
// cast as double/floats for floats:
int64_t m_sum;
int32_t m_max;
int32_t m_min;
};
#define MAX_SUBLISTS 50
// . each QueryTerm has this attached additional info now:
// . these should be 1-1 with query terms, Query::m_qterms[]
class QueryTermInfo {
public:
class QueryTerm *m_qt;
// the required lists for this query term, synonym lists, etc.
RdbList *m_subLists [MAX_SUBLISTS];
// flags to indicate if bigram list should be scored higher
char m_bigramFlags [MAX_SUBLISTS];
// shrinkSubLists() set this:
int32_t m_newSubListSize [MAX_SUBLISTS];
char *m_newSubListStart [MAX_SUBLISTS];
char *m_newSubListEnd [MAX_SUBLISTS];
char *m_cursor [MAX_SUBLISTS];
char *m_savedCursor [MAX_SUBLISTS];
// the corresponding QueryTerm for this sublist
//class QueryTerm *m_qtermList [MAX_SUBLISTS];
int32_t m_numNewSubLists;
// how many are valid?
int32_t m_numSubLists;
// size of all m_subLists in bytes
int64_t m_totalSubListsSize;
// the term freq weight for this term
float m_termFreqWeight;
// what query term # do we correspond to in Query.h
int32_t m_qtermNum;
// the word position of this query term in the Words.h class
int32_t m_qpos;
// the wikipedia phrase id if we start one
int32_t m_wikiPhraseId;
// phrase id term or bigram is in
int32_t m_quotedStartId;
};
/*
#include "RdbList.h"
class PosdbList : public RdbList {
public:
// why do i have to repeat this for LinkInfo::set() calling our set()??
void set ( char *list , int32_t listSize , bool ownData ) {
RdbList::set ( list ,
listSize ,
list , // alloc
listSize , // alloc size
0 , // fixed data size
ownData ,
true , // use half keys?
sizeof(key_t));// 12 bytes per key
};
// clear the low bits on the keys so terms are DELETED
void clearDelBits ( );
void print();
// . these are made for special IndexLists, too
// . getTermId() assumes as 12 byte key
int64_t getCurrentTermId12 ( ) {
return getTermId12 ( m_listPtr ); };
int64_t getTermId12 ( char *rec ) {
return (*(uint64_t *)(&rec[4])) >> 16 ;
};
int64_t getTermId16 ( char *rec ) {
return (*(uint64_t *)(&rec[8])) >> 16 ;
};
// these 2 assume 12 and 6 byte keys respectively
int64_t getCurrentDocId () {
if ( isHalfBitOn ( m_listPtr ) ) return getDocId6 (m_listPtr);
else return getDocId12(m_listPtr);
};
int64_t getDocId ( char *rec ) {
if ( isHalfBitOn ( rec ) ) return getDocId6 (rec);
else return getDocId12(rec);
};
int64_t getCurrentDocId12 ( ) {
return getDocId12 ( m_listPtr ); };
int64_t getDocId12 ( char *rec ) {
return ((*(uint64_t *)(rec)) >> 2) & DOCID_MASK; };
int64_t getDocId6 ( char *rec ) {
int64_t docid;
*(int32_t *)(&docid) = *(int32_t *)rec;
((char *)&docid)[4] = rec[4];
docid >>= 2;
return docid & DOCID_MASK;
};
// this works with either 12 or 6 byte keys
unsigned char getCurrentScore ( ) {
return getScore(m_listPtr); };
unsigned char getScore ( char *rec ) { return ~rec[5]; };
// uncomplemented...
void setScore ( char *rec , char score ) { rec[5] = score; };
// for date lists only...
int32_t getCurrentDate ( ) { return ~*(int32_t *)(m_listPtr+6); };
};
*/
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
// max # search results that can be viewed without using TopTree
//#define MAX_RESULTS 1000
class PosdbTable {
public:
// . returns false on error and sets errno
// . "termFreqs" are 1-1 with q->m_qterms[]
// . sets m_q to point to q
void init (Query *q ,
char debug ,
void *logstate ,
class TopTree *topTree ,
//char *coll ,
collnum_t collnum ,
//IndexList *lists ,
//int32_t numLists ,
class Msg2 *msg2,
class Msg39Request *r );
// pre-allocate m_whiteListTable
bool allocWhiteListTable ( ) ;
// pre-allocate memory since intersection runs in a thread
bool allocTopTree ( );
// . returns false on error and sets errno
// . we assume there are "m_numTerms" lists passed in (see set() above)
//void intersectLists_r ( );
//void intersectLists9_r ( );
void getTermPairScoreForNonBody ( int32_t i, int32_t j,
char *wpi, char *wpj,
char *endi, char *endj,
int32_t qdist ,
float *retMax );
float getSingleTermScore ( int32_t i, char *wpi , char *endi,
class DocIdScore *pdcs,
char **bestPos );
void evalSlidingWindow ( char **ptrs ,
int32_t nr ,
char **bestPos ,
float *scoreMatrix ,
int32_t advancedTermNum );
float getTermPairScoreForWindow ( int32_t i, int32_t j,
char *wpi,
char *wpj,
int32_t fixedDistance
);
float getTermPairScoreForAny ( int32_t i, int32_t j,
char *wpi, char *wpj,
char *endi, char *endj,
class DocIdScore *pdcs );
bool makeDocIdVoteBufForBoolQuery_r ( ) ;
// some generic stuff
PosdbTable();
~PosdbTable();
void reset();
// Msg39 needs to call these
void freeMem ( ) ;
// has init already been called?
bool isInitialized ( ) { return m_initialized; };
uint64_t m_docId;
uint64_t m_docIdHack;
bool m_hasFacetTerm;
bool m_hasMaxSerpScore;
// hack for seo.cpp:
float m_finalScore;
float m_preFinalScore;
float m_siteRankMultiplier;
// how long to add the last batch of lists
int64_t m_addListsTime;
int64_t m_t1 ;
int64_t m_t2 ;
int64_t m_estimatedTotalHits;
int32_t m_errno;
int32_t m_numSlots;
int32_t m_maxScores;
//char *m_coll;
collnum_t m_collnum;
int32_t *m_qpos;
int32_t *m_wikiPhraseIds;
int32_t *m_quotedStartIds;
//class DocIdScore *m_ds;
int32_t m_qdist;
float *m_freqWeights;
//int64_t *m_freqs;
char *m_bflags;
int32_t *m_qtermNums;
float m_bestWindowScore;
//char **m_finalWinners1;
//char **m_finalWinners2;
//float *m_finalScores;
char **m_windowTermPtrs;
// how many docs in the collection?
int64_t m_docsInColl;
//SectionStats m_sectionStats;
//SafeBuf m_facetHashList;
//HashTableX m_dt;
class Msg2 *m_msg2;
// if getting more than MAX_RESULTS results, use this top tree to hold
// them rather than the m_top*[] arrays above
class TopTree *m_topTree;
//HashTableX m_docIdTable;
SafeBuf m_scoreInfoBuf;
SafeBuf m_pairScoreBuf;
SafeBuf m_singleScoreBuf;
SafeBuf m_stackBuf;
//SafeBuf m_mergeBuf;
// a reference to the query
Query *m_q;
int32_t m_nqt;
// these are NOT in imap space, but in query term space, 1-1 with
// Query::m_qterms[]
//IndexList *m_lists;
//int32_t m_numLists;
// has init() been called?
bool m_initialized;
// are we in debug mode?
char m_debug;
// for debug msgs
void *m_logstate;
//int64_t m_numDocsInColl;
class Msg39Request *m_r;
// for gbsortby:item.price ...
int32_t m_sortByTermNum;
int32_t m_sortByTermNumInt;
// fix core with these two
int32_t m_sortByTermInfoNum;
int32_t m_sortByTermInfoNumInt;
// for gbmin:price:1.99
int32_t m_minScoreTermNum;
int32_t m_maxScoreTermNum;
// for gbmin:price:1.99
float m_minScoreVal;
float m_maxScoreVal;
// for gbmin:count:99
int32_t m_minScoreTermNumInt;
int32_t m_maxScoreTermNumInt;
// for gbmin:count:99
int32_t m_minScoreValInt;
int32_t m_maxScoreValInt;
// the new intersection/scoring algo
void intersectLists10_r ( );
HashTableX m_whiteListTable;
bool m_useWhiteTable;
bool m_addedSites;
// sets stuff used by intersect10_r()
bool setQueryTermInfo ( );
void shrinkSubLists ( class QueryTermInfo *qti );
int64_t countUniqueDocids( QueryTermInfo *qti ) ;
// for intersecting docids
void addDocIdVotes ( class QueryTermInfo *qti , int32_t listGroupNum );
// for negative query terms...
void rmDocIdVotes ( class QueryTermInfo *qti );
// upper score bound
float getMaxPossibleScore ( class QueryTermInfo *qti ,
int32_t bestDist ,
int32_t qdist ,
class QueryTermInfo *qtm ) ;
// stuff set in setQueryTermInf() function:
SafeBuf m_qiBuf;
int32_t m_numQueryTermInfos;
// the size of the smallest set of sublists. each sublists is
// the main term or a synonym, etc. of the main term.
int32_t m_minListSize;
// which query term info has the smallest set of sublists
int32_t m_minListi;
// intersect docids from each QueryTermInfo into here
SafeBuf m_docIdVoteBuf;
int32_t m_filtered;
// boolean truth table for boolean queries
HashTableX m_bt;
HashTableX m_ct;
// size of the data slot in m_bt
int32_t m_vecSize;
// are all positive query terms in same wikipedia phrase like
// 'time enough for love'?
bool m_allInSameWikiPhrase;
int32_t m_realMaxTop;
};
#define MAXDST 10
// distance used when measuring word from title/linktext/etc to word in body
#define FIXED_DISTANCE 400
class PairScore {
public:
float m_finalScore;
char m_isSynonym1;
char m_isSynonym2;
char m_isHalfStopWikiBigram1;
char m_isHalfStopWikiBigram2;
char m_diversityRank1;
char m_diversityRank2;
char m_densityRank1;
char m_densityRank2;
char m_wordSpamRank1;
char m_wordSpamRank2;
char m_hashGroup1;
char m_hashGroup2;
char m_inSameWikiPhrase;
char m_fixedDistance;
int32_t m_wordPos1;
int32_t m_wordPos2;
int64_t m_termFreq1;
int64_t m_termFreq2;
float m_tfWeight1;
float m_tfWeight2;
int32_t m_qtermNum1;
int32_t m_qtermNum2;
char m_bflags1;
char m_bflags2;
int32_t m_qdist;
};
class SingleScore {
public:
float m_finalScore;
char m_isSynonym;
char m_isHalfStopWikiBigram;
char m_diversityRank;
char m_densityRank;
char m_wordSpamRank;
char m_hashGroup;
int32_t m_wordPos;
int64_t m_termFreq; // float m_termFreqWeight;
float m_tfWeight;
int32_t m_qtermNum;
char m_bflags;
};
// we add up the pair scores of this many of the top-scoring pairs
// for inlink text only, so it is accumulative. but now we also
// have a parm "m_realMaxTop" which is <= MAX_TOP and can be used to
// tune this down.
#define MAX_TOP 10
// transparent query scoring info per docid
class DocIdScore {
public:
DocIdScore ( ) { reset(); }
void reset ( ) {
m_numPairs = m_numSingles = 0;
m_pairsOffset = m_singlesOffset = -1;
m_pairScores = NULL;
m_singleScores = NULL;
};
// we use QueryChange::getDebugDocIdScore() to "deserialize" per se
bool serialize ( class SafeBuf *sb );
int64_t m_docId;
// made this a double because of intScores which can't be captured
// fully with a float. intScores are used to sort by spidered time
// for example. see Posdb.cpp "intScore".
double m_finalScore;
char m_siteRank;
int32_t m_docLang; // langId
int32_t m_numRequiredTerms;
int32_t m_numPairs;
int32_t m_numSingles;
// . m_pairScores is just all the term pairs serialized
// . they contain their query term #1 of each term in the pair and
// they have the match number for each pair, since now each
// pair of query terms can have up to MAX_TOP associated pairs
// whose scores we add together to get the final score for that pair
// . record offset into PosdbTable::m_pairScoreBuf
// . Msg39Reply::ptr_pairScoreBuf will be this
int32_t m_pairsOffset;
// . record offset into PosdbTable.m_singleScoreBuf
// . Msg39Reply::ptr_singleScoreBuf will be this
int32_t m_singlesOffset;
//PairScore m_pairScores [MAXDST][MAXDST][MAX_TOP];
//SingleScore m_singleScores[MAXDST] [MAX_TOP];
// Msg3a.cpp::mergeLists() should set these ptrs after it
// copies over a top DocIdScore for storing the final results array
class PairScore *m_pairScores;
class SingleScore *m_singleScores;
};
extern Posdb g_posdb;
extern Posdb g_posdb2;
extern RdbCache g_termFreqCache;
// . b-step into list looking for docid "docId"
// . assume p is start of list, excluding 6 byte of termid
inline char *getWordPosList ( int64_t docId , char *list , int32_t listSize ) {
// make step divisible by 6 initially
int32_t step = (listSize / 12) * 6;
// int16_tcut
char *listEnd = list + listSize;
// divide in half
char *p = list + step;
// for detecting not founds
char count = 0;
loop:
// save it
char *origp = p;
// scan up to docid. we use this special bit to distinguish between
// 6-byte and 12-byte posdb keys
for ( ; p > list && (p[1] & 0x02) ; p -= 6 );
// ok, we hit a 12 byte key i guess, so backup 6 more
p -= 6;
// ok, we got a 12-byte key then i guess
int64_t d = g_posdb.getDocId ( p );
// we got a match, but it might be a NEGATIVE key so
// we have to try to find the positive keys in that case
if ( d == docId ) {
// if its positive, no need to do anything else
if ( (p[0] & 0x01) == 0x01 ) return p;
// ok, it's negative, try to see if the positive is
// in here, if not then return NULL.
// save current pos
char *current = p;
// back up to 6 byte key before this 12 byte key
p -= 6;
// now go backwards to previous 12 byte key
for ( ; p > list && (p[1] & 0x02) ; p -= 6 );
// ok, we hit a 12 byte key i guess, so backup 6 more
p -= 6;
// is it there?
if ( p >= list && g_posdb.getDocId(p) == docId ) {
// sanity. return NULL if its negative! wtf????
if ( (p[0] & 0x01) == 0x00 ) return NULL;
// got it
return p;
}
// ok, no positive before us, try after us
p = current;
// advance over current 12 byte key
p += 12;
// now go forwards to next 12 byte key
for ( ; p < listEnd && (p[1] & 0x02) ; p += 6 );
// is it there?
if ( p + 12 < listEnd && g_posdb.getDocId(p) == docId ) {
// sanity. return NULL if its negative! wtf????
if ( (p[0] & 0x01) == 0x00 ) return NULL;
// got it
return p;
}
// . crap, i guess just had a single negative docid then
// . return that and the caller will see its negative
return current;
}
// reduce step
//step /= 2;
step >>= 1;
// . make divisible by 6!
// . TODO: speed this up!!!
step = step - (step % 6);
// sanity
if ( step % 6 ) { char *xx=NULL;*xx=0; }
// ensure never 0
if ( step <= 0 ) {
step = 6;
// return NULL if not found
if ( count++ >= 2 ) return NULL;
}
// go up or down then
if ( d < docId ) {
p = origp + step;
if ( p > listEnd ) p = listEnd - 6;
}
else {
p = origp - step;
if ( p < list ) p = list;
}
// and repeat
goto loop;
}
#endif