open-source-search-engine/IndexTable.h

310 lines
11 KiB
C++

// Matt Wells, copyright Aug 2001
// . calls for intersecting a bunch of IndexLists to generate docIds
// . IndexLists are data-less lists (keys only)
// . each key in an IndexList is a termId/score/adultBit/docId tuple
// . we try to use as small a sublist of each IndexList as possible to avoid
// wasting network bandwidth
// . TODO: split into 2+ classes
// TODO: implement site clustering??????? in getNumResults()
// TODO: if we have in cache we can hash right into the table, but
// we must do that before blocking on something in case it disappears
// from the cache
// TODO: it is possible to get a better scoring result, even if we found
// 10 docIds in the heads of ALL the IndexLists. Because it may have
// a really high score in 3 of the IndexLists, but a low score in the
// fourth, but it's sum may be the highest of all docIds.
// TODO: the search "cell phone cable hp jornada 680", w/o quotes, should
// be quote forced anyway. Some pages will match all but "cable hp"
// so we should break that up into it's 2 terms, cable and hp
// 6912 (30%) of queries are 1 word queries 138
// 7825 (34%) of queries are 2 word queries 156
// 4512 (20%) of queries are 3 word queries 90
// 1771 ( 8%) of queries are 4 word queries 32
// 869 ( 4%) of queries are 5 word queries 17
// 391 ( 2%) of queries are 6 word queries 8
// 290 ( 1%) of queries are 7 word queries 6
// 183 ( 1%) of queries are 8 word queries (4 per second)
#ifndef _INDEXTABLE_H_
#define _INDEXTABLE_H_
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
#include "Indexdb.h" // makeStartKey(), getTruncationLimit()
#include "IndexList.h" // for m_lists[]
#include "Titledb.h" // g_titledb.getTotalNumDocs()
#include "IndexReadInfo.h" // MAX_TIERS
// max # search results that can be viewed
#define MAX_RESULTS 1000
class IndexTable {
public:
// . returns false on error and sets errno
// . we now support multiple plus signs before the query term
// . start/endTermNums apply to phrase termIds only
// . allows us to set multiple bits when a phrase termId is matched
// in case the singleton was truncated, but doc has the phrase
// . if you want Default AND behaviour set requireAllTerms to true
// it is much faster, too
void init (Query *q,bool isDebug,void *logstate,bool requireAllTerms,
class TopTree *topTree );
// has init already been called?
bool isInitialized ( ) { return m_initialized; };
// sets m_positiveBits, etc.
//void prepareToAddLists ( );
// . returns false on error and sets errno
// . we assume there are "m_numTerms" lists passed in (see set() above)
void addLists_r ( IndexList lists[MAX_TIERS][MAX_QUERY_TERMS] ,
int32_t numTiers ,
int32_t numListsPerTier ,
Query *q ,
int32_t docsWanted ,
int32_t *totalListSizes ,
bool useDateLists ,
bool sortByDate ,
float sortByDateWeight );
// . these are set from calling addLists() above
// . we log all matching topDocIds if isDebug is true
int64_t *getTopDocIds ( int32_t tier ) { return m_topDocIds[tier]; };
unsigned char *getTopBitScores ( int32_t tier )
{ return m_topBitScores[tier]; };
char *getTopExplicits ( int32_t tier ) { return m_topExplicits[tier]; };
int32_t *getTopScores ( int32_t tier ) { return m_topScores[tier]; };
//uint32_t *getTopBitScores () { return m_finalTopBitScores; };
// make sure to call getTopDocIds() before calling this
int32_t getNumTopDocIds ( int32_t tier ) { return m_numTopDocIds[tier]; };
// . get how many results we have in the topDocIds list
// . if "thatIncludeAllTerms" is true, results must have all terms
// from all indexLists that we haven't read ALL of yet
int32_t getNumExactExplicitMatches ( int32_t tier ) {
return m_numExactExplicitMatches[tier];};
int32_t getNumExactImplicitMatches ( int32_t tier ) {
return m_numExactImplicitMatches[tier];};
// some generic stuff
IndexTable();
~IndexTable();
void reset();
// . call to set the m_final* member vars from the m_top* member vars
// . ALWAYS call this BEFORE calling
// getTopDocIds(), getNumTopDocIds() or getNumExactMatches()
void filterTopDocIds ( ) ;
// how long to add the last batch of lists
int64_t m_addListsTime;
uint32_t m_totalDocIds;
int32_t m_numPanics;
int32_t m_numCollisions;
int32_t m_numPtrs; // in the beginning at least
int32_t m_numLoops;
// how long to get top docIds
int64_t m_setTopDocIdsTime;
int64_t m_estimatedTotalHits;
int32_t m_numSlots;
// Msg39 needs to call these
void freeMem ( ) ;
bool alloc (IndexList lists[MAX_TIERS][MAX_QUERY_TERMS],
int32_t numTiers ,
int32_t numListsPerTier ,
int32_t docsWanted ,
bool sortByDate );
bool doRecall() { return m_doRecall; };
int32_t getNumDocsInTier ( int32_t i ) { return m_numDocsInTier[i]; };
// . sets m_scoreWeights[] based on termFreqs (IDF)
void setScoreWeights ( Query *q );
void setScoreWeights ( Query *q , bool phrase );
int32_t *getScoreWeights ( ) { return m_scoreWeights; };
private:
void addLists2_r ( IndexList lists[MAX_TIERS][MAX_QUERY_TERMS] ,
int32_t numTiers ,
int32_t numListsPerTier ,
Query *q ,
int32_t docsWanted ,
int32_t *imap ,
bool lastRound ,
int32_t numBaseLists ,
bool useDateLists ,
bool sortByDate ,
float sortByDateWeight,
int32_t *minHardCountPtr );
void hashTopDocIds2 ( uint32_t *maxDocId ,
char **docIdPtrs ,
int32_t *scores ,
qvec_t *explicitBits ,
int16_t *hardCounts ,
uint32_t mask ,
int32_t numSlots ) ;
// . used for getting which topDocId to kick out of the top list
int32_t getWeakestTopDocId ( char **topp ,
int32_t *tops ,
unsigned char *topb ,
int32_t numTop ,
unsigned char *minBitScore2 ,
int32_t *score ,
char **docIdPtr ) ;
// . get the termBits for the termId represented by this list
// . only phrases may set multiple bits
qvec_t getTermImplicitBitMask_r ( int32_t i );
// . set the m_bitScores[] array
// . "count" is the # of query term (single or phrase) bit combinations
void setBitScores ( int32_t count );
// are lists swapped?
bool m_swapped [ MAX_TIERS ] [ MAX_QUERY_TERMS ] ;
// these describe the lists associated with each m_termId
int32_t m_scoreWeights [ MAX_QUERY_TERMS ];
// for each tier we have a list of the top docids
int64_t m_topDocIds [ MAX_TIERS ] [ MAX_RESULTS ];
char *m_topDocIdPtrs [ MAX_TIERS ] [ MAX_RESULTS ];
int32_t m_topScores [ MAX_TIERS ] [ MAX_RESULTS ];
//int16_t m_topHardCounts [ MAX_TIERS ] [ MAX_RESULTS ];
unsigned char m_topBitScores [ MAX_TIERS ] [ MAX_RESULTS ];
char m_topExplicits [ MAX_TIERS ] [ MAX_RESULTS ];
int32_t m_numTopDocIds [ MAX_TIERS ] ;
int32_t m_numExactExplicitMatches [ MAX_TIERS ];
int32_t m_numExactImplicitMatches [ MAX_TIERS ];
int32_t m_numTiers;
// when filterTopDocIds() is called it uniquifies and combines
// m_topDocIds[*][] into m_finalDocIds
/* int64_t m_finalTopDocIds [ MAX_RESULTS ];
int32_t m_finalTopScores [ MAX_RESULTS ];
//uint32_t m_finalTopBitScores [ MAX_RESULTS ];
int32_t m_finalNumExactExplicitMatches ;
int32_t m_finalNumExactImplicitMatches ;
int32_t m_finalNumTopDocIds ;*/
// a reference to the query
Query *m_q;
// has init() been called?
bool m_initialized;
// are we in debug mode?
bool m_isDebug;
// for debug msgs
int32_t m_logstate;
// . did we already call m_q->setBitScores() for this query?
// . don't call it more than once
bool m_alreadySet;
bool m_doRecalc;
bool m_requireAllTerms;
char **m_topDocIdPtrs2;
int32_t *m_topScores2;
qvec_t *m_topExplicits2;
int16_t *m_topHardCounts2;
int32_t m_maxTopDocIds2;
int32_t m_numTopDocIds2;
int32_t m_nexti;
int32_t m_oldnexti;
bool m_doRecall;
// allocated memory
char *m_buf;
int32_t m_bufSize;
char *m_bufMiddle;
// for large hashtable for sortByDate
char *m_bigBuf;
int32_t m_bigBufSize;
// the imap stuff
int32_t m_imap [ MAX_QUERY_TERMS ];
int32_t m_sizes [ MAX_QUERY_TERMS ];
int32_t m_blocksize [ MAX_QUERY_TERMS ];
int32_t m_nb;
class TopTree *m_topTree;
// these are for removing component lists replaced by their compounds
int32_t *m_componentCodes;
//char *m_ignore;
//bool m_scoresSet;
int32_t m_numDocsInTier [ MAX_TIERS ] ;
};
// . get the LOWEST scoring docId from our list of top docIds
// . set "minBitScore22" and "score" for that lowest docId
// . inline this for speed
// . BUT lower docIds are considered higher scoring than higher docIds
inline int32_t IndexTable::getWeakestTopDocId ( char **topp ,
int32_t *tops ,
unsigned char *topb ,
int32_t numTop ,
unsigned char *minBitScore2 ,
int32_t *score ,
char **docIdPtr ) {
int64_t tmp = 0LL;
int32_t minScore = 0x7fffffff;
unsigned char minBitScore = 0xff;
char *minDocIdPtr = (char *)&tmp;
int32_t mini = 0;
for ( int32_t i = 0 ; i < numTop ; i++ ) {
if ( topb [i] > minBitScore ) continue;
if ( topb [i] < minBitScore ) goto gotIt;
if ( tops [i] > minScore ) continue;
if ( tops [i] < minScore ) goto gotIt;
if ( *(uint32_t *)(topp[i]+1 ) <
*(uint32_t *)(minDocIdPtr+1) ) continue;
if ( *(uint32_t *)(topp[i]+1 ) >
*(uint32_t *)(minDocIdPtr+1) ) goto gotIt;
if ( (*(unsigned char *)(topp[i] ) & 0xfc) <
(*(unsigned char *)(minDocIdPtr) & 0xfc) ) continue;
// ties should not be happening for docid, unless
// it tied with initial setting of minDocIdPtr, in that
// case we should add it!
gotIt:
minScore = tops [i];
minBitScore = topb [i];
minDocIdPtr = topp [i];
mini = i;
}
// set the callers ptrs
*minBitScore2 = minBitScore;
*score = minScore;
*docIdPtr = minDocIdPtr;
// return the lowest scoring docId's position
return mini;
}
#endif