open-source-search-engine/IndexTable.h

// Matt Wells, copyright Aug 2001

// . calls for intersecting a bunch of IndexLists to generate docIds
// . IndexLists are data-less lists (keys only)
// . each key in an IndexList is a termId/score/adultBit/docId tuple
// . we try to use as small a sublist of each IndexList as possible to avoid
//   wasting network bandwidth
// . TODO: split into 2+ classes

// TODO: implement site clustering??????? in getNumResults()

// TODO: if we have in cache we can hash right into the table, but
//       we must do that before blocking on something in case it disappears
//       from the cache

// TODO: it is possible to get a better scoring result, even if we found
//       10 docIds in the heads of ALL the IndexLists. Because it may have
//       a really high score in 3 of the IndexLists, but a low score in the
//       fourth, but it's sum may be the highest of all docIds.

// TODO: the search "cell phone cable hp jornada 680", w/o quotes, should
//       be quote forced anyway. Some pages will match all but "cable hp"
//       so we should break that up into it's 2 terms, cable and hp

// 6912 (30%) of queries are 1 word queries  138
// 7825 (34%) of queries are 2 word queries  156
// 4512 (20%) of queries are 3 word queries  90
// 1771 ( 8%) of queries are 4 word queries  32
// 869  ( 4%) of queries are 5 word queries  17
// 391  ( 2%) of queries are 6 word queries  8
// 290  ( 1%) of queries are 7 word queries  6
// 183  ( 1%) of queries are 8 word queries (4 per second)

#ifndef _INDEXTABLE_H_
#define _INDEXTABLE_H_

#include "Query.h"         // MAX_QUERY_TERMS, qvec_t
#include "Indexdb.h"       // makeStartKey(), getTruncationLimit()
#include "IndexList.h"     // for m_lists[]
#include "Titledb.h"       // g_titledb.getTotalNumDocs()
#include "IndexReadInfo.h" // MAX_TIERS

// max # search results that can be viewed
#define MAX_RESULTS 1000

class IndexTable {

 public:

	// . returns false on error and sets errno
	// . we now support multiple plus signs before the query term
	// . start/endTermNums apply to phrase termIds only
	// . allows us to set multiple bits when a phrase termId is matched
	//   in case the singleton was truncated, but doc has the phrase
	// . if you want Default AND behaviour set requireAllTerms to true
	//   it is much faster, too
	void init (Query *q,bool isDebug,void *logstate,bool requireAllTerms,
		   class TopTree *topTree );

	// has init already been called?
	bool isInitialized ( ) { return m_initialized; };

	// sets m_positiveBits, etc.
	//void prepareToAddLists ( );

	// . returns false on error and sets errno
	// . we assume there are "m_numTerms" lists passed in (see set() above)
	void addLists_r ( IndexList   lists[MAX_TIERS][MAX_QUERY_TERMS] ,
			  int32_t        numTiers        ,
			  int32_t        numListsPerTier ,
			  Query      *q               ,
			  int32_t        docsWanted      ,
			  int32_t       *totalListSizes  ,
			  bool        useDateLists    ,
			  bool        sortByDate      ,
			  float       sortByDateWeight );

	// . these are set from calling addLists() above
	// . we log all matching topDocIds if isDebug is true
	int64_t *getTopDocIds ( int32_t tier ) { return m_topDocIds[tier]; };

	unsigned char *getTopBitScores ( int32_t tier )
		{ return m_topBitScores[tier]; };

	char *getTopExplicits ( int32_t tier ) { return m_topExplicits[tier]; };

	int32_t      *getTopScores ( int32_t tier ) { return m_topScores[tier]; };
	//uint32_t *getTopBitScores () { return m_finalTopBitScores; };

	// make sure to call getTopDocIds() before calling this
	int32_t   getNumTopDocIds ( int32_t tier ) { return m_numTopDocIds[tier]; };

	// . get how many results we have in the topDocIds list
	// . if "thatIncludeAllTerms" is true, results must have all terms
	//   from all indexLists that we haven't read ALL of yet
	int32_t getNumExactExplicitMatches ( int32_t tier ) {
		return m_numExactExplicitMatches[tier];};

	int32_t getNumExactImplicitMatches ( int32_t tier ) {
		return m_numExactImplicitMatches[tier];};

	// some generic stuff
	 IndexTable();
	~IndexTable();
	void reset();

	// . call to set the m_final* member vars from the m_top* member vars
	// . ALWAYS call this BEFORE calling
	//   getTopDocIds(), getNumTopDocIds() or getNumExactMatches()
	void filterTopDocIds ( ) ;

	// how long to add the last batch of lists
	int64_t       m_addListsTime;
	uint32_t   m_totalDocIds;
	int32_t            m_numPanics;
	int32_t            m_numCollisions;
	int32_t            m_numPtrs; // in the beginning at least
	int32_t            m_numLoops;

	// how long to get top docIds
	int64_t       m_setTopDocIdsTime;

	int64_t       m_estimatedTotalHits;

	int32_t            m_numSlots;

	// Msg39 needs to call these
	void freeMem ( ) ;
	bool alloc (IndexList lists[MAX_TIERS][MAX_QUERY_TERMS],
		    int32_t      numTiers                  ,
		    int32_t      numListsPerTier           ,
		    int32_t      docsWanted                ,
		    bool      sortByDate                );

	bool        doRecall() { return m_doRecall; };

	int32_t getNumDocsInTier ( int32_t i ) { return m_numDocsInTier[i]; };

	// . sets m_scoreWeights[] based on termFreqs (IDF)
	void setScoreWeights ( Query *q );
	void setScoreWeights ( Query *q , bool phrase );
	int32_t *getScoreWeights ( ) { return m_scoreWeights; };

 private:

	void addLists2_r ( IndexList   lists[MAX_TIERS][MAX_QUERY_TERMS] ,
			   int32_t        numTiers        ,
			   int32_t        numListsPerTier ,
			   Query      *q               ,
			   int32_t        docsWanted      ,
			   int32_t       *imap            ,
			   bool        lastRound       ,
			   int32_t        numBaseLists    ,
			   bool        useDateLists    ,
			   bool        sortByDate      ,
			   float       sortByDateWeight,
			   int32_t       *minHardCountPtr );

	void hashTopDocIds2 ( uint32_t  *maxDocId     ,
			      char          **docIdPtrs    ,
			      int32_t           *scores       ,
			      qvec_t         *explicitBits ,
			      int16_t          *hardCounts   ,
			      uint32_t   mask         ,
			      int32_t            numSlots     ) ;


	// . used for getting which topDocId to kick out of the top list
	int32_t getWeakestTopDocId ( char          **topp         ,
				  int32_t           *tops         ,
				  unsigned char  *topb         ,
				  int32_t            numTop       ,
				  unsigned char  *minBitScore2 ,
				  int32_t           *score        ,
				  char          **docIdPtr     ) ;

	// . get the termBits for the termId represented by this list
	// . only phrases may set multiple bits
	qvec_t getTermImplicitBitMask_r ( int32_t i );

	// . set the m_bitScores[] array
	// . "count" is the # of query term (single or phrase) bit combinations
	void setBitScores ( int32_t count );

	// are lists swapped?
	bool m_swapped [ MAX_TIERS ] [ MAX_QUERY_TERMS ] ;

	// these describe the lists associated with each m_termId
	int32_t       m_scoreWeights  [ MAX_QUERY_TERMS ];

	// for each tier we have a list of the top docids
	int64_t       m_topDocIds       [ MAX_TIERS ] [ MAX_RESULTS ];
	char           *m_topDocIdPtrs    [ MAX_TIERS ] [ MAX_RESULTS ];
	int32_t            m_topScores       [ MAX_TIERS ] [ MAX_RESULTS ];
	//int16_t           m_topHardCounts   [ MAX_TIERS ] [ MAX_RESULTS ];
	unsigned char   m_topBitScores    [ MAX_TIERS ] [ MAX_RESULTS ];
	char            m_topExplicits    [ MAX_TIERS ] [ MAX_RESULTS ];
	int32_t            m_numTopDocIds    [ MAX_TIERS ] ;
	int32_t            m_numExactExplicitMatches [ MAX_TIERS ];
	int32_t            m_numExactImplicitMatches [ MAX_TIERS ];
	int32_t            m_numTiers;

	// when filterTopDocIds() is called it uniquifies and combines
	// m_topDocIds[*][] into m_finalDocIds
	/*	int64_t       m_finalTopDocIds    [ MAX_RESULTS ];
	int32_t            m_finalTopScores    [ MAX_RESULTS ];
	//uint32_t   m_finalTopBitScores [ MAX_RESULTS ];
	int32_t            m_finalNumExactExplicitMatches ;
	int32_t            m_finalNumExactImplicitMatches ;
	int32_t            m_finalNumTopDocIds ;*/

	// a reference to the query
	Query          *m_q;

	// has init() been called?
	bool            m_initialized;

	// are we in debug mode?
	bool            m_isDebug;

	// for debug msgs
	int32_t            m_logstate;

	// . did we already call m_q->setBitScores() for this query?
	// . don't call it more than once
	bool            m_alreadySet;

	bool            m_doRecalc;

	bool             m_requireAllTerms;
	char           **m_topDocIdPtrs2;
	int32_t            *m_topScores2;
	qvec_t          *m_topExplicits2;
	int16_t           *m_topHardCounts2;
	int32_t             m_maxTopDocIds2;
	int32_t             m_numTopDocIds2;
	int32_t             m_nexti;
	int32_t             m_oldnexti;
	bool             m_doRecall;

	// allocated memory
	char *m_buf;
	int32_t  m_bufSize;
	char *m_bufMiddle;

	// for large hashtable for sortByDate
	char *m_bigBuf;
	int32_t  m_bigBufSize;

	// the imap stuff
	int32_t m_imap      [ MAX_QUERY_TERMS ];
	int32_t m_sizes     [ MAX_QUERY_TERMS ];
	int32_t m_blocksize [ MAX_QUERY_TERMS ];
	int32_t m_nb;

	class TopTree *m_topTree;

	// these are for removing component lists replaced by their compounds
	int32_t *m_componentCodes;
	//char       *m_ignore;
	//bool        m_scoresSet;
	int32_t  m_numDocsInTier [ MAX_TIERS ] ;
};

// . get the LOWEST scoring docId from our list of top docIds
// . set "minBitScore22" and "score" for that lowest docId
// . inline this for speed
// . BUT lower docIds are considered higher scoring than higher docIds
inline int32_t IndexTable::getWeakestTopDocId ( char          **topp         ,
					     int32_t           *tops         ,
					     unsigned char  *topb         ,
					     int32_t            numTop       ,
					     unsigned char  *minBitScore2 ,
					     int32_t           *score        ,
					     char          **docIdPtr     ) {
	int64_t      tmp         = 0LL;
	int32_t           minScore    = 0x7fffffff;
	unsigned char  minBitScore = 0xff;
	char          *minDocIdPtr = (char *)&tmp;
	int32_t           mini        = 0;
	for ( int32_t i = 0 ; i < numTop ; i++ ) {
		if ( topb [i] > minBitScore ) continue;
		if ( topb [i] < minBitScore ) goto gotIt;
		if ( tops [i] > minScore    ) continue;
		if ( tops [i] < minScore    ) goto gotIt;
		if ( *(uint32_t *)(topp[i]+1  )  <
		     *(uint32_t *)(minDocIdPtr+1)    ) continue;
		if ( *(uint32_t *)(topp[i]+1  )  >
		     *(uint32_t *)(minDocIdPtr+1)    ) goto gotIt;
		if ( (*(unsigned char *)(topp[i]    ) & 0xfc) <
		     (*(unsigned char *)(minDocIdPtr) & 0xfc) ) continue;
		// ties should not be happening for docid, unless
		// it tied with initial setting of minDocIdPtr, in that
		// case we should add it!
	gotIt:
		minScore     = tops [i];
		minBitScore  = topb [i];
		minDocIdPtr  = topp [i];
		mini         = i;
	}
	// set the callers ptrs
	*minBitScore2 = minBitScore;
	*score        = minScore;
	*docIdPtr     = minDocIdPtr;
	// return the lowest scoring docId's position
	return mini;
}

#endif