open-source-search-engine/IndexReadInfo.h
2013-08-02 13:12:24 -07:00

144 lines
4.2 KiB
C++

// Matt Wells, copyright Oct 2001
// . used for looking up IndexLists for queries
// . call init() to get initial read info per IndexList (1 per termId in query)
// . call update() to update read info for next read of lists
// . use getStartKey() , getEndKey(), getNumRecsToRead() to extract read info
// . tries to keep the amount of reading to a minimal
// . if # of results is not achieved the call update() to get read info for
// another read to hopefully get the # of requested docIds
#ifndef _INDEXREADINFO_H_
#define _INDEXREADINFO_H_
#include "Query.h" // MAX_QUERY_TERMS
#include "IndexList.h"
#include "Titledb.h"
#include "Indexdb.h"
// how many tiered might we break an indexlist into?
#define MAX_TIERS 3
// . define read sizes of each stage
// . each docid is 6 bytes, but first is 12
// . stage0 was 5000, but made it 8000 for trek today,
// . let's see how the powers of ten perform
#define STAGE0 (10000 *6)
#define STAGE1 (100000 *6)
#define STAGE2 (1000000 *6)
#define STAGESUM (STAGE0 + STAGE1 + STAGE2) // + STAGE3)
class IndexReadInfo {
public:
// just sets m_numLists to 0
IndexReadInfo();
// . this will calculate minStartKey and maxEndKey for each termId
// . does not copy these, so don't trash this stack
// . "stage0" is the first # of docIds to read from each IndexList
// -- dynamic truncation
void init ( Query *q ,
long long *termFreqs ,
long docsWanted , char callNum , long stage0 ,
long *tierStage ,
bool useDateLists ,
bool sortByDate ,
unsigned long date1 ,
unsigned long date2 ,
bool isDebug );
// . this updates the start keys and docsToRead for each list
// in preparation for another read
// . call this after you've done a read and called
// IndexTable::addLists() so it can hash them and calculate the #
// of results it got
// . it advances m_startKey[i] to lastKey + 1 in lists[i]
void update ( IndexList *lists , long numLists , char callNum );
void update2 ( long tier ) ;
/* void updateForMsg3b ( char *lastParts,
long long *termFreqs,
long numLists );*/
void update ( long long *termFreqs,
long numLists,
char callNum );
// update without the full lists, just the last part and size
void update ( char *lastParts,
long *listSizes,
long numLists );
// call this after calling update to determine read info per list
char *getStartKeys ( ) { return (char *)m_startKeys ; };
char *getEndKeys ( ) { return (char *)m_endKeys ; };
char getIgnored ( long i ) { return m_ignore[i] ; };
char getHalfKeySize( ) { return m_hks ; };
// getting info directly, like above
long getReadSize ( long i ) { return m_readSizes[i]; };
long *getReadSizes( ) { return m_readSizes; };
// . did we get the # of required results
// . or are all our lists exhausted?
// . call only AFTER calling update() above
bool isDone ( ) { return m_isDone ; };
// call only after calling init() to estimate # of results
long long getEstimatedTotalHits();
long getNumLists () { return m_numLists; };
long getStage0Default ( ) ;
private:
// . reading positions to read next portion of each list
// . set initially by init()
// . updated by addLists
// . might read one list multiple tims if we don't get enough hits
//key_t m_startKeys [ MAX_QUERY_TERMS ];
//key_t m_endKeys [ MAX_QUERY_TERMS ];
//key128_t m_startKeys2 [ MAX_QUERY_TERMS ];
//key128_t m_endKeys2 [ MAX_QUERY_TERMS ];
char m_startKeys [ MAX_QUERY_TERMS * MAX_KEY_BYTES ];
char m_endKeys [ MAX_QUERY_TERMS * MAX_KEY_BYTES ];
// how many docIds/recs/keys should we read?
long m_readSizes [ MAX_QUERY_TERMS ];
char m_ignore [ MAX_QUERY_TERMS ];
// . the query we're doing
// . the above arrays are 1-1 with the arrays in m_q, 1 for each termId
Query *m_q;
// how many index lists we're reading
long m_numLists;
// may be set to true after update() is called
bool m_isDone;
// . for dynamic truncation, first # of docs to read from each list
// . stages can now be set dynamically on a per query basis
long m_stage[MAX_TIERS];
//long m_stageSum;
char m_ks;
char m_hks;
char m_useDateLists;
char m_sortByDate;
unsigned long m_date1;
unsigned long m_date2;
bool m_isDebug;
};
#endif