// Matt Wells, copyright Sep 2001 // . gets an RdbList from disk // . reads from N specified files and stores results in N RdbLists #ifndef _MSG3_H_ #define _MSG3_H_ // . max # of rdb files an rdb can have w/o merging // . merge your files to keep the number of them low to cut down # of seeks // . we try to keep it down to only 1 file through merging // . now that we embed a title file num in tfndb for each docid, titledb // only needs to be merged to collide positive/negative recs to save disk // space, so we do not want to be limited by number of files for titledb // . we bumped this up to 512 to help get more sites out of site search //#define MAX_RDB_FILES 512 // allow us to spider for a while without having to merge //#define MAX_RDB_FILES 2048 // make Msg5 footprint smaller //#define MAX_RDB_FILES 512 // make Msg5 footprint smaller since we have "whitelist" in msg2.cpp // we need to run one msg5 per whitelisted site then and we can have up to // 500 sites in the whitelist. #define MAX_RDB_FILES 1024 //#define MSG3_BUF_SIZE ((sizeof(RdbScan)+sizeof(key_t)+sizeof(RdbList)+20)*6) //#define MSG3_BUF_SIZE ((sizeof(RdbScan)+MAX_KEY_BYTES+sizeof(RdbList)+20)*6) #define MSG3_BUF_SIZE 64 #include "RdbList.h" #include "RdbScan.h" class Msg3 { public: Msg3(); ~Msg3(); // just sets # of read lists (m_numScansCompleted) to 0 void reset(); // . try to get at least minRecSizes worth of records // . endKey of "list" may be less than "endKey" provided // . sometimes there is a disk read error (due to merge deleting files) // and retryNum/maxRetries help define the retries // . if "numFiles" is -1, it means read ALL files available // . if "justGetEndKey" is true, then the call just sets // m_msg3.m_endKey and m_msg3.m_constrainKey. This is just used // by Msg5.cpp to constrain the endKey so it can read the recs // from the tree using that endKey, and not waste time. bool readList ( char rdbId , char *coll , //key_t startKey , //key_t endKey , char *startKey , char *endKey , long minRecSizes , // scan size(-1 all) long startFileNum , // first file to scan long numFiles , // rel.2 startFileNum void *state , // for callback void (* callback ) ( void *state ) , long niceness , // = MAX_NICENESS , long retryNum , // = 0 , long maxRetries , // = -1 bool compensateForMerge , long long syncPoint , // = -1 (none) bool justGetEndKey = false , bool allowPageCache = true , bool hitDisk = true ); // for retrieving unmerged lists RdbList *getList ( long i ) {return &m_lists[i];}; long getTfn ( long i ) {return m_tfns[i];}; long getNumLists ( ) {return m_numScansCompleted; }; // keep public for doneScanningWrapper to use bool doneScanning ( ); // on read/write error we sleep and retry bool doneSleeping (); long m_numScansStarted; long m_numScansCompleted; void *m_state ; void (* m_callback )( void *state ); //private: // this might increase m_minRecSizes void compensateForNegativeRecs ( class RdbBase *base ) ; // . sets page ranges for RdbScan (m_startpg[i], m_endpg[i]) // . returns the endKey for all RdbScans //key_t setPageRanges ( class RdbBase *base , void setPageRanges ( class RdbBase *base , long *fileNums , long numFileNums , //key_t startKey , //key_t endKey , char *startKey , char *endKey , //long minRecSizes ); long minRecSizes ); // . buries bad pages from the m_lists we read from disk // . usually modifies m_badStartKey, m_badEndKey // . "n" is the bad list index into m_lists[] void extractBadness ( long n ); // the rdb we're scanning for char m_rdbId; char *m_coll; // the scan classes, 1 per file, used to read from that file RdbScan *m_scans ; // [ MAX_RDB_FILES ]; // page ranges for each scan computed in setPageRanges() long *m_startpg ; // [ MAX_RDB_FILES ]; long *m_endpg ; // [ MAX_RDB_FILES ]; //key_t *m_hintKeys ; // [ MAX_RDB_FILES ]; char *m_hintKeys ; // [ MAX_RDB_FILES ]; long *m_hintOffsets ; // [ MAX_RDB_FILES ]; long m_startFileNum; long m_numFiles ; long *m_fileNums ; // [ MAX_RDB_FILES ]; long m_numFileNums; // hold the lists we read from disk here RdbList *m_lists ; // [ MAX_RDB_FILES ]; long *m_tfns ; // [ MAX_RDB_FILES ]; // key range to read //key_t m_fileStartKey; //key_t m_startKey; //key_t m_endKey; char *m_fileStartKey; char m_startKey[MAX_KEY_BYTES]; char m_endKey[MAX_KEY_BYTES]; // end key to use when calling constrain_r() //key_t m_constrainKey; char m_constrainKey[MAX_KEY_BYTES]; // min bytes to read long m_minRecSizes; // keep some original copies incase errno == ETRYAGAIN //key_t m_endKeyOrig; char m_endKeyOrig[MAX_KEY_BYTES]; long m_minRecSizesOrig; long m_niceness; // last error received from doing all reads int m_errno; // only retry up to m_maxRetries times in case it was a fluke long m_retryNum; long m_maxRetries; // for debugging long long m_startTime; // . these hints make a call to constrain() fast // . used to quickly contrain the tail of a 1-list read long m_hintOffset; //key_t m_hintKey; char m_hintKey[MAX_KEY_BYTES]; bool m_compensateForMerge; //long long m_syncPoint; char m_buf[MSG3_BUF_SIZE]; char *m_alloc; long m_allocSize; long m_numChunks; char m_ks; // for allowing the page cache bool m_allowPageCache; bool m_listsChecked; bool m_hadCorruption; bool m_hitDisk; }; extern long g_numIOErrors; #endif