#include "gb-include.h" #include "IndexReadInfo.h" #include "Datedb.h" IndexReadInfo::IndexReadInfo() { m_numLists = 0; m_isDone = false; } // . initialize initial read info // . sets m_readSizes[i] for each list // . sets startKey/endKey for each list, too // . startKey set passed endKey to indicate no reading void IndexReadInfo::init ( Query *q , long long *termFreqs , long docsWanted , char callNum , long stage0 , long *tierStage, bool useDateLists , bool sortByDate , unsigned long date1 , unsigned long date2 , bool isDebug ) { // save ptr but don't copy m_q = q; m_useDateLists = useDateLists; m_sortByDate = sortByDate; m_date1 = date1; m_date2 = date2; m_isDebug = isDebug; if ( m_useDateLists ) m_ks = 16; else m_ks = 12; m_hks = m_ks - 6; // . now set m_includeList array // . set to false if we determine termId to be ousted due to dpf // . loop over each termId in the query for ( long i = 0 ; i < m_q->getNumTerms() ; i++ ) { // ignore some //m_ignore [i] = m_q->m_ignore[i]; // no need to gen keys if ignored //if ( m_ignore[i] ) continue; // nothing ignored initially m_ignore[i] = false; // make our arrays 1-1 with those in Query class, q if ( m_useDateLists ) { // remember, date is complemented in the key, so use // the larger date for the startKey *(key128_t *)&m_startKeys [i*m_ks] = g_datedb.makeStartKey(m_q->getTermId(i),m_date2); *(key128_t *)&m_endKeys [i*m_ks] = g_datedb.makeEndKey (m_q->getTermId(i),m_date1); continue; } *(key_t *)&m_startKeys [i*m_ks] = g_indexdb.makeStartKey ( m_q->getTermId(i) ); *(key_t *)&m_endKeys [i*m_ks] = g_indexdb.makeEndKey ( m_q->getTermId(i) ); } // no negatives for ( long i = 0; i < MAX_TIERS; i++ ){ if ( tierStage[i] < 0 ) tierStage[i] = 0; } // -1 means to use default if ( stage0 <= 0 ) { // adjust for dateLists, proportionally if ( m_useDateLists ) m_stage[0] = (tierStage[0] * (16-6)) / (12-6); else m_stage[0] = tierStage[0]; // STAGE0; } else m_stage[0] = stage0 * m_hks + 6; // for all the other stages just get the same tier size for ( long i = 1; i < MAX_TIERS; i++ ){ // adjust for dateLists, proportionally if ( m_useDateLists ) m_stage[i] = (tierStage[i] * (16-6)) / (12-6); else m_stage[i] = tierStage[i]; } // set # of lists m_numLists = m_q->getNumTerms(); // we're not done yet, we haven't even begun m_isDone = false; // . how many docs do we need to read to get docsWanted hits? // . HITS = (X2 * ... * XN) / T^N // . where Xi is docs read from each list // . T is the total # of docs in the index // . this assumes no dependence between the words // . So let's just start off reading 10,000, then 30k more then 60k // . So we break up our 100k truncation limit that way long toRead = m_stage[(int)callNum]; long long def = getStage0Default() ; long long *tf = termFreqs ; // . ...but if we're only reading 1 list... // . keys are 6 bytes each, first key is 12 bytes // . this made our result count inaccurate // . since we had to round up to half a PAGE_SIZE // (#defined to be 16k in RdbMap.h) we would never estimate at lower // than about 4,000 docids for one-word queries // . so, since we're going to read at least a PAGE_SIZE anyway, // removing this should not slow us down!! // . actually, should speed us up if all the guys site cluster which // is especially probable for rare terms --- all from the same site // . SECONDLY, now i use Msg39::getStageNum() to do prettier clustering // and that requires us to be consistent with our stages from Next // 10 to Next 10 //if ( m_q->getNumTerms() <= 1 ) toRead = docsWanted * 6 + 6; // now loop through all non-ignored lists for ( long i = 0 ; i < m_numLists ; i++ ) { // ignore lists that should be if ( m_ignore[i] ) { m_readSizes[i]=0; continue; } // don't include excluded lists in this calculation if ( m_q->m_qterms[i].m_termSign == '-' ) m_readSizes[i] = m_stage[MAX_TIERS - 1] ; // STAGESUM; else if ( m_q->m_qterms[i].m_underNOT ) m_readSizes[i] = m_stage[MAX_TIERS - 1] ; // STAGESUM; else if ( m_q->m_qterms[i].m_piped ) m_readSizes[i] = m_stage[MAX_TIERS - 1] ; // STAGESUM; //m_readSizes[i] = g_indexdb.getTruncationLimit() *6+6; // m_readSizes[i] = g_indexdb.getTruncationLimit()*6 ; // . this is set to max if we got more than 1 ignored list // . later we will use dynamic truncation /*else if (useNewTierSizing && m_q->m_termFreqs[i] > tierStage2) m_readSizes[i] = tierStage2; else if (useNewTierSizing && m_q->m_termFreqs[i] > tierStage1) m_readSizes[i] = tierStage1;*/ else m_readSizes[i] = toRead; // . when user specifies the s0=X cgi parm and X is like 4M // try to avoid allocating so much space when we do not need // . mark is using s0 to get exact hit counts long long max = tf[i] * m_hks+m_hks +GB_INDEXDB_PAGE_SIZE*10 ; if ( max < def ) max = def; if ( m_readSizes[i] > max ) m_readSizes[i] = max; // debug msg if ( m_isDebug || g_conf.m_logDebugQuery ) logf ( LOG_DEBUG,"query: ReadInfo: " "newreadSizes[%li]=%li",i, m_readSizes[i] ); // sanity check if ( m_readSizes[i] > ( 500 * 1024 * 1024 ) || m_readSizes[i] < 0 ){ log( "minRecSize = %li", m_readSizes[i] ); char *xx=NULL; *xx=0; } } // return for now return; } long IndexReadInfo::getStage0Default ( ) { return STAGE0; } // . updates m_readSizes // . sets m_isDone to true if all lists are exhausted void IndexReadInfo::update ( IndexList *lists, long numLists, char callNum ) { // loop over all lists and update m_startKeys[i] for ( long i = 0 ; i < numLists ; i++ ) { // ignore lists that should be if ( m_ignore[i] ) continue; // . how many docIds did we read into this list? // . double the size since the lists are compress to half now //long docsRead = lists[i].getListSize() / 6 ; // . remove the endKey put at the end by RdbList::appendLists() // . iff we did NOT do a merge //if ( ! didMerge && docsRead > 0 ) docsRead--; // debug //log("startKey for list #%li is n1=%lx,n0=%llx " // "(docsRead=%li)", // i,m_startKeys[i].n1,m_startKeys[i].n0,docsRead); // . if we read less than supposed to, this list is exhausted // so we set m_ignore[i] to true so we don't read again // . we also now update termFreq to it's exact value // . ok this condition doesn't apply now because when we // append lists so that they are all less than a common // endKey some lose some keys so the minRecSizes goes down // . we should just see that if the # read is 0! //if ( docsRead < m_docsToRead[i] ) { if ( lists[i].getListSize() < m_readSizes[i] ) { m_ignore [i] = true; //m_readSizes[i] = 0; continue; } // if we didn't meet our quota... //else if ( docsRead < m_docsToRead[i] ) // m_startKeys [i] = m_endKeys [i] ; // point to last compressed 6 byte key in list char *list = (char *)lists[i].getList(); long listSize = lists[i].getListSize(); // don't seg fault if ( listSize < m_hks ) { m_ignore [i] = true; // keep the old readsize // m_readSizes[i] = 0; continue; } // we now do NOT call appendLists() again since // we're using fast superMerges //char *lastPart = list + listSize - 6; char *lastPart = list + listSize - m_hks; // . we update m_startKey to the endKey of each list // . get the startKey now //key_t startKey = m_startKeys[i]; char *startKey = &m_startKeys[i*m_ks]; // . load lastPart into lower 6 bytes of "startKey" // . little endian //memcpy ( &startKey , lastPart , 6 ); memcpy ( startKey , lastPart , m_hks ); // debug msg //log("pre-startKey for list #%li is n1=%lx,n0=%llx", // i,startKey.n1,startKey.n0); // sanity checks //if ( startKey < m_startKeys[i] ) { if ( KEYCMP(startKey,&m_startKeys[i*m_ks],m_ks)<0 ) { log("query: bad startKey. " "a.n1=%016llx a.n0=%016llx < " "b.n1=%016llx b.n0=%016llx" , KEY1(startKey,m_ks), KEY0(startKey ), KEY1(&m_startKeys[i*m_ks],m_ks), KEY0(&m_startKeys[i*m_ks] )); //startKey.n1 = 0xffffffff; //startKey.n0 = 0xffffffffffffffffLL; } // update startKey to read the next piece now //m_startKeys[i] = startKey; KEYSET(&m_startKeys[i*m_ks],startKey,m_ks); // add 1 to startKey //m_startKeys[i] += (unsigned long) 1; KEYADD(&m_startKeys[i*m_ks],1,m_ks); // debug msg //log("NOW startKey for list #%li is n1=%lx,n0=%llx", // i,m_startKeys[i].n1,m_startKeys[i].n0); // . increase termFreqs if we read more than was estimated // . no! just changes # of total results when clicking Next 10 //if ( docsRead > m_q->m_termFreqs[i] ) // m_q->m_termFreqs[i] = docsRead; } // break if a list can read more, if it can read more, that is long i; for ( i = 0 ; i < numLists ; i++ ) if ( ! m_ignore[i] ) break; // if all lists are exhausted, set m_isDone if ( i >= numLists ) { m_isDone = true; return; } // . based on # of results we got how much more should we have to read // to get what we want, "docsWanted" // . just base it on linear proportion // . keep in mind, if we double the amount to read we will quadruple // the results if reading 2 indexLists, x8 if reading from 3. // . that doesn't take into account phrases though... // . let's just do it this way // loop over all lists and update m_startKeys[i] and m_totalDocsRead for ( long i = 0 ; i < numLists ; i++ ) { // ignore lists that should be if ( m_ignore[i] ) continue; // update each list's docs to read m_readSizes[i] = m_stage[(int)callNum]; /* if ( m_readSizes[i] < m_stage[0]) m_readSizes[i] = m_stage0; else if ( m_readSizes[i] < m_stage[1]) m_readSizes[i] = m_stage1; else m_readSizes[i] = m_stage2;*/ // debug msg log("newreadSizes[%li]=%li",i,m_readSizes[i]); } } // . updates m_readSizes // . sets m_isDone to true if all lists are exhausted // . used by virtual split in msg3b to check if we're done or not. void IndexReadInfo::update ( long long *termFreqs, long numLists, char callNum ) { // loop over all lists and update m_startKeys[i] for ( long i = 0 ; i < numLists ; i++ ) { // ignore lists that should be if ( m_ignore[i] ) continue; // . how many bytes did we read ? Since these are // . half keys, multiply termFreqs by 6 and add 6 for the // . first key which is full 12 bytes long long listSize = termFreqs[i] * 6 + 6; if ( listSize < m_readSizes[i] ) { m_ignore [i] = true; //m_readSizes[i] = 0; continue; } // if we didn't meet our quota... //else if ( docsRead < m_docsToRead[i] ) // m_startKeys [i] = m_endKeys [i] ; // point to last compressed 6 byte key in list //char *list = (char *)lists[i].getList(); // don't seg fault if ( listSize < m_hks ) { m_ignore [i] = true; //m_readSizes[i] = 0; continue; } } // break if a list can read more, if it can read more, that is long i; for ( i = 0 ; i < numLists ; i++ ) if ( ! m_ignore[i] ) break; // if all lists are exhausted, set m_isDone if ( i >= numLists ) { m_isDone = true; return; } // . based on # of results we got how much more should we have to read // to get what we want, "docsWanted" // . just base it on linear proportion // . keep in mind, if we double the amount to read we will quadruple // the results if reading 2 indexLists, x8 if reading from 3. // . that doesn't take into account phrases though... // . let's just do it this way // loop over all lists and update m_startKeys[i] and m_totalDocsRead for ( long i = 0 ; i < numLists ; i++ ) { // debug msg //log("oldreadSizes[%li]=%li",i,m_readSizes[i]); // update each list's docs to read if we're not on the last // tier if ( !m_ignore[i] && callNum < MAX_TIERS && m_readSizes[i] < m_stage[(int)callNum] ) m_readSizes[i] = m_stage[(int)callNum]; /*if ( m_readSizes[i] < m_stage0) m_readSizes[i] = m_stage0; else if ( m_readSizes[i] < m_stage1) m_readSizes[i] = m_stage1; else m_readSizes[i] = m_stage2;*/ // debug msg if ( m_isDebug || g_conf.m_logDebugQuery ) logf ( LOG_DEBUG,"query: ReadInfo: " "newreadSizes[%li]=%li",i,m_readSizes[i] ); } }