open-source-search-engine/IndexReadInfo.cpp
2014-11-10 14:45:11 -08:00

345 lines
12 KiB
C++

#include "gb-include.h"
#include "IndexReadInfo.h"
#include "Datedb.h"
IndexReadInfo::IndexReadInfo() {
m_numLists = 0;
m_isDone = false;
}
// . initialize initial read info
// . sets m_readSizes[i] for each list
// . sets startKey/endKey for each list, too
// . startKey set passed endKey to indicate no reading
void IndexReadInfo::init ( Query *q ,
int64_t *termFreqs ,
int32_t docsWanted , char callNum ,
int32_t stage0 ,
int32_t *tierStage,
bool useDateLists , bool sortByDate ,
uint32_t date1 , uint32_t date2 ,
bool isDebug ) {
// save ptr but don't copy
m_q = q;
m_useDateLists = useDateLists;
m_sortByDate = sortByDate;
m_date1 = date1;
m_date2 = date2;
m_isDebug = isDebug;
if ( m_useDateLists ) m_ks = 16;
else m_ks = 12;
m_hks = m_ks - 6;
// . now set m_includeList array
// . set to false if we determine termId to be ousted due to dpf
// . loop over each termId in the query
for ( int32_t i = 0 ; i < m_q->getNumTerms() ; i++ ) {
// ignore some
//m_ignore [i] = m_q->m_ignore[i];
// no need to gen keys if ignored
//if ( m_ignore[i] ) continue;
// nothing ignored initially
m_ignore[i] = false;
// make our arrays 1-1 with those in Query class, q
if ( m_useDateLists ) {
// remember, date is complemented in the key, so use
// the larger date for the startKey
*(key128_t *)&m_startKeys [i*m_ks] =
g_datedb.makeStartKey(m_q->getTermId(i),m_date2);
*(key128_t *)&m_endKeys [i*m_ks] =
g_datedb.makeEndKey (m_q->getTermId(i),m_date1);
continue;
}
*(key_t *)&m_startKeys [i*m_ks] =
g_indexdb.makeStartKey ( m_q->getTermId(i) );
*(key_t *)&m_endKeys [i*m_ks] =
g_indexdb.makeEndKey ( m_q->getTermId(i) );
}
// no negatives
for ( int32_t i = 0; i < MAX_TIERS; i++ ){
if ( tierStage[i] < 0 )
tierStage[i] = 0;
}
// -1 means to use default
if ( stage0 <= 0 ) {
// adjust for dateLists, proportionally
if ( m_useDateLists )
m_stage[0] = (tierStage[0] * (16-6)) / (12-6);
else
m_stage[0] = tierStage[0]; // STAGE0;
}
else
m_stage[0] = stage0 * m_hks + 6;
// for all the other stages just get the same tier size
for ( int32_t i = 1; i < MAX_TIERS; i++ ){
// adjust for dateLists, proportionally
if ( m_useDateLists )
m_stage[i] = (tierStage[i] * (16-6)) / (12-6);
else
m_stage[i] = tierStage[i];
}
// set # of lists
m_numLists = m_q->getNumTerms();
// we're not done yet, we haven't even begun
m_isDone = false;
// . how many docs do we need to read to get docsWanted hits?
// . HITS = (X2 * ... * XN) / T^N
// . where Xi is docs read from each list
// . T is the total # of docs in the index
// . this assumes no dependence between the words
// . So let's just start off reading 10,000, then 30k more then 60k
// . So we break up our 100k truncation limit that way
int32_t toRead = m_stage[(int)callNum];
int64_t def = getStage0Default() ;
int64_t *tf = termFreqs ;
// . ...but if we're only reading 1 list...
// . keys are 6 bytes each, first key is 12 bytes
// . this made our result count inaccurate
// . since we had to round up to half a PAGE_SIZE
// (#defined to be 16k in RdbMap.h) we would never estimate at lower
// than about 4,000 docids for one-word queries
// . so, since we're going to read at least a PAGE_SIZE anyway,
// removing this should not slow us down!!
// . actually, should speed us up if all the guys site cluster which
// is especially probable for rare terms --- all from the same site
// . SECONDLY, now i use Msg39::getStageNum() to do prettier clustering
// and that requires us to be consistent with our stages from Next
// 10 to Next 10
//if ( m_q->getNumTerms() <= 1 ) toRead = docsWanted * 6 + 6;
// now loop through all non-ignored lists
for ( int32_t i = 0 ; i < m_numLists ; i++ ) {
// ignore lists that should be
if ( m_ignore[i] ) { m_readSizes[i]=0; continue; }
// don't include excluded lists in this calculation
if ( m_q->m_qterms[i].m_termSign == '-' )
m_readSizes[i] = m_stage[MAX_TIERS - 1] ; // STAGESUM;
else if ( m_q->m_qterms[i].m_underNOT )
m_readSizes[i] = m_stage[MAX_TIERS - 1] ; // STAGESUM;
else if ( m_q->m_qterms[i].m_piped )
m_readSizes[i] = m_stage[MAX_TIERS - 1] ; // STAGESUM;
//m_readSizes[i] = g_indexdb.getTruncationLimit() *6+6;
// m_readSizes[i] = g_indexdb.getTruncationLimit()*6 ;
// . this is set to max if we got more than 1 ignored list
// . later we will use dynamic truncation
/*else if (useNewTierSizing && m_q->m_termFreqs[i] > tierStage2)
m_readSizes[i] = tierStage2;
else if (useNewTierSizing && m_q->m_termFreqs[i] > tierStage1)
m_readSizes[i] = tierStage1;*/
else m_readSizes[i] = toRead;
// . when user specifies the s0=X cgi parm and X is like 4M
// try to avoid allocating so much space when we do not need
// . mark is using s0 to get exact hit counts
int64_t max = tf[i] * m_hks+m_hks +GB_INDEXDB_PAGE_SIZE*10 ;
if ( max < def ) max = def;
if ( m_readSizes[i] > max ) m_readSizes[i] = max;
// debug msg
if ( m_isDebug || g_conf.m_logDebugQuery )
logf ( LOG_DEBUG,"query: ReadInfo: "
"newreadSizes[%"INT32"]=%"INT32"",i,
m_readSizes[i] );
// sanity check
if ( m_readSizes[i] > ( 500 * 1024 * 1024 ) ||
m_readSizes[i] < 0 ){
log( "minRecSize = %"INT32"", m_readSizes[i] );
char *xx=NULL; *xx=0;
}
}
// return for now
return;
}
int32_t IndexReadInfo::getStage0Default ( ) { return STAGE0; }
// . updates m_readSizes
// . sets m_isDone to true if all lists are exhausted
void IndexReadInfo::update ( IndexList *lists, int32_t numLists,
char callNum ) {
// loop over all lists and update m_startKeys[i]
for ( int32_t i = 0 ; i < numLists ; i++ ) {
// ignore lists that should be
if ( m_ignore[i] ) continue;
// . how many docIds did we read into this list?
// . double the size since the lists are compress to half now
//int32_t docsRead = lists[i].getListSize() / 6 ;
// . remove the endKey put at the end by RdbList::appendLists()
// . iff we did NOT do a merge
//if ( ! didMerge && docsRead > 0 ) docsRead--;
// debug
//log("startKey for list #%"INT32" is n1=%"XINT32",n0=%"XINT64" "
// "(docsRead=%"INT32")",
// i,m_startKeys[i].n1,m_startKeys[i].n0,docsRead);
// . if we read less than supposed to, this list is exhausted
// so we set m_ignore[i] to true so we don't read again
// . we also now update termFreq to it's exact value
// . ok this condition doesn't apply now because when we
// append lists so that they are all less than a common
// endKey some lose some keys so the minRecSizes goes down
// . we should just see that if the # read is 0!
//if ( docsRead < m_docsToRead[i] ) {
if ( lists[i].getListSize() < m_readSizes[i] ) {
m_ignore [i] = true;
//m_readSizes[i] = 0;
continue;
}
// if we didn't meet our quota...
//else if ( docsRead < m_docsToRead[i] )
// m_startKeys [i] = m_endKeys [i] ;
// point to last compressed 6 byte key in list
char *list = (char *)lists[i].getList();
int32_t listSize = lists[i].getListSize();
// don't seg fault
if ( listSize < m_hks ) {
m_ignore [i] = true;
// keep the old readsize
// m_readSizes[i] = 0;
continue;
}
// we now do NOT call appendLists() again since
// we're using fast superMerges
//char *lastPart = list + listSize - 6;
char *lastPart = list + listSize - m_hks;
// . we update m_startKey to the endKey of each list
// . get the startKey now
//key_t startKey = m_startKeys[i];
char *startKey = &m_startKeys[i*m_ks];
// . load lastPart into lower 6 bytes of "startKey"
// . little endian
//memcpy ( &startKey , lastPart , 6 );
memcpy ( startKey , lastPart , m_hks );
// debug msg
//log("pre-startKey for list #%"INT32" is n1=%"XINT32",n0=%"XINT64"",
// i,startKey.n1,startKey.n0);
// sanity checks
//if ( startKey < m_startKeys[i] ) {
if ( KEYCMP(startKey,&m_startKeys[i*m_ks],m_ks)<0 ) {
log("query: bad startKey. "
"a.n1=%016"XINT64" a.n0=%016"XINT64" < "
"b.n1=%016"XINT64" b.n0=%016"XINT64"" ,
KEY1(startKey,m_ks),
KEY0(startKey ),
KEY1(&m_startKeys[i*m_ks],m_ks),
KEY0(&m_startKeys[i*m_ks] ));
//startKey.n1 = 0xffffffff;
//startKey.n0 = 0xffffffffffffffffLL;
}
// update startKey to read the next piece now
//m_startKeys[i] = startKey;
KEYSET(&m_startKeys[i*m_ks],startKey,m_ks);
// add 1 to startKey
//m_startKeys[i] += (uint32_t) 1;
KEYADD(&m_startKeys[i*m_ks],1,m_ks);
// debug msg
//log("NOW startKey for list #%"INT32" is n1=%"XINT32",n0=%"XINT64"",
// i,m_startKeys[i].n1,m_startKeys[i].n0);
// . increase termFreqs if we read more than was estimated
// . no! just changes # of total results when clicking Next 10
//if ( docsRead > m_q->m_termFreqs[i] )
// m_q->m_termFreqs[i] = docsRead;
}
// break if a list can read more, if it can read more, that is
int32_t i;
for ( i = 0 ; i < numLists ; i++ ) if ( ! m_ignore[i] ) break;
// if all lists are exhausted, set m_isDone
if ( i >= numLists ) { m_isDone = true; return; }
// . based on # of results we got how much more should we have to read
// to get what we want, "docsWanted"
// . just base it on linear proportion
// . keep in mind, if we double the amount to read we will quadruple
// the results if reading 2 indexLists, x8 if reading from 3.
// . that doesn't take into account phrases though...
// . let's just do it this way
// loop over all lists and update m_startKeys[i] and m_totalDocsRead
for ( int32_t i = 0 ; i < numLists ; i++ ) {
// ignore lists that should be
if ( m_ignore[i] ) continue;
// update each list's docs to read
m_readSizes[i] = m_stage[(int)callNum];
/* if ( m_readSizes[i] < m_stage[0])
m_readSizes[i] = m_stage0;
else if ( m_readSizes[i] < m_stage[1])
m_readSizes[i] = m_stage1;
else
m_readSizes[i] = m_stage2;*/
// debug msg
log("newreadSizes[%"INT32"]=%"INT32"",i,m_readSizes[i]);
}
}
// . updates m_readSizes
// . sets m_isDone to true if all lists are exhausted
// . used by virtual split in msg3b to check if we're done or not.
void IndexReadInfo::update ( int64_t *termFreqs,
int32_t numLists,
char callNum ) {
// loop over all lists and update m_startKeys[i]
for ( int32_t i = 0 ; i < numLists ; i++ ) {
// ignore lists that should be
if ( m_ignore[i] ) continue;
// . how many bytes did we read ? Since these are
// . half keys, multiply termFreqs by 6 and add 6 for the
// . first key which is full 12 bytes
int64_t listSize = termFreqs[i] * 6 + 6;
if ( listSize < m_readSizes[i] ) {
m_ignore [i] = true;
//m_readSizes[i] = 0;
continue;
}
// if we didn't meet our quota...
//else if ( docsRead < m_docsToRead[i] )
// m_startKeys [i] = m_endKeys [i] ;
// point to last compressed 6 byte key in list
//char *list = (char *)lists[i].getList();
// don't seg fault
if ( listSize < m_hks ) {
m_ignore [i] = true;
//m_readSizes[i] = 0;
continue;
}
}
// break if a list can read more, if it can read more, that is
int32_t i;
for ( i = 0 ; i < numLists ; i++ ) if ( ! m_ignore[i] ) break;
// if all lists are exhausted, set m_isDone
if ( i >= numLists ) { m_isDone = true; return; }
// . based on # of results we got how much more should we have to read
// to get what we want, "docsWanted"
// . just base it on linear proportion
// . keep in mind, if we double the amount to read we will quadruple
// the results if reading 2 indexLists, x8 if reading from 3.
// . that doesn't take into account phrases though...
// . let's just do it this way
// loop over all lists and update m_startKeys[i] and m_totalDocsRead
for ( int32_t i = 0 ; i < numLists ; i++ ) {
// debug msg
//log("oldreadSizes[%"INT32"]=%"INT32"",i,m_readSizes[i]);
// update each list's docs to read if we're not on the last
// tier
if ( !m_ignore[i] && callNum < MAX_TIERS &&
m_readSizes[i] < m_stage[(int)callNum] )
m_readSizes[i] = m_stage[(int)callNum];
/*if ( m_readSizes[i] < m_stage0)
m_readSizes[i] = m_stage0;
else if ( m_readSizes[i] < m_stage1)
m_readSizes[i] = m_stage1;
else
m_readSizes[i] = m_stage2;*/
// debug msg
if ( m_isDebug || g_conf.m_logDebugQuery )
logf ( LOG_DEBUG,"query: ReadInfo: "
"newreadSizes[%"INT32"]=%"INT32"",i,m_readSizes[i] );
}
}