open-source-search-engine/IndexTable.cpp

2762 lines
98 KiB
C++
Raw Normal View History

2013-08-03 00:12:24 +04:00
#include "gb-include.h"
#include "IndexTable.h"
#include "Stats.h"
#include <math.h>
#include "Conf.h"
#include "Mem.h" // getHighestLitBitValue()
#include "TopTree.h"
#include "sort.h"
// global var
//TopTree *g_topTree;
// when using Msg39.cpp to call addLists_r() on 2 5MB prefab lists:
// 1. it takes 100ms just to load the 10MB from mainMem into L1cache.(133MB/s)
// 2. Then we read AND write 10MB to the hash table in the L1 cache,
// which @ 400MB/s takes 25ms + 25ms = 50ms.
// 2b. Also, when hashing 2 millions keys, each key will read and write a
// full slot in hash table, so each slot is 10 bytes, that's 20 megs
// @ 400MB/s which is 50ms + 50ms = 100ms.
// 2c. Then there's chaining, about 10% of docids will chain, 10ms
// 3. Then we loop 2432 times over the 20k hash table in the L1 cache to
// get the winners which is reading 49MB @ 400MB/s = 122ms.
// 4. total mem access times is 382ms
// 5. so out of the 620ms or so, for intersecting 'the .. sex' 382ms is
// just bare memory bottleneck!
// NOTE: i used membustest.cpp to measure L1 cache speed, main mem speed, etc.
// on lenny
IndexTable::IndexTable() {
// top docid info
m_q = NULL;
m_topDocIdPtrs2 = NULL;
m_topScores2 = NULL;
m_topExplicits2 = NULL;
m_topHardCounts2= NULL;
m_buf = NULL;
m_bigBuf = NULL;
reset();
}
IndexTable::~IndexTable() { reset(); }
void IndexTable::reset() {
m_initialized = false;
m_numTiers = 0;
//m_alreadySet = false;
m_estimatedTotalHits = -1;
m_doRecall = true;
// filterTopDocIds is now in msg3a, and so we need to init some stuff
// or else it cores when addlists2_r is not executed.
for ( long i = 0; i < MAX_TIERS; i++ ){
m_numTopDocIds[i] = 0;
m_numExactExplicitMatches[i] = 0;
m_numExactImplicitMatches[i] = 0;
}
freeMem();
}
// . max score weight
// . we base score weight on the termFreq(# of docs that have that term)
// . it is now also based on the # of plus signs prepended before a term
#define MAX_SCORE_WEIGHT 1000
// . HACK: date hack
// . this is a special score weight i added for sorting by date
// . it scales the score of date indexlists
// . score of termIds 0xdadadada and 0xdadadad2 represent date of the document
// . 0xdadadada's score is the most signifincat 8-bits of the 16-bit date and
// . 0xdadadad2's score is the least signifincat
// . the date is stored in days since the epoch (16 bits needed = 2 scores)
// . unfortunately, it may not always work too well
// . the max score any doc can have (w/o date) is about 16,000
// . so when we weight the date by DATE_WEIGHT and add to the doc's score
// just a 1 second difference will supercede any difference in term scores
#define DATE_WEIGHT (2 * MAX_QUERY_TERMS * MAX_SCORE_WEIGHT)
// . returns false on error and sets g_errno
// . NOTE: termIds,termFreqs,termSigns are just referenced by us, not copied
// . sets m_startKeys, m_endKeys and m_minNumRecs for each termId
// . TODO: ensure that m_termFreqs[] are all UPPER BOUNDS on the actual #!!
// we should be able to get an upper bound estimate from the b-tree
// quickly!
// . we now support multiple plus signs before the query term
void IndexTable::init ( Query *q , bool isDebug , void *logstate ,
bool requireAllTerms , TopTree *topTree ) {
// bail if already initialized
if ( m_initialized ) return;
// clear everything
reset();
// we are now
m_initialized = true;
// set debug flag
m_isDebug = isDebug;
// list heads are not swapped yet
for ( long i = 0 ; i < q->getNumTerms() ; i++ )
// loop over all lists in this term's tiers
for ( long j = 0 ; j < MAX_TIERS ; j++ )
m_swapped[j][i] = false;
// . are we default AND? it is much faster.
// . using ~/queries/queries.X.rex on rex:
// . 81q/s raw=8&sc=0&dr=0&rat=1 sum = 288.0 avg = 0.3 sdev = 0.5
// . 70q/s raw=8&sc=1&dr=1&rat=1 sum = 185.0 avg = 0.4 sdev = 0.5
// . -O2 sum = 204.00 avg = 0.15 sdev = 0.26
// . 45q/s raw=8&sc=0&dr=0&rat=0 sum = 479.0 avg = 1.1 sdev = 1.1
// . 38q/s raw=8&sc=1&dr=1&rat=0 sum = 429.0 avg = 1.2 sdev = 1.2
// . -O2 raw=8&sc=0&dr=0&rat=0 sum = 351.00 avg = 0.60 sdev = 0.69
// . speed up mostly from not having to look up as many title recs?
// . the Msg39 reported time to intersect is 4 times higher for rat=0
// . grep "intersected lists took" rat1 | awk '{print $8}' | add
// . do this on host0c for speed testing:
// ?q=windows+2000+server+product+key&usecache=0&debug=1&rat=1
m_requireAllTerms = requireAllTerms;
// make sure our max score isn't too big
//long a = MAX_QUERY_TERMS * MAX_QUERY_TERMS * 300 * 255 + 255;
//long long aa=MAX_QUERY_TERMS * MAX_QUERY_TERMS * 300LL * 255LL + 255;
//if ( a != aa ) {
//log("IndexTable::set: MAX_QUERY_TERMS too big"); exit(-1); }
// save it
m_topTree = topTree;
//g_topTree = topTree;
// . HACK: we must always get the top 500 results for now
// . so next results page doesn't have same results as a previous page
// . TODO: refine this
// . this is repeated in IndexReadInfo.cpp
// . HACK: this prevents dups when clicking "Next X Results" link
//if ( docsWanted < 500 ) docsWanted = 500;
// remember the query class, it has all the info about the termIds
m_q = q;
// just a short cut
m_componentCodes = m_q->m_componentCodes;
// for debug msgs
m_logstate = (long)logstate;
// after this many docids for a term, we start dropping some
//long truncLimit = g_indexdb.getTruncationLimit();
// . set score weight for each term based on termFreqs
// . termFreqs is just the max size of an IndexList
setScoreWeights ( m_q );
// . HACK: date hack
// . set termSigns of the special date termIds to 'd' so addList()
// knows to throw these scores into the highest tier (just for them)
// . also re-set the weights so the most significant 8-bits of our
// 16-bit date is weighted by 256, and the low by 1
/*
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
if ( m_q->m_termIds[i] == (0xdadadada & TERMID_MASK) ) {
m_q->m_termSigns[i] = 'd';
m_scoreWeights[i] = 256;
}
else if ( m_q->m_termIds[i] == (0xdadadad2 & TERMID_MASK) ) {
m_q->m_termSigns[i] = 'd';
m_scoreWeights[i] = 1;
}
}
*/
}
// . the score weight is in [1,MAX_SCORE_WEIGHT=1000]
// . so max score would be MAX_QUERY_TERMS*1000*255 = 16*1000*255 = 4,080,000
// . now we use MAX_QUERY_TERMS * MAX_SCORE to be the actual max score so
// that we can use 4,080,000 for truncated hard-required terms
// . TODO: make adjustable from an xml interpolated point map
void IndexTable::setScoreWeights ( Query *q ) {
// set weights for phrases and singletons independently
setScoreWeights ( q , true );
setScoreWeights ( q , false );
}
void IndexTable::setScoreWeights ( Query *q , bool phrases ) {
// get an estimate on # of docs in the database
//long long numDocs = g_titledb.getGlobalNumDocs();
// this should only be zero if we have 0 docs, so make it 1 if so
//if ( numDocs <= 0 ) numDocs = 1;
// . compute the total termfreqs
// . round small termFreqs up to half a GB_PAGE_SIZE
double minFreq = 0x7fffffffffffffffLL;
for ( long i = 0 ; i < q->getNumTerms() ; i++ ) {
// ignore termIds we should
//if ( q->m_ignore[i] ) continue;
// component lists are merged into one compound list
if ( m_componentCodes[i] >= 0 ) continue;
// . if we're setting phrases, it must be an UNQUOTED phrase
// . quoted phrases have a term sign!
if ( phrases ) {
if ( q->isInQuotes (i) ) continue;
if ( ! q->isPhrase (i) ) continue;
if ( q->getTermSign(i) != '\0' ) continue;
}
else if ( ! q->isInQuotes (i) &&
q->isPhrase (i) &&
q->getTermSign(i) == '\0' )
continue;
// is this the new min?
if ( q->getTermFreq(i) < minFreq ) minFreq = q->getTermFreq(i);
}
// to balance things out don't allow minFreq below "absMin"
double absMin = GB_PAGE_SIZE/(2*sizeof(key_t));
if ( minFreq < absMin ) minFreq = absMin;
// loop through each term computing the score weight for it
for ( long i = 0 ; i < q->getNumTerms() ; i++ ) {
// reserve half the weight for up to 4 plus signs
//long long max = MAX_SCORE_WEIGHT / 3;
// i eliminated the multi-plus thing
//long long max = MAX_SCORE_WEIGHT ;
// . 3 extra plusses can triple the score weight
// . each extra plus adds "extra" to the score weight
//long extra = (2 * MAX_SCORE_WEIGHT) / 9;
// add 1/6 for each plus over 1
//if ( q->m_numPlusses[i] > 0 )
// max += (q->m_numPlusses[i] - 1) * extra;
// but don't exceed the absolute max
//if ( max > MAX_SCORE_WEIGHT ) max = MAX_SCORE_WEIGHT;
// ignore termIds we should
//if ( q->m_ignore[i] ) { m_scoreWeights[i] = 0; continue; }
// component lists are merged into one compound list
if ( m_componentCodes[i] >= 0 ) continue;
// if we're setting phrases, it must be a phrase
if ( phrases ) {
if ( q->isInQuotes (i) ) continue;
if ( ! q->isPhrase (i) ) continue;
if ( q->getTermSign(i) != '\0' ) continue;
}
else if ( ! q->isInQuotes (i) &&
q->isPhrase (i) &&
q->getTermSign(i) == '\0' )
continue;
// increase small term freqs to the minimum
double freq = q->getTermFreq(i);
if ( freq < absMin ) freq = absMin;
// get ratio into [1,inf)
double ratio1 = 2.71828 * freq / minFreq; // R1
// . natural log it
// . gives a x8 whereas log10 would give a x4 for a wide case
double ratio2 = log ( ratio1 ); // R2
// square
// ratio = ratio * ratio;
// make bigger now for '"warriors of freedom" game' query
// so it weights 'game' less
//double ratio3 = pow ( ratio2 , 2.6 ); // R3
double ratio3 = pow ( ratio2, (double)g_conf.m_queryExp );// R3
// now invert
double ratio4 = 1.0 / ratio3; // R4
// Example for 'boots in the uk' query: (GB_PAGE_SIZE is 32k, absMin=1365)
// TERM df R1 R2 R3 R4 W1
// boots(184) --> 7500000 7465 8.9179 295.58 .00338 1.1255
// uk (207) --> 78000000 77636 11.2597 541.97 .001845 .6143
// "boots.. uk"(25)--> 2731 2.71828 1.0 1.0 1.0 333
// don't exceed multiplier
//if ( ratio < 1.0 / g_conf.m_queryMaxMultiplier )
// ratio = 1.0 / g_conf.m_queryMaxMultiplier;
// get the pure weight
long weight= (long)(((double)MAX_SCORE_WEIGHT/3) * ratio4);//W1
// ensure at least 1
if ( weight < 1 ) weight = 1;
// don't breech MAX_SCORE
if ( weight > MAX_SCORE_WEIGHT ) weight = MAX_SCORE_WEIGHT;
// store it for use by addLists_r
m_scoreWeights[i] = weight; // (300 - score) + 100 ;
// . if this is a phrase then give it a boost
// . NOTE: this might exceed MAX_SCORE_WEIGHT then!!
if ( q->isPhrase(i) )
m_scoreWeights[i] = (long)
( ( (float)m_scoreWeights[i] *
g_conf.m_queryPhraseWeight )/
100.0 ) ;
// . apply user-defined weights
// . we add this with completed disregard with date weighting
QueryTerm *qt = &q->m_qterms[i];
long long w ;
if ( qt->m_userType == 'r' ) w = (long long)m_scoreWeights[i] ;
else w = 1LL;
w *= (long long)qt->m_userWeight;
// it can be multiplied by up to 256 (the term count)
long long max = 0x7fffffff / 256;
if ( w > max ) {
log("query: Weight breech. Truncating to %llu.",max);
w = max;
}
m_scoreWeights[i] = w;
// . stop words always get a weight of 1 regardless
// . we don't usually look them up unless all words in the
// query are stop words
//if ( q->m_isStopWords[i] ) weight = 1;
// log it
if ( m_isDebug || g_conf.m_logDebugQuery )
logf(LOG_DEBUG,"query: [%lu] term #%li has freq=%lli "
"r1=%.3f r2=%.3f r3=%.3f r4=%.3f score weight=%li",
m_logstate,i,q->m_termFreqs[i],
ratio1,ratio2,ratio3,ratio4,m_scoreWeights[i]);
}
}
// doubling this will half the number of loops (m_numLoops) required, but if
// we hit the L2 cache more then it might not be worth it. TODO: do more tests.
#define RAT_SLOTS (1024)
// . the 1.4Ghz Athlon has a 64k L1 data cache (another 64k for instructions)
// . TODO: put in conf file OR better yet... auto detect it!!!
// . we're like 3 times faster on host0 when L1 cache is 32k instead of 64k
//#define L1_DATA_CACHE_SIZE (64*1024)
//#define L1_DATA_CACHE_SIZE (32*1024)
// this is only 8k on my new pentium 4s!!!
#define L1_DATA_CACHE_SIZE (8*1024)
// . this is in bytes. the minimal size of a cached chunk of mem.
// . error on the large side if you don't know
// . this is 64 bytes for Athlon's from what I've heard
// . this is 64bytes for my new pentium 4s, too
#define L1_CACHE_LINE_SIZE 64
void IndexTable::hashTopDocIds2 ( unsigned long *maxDocId ,
char **docIdPtrs ,
long *scores ,
qvec_t *explicitBits ,
short *hardCounts ,
unsigned long mask ,
long numSlots ) {
// if none left to hash, we're done
if ( m_nexti >= m_numTopDocIds2 ) {
*maxDocId = (unsigned long)0xffffffff;
return;
}
*maxDocId = 0;
long maxi = m_nexti + (numSlots >> 1);
if ( maxi > m_numTopDocIds2 ) maxi = m_numTopDocIds2;
long oldmaxi = maxi;
// move maxi down to a maxDocId slot
while ( maxi > 0 && m_topScores2[maxi-1] != 0 ) maxi--;
// sometimes, the block can be bigger than numSlots when we
// overestimate maxDocIds and the second list hits the first like 80%
// or more of the time or so
if ( maxi == 0 || maxi <= m_nexti ) {
maxi = oldmaxi;
long bigmax = m_nexti + numSlots;
if ( bigmax > m_numTopDocIds2 ) bigmax = m_numTopDocIds2;
// we can equal bigmax if we have exactly m_numSlots(1024)
// winners!! VERY RARE!! but it happened for me on the query
// 'https://www.highschoolalumni.com/'. we filled up our hash
// table exactly so we got m_numSlots winners, and before this
// was "maxi < bigmax" which stopped short of what we needed.
// maxi should techincally allowed to equal m_numTopDocIds2
while ( maxi <= bigmax && m_topScores2[maxi-1] != 0 ) maxi++;
if ( m_topScores2[maxi-1] != 0 ) {
log(LOG_LOGIC,"query: bad top scores2.");
char *xx = NULL; *xx = 0; }
}
// set maxDocId
*maxDocId = (long)m_topDocIdPtrs2[maxi-1];
// sanity check
if ( *maxDocId == 0 ) {
log(LOG_LOGIC,"query: bad maxDocId.");
char *xx = NULL; *xx = 0; }
// debug msg
if ( m_isDebug || g_conf.m_logDebugQuery )
logf(LOG_DEBUG,"query: Hashing %li top docids2, [%li, %li)",
maxi-m_nexti,m_nexti,maxi);
long nn;
// we use a score of 0 to denote docid blocks
for ( long i = m_nexti ; i < maxi ; i++ ) {
// . if score is zero thats a tag block
// . all the docids before this should be < this max
if ( m_topScores2[i] == 0 ) continue;
// hash the top 32 bits of this docid
nn = (*(unsigned long *)(m_topDocIdPtrs2[i]+1) ) & mask ;
chain:
// . if empty, take right away
// . this is the most common case so we put it first
if ( docIdPtrs[nn] == NULL ) {
// hold ptr to our stuff
docIdPtrs [ nn ] = m_topDocIdPtrs2[i];
// store score
scores [ nn ] = m_topScores2 [i];
// now we use the explicitBits!
explicitBits [ nn ] = m_topExplicits2[i];
// and hard counts
hardCounts [ nn ] = m_topHardCounts2[i];
// add the next one
continue;
}
// if docIds bits don't match, chain to next bucket
// since we don't have dups and this is the first list we do
// not need to check for matching docids
if ( ++nn >= numSlots ) nn = 0;
goto chain;
}
// save the old one in case of a panic and rollback
m_oldnexti = m_nexti;
// advance next i
m_nexti = maxi;
}
// alloc m_topDocIdPtrs2/m_topScores2
bool IndexTable::alloc (IndexList lists[MAX_TIERS][MAX_QUERY_TERMS],
long numTiers ,
long numListsPerTier ,
long docsWanted ,
bool sortByDate ) {
// pre-allocate all the space we need for intersecting the lists
long long need = 0;
long nqt = numListsPerTier; // number of query terms
// component lists are merged into compound lists
nqt -= m_q->getNumComponentTerms();
long ntt = nqt * MAX_TIERS;
need += ntt * 256 * sizeof(char *) ; // ptrs
need += ntt * 256 * sizeof(char *) ; // pstarts
need += ntt * 256 * sizeof(char *) ; // oldptrs
need += nqt * 256 * sizeof(long ) + nqt * sizeof(long *);// scoretbls
need += ntt * sizeof(char ) ; // listSigns
need += ntt * sizeof(char ) ; // listHardCount
need += ntt * sizeof(long *) ; // listScoreTablePtrs
need += ntt * sizeof(qvec_t) ; // listExplicitBits
need += ntt * sizeof(char *) ; // listEnds
need += ntt * sizeof(char ) ; // listHash
need += ntt * sizeof(qvec_t) ; // listPoints
// mark spot for m_topDocIdPtrs2 arrays
long off = need;
// if sorting by date we need a much larger hash table than normal
if ( sortByDate ) {
// we need a "ptr end" (ptrEnds[i]) for each ptr since the
// buckets are not delineated by a date or score
need += ntt * 256 * sizeof(char *) ;
// get number of docids in largest list
long max = 0;
for ( long i = 0 ; i < numTiers ; i++ ) {
for ( long j = 0 ; j < numListsPerTier ; j++ ) {
// datedb lists are 10 bytes per half key
long nd = lists[i][j].getListSize() / 10;
if ( nd > max ) max = nd;
}
}
// how big to make hash table?
long slotSize = 4+4+2+sizeof(qvec_t);
long long need = slotSize * max;
// have some extra slots in between for speed
need = (need * 5 ) / 4;
// . do not go overboard
// . let's try to keep it in the L2 cache
// . we can only do like 1 million random access to main
// memory per second. L2 mem should be much higher.
//if ( need > 30*1024*1024 ) need = 30*1024*1024;
//if ( need > 512*1024 ) need = 512*1024;
//if ( need > 1024*1024 ) need = 1024*1024;
// . this is balance between two latency functions. with more
// "need" we can have a bigger hash table which decrease
// m_numLoops, but increases our memory access latency since
// we'll be hitting more of main memory. use hashtest.cpp
// with a high # of slots to see the affects of increasing
// the size of the hash table. every loop in m_numLoops
// means we read the entire datedb lists and we can only do
// that at most at 2.5GB/sec or so on the current hardware.
// . i don't expect memory access times to improve as fast as
// memory throughput, so in the future to maintain optimal
// behaviour, we should decrease the max for "need"???
//if ( need > 4*1024*1024 ) need = 4*1024*1024;
if ( need > 2*1024*1024 ) need = 2*1024*1024;
//if ( need > 512*1024 ) need = 512*1024;
// we need to have AT LEAST 1024 slots in our hash table
if ( need < slotSize*1024 ) need = slotSize*1024;
// clear this out in case it was set
if ( m_bigBuf ) {
mfree ( m_bigBuf , m_bigBufSize , "IndexTable2" );
m_bigBuf = NULL;
}
tryagain:
// alloc again
m_bigBuf = (char *)mmalloc ( need , "IndexTable2" );
if ( ! m_bigBuf && need > 1024*1024 ) {
need >>= 1; goto tryagain; }
// set it
m_bigBufSize = need;
if ( ! m_bigBuf ) {
log("query: Could not allocate %lli for query "
"resolution.",need);
return false;
}
}
// if not default AND no need for the list of docids from intersecting
// the last two termlists (and their associated phrase termlists)
if ( ! m_requireAllTerms ) {
// do it
m_bufSize = need;
m_buf = (char *) mmalloc ( m_bufSize , "IndexTable" );
if ( ! m_buf ) return log("query: Table alloc(%lli)"
": %s",need,mstrerror(g_errno));
// save it for error checking
m_bufMiddle = m_buf + off;
m_topDocIdPtrs2 = NULL;
m_topScores2 = NULL;
m_topExplicits2 = NULL;
m_topHardCounts2= NULL;
return true;
}
// sanity check
if ( m_topDocIdPtrs2 ) {
log(LOG_LOGIC,"query: bad top docid ptrs2.");
char *xx = NULL; *xx = 0; }
// calc list sizes
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
m_sizes[i] = 0;
// component lists are merged into one compound list
if ( m_componentCodes[i] >= 0 ) continue;
for ( long j = 0 ; j < numTiers ; j++ )
m_sizes [i] += lists[j][i].getListSize();
}
// . get imap here now so we can get the smallest block size to
// set m_maxNumTopDocIds2
// . get the smallest list (add all its tiers) to find least number
// of docids we'll have in m_topDocIdPtrs2
m_nb = m_q->getImap ( m_sizes , m_imap , m_blocksize );
// bail now if we have none!
if ( m_nb <= 0 ) return true;
long long min = 0;
for ( long i = 0 ; i < m_blocksize[0]; i++ )
for ( long j = 0 ; j < numTiers ; j++ )
min += lists[j][m_imap[i]].getListSize() / 6 ;
// debug msg
//log("minSize = %li docids q=%s",min,m_q->m_orig);
// now add in space for m_topDocIdPtrs2 (holds winners of a
// 2 list intersection (more than 2 lists if we have phrases) [imap]
long long nd = (105 * min) / 100 + 10 ;
need += (4+ // m_topDocIdPtrs2
4+ // m_topScores2
sizeof(qvec_t)+ // m_topExplicits2
2 // m_topHardCounts2
) * nd;
// do it
m_bufSize = need;
m_buf = (char *) mmalloc ( m_bufSize , "IndexTable" );
if ( ! m_buf ) return log("query: table alloc(%lli): %s",
need,mstrerror(g_errno));
// save it for error checking
m_bufMiddle = m_buf + off;
// . allow for 5% more for inserting maxDocIds every block
// so hashTopDocIds2() can use that
// . actually, double in case we get all the termIds, but only one
// per slot scan/hash, so every other guy is a maxdocid
// . no, now we use lastGuy/lastGuy2 to check if our new maxDocId
// is within 50 or less of the second last maxDocId added, and
// if it is, we bury the lastGuy and replace him.
char *p = m_buf + off;
m_topDocIdPtrs2 = (char **)p ; p += 4 * nd;
m_topScores2 = (long *)p ; p += 4 * nd;
m_topExplicits2 = (qvec_t *)p ; p += sizeof(qvec_t) * nd;
m_topHardCounts2= (short *)p ; p += 2 * nd;
m_maxTopDocIds2 = nd;
m_numTopDocIds2 = 0;
return true;
}
// realloc to save mem if we're rat
void IndexTable::freeMem ( ) {
if ( m_bigBuf ) {
mfree ( m_bigBuf , m_bigBufSize , "IndexTable2" );
m_bigBuf = NULL;
}
if ( ! m_buf ) return;
//mfree ( (char *)m_topDocIdPtrs2 , m_maxTopDocIds2 * 10,"IndexTable");
mfree ( m_buf , m_bufSize , "IndexTable" );
m_buf = NULL;
m_topDocIdPtrs2 = NULL;
m_topScores2 = NULL;
m_topExplicits2 = NULL;
m_topHardCounts2= NULL;
m_maxTopDocIds2 = 0;
m_numTopDocIds2 = 0;
}
void IndexTable::addLists_r (IndexList lists[MAX_TIERS][MAX_QUERY_TERMS],
long numTiers ,
long numListsPerTier ,
Query *q ,
long docsWanted ,
long *totalListSizes ,
bool useDateLists ,
bool sortByDate ,
float sortByDateWeight ) {
// sanity check
if ( ! useDateLists && sortByDate ) {
log(LOG_LOGIC,"query: bad useDateLists/sortByDate.");
char *xx = NULL; *xx = 0; }
// if we don't get enough results should we grow the lists and recall?
m_doRecall = true;
// bail if nothing to intersect... if all lists are empty
if ( ! m_buf /*m_nb <= 0*/ ) return;
// set start time
long long t1 = gettimeofdayInMilliseconds();
char hks = 6; // half key size (size of everything except the termid)
char fks = 12;
// dateLists are 4 more bytes per key than standard 12-byte key lists
if ( useDateLists ) { hks += 4; fks += 4; }
// set up for a pointer (array index actually) sort of the lists
//long imap [ MAX_QUERY_TERMS ];
// now swap the top 12 bytes of each list back into original order
for ( long i = 0 ; i < numListsPerTier ; i++ ) {
// loop over all lists in this term's tiers
for ( long j = 0 ; j < numTiers ; j++ ) {
// skip if list is empty, too
if ( lists[j][i].isEmpty() ) continue;
// skip if already swapped
if ( m_swapped[j][i] ) continue;
// flag it
m_swapped[j][i] = true;
// point to start
char *p = lists[j][i].getList();
// remember to swap back when done!!
//char ttt[6];
//memcpy ( ttt , p , 6 );
//memcpy ( p , p + 6 , 6 );
//memcpy ( p + 6 , ttt , 6 );
char ttt[10];
memcpy ( ttt , p , hks );
memcpy ( p , p + hks , 6 );
memcpy ( p + 6 , ttt , hks );
// point to the low "hks" bytes now
p += 6;
// turn half bit on
*p |= 0x02;
}
}
// if query is boolean, turn off rat
if ( m_q->m_isBooleanQuery ) m_requireAllTerms = false;
// count # of panics
m_numPanics = 0;
// and # of collisions
m_numCollisions = 0;
// count # loops we do
m_numLoops = 0;
// . set m_q->m_bitScores[]
// . see Query.h for description of m_bitScores[]
// . this is now preset in Msg39.cpp since it may now require an
// alloc since we allow more than 16 query terms for metalincs
if ( ! m_q->m_bmap || ! m_q->m_bitScores ) { //alreadySet ) {
//m_q->setBitScores();
//m_alreadySet = true;
log (LOG_LOGIC,"query: bit map or scores is NULL. Fix this.");
return;
}
long minHardCount = 0;
// if not rat, do it now
if ( ! m_requireAllTerms ) {
// no re-arranging the query terms for default OR searches
// because it is only beneficial when doing sequential
// intersectinos to minimize the intersection and speed up
long count = 0;
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
// component lists are merged into one compound list
if ( m_componentCodes[i] >= 0 ) continue;
m_imap[count++] = i;
}
addLists2_r ( lists ,
numTiers ,
count , // numListsPerTier
q ,
docsWanted ,
m_imap ,
true ,
0 ,
useDateLists ,
sortByDate ,
sortByDateWeight,
&minHardCount );
goto swapBack;
}
{ // i added this superfluous '{' so we can call "goto swapBack" above.
// TODO: use ptrs to score so when hashing m_topDocIdPtrs2 first and
// another non-required list second we can just add to the scores
// directly? naahhh. too complicated.
// get a map that sorts the query terms for optimal intersectioning
//long blocksize [ MAX_QUERY_TERMS ];
//long nb = m_q->getImap ( sizes , imap , blocksize );
// . if first list is required and has size 0, no results
// . 'how do they do that' is reduced to a signless phrase
// . careful, i still saw some chinese crap whose two terms were
// both signless phrases somehow... hmmm, m_nb was 0 and m_imap
// was not set
//if (m_nb <= 0||/*m_q->isRequired(imap[0])&&*/ m_sizes[m_imap[0]]<12){
if (m_nb <=0 || /*m_q->isRequired(imap[0])&&*/ m_sizes[m_imap[0]]<fks){
m_doRecall = false;
goto swapBack;
}
// count number of base lists
long numBaseLists = m_blocksize[0];
// how many lists to intersect initially? that is numLists.
long numLists = m_blocksize[0];
// if we got a second block, add his lists (single term plus phrases)
if ( m_nb > 1 ) numLists += m_blocksize[1];
// component lists are merged into one compound list
long total = 0;
for ( long i = 0 ; i < m_nb ; i++ ) // q->m_numTerms ; i++ )
//if ( m_componentCodes[i] < 0 ) total++;
total += m_blocksize[i];
// if this is true we set m_topDocIds/m_topScores/etc. in addLists2_r()
bool lastRound = (numLists == total); // numListsPerTier);
addLists2_r ( lists , numTiers , numLists , q , docsWanted ,
m_imap , lastRound , numBaseLists , useDateLists ,
sortByDate , sortByDateWeight , &minHardCount );
// . were both lists under the sum of tiers size?
// . if they were and we get no results, no use to read more, so we
// set m_doRecall to false to save time
bool underSized = true;
long m0 = m_imap[0];
long m1 = 0 ;
if ( m_nb > 1 ) m1 = m_imap[m_blocksize[0]];
if ( m_sizes[m0] >= totalListSizes[m0]&&m_q->getTermSign(m0)!='-')
underSized = false;
if(m_nb>1&& m_sizes[m1]>=totalListSizes[m1]&&m_q->getTermSign(m1)!='-')
underSized = false;
// . offset into imap
// . imap[off] must NOT be a signless phrase term
long off = numLists;
// follow up calls
for ( long i = 2 ; i < m_nb ; i++ ) {
// if it is the lastRound then addLists2_r() will compute
// m_topDocIds/m_topScores arrays
lastRound = (i == m_nb - 1);
// . if we have no more, might as well stop!
// . remember, maxDocIds is stored in here, so it will be at
// least 1
if ( m_numTopDocIds2 <= 1 ) break;
// is this list undersized?
long mx = m_imap[off];
if (m_sizes[mx]>=totalListSizes[mx]&&m_q->getTermSign(mx)!='-')
underSized = false;
// set number of lists
numLists = m_blocksize[i];
// add it to the intersection
addLists2_r ( lists , numTiers , numLists , q , docsWanted ,
m_imap + off , lastRound, 0 , useDateLists ,
sortByDate , sortByDateWeight , &minHardCount );
// skip to next block of lists
off += m_blocksize[i];
}
// . now if we have no results and underSize is true, there is no
// use to read more for each list... pointless
// . remember, maxDocIds is stored in here, so it will be at least 1
if ( m_numTopDocIds2 <= 1 && underSized ) {
// debug
//fprintf(stderr,"UNDERSIZED quitting\n");
m_doRecall = false;
}
}
swapBack:
// compute total number of docids we dealt with
m_totalDocIds = 0;
// now swap the top 12 bytes of each list back into original order
for ( long i = 0 ; i < numListsPerTier ; i++ ) {
// loop over all lists in this term's tiers
for ( long j = 0 ; j < numTiers ; j++ ) {
// skip if list is empty, too
if ( lists[j][i].isEmpty() ) continue;
// compute total number of docids we dealt with
//m_totalDocIds += (lists[j][i].getListSize()-6)/6;
// date lists have 5 bytes scores, not 1 byte scores
m_totalDocIds += (lists[j][i].getListSize()-6)/hks;
// point to start
//char *p = lists[j][i].getList();
// remember to swap back when done!!
//char ttt[6];
//memcpy ( ttt , p , 6 );
//memcpy ( p , p + 6 , 6 );
//memcpy ( p + 6 , ttt , 6 );
// turn half bit off again
//*p &= 0xfd;
}
}
// get time now
long long now = gettimeofdayInMilliseconds();
// store the addLists time
m_addListsTime = now - t1;
// . measure time to add the lists in bright green
// . use darker green if rat is false (default OR)
long color;
if ( ! m_requireAllTerms ) color = 0x00008000 ;
else color = 0x0000ff00 ;
g_stats.addStat_r ( 0 , t1 , now , color );
}
// . DO NOT set g_errno cuz this is probably in a thread
// . these lists should be 1-1 with the query terms
// . there are multiple tiers of lists, each tier is an array of lists
// one for each query term
// . we try to restrict the hash table to the L1 cache to avoid slow mem
// . should result in a 10x speed up on the athlon, 50x on newer pentiums,
// even more in the future as the gap between L1 cache mem speed and
// main memory widens
void IndexTable::addLists2_r ( IndexList lists[MAX_TIERS][MAX_QUERY_TERMS] ,
long numTiers ,
long numListsPerTier ,
Query *q ,
long docsWanted ,
long *imap ,
bool lastRound ,
long numBaseLists ,
bool useDateLists ,
bool sortByDate ,
float sortByDateWeight ,
long *minHardCountPtr ) {
// set up for rat
long rat = m_requireAllTerms;
m_nexti = 0;
// sanity test -- fails if 2nd guy is short negative
/*
long size0 = 0;
long size1 = 0;
if ( rat && numListsPerTier == 2 ) {
for ( long i = 0 ; i < numTiers ; i++ ) {
size0 += lists[i][imap[0]].getListSize();
size1 += lists[i][imap[1]].getListSize();
}
if ( size0 > size1 ) { char *xx = NULL; *xx = 0; }
}
*/
// getting 100 takes same time as getting 1
if ( docsWanted < 100 ) docsWanted = 100;
qvec_t requiredBits = m_q->m_requiredBits ;
unsigned char *bitScores = m_q->m_bitScores;
qvec_t *bmap = m_q->m_bmap;
// . keep track of max tier we've processed
// . if Msg39 finds that too many docids are clustered away it
// will recall us with docsWanted increased and the numTiers will
// be the same as the previous call
if ( numTiers < m_numTiers || numTiers > m_numTiers + 1 )
log(LOG_LOGIC,"query: indextable: Bad number of tiers."
" %li vs. %li", numTiers, m_numTiers );
else if ( numTiers == m_numTiers + 1 )
m_numTiers++;
// current tier #
long tier = numTiers - 1;
// sanity check
if ( ! rat &&
numListsPerTier != m_q->getNumTerms()-m_q->getNumComponentTerms())
log(LOG_LOGIC,"query: indextable: List count mismatch.");
// truncate this
if (!m_topTree) if (docsWanted > MAX_RESULTS) docsWanted = MAX_RESULTS;
// convenience ptrs
char **topp = m_topDocIdPtrs [ tier ];
long *tops = m_topScores [ tier ];
unsigned char *topb = m_topBitScores [ tier ];
char *tope = m_topExplicits [ tier ];
// assume no top docIds now
long numTopDocIds = 0;
////////////////////////////////////
// begin hashing setup
////////////////////////////////////
// count # of docs that EXPLICITLY have all query singleton terms
long explicitCount = 0;
// count # of docs that IMPLICITY have all query singleton terms
long implicitCount = 0;
// highest bscore we can have
//unsigned char maxbscore = bitScores [ requiredBits ];
// . count all mem that should be in the L1 cache
// . the less mem we use the more will be available for the hash table
long totalMem = 0;
// . we mix up the docIdBits a bit before hashing using this table
// . TODO: what was the reason for this? particular type of query
// was too slow w/o it?
// . a suburl:com collision problem, where all docids have the same
// score and are spaced equally apart somehow made us have too
// many collisions before, probably because we included the score
// in the hash...? use bk revtool IndexTable.cpp to see the old file.
/*
static unsigned long s_mixtab [ 256 ] ;
// is the table initialized?
static bool s_mixInit = false;
if ( ! s_mixInit ) {
srand ( 1945687 );
for ( long i = 0 ; i < 256 ; i++ )
s_mixtab [i]= ((unsigned long)rand());
s_mixInit = true;
// randomize again
srand ( time(NULL) );
}
*/
// s_mixtab should be in the L1 cache cuz we use it to hash
//totalMem += 256 * sizeof(long);
// . now form a set of ptrs for each list
// . each ptr points to the first 6-byte key for a particular score
char *p = m_buf;
long nqt = numListsPerTier; // q->getNumQueryTerms();
long ntt = numListsPerTier * numTiers;
// we now point into m_buf to save stack since we're in a thread
//char *ptrs [ MAX_QUERY_TERMS * MAX_TIERS * 256 ];
//char *pstarts [ MAX_QUERY_TERMS * MAX_TIERS * 256 ];
//long scoreTable [ MAX_QUERY_TERMS ] [ 256 ];
//char *oldptrs [ MAX_QUERY_TERMS * MAX_TIERS * 256 ];
//char listSigns [ MAX_QUERY_TERMS * MAX_TIERS ];
//long *listScoreTablePtrs [ MAX_QUERY_TERMS * MAX_TIERS ];
//qvec_t listExplicitBits [ MAX_QUERY_TERMS * MAX_TIERS ];
//char *listEnds [ MAX_QUERY_TERMS * MAX_TIERS ];
//char listHash [ MAX_QUERY_TERMS * MAX_TIERS ];
//qvec_t listPoints [ MAX_QUERY_TERMS * MAX_TIERS ];
char **ptrs = (char **)p; p += ntt * 256 * sizeof(char *);
char **pstarts = (char **)p; p += ntt * 256 * sizeof(char *);
char **ptrEnds = NULL;
if ( useDateLists ) {
ptrEnds = (char **)p; p += ntt * 256 * sizeof(char *);
}
char **oldptrs = (char **)p; p += ntt * 256 * sizeof(char *);
long **scoreTable = (long **)p; p += nqt * sizeof(long *);
// one score table per query term
for ( long i = 0 ; i < nqt ; i++ ) {
scoreTable [ i ] = (long *)p; p += 256 * sizeof(long); }
// we have to keep this info handy for each list
char *listSigns = (char *)p; p += ntt * sizeof(char );
char *listHardCount = (char *)p; p += ntt * sizeof(char );
long **listScoreTablePtrs = (long **)p; p += ntt * sizeof(long *);
char **listEnds = (char **)p; p += ntt * sizeof(char *);
char *listHash = (char *)p; p += ntt * sizeof(char );
qvec_t *listExplicitBits = (qvec_t *)p; p += ntt * sizeof(qvec_t);
qvec_t *listPoints = (qvec_t *)p; p += ntt * sizeof(qvec_t);
// do not breech
if ( p > m_bufMiddle ) {
log(LOG_LOGIC,"query: indextable: Got table "
"mem breech.");
char *xx = NULL; *xx = 0;
}
char hks = 6; // half key size (size of everything except the termid)
// dateLists are 4 more bytes per key than standard 12-byte key lists
if ( useDateLists ) hks += 4;
long numPtrs = 0;
long numLists = 0;
unsigned long numDocIds = 0;
long numSorts = 0;
// . make the ebitMask
// . used when rat=1 to determine the hits from both (or the) list
qvec_t ebitMask = 0;
// how many of the terms we are intersecting are "required" and
// therefore do not have an associated explicit bit. this allows us
// to support queries of many query terms.
long minHardCount = *minHardCountPtr;
// each list can have up to 256 ptrs, corrupt data may mess this up
long maxNumPtrs = ntt * 256;
// only log this error message once per call to this routine
char pflag = 0;
// time the gbsorting for the datelists
long long t1 = gettimeofdayInMilliseconds();
for ( long i = 0 ; i < numListsPerTier ; i++ ) {
// map i to a list number
long m = imap[i];
// skip if ignored
//if ( m_q->m_ignore[i] ) continue;
// skip if list in first tier is empty
if ( lists[0][m].isEmpty() ) continue;
// . make score table to avoid IMUL instruction (16-36 cycles)
// . 1k per table! will that fit in L1/L2 cache, too?
// . if too big, limit j to ~32 to cover most common scores
// . remember that scores in the indexdb keys are complemented
// so higher guys appear on top
// . if score is weighted with a relative 0 weight, that means
// that the term is required, but should not contribute to
// the final ranking in any way, so use a score of 1 despite
// the term count of this term in the page
QueryTerm *qt = &q->m_qterms[m];
if ( qt->m_userWeight == 0 && qt->m_userType == 'r' )
for ( long k = 0 ; k < 256 ; k++ )
scoreTable[i][k] = 1;
else
for ( long k = 0 ; k < 256 ; k++ )
scoreTable[i][k] = m_scoreWeights[m] * (255-k);
// sorty by date uses the dates as scores, but will add the
// normal score to the date, after weighting it with this
if ( sortByDate ) // && sortByDateWeight > 0.0 )
for ( long k = 0 ; k < 256 ; k++ )
scoreTable[i][k] = (long)((float)
scoreTable[i][k] *
sortByDateWeight);
// are we using the hard count instead of an explicit bit?
char hc = 0;
if ( qt->m_explicitBit == 0 ) {
// . it is either a phrase in quotes or a single word
// . or something like cd-rom or www.abc.com
if ( qt->m_isPhrase ) {
if ( qt->m_inQuotes ) minHardCount++;
if ( qt->m_termSign == '+' ) minHardCount++;
if ( qt->m_termSign == '*' ) minHardCount++;
hc = 1;
}
else {
minHardCount++;
hc = 1;
}
}
// count mem from score tables
totalMem += 256 * sizeof(long);
// loop over all lists in this term's tiers
for ( long j = 0 ; j < numTiers ; j++ ) {
// skip if first tier list is empty, too
if ( lists[j][m].isEmpty() ) continue;
// corrupt data can make numPtrs too big
if ( numPtrs >= maxNumPtrs ) continue;
//if ( numPtrs >= MAX_QUERY_TERMS*MAX_TIERS*256 )
// continue;
// get the sign
char sign = m_q->getTermSign(m);
// are we date?
if ( useDateLists && sortByDate ) sign = 'd';
else if (useDateLists) sign='e';
// set sign of list (\0,- or d)
listSigns[numLists] = sign;
// point to its table
listScoreTablePtrs[numLists] = scoreTable[i];
// and its explicit bit
listExplicitBits[numLists] = qt->m_explicitBit ;//1<<m;
// some have a hard count instead of an explicit bit
listHardCount[numLists] = hc;
// rat is special
if ( rat ) {
//if ( i < numBaseLists )
// listExplicitBits[numLists] = 0x00 ;
if ( sign == '-' )
ebitMask |= 0x00; // dummy
else if ( m_q->isPhrase(m) && sign&& sign!='d')
ebitMask |= listExplicitBits[numLists];
else if ( ! m_q->isPhrase(m) )
ebitMask |= listExplicitBits[numLists];
}
// the end of this list
listEnds[numLists] = lists[j][m].getListEnd();
// . should list be hashed first?
// . only applies to rat=1
if ( i < numBaseLists ) listHash[numLists] = true;
else listHash[numLists] = false;
if ( ! rat ) listHash[numLists] = true;
// and the # of it's first ptr in the ptrs[] array
listPoints[numLists] = numPtrs;
// it should be in L1 cache, too
totalMem += 1 + 4 + 2 + 4 + 2;
// inc list count
numLists++;
// . now fill the ptr[] array
// . reset ptrs to start of list
lists[j][m].resetListPtr();
// point to start
char *p = lists[j][m].getList();
// and end
char *pend = lists[j][m].getListEnd();
// add to total docids
//numDocIds += (lists[j][m].getListSize() - 6) / 6;
numDocIds += (lists[j][m].getListSize() - 6) / hks;
// this is now done in addLists_r()
// remember to swap back when done!!
//char ttt[6];
//memcpy ( ttt , p , 6 );
//memcpy ( p , p + 6 , 6 );
//memcpy ( p + 6 , ttt , 6 );
// point to the low 6 bytes now
p += 6;
// turn half bit on
*p |= 0x02;
// find the tiers
ptrs [ numPtrs ] = p;
// . ok, no need to set pstarts[] and ptrs[] any more
// since we do not have score tiers for date sort
// . we do not do score tiering for date lists, since
// we use 5 bytes for the score, not one, so we must
// scan the whole list for every maxDocId used. This
// may be a few times slower. If we had a large L1
// cache this would hurt us less. but L2 caches are
// like 1MB nowadays and having the hash table
// contained in the L2 cache should only be like 3
// times as slow.
//if ( useDateLists ) continue;
// to make sort by date faster, we now artificially
// create 256 buckets on each date list and then
// sort each bucket by docid
if ( useDateLists ) {
long listSize = pend - p;
long pstep = listSize / 250;
// . do not go too low
// . TODO: try lowering this to see if it
// gets faster. i could not test lower
// values because buzz starting
// querying again.
if ( pstep < 10000 ) pstep = 10000;
// make sure it is divisible by 10
while ( (pstep % hks) ) pstep--;
// do not go crazy
if ( pstep <= 0 ) pstep = listSize;
// now make 256 approximately equally sized
// buckets of datedb records . a lot of times
// the dates are the same and they are already
// sorted by docid, so we might want to check
// for that to save time. (TODO)
for ( ; p < pend ; p += pstep ) {
// get the end
char *end = p + pstep;
if ( end > pend ) end = pend;
// store the current ptr info
pstarts [ numPtrs ] = p;
ptrEnds [ numPtrs ] = end;
ptrs [ numPtrs++ ] = p;
long size = end - p;
// count it
numSorts++;
// now sort each p
hsort_10_1_5((uint8_t *)p, size / 10 );
//gbsort ( p, size, hks, dateDocIdCmp );
}
// skip the rest
continue;
}
// set this new var so we don't rollback too much
// when re-hashing because of a panic
pstarts [ numPtrs++ ] = p;
// advance
p += 6;
// fill up the ptrs array
//for ( ; p < pend ; p += 6 ) {
for ( ; p < pend ; p += hks ) {
// if we got a new score, add a new ptr
if ( p[5] != p[-1] ) {
// if data is corrupt, we have
// to watch so we don't breech!!
if ( numPtrs >= maxNumPtrs ) {
if ( ! pflag )
log(LOG_LOGIC,
"query: Got "
"indextable "
"breech. np=%li",
numPtrs);
pflag = 1;
break;
}
//if ( numPtrs >=
// MAX_QUERY_TERMS*MAX_TIERS*256 )
// break;
// speed test -- remove the small head
// score is COMPLEMENTED!!!
//unsigned char c =(unsigned char)p[5];
//if ( 255-c <= (unsigned char)25 ) {
pstarts [ numPtrs ] = p;
ptrs [ numPtrs++ ] = p;
//}
}
// . jump by 30 if he's got same score as us
// . this saves a good amount of time!
while (p+300*6 < pend && p[300*6+5] == p[5] )
p += 300*6;
}
}
}
// time the gbsorting for the datelists
if ( m_isDebug || g_conf.m_logDebugQuery ) {
long long t2 = gettimeofdayInMilliseconds();
logf(LOG_DEBUG,"query: Took %lli ms to prepare list ptrs. "
"numDocIds=%lu numSorts=%li",
t2 - t1 , numDocIds , numSorts );
}
// save it
*minHardCountPtr = minHardCount;
// count mem from ptrs
totalMem += numPtrs * sizeof(char *);
// . and what they point to too! first line should/will be in cache.
// . assume a cache line size of 64 bytes
totalMem += numPtrs * L1_CACHE_LINE_SIZE;
// if we got no non-empty lists, we're done
//if ( n == 0 ) goto done;
// count miscellaneous mem access (like "point" and "next")
totalMem += 256;
// convenience vars
register long i = 0 ;
long j;
// a dummy var
long long tmpHi = 0x7fffffffffffffffLL;
// . the info of the weakest entry in the top winners
// . if its is full and we get another winner, the weakest will be
// replaced by the new winner
unsigned char minTopBitScore = 0 ;
long minTopScore = 0 ;
char *minTopDocIdPtr = (char *)&tmpHi;
// . this is the hash table
// . it is small cuz we are trying to stuff it all into the L1 cache
// . it could be as big as 128k, so reserve enough to take advantage
// . some of the cache will be used for scoreTables[], ptrs[] etc.
// . i tried using a struct that holds these 3 but it made everything
// about 10% slower
char localBuf[10000*(4+4+2+sizeof(qvec_t))];
char *pp = localBuf;
//char *docIdPtrs [ 10000 ];
//long scores [ 10000 ];
//qvec_t explicitBits [ 10000 ];
//short hardCounts [ 10000 ];
// . how many slots in the in-mem-cache hash table... up to 10000
// . use the remainder of the L1 cache to hold this hash table
long availSlots = (L1_DATA_CACHE_SIZE - totalMem) / 10;
// make a power of 2 for easy hashing (avoids % operator)
long numSlots = getHighestLitBitValue ( availSlots );
// don't go below this min even if we must leave the cache
if ( numSlots < 1024 ) numSlots = 1024;
// damn, now we have to keep this fixed for rat because hashTopDocIds2
// needs to have numSlots consistent for all calls to addLists2_r()
// because m_topDocIdPtrs2 needs to hash full blocks. blocks are
// separated by maxDocId values, which are stored as docId ptrs in
// m_topDocIdPtrs2, and indicated by having a score of 0. so if we
// stored one block into m_topDocIdPtrs2 when numSlots was 2048 that
// block might not hash into a 1024 slot table...
if ( rat ) numSlots = RAT_SLOTS;
// sanity check
if ( numSlots > 10000 ) {
log(LOG_LOGIC,"query: bad numSlots.");
char *xx = NULL; *xx = 0; }
// sort by date should try to maximize maxDocId and the size of the
// hash table in order to prevent rescanning the termlists
if ( sortByDate ) {
numSlots = m_bigBufSize / (4+4+2+sizeof(qvec_t));
// make a power of 2 for easy hashing (avoids % operator)
numSlots = getHighestLitBitValue ( numSlots );
// don't go below this min even if we must leave the cache
if ( numSlots < 1024 ) {
log(LOG_LOGIC,"query: bad numSlots2.");
char *xx = NULL; *xx = 0; }
// point to the buffer space
pp = m_bigBuf;
}
char **docIdPtrs = (char **)pp; pp += numSlots*4;
long *scores = (long *)pp; pp += numSlots*4;
short *hardCounts = (short *)pp; pp += numSlots*2;
qvec_t *explicitBits = (qvec_t *)pp; pp += numSlots*sizeof(qvec_t);
// for hashing we need a mask to use instead of the % operator
unsigned long mask = (unsigned long)numSlots - 1;
// empty all buckets
for ( long i = 0 ; i < numSlots ; i++ )
docIdPtrs[i] = NULL; // explicitBits[i] = 0;
// . use numSlots to get first docid upper bound
// . this is just the top 4 bytes of docids allowed to hash...
// . since hash table is so small we can only hash docids below
// "maxDocId"
// . therefore, we have to do serveral iterations
// . i found a slight increase in speed chaning to 8/20 instead of
// 1/2 on the 'the .. sex' test now commented out in Msg39.cpp
// . if rat (require all terms) is true we only have two lists and
// only the smallest one (first one) gets hashed..
unsigned long step = numSlots * 8 / 20 ; // >> 1 ;
unsigned long dd = numDocIds ;
if ( dd <= 0 ) dd = 1;
// max it out if numDocIds is smaller than numSlots/2
if ( dd <= step ) step = 0xffffffff;
else step *= ((unsigned long)0xffffffff / dd) ;
unsigned long maxDocId = step;
// we overwrite m_topDocIdPtrs2/m_topScores2
long newTopDocIds2 = 0;
// these two guys save us on memory
long lastGuy2 = -10000;
long lastGuy = -10000;
// save the last maxDocId in case we have to rollback for a panic
unsigned long lastMaxDocId = 0;
// used for hashing
long nn = 0;
long nnstart;
// these vars are specific to each list
char sign = 0;
long *scoreTablePtr = NULL;
qvec_t ebits = 0;
char *listEnd = NULL;
bool hashIt = true;
char hc = 0;
// if maxDocId is too big step down by this much / 2
unsigned long downStep = step;
unsigned long oldDocId ;
long printed = -1;
long weakest = -1;
// reset these counts in addLists_r() now so rat=1 can count right
// count # of panics
//m_numPanics = 0;
// and # of collisions
//m_numCollisions = 0;
// save # of ptrs for stats
m_numPtrs = numPtrs;
// count # loops we do
//m_numLoops = 0;
// save for debug
m_numSlots = numSlots;
//t1 = gettimeofdayInMilliseconds();
/////////////////////////////////////////////////
// begin the hashing loop
// this is where we need SUPER SPEED!!!!
/////////////////////////////////////////////////
// should we hash topDocIds2? only if we got some
bool hashThem = ( rat && m_topDocIdPtrs2 && numBaseLists == 0 );
// these vars help us change the list-specific vars
short point ;
short next ;
// . don't do anything if we're rat and list is negative and empty
// . positive lists are always before negative and they are sorted
// by list size, so if a positive is ever empty, we add it first
if ( rat && numLists == 0 ) {
// if this is the last round we must set m_topDocIds et al
if ( ! lastRound ) return;
newTopDocIds2=m_numTopDocIds2;
goto done;
}
// . if we have none, intersection is empty, no results found
// . this is now done in addLists_r()
//if ( ! firstRound && hashThem && m_numTopDocIds2 <= 0 ) return ;
top:
m_numLoops++;
// these vars help us change the list-specific vars
point = 0;
next = 0;
// reset ptrs for datedb since we scan the whole thing each loop
//if (useDateLists) for ( i = 0; i < numPtrs; i++ ) ptrs[i]=pstarts[i];
// . if this is a successive rat call, m_topDocIdPtrs2 should be hashed
// and then we hash the one list we should have
// . this is good when rat is 0 now because we're adding the phrases
// so it's like partial rat going on
// . TODO: if m_numTopDocIds2 is 1, that is just the stored maxDocId!!!
if ( hashThem && m_numTopDocIds2 > 0 )
hashTopDocIds2 ( &maxDocId ,
docIdPtrs ,
scores ,
explicitBits ,
hardCounts ,
mask ,
numSlots );
// loop over the ptrs
for ( i = 0 ; i < numPtrs ; i++ ) {
// when i reaches this break point we've switched lists
if ( i == point ) {
sign = listSigns [ next ];
scoreTablePtr = listScoreTablePtrs [ next ];
ebits = listExplicitBits [ next ];
listEnd = listEnds [ next ];
hashIt = listHash [ next ];
hc = listHardCount [ next ];
// if that was the last list, then we'll exit the
// loop by i hitting numPtrs, so don't worry
next++;
if ( next < numLists ) point = listPoints [ next ];
else point = -1;
}
// skip if emptied (end of that list fragment)
if ( ! ptrs[i] ) continue;
addMore:
// . date lists are not divided into sublists based on
// the one byte score, they are just one big list...
// . this at "top:" we must reset each whole termlist to
// point to the start, thus, do not re-hash docids from
// the last time (lastMaxDocId)
/*
if ( useDateLists ) {
// we expect to skip over a lot of docids in a row
// because we usually only have 32k slots and millions
// of docids, so we're talking doing hundreds of loops
// (calls to top:) which means there's like >99% chance
// we end up skipping this docid here...
// TODO: make a linked list for each docid "section"..
// call this a jump table... to avoid having to
// scan over the whole list every loop. this
// might make us 2x faster, and sort by date is
// currently ~10x slower than regular search for
// the same number of docids.
pp = ptrs[i]+1;
tightloop:
if ( *(unsigned long *)pp > maxDocId ||
*(unsigned long *)pp <= lastMaxDocId ) {
pp += 10;
if ( pp < listEnd ) goto tightloop;
oldptrs[i] = pp-1;
ptrs[i] = NULL;
continue;
}
// set it back
ptrs[i] = pp-1;
}
*/
// if the top 4 bytes of his docid is > maxDocId,
// then skip to next ptr
//else if( *(unsigned long *)(ptrs[i]+1) > maxDocId ) continue;
if ( *(unsigned long *)(ptrs[i]+1) > maxDocId ) continue;
// otherwise, hash him, use the top 32 bits of his docid
nn = (*(unsigned long *)(ptrs[i]+1) )& mask ;
// removing the mix table reduces time by about 10%
//^ (s_mixtab[ptrs[i][2]])) & mask;
// save start position so we can see if we chain too much
nnstart = nn;
// debug point
//long long ddd ;
//memcpy ( &ddd , ptrs[i] , 6 );
//ddd >>= 2;
//ddd &= DOCID_MASK;
//if ( ddd == 261380478983LL )
// log("got it");
/*
if ( ddd ==
unsigned char ss = (unsigned char)ptrs[i][5];
long sss = scoreTablePtr[ss];
logf(LOG_DEBUG,
"nt=%li i=%li max=%llu sc=%hhu sc2=%lu d=%llu",
(long)numTiers,
(long)i,
(long long)(((long long)maxDocId)<<6) | 0x3fLL,
255-ss,
sss,
(long long)ddd );
}
*/
chain:
// . if empty, take right away
// . this is the most common case so we put it first
if ( docIdPtrs[nn] == NULL ) { // explicitBits[nn] == 0 ) {
// . did we miss it?
// . if rat is true, then advance to next right now
if ( ! hashIt ) {
if ( ! useDateLists ) goto advance;
else if(sortByDate) goto dateAdvance;
else goto dateAdvance1;
}
// hold ptr to our stuff
docIdPtrs [ nn ] = ptrs[i];
// set pure bits
explicitBits [ nn ] = ebits;
hardCounts [ nn ] = hc;
// if we're not exclusive or date search, do it quick
if ( sign == '\0' || sign == '+' || sign == '*' )
scores[nn]=scoreTablePtr[((unsigned char)ptrs[i][5])];
// deal with excluded terms
else if ( sign == '-' ) scores[nn] = -500000000;
else if (sign == 'e') {
if ( scores[nn]<0 ) {
scores[nn] = 0x7fffffff;
}
// replicate the code below for speed
dateAdvance1:
ptrs[i] += 10;
if (ptrs[i] < ptrEnds[i] ) {
goto addMore;
}
oldptrs[i] = ptrs[i];
ptrs[i] = NULL;
continue;
}
// deal with date search, sign is 'd'
//else scores[nn] =
// ((unsigned char)(~ptrs[i][5]))*DATE_WEIGHT;
// date sort
else {
scores[nn]=~(*((unsigned long *)&ptrs[i][6]));
scores[nn]+=
scoreTablePtr[((unsigned char)ptrs[i][5])];
if ( scores[nn]<0 ) scores[nn] = 0x7fffffff;
// replicate the code below for speed
dateAdvance:
ptrs[i] += 10;
if (ptrs[i] < ptrEnds[i] ) goto addMore;
oldptrs[i] = ptrs[i];
ptrs[i] = NULL;
continue;
}
advance:
// debug msg
//log("added score=%05li totalscore=%05li to "
// "slotnum=%04li ptrList=%li",
// scoreTablePtr[((unsigned char)ptrs[i][5])],
// scores[nn],nn,i);
// advance ptr to point to next score/docid 6 bytes
//ptrs[i] += 6;
//ptrs[i] += hks; // half key size
ptrs[i] += 6;
// if he points to his lists end or a different score
// then remove him from the list of ptrs
//if (ptrs[i] >= listEnd || ptrs[i][5] != ptrs[i][-1]){
//if (ptrs[i] >= listEnd || (ptrs[i][5]!=ptrs[i][-1]&&
// !useDateLists)) {
if (ptrs[i] >= listEnd || ptrs[i][5] != ptrs[i][-1] ){
// if no more, we're done
//if ( numPtrs == 1 ) goto done;
// replace him with last ptr
//ptrs[i] = ptrs[--numPtrs];
// save him in case we have to roll back
oldptrs[i] = ptrs[i];
// signify his demis
ptrs[i] = NULL;
continue;
}
// otherwise, try to add more from this ptr list
goto addMore;
}
// if docIds bits don't match, chain to next bucket
if ( *(long *)(ptrs[i]+1) != *(long *)(docIdPtrs[nn]+1) ||
(*ptrs[i] & 0xfd) != (*docIdPtrs[nn] & 0xfd) ) {
if ( ++nn >= numSlots ) nn = 0;
// if we wrapped back, table is FULL!!
if ( nn == nnstart ) goto panic;
// count the number of times we miss
m_numCollisions++;
goto chain;
}
// got dup docid for the same termid due to index corruption?
if ( /*! rat &&*/ explicitBits[nn] & ebits ) {
// no point in logging since in thread!
//long long dd ;
//memcpy ( &dd , ptrs[i] , 6 );
//dd >>= 2;
//dd &= DOCID_MASK;
//fprintf(stderr,"got dup score for docid=%lli\n",dd);
if ( ! useDateLists ) goto advance;
else if(sortByDate) goto dateAdvance;
else goto dateAdvance1;
}
// sometimes the first list has dups!
//if ( rat && hashIt ) goto advance;
// if docIdBits match OR in the pure bits
explicitBits[nn] |= ebits;
// . else if we are using hardCounts for *many* query terms...
// . terms not explicitly required will still have ebits > 0
// in order to support boolean expressions along side of
// hard required terms
// . non-required phrases can all share the same ebit when we
// have a lot of query terms, so they will not be
// able to imply hard-required, single-word terms but at
// least they can add to scores[]. they have ebits of 0.
// . an ebits of 0x80000000 means it's a hard required term
if ( hc ) hardCounts[nn]++;
// and add the score
if ( sign == '\0' || sign == '+' || sign == '*' )
scores[nn] += scoreTablePtr[(unsigned char)ptrs[i][5]];
// excluded this term
else if (sign == '-') scores[nn] -= 500000000;
// apply a date search weight if sign is 'd'
//else scores[nn] += (unsigned char)ptrs[i][5] * DATE_WEIGHT;
// date sort
else {
//scores[nn] =~(*((unsigned long *)&ptrs[i][6]));
scores[nn]+=scoreTablePtr[((unsigned char)ptrs[i][5])];
if ( scores[nn]<0 ) scores[nn] = 0x7fffffff;
}
// advance our ptr
if ( ! useDateLists ) goto advance;
else if(sortByDate) goto dateAdvance;
else goto dateAdvance1;
}
// get the winners
goto getWinners;
panic:
// . keep cutting the maxDocId in half until all docids below
// it fit into our little hash table
// . "downStep" will be reset to "step" on our next round of hash
//downStep >>= 1;
// . scan hash table to find a good maxDocId/downstep
// . compute the average of all the docids
downStep = maxDocId - lastMaxDocId;
// and then hack it for good measure
downStep >>= 1;
// sanity test, log if not sane
//log(LOG_LOGIC,"query: last=%lu downstep=%lu max=%lu",
// lastMaxDocId,downStep,maxDocId);
// debug msg
//log("panicing");
// count
m_numPanics++;
// . if it is zero we're fucked! how can this happen really?
// . very very rare
// . TODO: look into this more
if ( downStep == 0 || downStep >= maxDocId ) {
log(LOG_LOGIC,"query: indextable: Major panic. "
"downstep=%lu maxDocId=%lu numPanics=%li",
downStep,maxDocId,m_numPanics);
goto done;
}
// why is this still maxed out after a panic?
if ( maxDocId == 0xffffffff && m_numPanics > 1 ) {
log(LOG_LOGIC,"query: indextable: Logic panic. "
"downstep=%lu maxDocId=%lu numPanics=%li",
downStep,maxDocId,m_numPanics);
goto done;
}
// decrease docid ceiling by half a step each time this is called
maxDocId -= downStep ;
// clear the hash table
//for ( long i = 0 ; i < numSlots ; i++ )
// docIdPtrs[i] = NULL ; // explicitBits[i] = 0;
memset ( docIdPtrs , 0 , numSlots * 4 );
// roll back m_nexti so hashTopDocIds2() works again
m_nexti = m_oldnexti;
// . now move each ptrs[i] backwards if we need to
// . if the docid is in this hash table that panicked,move it backwards
j = 0;
for ( long i = 0 ; i < numPtrs ; i++ ) {
char *p = ptrs[i];
// rollback special for date lists
if ( useDateLists ) {
// since we only have one sublist, as opposed to many
// sublists defined by a one byte score, just roll
// it all the way back
//while ( p - hks >= pstarts[i] &&
// *(unsigned long *)((p-hks)+1) > lastMaxDocId){
// p -= hks;
// j++;
//}
// re-assign
//ptrs[i] = p;
if ( p ) j = (ptrs[i] - pstarts[i])/hks;
ptrs[i] = pstarts[i];
continue;
}
// was it emptied? if so we just negate the ptr and go back 6
if ( ! p ) {
// get last value
p = oldptrs[i];
// back him up to the last docid for this score
//p -= 6;
p -= hks;
// if hashed long ago, continue
if ( *(unsigned long *)(p+1) <= lastMaxDocId )
continue;
j++;
}
// get his score
unsigned char score = (unsigned char)(p[5]);
// was previous guy in this hash table? if so, rollback
while ( p - 6 >= pstarts[i] &&
*(unsigned long *)((p-6)+1) > lastMaxDocId &&
(unsigned char)(p-6)[5] == score ) {
p -= 6;
j++;
}
// re-assign
ptrs[i] = p;
}
if ( m_isDebug || g_conf.m_logDebugQuery )
logf(LOG_DEBUG,"query: indextable: Rolled back over "
"%li docids.",j);
// try to fit all docids into it from the beginning
goto top;
/////////////////////////////////////
// scrape off any winners
/////////////////////////////////////
getWinners:
// . rat needs to know what the primary ebit mask we need is
// . this should be the ebits of the non-base lists ORed together
// . call it ebitMask
long tn ;
TopNode *t ;
for ( long i = 0 ; i < numSlots ; i++ ) {
// skip empty slots
//if ( explicitBits[i] == 0 ) continue;
if ( docIdPtrs[i] == NULL ) continue;
// . to be a winner it must pass the min mask test
// . that is, any terms with '+' signs must be accounted for
// . THIS has been replaced by bit #7, the hard-required bit
//if ((explicitBits[i] & minMask) != minMask) {
// // clear the slot
// explicitBits[i] = 0;
// continue;
//}
// add to tree if we're returning a *lot* of docids
if ( rat && lastRound && m_topTree ) {
// skip if zero or negative (negative query term)
if ( scores[i] <= 0 ) { docIdPtrs[i] = NULL; continue;}
// get implied bits from explicit bits
qvec_t ibits = bmap [ explicitBits[i] ];
// . we need to have the required term count to make it
// . this is increased by one with each call to
// addLists_r() and start with a value of 1 or 2
if ( (ibits & ebitMask) != ebitMask ) {
docIdPtrs[i] = NULL; continue; }
// we now also check the hard count
if ( hardCounts[i] < minHardCount ) {
docIdPtrs[i] = NULL; continue; }
// . WE GOT A WINNER, ADD IT TO THE TREE
// . bscore's bit #6 (0x40) is on if he has all terms
// explicitly, use this to disallow a high-tiered
// node from topping a node that has bit #6 on
tn = m_topTree->getEmptyNode();
t = &m_topTree->m_nodes[tn];
//t->m_bscore = bscore;
t->m_tier = tier;
t->m_score = scores[i];
t->m_docIdPtr = docIdPtrs[i];
// we don't have a cluster rec for this guy yet, -3
t->m_clusterLevel = -3;
// it only matter if we have all explicitly or
// implicitly when doing rat
if ( (explicitBits[i] & ebitMask) == ebitMask) {
explicitCount++;
t->m_bscore = 0x60; // 0x40 + 0x20
}
else {
implicitCount++;
t->m_bscore = 0x20;
}
// debug
//if ( t->getDocId() == 071325612500LL )
// log("hey");
// . nodes that have that all the terms explicitly
// cannot be passed by higher-tiered nodes,
// otherwise they are game
// . we can have dup docids in here from multiple tiers
// . de-duping is done at display time
// . this will not add if tree is full and it is
// less than the m_lowNode in score
// . if it does get added to a full tree, lowNode will
// be removed
//if ( ! m_topTree->checkTree ( 1 ) || tn == 4 )
// log("hey");
m_topTree->addNode ( t , tn );
//if ( ! m_topTree->checkTree ( 1 ) ) {
// char *xx = NULL; *xx = 0; }
docIdPtrs[i] = NULL;
continue;
}
// add it right away if we're rat, it did not get nuked
// by a negative
if ( rat ) {
// skip if zero or negative (negative query term)
if ( scores[i] <= 0 ) {
docIdPtrs[i] = NULL; continue; }
// get implied bits from explicit bits
qvec_t ibits = bmap [ explicitBits[i] ];
// . we need to have the required term count to make it
// . this is increased by one with each call to
// addLists_r() and start with a value of 1 or 2
if ( (ibits & ebitMask) != ebitMask ) {
docIdPtrs[i] = NULL; continue; }
// we now also check the hard count
if ( hardCounts[i] < minHardCount ) {
docIdPtrs[i] = NULL; continue; }
// mark it
//log("adding %li",newTopDocIds2);
// sanity check
if ( newTopDocIds2 >= m_maxTopDocIds2 ) {
log(LOG_LOGIC,"query: bad newTopDocIds2.");
char *xx = NULL; *xx = 0; }
// . store in the buffer
// . we now save ebits because a phrase in the base
// block may imply a term we add later on, but do
// not have explicitly. this way we still get it.
m_topDocIdPtrs2 [ newTopDocIds2 ] = docIdPtrs [i];
m_topScores2 [ newTopDocIds2 ] = scores [i];
m_topExplicits2 [ newTopDocIds2 ] = explicitBits[i];
m_topHardCounts2[ newTopDocIds2 ] = hardCounts [i];
newTopDocIds2++;
docIdPtrs[i] = NULL;
continue;
}
// . get his bit score
// . pass in the EXplicit bits... will map to # of implicit
// single, unignored word bits on. that is bscore.
// . so a phrase will turn on bits for its single word
// constituents
// . TODO: do not use a map for this, instead OR in the
// implicit bits from each QueryTerm. that way we
// are not limited to 16 query terms for the map's sake
// MDW: how can we tell if we have all terms implicitly
// without the bitScores[] array? getting an explicit
// count is easy, just add one for each term you have
// to the hash bucket.
// MDW: we still need to keep the bitScores[] array for
// boolean queries, however...
// MDW: we could support boolean queries with long
// sequences of ORs by making all the ORs to one bit!
// MDW: we could use the long long bit vector, one per bucket
// but we should probably OR in the implicits at
// intersection time, AND we should keep a separate count
// for the number of explicits. but all of this would
// be in addition to the LORs (Long ORs) or UORs
// (Unified ORs)
unsigned char bscore = bitScores [ explicitBits[i] ] ;
// if requireAllTerms is true, only accept docs that have ALL
// terms either explicitly or implicitly
//if ( requireAllTerms2 && ! (bscore & 0x60) ) continue;
// . if bit #7 is SET iff he has ALL HARD-required terms,if any
// . hard-required terms typically have + signs
// . boolean logic also works with hard required terms
// . if we don't meet that standard, skip this guy
// . TODO: make bit vector a long long so we can have 64
// query terms. then AND this with the hardRequired
// bit vector to make sure we got it all. Or better
// yet, have a hardCount that gets inc'd everytime
// a docid matches a hard required term.
if ( ! (bscore & 0x80) ) {
// clear the slot
//explicitBits[i] = 0;
docIdPtrs[i] = NULL;
continue;
}
// must have all hard counts, too (for boolean queries)
if ( hardCounts[i] < minHardCount ) {
docIdPtrs[i] = NULL;
continue;
}
// . count EXplicit matches
// . bit #6 is on iff he has all "required terms" EXplicitly
// . "required terms" just means all the terms that we would
// need to have if we were a default AND engine
// . now he must have all phrase bits, too, where
// phrases start and end with non-ignored single words
// . if ((explicitBits[i] & requiredBits) == requiredBits )
// explicitCount++;
if ( (bscore & 0x40) )
explicitCount++;
// . count IMplicit matches
// . bit #5 is on if he has all requird terms IMplicitly
// . if the doc has the phrase "A B" then he has term A and
// term B IMplicitly
// . if ( bscore >= maxbscore ) implicitCount++;
if ( bscore & 0x20 )
implicitCount++;
// . don't let explicits always sort above the implicits
// . but guys with all required terms should always sort
// above guys without, so we DO NOT clear 0x20 (bit #5)
// . might as well clear bit #7, too, it should always be set
// . REMOVE THIS line if you want any doc that has
// all the terms EXplicitly to always outrank any
// doc that only has them IMplicitly
bscore &= ~0xc0;
// add in our hard count (for boolean queries)
bscore += hardCounts[i];
// add to tree if we're returning a *lot* of docids
if ( m_topTree ) {
// . WE GOT A WINNER, ADD IT TO THE TREE
// . bscore's bit #6 (0x40) is on if he has all terms
// explicitly, use this to disallow a high-tiered
// node from topping a node that has bit #6 on
tn = m_topTree->getEmptyNode();
t = &m_topTree->m_nodes[tn];
t->m_bscore = bscore;
t->m_tier = tier;
t->m_score = scores[i];
t->m_docIdPtr = docIdPtrs[i];
// we don't have a cluster rec for this guy yet, -3
t->m_clusterLevel = -3;
// . nodes that have that all the terms explicitly
// cannot be passed by higher-tiered nodes,
// otherwise they are game
// . we can have dup docids in here from multiple tiers
// . de-duping is done at display time
// . this will not add if tree is full and it is
// less than the m_lowNode in score
// . if it does get added to a full tree, lowNode will
// be removed
m_topTree->addNode ( t , tn );
docIdPtrs[i] = NULL;
continue;
}
// empty the slot for next round of hashing
//explicitBits[i] = 0;
// branch on the bit score
if ( bscore < minTopBitScore ) {
// clear the slot
//explicitBits[i] = 0;
docIdPtrs[i] = NULL;
continue;
}
// automatically add to top if our bscore is the highest so far
if ( bscore > minTopBitScore ) goto addIt;
// docId must have a better score if it tied matching terms
if ( scores[i] < minTopScore ) {
// clear the slot
//explicitBits[i] = 0;
docIdPtrs[i] = NULL;
continue;
}
if ( scores[i] > minTopScore ) goto addIt;
// continue if docId is too big
if ( *(unsigned long *)(docIdPtrs[i]+1) >
*(unsigned long *)(minTopDocIdPtr+1) ) {
// clear the slot
//explicitBits[i] = 0;
docIdPtrs[i] = NULL;
continue;
}
// if top is equal, compare lower 6 bits
if ( (*(unsigned long *)(docIdPtrs[i] +1) ==
*(unsigned long *)(minTopDocIdPtr+1)) &&
(*(unsigned char *)(docIdPtrs[i] ) & 0xfc) >=
(*(unsigned char *)(minTopDocIdPtr) & 0xfc) ) {
// clear the slot
//explicitBits[i] = 0;
docIdPtrs[i] = NULL;
continue;
}
addIt:
// if we have less than the max, add right away
if ( weakest == -1 ) j = numTopDocIds++ ;
else j = weakest;
// debug msg
/*
if ( weakest >= 0 )
log("bscore=%04lu score=%04li topp+1=%08lx "
"replacing "
"#%02li bscore=%04lu score=%04li topp+1=%08lx",
bscore,scores[i],*(long *)(docIdPtrs[i]+1),
j,topb[j],tops[j],*(long *)(topp[j]+1));
else
log("bscore=%04lu score=%04li topp+1=%08lx adding #%li",
bscore,scores[i],*(long *)(docIdPtrs[i]+1),j);
*/
// now we got in the top
tops [j] = scores[i] ;
// just use the pure bits to set this bit score
topb [j] = bscore ;
// do not store actual docid, just a pointer to the low 6 bytes
topp [j] = docIdPtrs[i];
// . save number of terms we have EXplicitly
// . only used for tier integration code in filterTopDocIds()
tope [j] = getNumBitsOn ( (qvec_t)
(explicitBits[i] & requiredBits));
// clear the slot
//explicitBits[i] = 0;
docIdPtrs[i] = NULL;
// debug msg
//log("added #%li docid = %llu minPtr=%lu",
//j,topd[j],(long)minPtr);
// don't get new weakest parms if we're still under limit
if ( numTopDocIds >= docsWanted )
weakest = getWeakestTopDocId ( topp ,
tops ,
topb ,
numTopDocIds ,
&minTopBitScore ,
&minTopScore ,
&minTopDocIdPtr );
}
// adjustStep:
// if rat then store the maxDocId for re-hashing m_topDocIdPtrs2
if ( rat ) {
// sanity check
if ( newTopDocIds2 >= m_maxTopDocIds2 ) {
log(LOG_LOGIC,"query: bad newTopDocIds2b.");
char *xx = NULL; *xx = 0; }
// . if 2nd last guy is within 50 of us then bury the last guy
// . this guarantees all maxDocIds stored in m_topDocIdPtrs2
// are at least 50 units away from each other, with the
// exception of the last one added, of course, and this
// allows us to save memory (see IndexTable::alloc() above)
if ( lastGuy2 >= 0 && newTopDocIds2 - lastGuy2 < 50 ) {
newTopDocIds2--;
m_topDocIdPtrs2 [ lastGuy ] =
m_topDocIdPtrs2 [ newTopDocIds2 ];
m_topScores2 [ lastGuy ] =
m_topScores2 [ newTopDocIds2 ];
m_topExplicits2 [ lastGuy ] =
m_topExplicits2 [ newTopDocIds2 ];
m_topHardCounts2[ lastGuy ] =
m_topHardCounts2[ newTopDocIds2 ];
// and replace him...
lastGuy = lastGuy2;
}
//log("adding marker %li",newTopDocIds2);
// overwrite last entry if no actual docids were added since
// then, this allows us to save memory and fixes the seg fault
//if ( newTopDocIds2 > 0 &&
// m_topScores2 [ newTopDocIds2 - 1 ] == 0 )
// newTopDocIds2--;
m_topDocIdPtrs2 [ newTopDocIds2 ] = (char *)maxDocId;
m_topScores2 [ newTopDocIds2 ] = 0;
newTopDocIds2++;
// update second last guy
lastGuy2 = lastGuy ;
lastGuy = newTopDocIds2 - 1 ;
}
// . reset our step down function
// . do not reset this if we've panicked more than 20 times
// . globalspec's single term query termlist's docids were all
// essentially sequential thus requiring VERY small steps of like
// 100 or something... it seemed to block forever.
if ( m_numPanics < 5 ) downStep = step ;
// log it the first time it happens
else if ( m_numPanics >= 5 && printed != m_numPanics ) {
printed = m_numPanics;
// it should be impossible that downStep be 0 or small like
// that because we have a lot of slots in the hash table
//char *qq = "";
//if ( m_q && m_q->m_orig && m_q->m_orig[0] ) qq = m_q->m_orig;
log(LOG_INFO,"query: Got %li panics. Using small steps of "
"%li",m_numPanics,downStep);
// set step to downStep, otherwise, maxDocId will not be
// able to be decreased close to within lastMaxDocId because
// it is only decremented by downStep above, but
// incremented below by step below. i think this matters
// more for date termlists cuz those use lastMaxDocId
step = downStep;
}
//#else
//if ( m_numPanics > 0 && printed != m_numPanics ) {
// printed = m_numPanics;
// // only print every 20 panics
// if ( (printed % 20) == 0 )
// log(LOG_INFO,"query: Got %li panics.",m_numPanics);
//}
//downStep = step;
//#endif
// save it to check for wraps
oldDocId = maxDocId;
// if we were maxxed, we're done
if ( maxDocId == (unsigned long)0xffffffff ) goto done;
// save the last maxDocId in case we have to rollback for a panic
lastMaxDocId = maxDocId;
// now advance the ceiling
maxDocId += step;
// if wrapped, set to max
if ( maxDocId <= oldDocId ) {
maxDocId = (unsigned long)0xffffffff;
// . if we panic after this, come down "half the distance to
// the goal line" and try to hash all the docIds in the
// range: (lastMaxDocId,maxDocId-downStep]
// . downStep is divided by 2 (right shifted) in panic: above
// . otherwise, downStep can be too big and mess us up!
downStep = (maxDocId - lastMaxDocId);
}
// sanity check
if ( maxDocId - (downStep / 2) <= lastMaxDocId+1 )
log("query: Got infinite loop criteria.");
// . do another round
// . TODO: start with bottom of ptrs to take advantage of cache more!
goto top;
done:
long long *topd;
bool didSwap ;
// . if we're rat and we've hashed all the termlists for this tier
// then let's compute ex/implicitCount
// . we need to store the explicit bits to compute the bscore!!
// . if we're rat then we MUST have all terms in every doc...
// . if we're using TopTree and done for this tier, set the counts
if ( m_topTree ) { // && ((rat && lastRound) || ! rat) ) {
// set the number of exact matches
m_numExactExplicitMatches [ tier ]=explicitCount;
// implicit matches
m_numExactImplicitMatches [ tier ]=implicitCount+explicitCount;
// . set our # of top docids member var
// . CAUTION: THIS COUNT INCLUDES DUPS FROM OTHER TIERS
m_numTopDocIds [ tier ] = m_topTree->m_numUsedNodes;
}
// update the new count now
m_numTopDocIds2 = newTopDocIds2;
// . if we use top tree, TRY to estimate the hits now
// . will only estimate if this is the first round
if ( m_topTree ) goto estimateHits;
// point to straight docids
topd = m_topDocIds [ tier ];
// . if we're rat, then store the m_topDocIdPtrs2/m_topScores2 info
// into topd/tops... but only if this is the last list...
// . non-rat method just keeps the list of winners as it goes, but
// we only compute them at the very end.
if ( rat && lastRound ) {
// now get the top docsWanted docids from
// m_topDocIdPtrs2/Scores2 and store in m_topDocIds/m_topScores
//unsigned char minb = 0 ;
short minb = 0 ;
long mins = 0 ;
long long mind = 0 ;
char *minp = NULL ;
long mini = -1 ;
long i = 0 ;
// point to final docids and scores
long long *topd = m_topDocIds [ tier ];
long *tops = m_topScores [ tier ];
char **tdp2 = m_topDocIdPtrs2 ;
long count = 0;
//unsigned char bscore;
short bscore;
// count our explicits in this loop
explicitCount = 0;
for ( ; i < m_numTopDocIds2 ; i++ ) {
// skip if it is a stored maxDocId
if ( m_topScores2[i] <= 0 ) continue;
// count hits with all terms implied
count++;
// get the bit score, # terms implied
bscore = bitScores [ m_topExplicits2[i] ] ;
// count it if it has all terms EXplicitly
if ( (bscore & 0x40) &&
m_topHardCounts2[i] >= minHardCount )
explicitCount++;
// get the number of terms we got either EX or IMplicit
//bscore &= ~0xc0 ;
// REMOVE THIS line if you want any doc that has
// all the terms EXplicitly to always outrank any
// doc that only has them IMplicitly
bscore &= ~0xffc0 ; // turn off 0x20? (0xa0)
// add in the hard count, will be 0 most of the time
bscore += m_topHardCounts2[i];
// if we're one of the first "docsWanted" add right now
if ( count <= docsWanted ) {
mini = count - 1;
goto gotIt;
}
// this was taken from IndexTable.h::getWeakest...()
if ( bscore < minb ) continue;
if ( bscore > minb ) goto gotIt;
if ( m_topScores2[i] < mins ) continue;
if ( m_topScores2[i] > mins ) goto gotIt;
if ( *(unsigned long *)(tdp2[i]+1 ) >
*(unsigned long *)(minp+1) ) continue;
if ( *(unsigned long *)(tdp2[i]+1 ) <
*(unsigned long *)(minp+1) ) goto gotIt;
if ( (*(unsigned char *)(tdp2[i] ) & 0xfc) >
(*(unsigned char *)(minp) & 0xfc) ) continue;
gotIt:
memcpy ( &topd[mini] , tdp2[i] , 6 );
topd[mini] >>= 2;
topd[mini] &= DOCID_MASK;
topp[mini] = tdp2 [i];
tops[mini] = m_topScores2[i];
// bit score
topb[mini] = bscore ;
// debug msg
//log("d=%lli takes slot #%li (score=%li bscore=%lu)",
// topd[mini],mini,tops[mini],(long)topb[mini]);
// . save number of terms we have EXplicitly
// . used 4 tier integration code in filterTopDocIds()
tope[mini] = getNumBitsOn ( (qvec_t)
(m_topExplicits2[i] & requiredBits));
// add in the hard count, usually this is 0
tope[mini] = tope[mini] + m_topHardCounts2[i];
// are we still filling up the list? continue, if so.
if ( count < docsWanted ) continue;
// find new min in the list
//minb = 0xff;
minb = 0x7fff;
mins = 0x7fffffff;
mind = 0LL;
mini = -1000000; // sanity check
for ( long j = 0 ; j < docsWanted ; j++ ) {
if ( topb[j] > minb ) continue;
if ( topb[j] < minb ) goto gotMin;
if ( tops[j] > mins ) continue;
if ( tops[j] < mins ) goto gotMin;
if ( topd[j] < mind ) continue;
gotMin:
minb = topb[j];
mins = tops[j];
mind = topd[j];
minp = topp[j];
mini = j;
}
//log("mini=%li minb=%li mins=%li mind=%lli",
// mini,(long)minb,(long)mins,(long long )mind);
}
// how many top docids do we have? don't exceed "docsWanted"
numTopDocIds = count;
if ( numTopDocIds > docsWanted ) numTopDocIds = docsWanted;
implicitCount = count;
}
// . fill in the m_topDocIds array
// . the m_topDocIdPtrs array is score/docid 6 byte combos
else if ( ! rat ) {
for ( long i = 0 ; i < numTopDocIds ; i++ ) {
memcpy ( &topd[i] , topp[i] , 6 );
topd[i] >>= 2;
topd[i] &= DOCID_MASK;
}
}
// set the number of exact matches
m_numExactExplicitMatches [ tier ] = explicitCount;
// implicit matches
m_numExactImplicitMatches [ tier ] = implicitCount;
// set our # of top docids member var
m_numTopDocIds [ tier ] = numTopDocIds;
// . now sort by m_topBitScores/m_topScores/m_topDocIds
// . use a local static ptr since gbsort doesn't have THIS ptr
// . do a quick bubble sort
keepSorting:
// assume no swap will happen
didSwap = false;
for ( long i = 1 ; i < numTopDocIds ; i++ ) {
// continue if no switch needed
if ( topb [i-1] > topb [i] ) continue;
if ( topb [i-1] < topb [i] ) goto doSwap;
if ( tops [i-1] > tops [i] ) continue;
if ( tops [i-1] < tops [i] ) goto doSwap;
if ( topd [i-1] <= topd [i] ) continue;
doSwap:
long tmpScore = tops [i-1];
unsigned char tmpBitScore = topb [i-1];
long long tmpDocId = topd [i-1];
char tmpe = tope [i-1];
tops [i-1] = tops [i ];
topb [i-1] = topb [i ];
topd [i-1] = topd [i ];
tope [i-1] = tope [i ];
tops [i ] = tmpScore;
topb [i ] = tmpBitScore;
topd [i ] = tmpDocId;
tope [i ] = tmpe;
didSwap = true;
}
// if it's sorted then return
if ( didSwap ) goto keepSorting;
estimateHits:
// . now extrapolate the total # of hits from what we got
// . use default AND for this
// . use this for boolean queries, too
// . only set for the first call, so this number is consistent!!
// . little graph: X's are matching terms, O's are unmatched
// and the hyphens separate stages. see the combinations.
// . list #1 list #2
// . X -------> X
// . X -------> X
// . X -------> X
// . O O
// . O O
// . O O
// . ----------------
// . O O
// . O O
// . O O
// . O
// . O
// . O
// . we can re-compute if we didn't get many hits the first time!
if ( m_estimatedTotalHits == -1 || m_doRecalc ) {
// # of tried and untried combinations respectfully
double tried = 1.0;
double untried = 1.0;
double noretry = 1.0;
// minimum term frequency of the eligible query terms
long long mintf = 0x7fffffffffffffffLL;
// . total hits we got now
// . we use explicit, because we're only taking combinations
// of non-negative terms and positive phrase terms, using
// implicit matches would mess our count up
// . furthermore, re-arranging query words would change the
// hit count because it would change the implicit count
long totalHits = explicitCount;
// . use combinatorics, NOT probability theory for this cuz
// we're quite discrete
// . MOST of the error in this is from inaccurate term freqs
// because i think this logic is PERFECT!!!
// . how many tuple combinations did we have?
// . do not use imap here
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
// component lists are merged into one compound list
if ( m_componentCodes[i] >= 0 ) continue;
// skip if negative or unsigned phrase
QueryTerm *qt = &m_q->m_qterms[i];
if ( qt->m_termSign == '-' ) continue;
if ( qt->m_termSign =='\0' && qt->m_isPhrase) continue;
// if its a boolean query under a NOT sign
if ( qt->m_underNOT ) continue;
// get current total size of list #i(combine all tiers)
long total = 0;
for ( long j = 0 ; j < m_numTiers ; j++ ) {
long size = lists[j][i].getListSize();
if ( size >= 12 ) size -= 6;
size /= 6;
total += size;
}
// how many docs have this term?
long long tf = m_q->m_termFreqs[i];
// . multiply to get initial # of combinations of terms
// . "tried" means we tried these combinations to
// produce the "totalHits" search results
tried *= (double)total;
// get # of untried combinations
untried *= (double) ( tf - totalHits );
// get # of combinations we tried amongst STAGE0 guys
// so we don't retry in untried
noretry *= (double) ( total - totalHits );
// count required terms
//nn++;
// if we only have one required term, nothing isuntried
// record the min tf as a safety catch
if ( tf < mintf ) mintf = tf;
}
// don't retry combos tried in "tried" count (intersection)
untried -= noretry;
// don't go negative... if tf == total
if ( untried < 0.0 ) untried = 0.0;
// dont divide by 0
if ( tried < 1.0 ) tried = 1.0;
// . out of "tried" combinations we got "totalHits" hits!
// . what's the prob. a particular combination is a hit?
double percent = (double)totalHits / (double)tried;
// out of the untried combinations,how many hits can we expect?
m_estimatedTotalHits = totalHits +
(long long) (untried * percent);
// don't exceed the max tf of any one list (safety catch)
if ( m_estimatedTotalHits > mintf )
m_estimatedTotalHits = mintf;
// make it at least what we got for sure
if ( m_estimatedTotalHits < totalHits )
m_estimatedTotalHits = totalHits;
// if we get a re-call, replace the total hits calc
if ( explicitCount <= 0 ) m_doRecalc = true;
else m_doRecalc = false;
}
}
// . set all the m_final* member vars from the m_top* member vars
// . this essentially merges the results we got from each tier
/*void IndexTable::filterTopDocIds ( ) {
// return now if no tiers
if ( m_numTiers <= 0 ) return;
// what tier are we?
char tier = m_numTiers - 1;
long count = 0;
uint64_t d;
2013-08-03 00:12:24 +04:00
unsigned long h;
unsigned long numBuckets = 0 ;
long bufSize = 0 ;
char *buf = NULL ;
uint64_t *htable = NULL;
2013-08-03 00:12:24 +04:00
TopNode **vtable = NULL;
long explicitCount;
long implicitCount;
long docCount = 0;
bool dedup;
long currTier = 0;
// initialize this cuz we now count final results per tier
for ( long i = 0; i < MAX_TIERS; i++ )
m_numDocsInTier[i] = 0;
// . the TopTree is already sorted correctly so does not need filtering
// . it is also immune to adding dups
if ( ! m_topTree ) goto skip;
numBuckets = m_topTree->m_numUsedNodes * 2;
bufSize = 12 * numBuckets;
explicitCount = 0; // m_numExactExplicitMatches [ tier ];
implicitCount = 0; // m_numExactImplicitMatches [ tier ] ;
// set up the hash table for deduping the docids
// since we may have multiple tiers in here
dedup = (m_numTiers > 1);
if ( dedup ) {
// TODO: move this alloc into the TopTree allocation?
buf = (char *)mmalloc ( bufSize , "Msg39" );
if ( ! buf ) {
log("query: Hash alloc of %li failed.",
bufSize);
return;
}
htable = (uint64_t *)buf;
2013-08-03 00:12:24 +04:00
vtable = (TopNode **)(buf + numBuckets * 8) ;
memset ( htable , 0 , numBuckets * 8 );
}
// loop over all results
for ( long ti = m_topTree->getHighNode() ; ti >= 0 ;
ti = m_topTree->getPrev(ti) ) {
// get the guy
TopNode *t = &m_topTree->m_nodes[ti];
// skip if no dedup needed
if ( ! dedup ) goto skip2;
// get his docid
d = t->getDocId();
//log("deduping d=%llu",d);
// dedup him in the hash table
h = (unsigned long)d % numBuckets;
while ( htable[h] && htable[h] != d )
if ( ++h >= numBuckets ) h = 0;
// if got one, its a dup
if ( htable[h] ) {
// debug
//log("%llu is a dup",d);
// give site/content hash info to it
// to save Msg38 a look up
if ( t->m_siteHash ) {
TopNode *f = (TopNode *)vtable;
if ( f->m_siteHash ) continue;
f->m_siteHash = t->m_siteHash;
f->m_contentHash=t->m_contentHash;
}
// . mark him as a dup
// . we cannot delete because TopTree will not update
// emptyNode correctly
t->m_clusterLevel = 5;
continue;
}
// otherwise, if not a dup, add to hash table
htable[h] = d;
vtable[h] = t;
skip2:
docCount++;
if ( t->m_bscore & 0x40 ) {
explicitCount++;
implicitCount++;
}
else if ( t->m_bscore & 0x020 ) {
implicitCount++;
}
// increment tier count, we can actually just do this in Msg39
// since toptree stores the tiers
//if ( t->m_tier > currTier )
// currTier = t->m_tier;
//m_numDocsInTier[currTier]++;
}
// free hash table mem
if ( buf ) { mfree ( buf , bufSize , "Msg39" ); buf = NULL; }
// nodedup:
m_finalNumTopDocIds = docCount;
// use the last tier for this info, it should be right
m_finalNumExactExplicitMatches = explicitCount;
m_finalNumExactImplicitMatches = implicitCount;
// debug print
//if ( ! m_isDebug && ! g_conf.m_logDebugQuery ) return;
if ( ! m_isDebug && ! g_conf.m_logDebugQuery ) return;
// force log it cuz m_isDebug might be true
logf(LOG_DEBUG,"query: indextable: Tier %li has %li deduped explicit "
"matches. dedup=%li", (long)tier, explicitCount, dedup);
logf(LOG_DEBUG,"query: indextable: Tier %li has %li deduped implicit "
"matches.",(long)tier,implicitCount);
for ( long ti = m_topTree->getHighNode() ; ti >= 0 ;
ti = m_topTree->getPrev(ti) ) {
TopNode *t = &m_topTree->m_nodes[ti];
logf(LOG_DEBUG,"query: indextable: [%lu] %03li) docId=%012llu "
"sum=%li tier=%lu bs=0x%hhx cl=%li",
m_logstate,
count++ ,
t->getDocId() ,
t->m_score ,
t->m_tier ,
t->m_bscore,
(long)t->m_clusterLevel);
}
return;
skip:
// debug msg
//if ( m_isDebug || g_conf.m_logDebugQuery ) {
if ( m_isDebug || g_conf.m_logDebugQuery ) {
for ( long j = m_numTiers-1 ; j < m_numTiers ; j++ ) {
// force log it even if debug turned off
logf(LOG_DEBUG,"query: indextable: [%lu] tier #%li has "
"%li top docIds and %li exact explicit matches. "
"Note: not all explicit matches may have scores "
"high enough to be in this list of winners:",
m_logstate,
j ,
m_numTopDocIds[j],
m_numExactExplicitMatches[j]) ;
for ( long i = 0 ; i < m_numTopDocIds[j]; i++)
logf(LOG_DEBUG,"query: indextable: [%lu] "
"%03li) docId=%012llu "
"sum=%li imb=%lu [%lu] exb=%li",
m_logstate,
i,
m_topDocIds [j][i] ,
m_topScores [j][i] ,
// bit #7,#6 & #5 are special
(long)(m_topBitScores [j][i] & 0x1f),
(long)(m_topBitScores [j][i] ),
(long)m_topExplicits [j][i] );
}
}
// . if we were only called once we don't need to filter
// . no, we might have docid dups due to errors
// . we need to set m_numFiltered, too now
//if ( m_numTiers <= 1 ) return m_topDocIds[0];
// reset this
long nn = 0;
// combine all docIds and bit scores from m_topDocIds[X] into
// these 2 arrays:
long long docIds [ MAX_RESULTS * MAX_TIERS ];
unsigned char bitScores [ MAX_RESULTS * MAX_TIERS ];
char explicits [ MAX_RESULTS * MAX_TIERS ];
long scores [ MAX_RESULTS * MAX_TIERS ];
char tiers [ MAX_RESULTS * MAX_TIERS ];
// use memcpy for speed reasons
for ( long i = 0 ; i < m_numTiers ; i++ ) {
// how many top docIds in this one?
long nt = m_numTopDocIds[i];
// bitch and skip if we don't have enough room to store
if ( nn + nt > MAX_RESULTS * MAX_TIERS ) {
log(LOG_LOGIC,"query: indextable: "
"filterTopDocIds had issue.");
continue;
}
// . if he's got less than 10 exact matches, ignore him,
// unless he's the last tier
// . no! now we don't allow tier jumping whatsoever
//if ( m_numExactExplicitMatches[i] < 10 && i != m_numTiers-1)
// continue;
// store all top docIds from all tiers into "docIds"
memcpy ( & docIds [ nn ] , &m_topDocIds [i] , nt * 8 );
// also save the bit scores, for sorting
memcpy ( & scores [ nn ] , &m_topScores [i] , nt * 4 );
memcpy ( & bitScores [ nn ] , &m_topBitScores[i] , nt );
memcpy ( & explicits [ nn ] , &m_topExplicits[i] , nt );
memset ( & tiers [ nn ] , i , nt );
// inc the count
nn += nt;
}
// . now only bubble up a docId if it has a higher termBitScore!!!!
// . a call stage auto advances to the next if ANY one of it's docIds
// does not have ALL terms in it! (including phrase termIds)
// . convenience ptrs
long long *topd = docIds;
long *tops = scores;
unsigned char *topb = bitScores;
char *tope = explicits;
char *topt = tiers;
long long tmpd;
long tmps;
unsigned char tmpb;
char tmpe;
char tmpt;
// max # of required terms we can have
qvec_t requiredBits = m_q->m_requiredBits ;
long maxe = getNumBitsOn ( (qvec_t) requiredBits );
// . now sort by m_topBitScores only
// . use a local static ptr since gbsort doesn't have THIS ptr
// . do a quick bubble sort
bool didSwap = false;
keepSorting:
// reset flag
didSwap = false;
// do a bubble pass
for ( long i = 1 ; i < nn ; i++ ) {
// . only allow guys to propagate upwards if they have
// more explicit bits
// . this is only to ensure that the "Next 10" line up with
// the first 10
// . if defender has more explicit bits he always wins
//if ( tope [i-1] > tope [i] ) continue;
// . if defender has all explicit bits, he cannot be moved
// aside because he may have been used for a previous page of
// results
// . for instance, tier #0 had 550 explicit hits, those docids
// are sorted so that the top 550 (at least) will have
// all terms IMplicitly or EXplicitly. so tier #1's winners
// cannot bubble past a full EXplicit hit in tier #0, but
// they may bubble passed a full IMplicit hit in tier #0,
// which is OK, as long as we don't much with tier #0's
// top 550 hits, otherwise we'd mess of the Next/Prev
// serps flow. this next line guarantees we won't muck it up.
if ( tope [i-1] == maxe ) continue;
// . if defender has more bits IMplicitly, he wins
// . bit #5 is set if he has all terms implicitly
if ( topb [i-1] > topb[i] ) continue;
// . before switching, though, you must have higher scores,etc.
// . otherwise, what's the point? and you'll mess up the
// search results ordering from page to page, which is the
// whole point of tracking the explicit bits... so we know
// if the previous tier fulfilled the request completely
if ( topb [i-1] == topb [i] ) {
if ( tops [i-1] > tops [i] ) continue;
if ( tops [i-1] == tops [i] &&
topd [i-1] <= topd [i] ) continue;
}
tmpd = topd [i-1];
tmps = tops [i-1];
tmpb = topb [i-1];
tmpe = tope [i-1];
tmpt = topt [i-1];
topd [i-1] = topd [i ];
tops [i-1] = tops [i ];
topb [i-1] = topb [i ];
tope [i-1] = tope [i ];
topt [i-1] = topt [i ];
topd [i ] = tmpd;
tops [i ] = tmps;
topb [i ] = tmpb;
tope [i ] = tmpe;
topt [i ] = tmpt;
didSwap = true;
}
// if we did a swap then do another bubble pass until it's sorted
if ( didSwap ) goto keepSorting;
// debug point
if ( m_isDebug || g_conf.m_logDebugQuery ) {
logf(LOG_DEBUG,"query: indextable: imb is how many query "
"terms the result has IMPLICITLY. "
"(phrase terms imply single word terms)");
logf(LOG_DEBUG,"query: indextable: exb is how many query "
"terms the result has EXPLICITLY.");
logf(LOG_DEBUG,"query: indextable: the last number in []'s "
"is a bitmap. See Query.cpp::setBitScore to see where "
"those bits are defined.");
for ( long i = 0 ; i < nn ; i++)
logf(LOG_DEBUG,"query: indextable: [%lu] %03li) final "
"docId=%012llu "
"sum=%li imb=%lu [0x%lx] exb=%li tier=%li",
m_logstate,
i,
topd [i] ,
tops [i] ,
// bit #7,#6 & #5 are special
(long)(topb [i] & 0x1f),
(long)(topb [i] & 0xe0),
(long)tope[i] ,
(long)topt[i] );
}
// we have nothing in m_finalTop* right now
long nf = 0;
// tmp vars
long long docId;
long j;
// uniquify docIds from docIds[] into m_filtered
for ( long i = 0 ; i < nn ; i++ ) {
// get the docId
docId = docIds [ i ];
// is this docId already in m_filtered?
for ( j = 0 ; j < nf ; j++)
if ( docId == m_finalTopDocIds [ j ] ) break;
// skip if docId already in m_finalTopDocIds[]
if ( j < nf ) continue;
// otherwise, store it
m_finalTopDocIds [ nf ] = docIds[i];
m_finalTopScores [ nf ] = scores[i];
nf++;
// increment the tier count
if ( tiers[i] > currTier )
currTier = tiers[i];
m_numDocsInTier[currTier]++;
// break if we have enough
if ( nf >= MAX_RESULTS ) break;
}
// . sanity check
// . IndexTable::getTopDocIds: bad engineer. nf=17 ntd[1]=19
// . this happens when we got lots of docid dups from errors
// . the last time was same docid score but different punish bits
long min = m_numTopDocIds [ m_numTiers - 1 ];
if ( nf < min ) {
//errno = EBADENGINEER;
//log("IndexTable::getTopDocIds: bad engineer. "
// "nf=%li ntd[%li]=%li",
// nf,m_numTiers-1,m_numTopDocIds[m_numTiers-1]);
log("query: Got %li duplicate docids.",min - nf );
//sleep(50000);
// just count as an error for now
//return NULL;
// pad it with doc of the last one, maybe nobody will notice
long long lastDocId = 0;
if ( nf > 0 ) lastDocId = m_finalTopDocIds [ nf - 1 ];
while ( nf < min && nf < MAX_RESULTS )
m_finalTopDocIds [ nf++ ] = lastDocId;
}
m_finalNumTopDocIds = nf;
// . often the exact count is bigger than the docids we got
// . adjust it down so it's no longer bigger in case we had dup docids
m_finalNumExactExplicitMatches=m_numExactExplicitMatches[m_numTiers-1];
// store implict matches too so we can put the gray bar separating
// results that have all terms from those that don't
m_finalNumExactImplicitMatches=m_numExactImplicitMatches[m_numTiers-1];
}
*/