open-source-search-engine/Msg39.cpp
2014-07-29 19:32:27 -07:00

1598 lines
48 KiB
C++

#include "gb-include.h"
#include "Msg39.h"
#include "Stats.h"
#include "Threads.h"
#include "TopTree.h"
#include "UdpServer.h"
//#include "CollectionRec.h"
#include "SearchInput.h"
// called to send back the reply
static void sendReply ( UdpSlot *slot ,
Msg39 *msg39 ,
char *reply ,
long replySize ,
long replyMaxSize ,
bool hadError );
// called when Msg2 has got all the termlists
//static void gotListsWrapper ( void *state ) ;
// thread wrappers
static void *addListsWrapper ( void *state , ThreadEntry *t ) ;
//static void threadDoneWrapper ( void *state , ThreadEntry *t ) ;
bool Msg39::registerHandler ( ) {
// . register ourselves with the udp server
// . it calls our callback when it receives a msg of type 0x39
if ( ! g_udpServer.registerHandler ( 0x39, handleRequest39 ))
return false;
return true;
}
Msg39::Msg39 () {
m_inUse = false;
reset();
}
void Msg39::reset() {
if ( m_inUse ) { char *xx=NULL;*xx=0; }
m_allocedTree = false;
//m_numDocIdSplits = 1;
m_tmpq.reset();
m_numTotalHits = 0;
m_gotClusterRecs = 0;
reset2();
}
void Msg39::reset2() {
// reset lists
for ( long j = 0 ; j < m_msg2.m_numLists ; j++ )
m_lists[j].freeList();
m_msg2.reset();
m_posdbTable.reset();
m_callback = NULL;
m_state = NULL;
m_blocked = false;
m_tmp = NULL;
}
// . handle a request to get a the search results, list of docids only
// . returns false if slot should be nuked and no reply sent
// . sometimes sets g_errno on error
void handleRequest39 ( UdpSlot *slot , long netnice ) {
// use Msg39 to get the lists and intersect them
Msg39 *THIS ;
try { THIS = new ( Msg39 ); }
catch ( ... ) {
g_errno = ENOMEM;
log("msg39: new(%i): %s", sizeof(Msg39),mstrerror(g_errno));
sendReply ( slot , NULL , NULL , 0 , 0 ,true);
return;
}
mnew ( THIS , sizeof(Msg39) , "Msg39" );
// clear it
g_errno = 0;
// . get the resulting docIds, usually blocks
// . sets g_errno on error
THIS->getDocIds ( slot ) ;
}
// this must always be called sometime AFTER handleRequest() is called
void sendReply ( UdpSlot *slot , Msg39 *msg39 , char *reply , long replyLen ,
long replyMaxSize , bool hadError ) {
// debug msg
if ( g_conf.m_logDebugQuery || (msg39&&msg39->m_debug) )
logf(LOG_DEBUG,"query: msg39: [%lu] Sending reply len=%li.",
(long)msg39,replyLen);
// sanity
if ( hadError && ! g_errno ) { char *xx=NULL;*xx=0; }
// no longer in use. msg39 will be NULL if ENOMEM or something
if ( msg39 ) msg39->m_inUse = false;
// . if we enter from a local call and not from handling a udp slot
// then execute this logic here to return control to caller.
// . do not delete ourselves because we will be re-used probably and
// caller handles that now.
if ( msg39 && msg39->m_callback ) {
// if we blocked call user callback
if ( msg39->m_blocked ) msg39->m_callback ( msg39->m_state );
// if not sending back a udp reply, return now
return;
}
// . now we can free the lists before sending
// . may help a little bit...
//if ( msg39 ) {
// for ( long j = 0 ; j < msg39->m_msg2.m_numLists ; j++ )
// msg39->m_lists[j].freeList();
//}
// get the appropriate UdpServer for this niceness level
UdpServer *us = &g_udpServer;
// i guess clear this
long err = g_errno;
g_errno = 0;
// send an error reply if g_errno is set
if ( err ) us->sendErrorReply ( slot , err ) ;
else us->sendReply_ass ( reply ,
replyLen ,
reply ,
replyMaxSize ,
slot );
// always delete ourselves when done handling the request
if ( msg39 ) {
mdelete ( msg39 , sizeof(Msg39) , "Msg39" );
delete (msg39);
}
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . calls gotDocIds to send a reply
void Msg39::getDocIds ( UdpSlot *slot ) {
// remember the slot
m_slot = slot;
// reset this
m_errno = 0;
// get the request
m_r = (Msg39Request *) m_slot->m_readBuf;
long requestSize = m_slot->m_readBufSize;
// ensure it's size is ok
if ( requestSize < 8 ) {
g_errno = EBADREQUESTSIZE;
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
// deserialize it before we do anything else
long finalSize = deserializeMsg ( sizeof(Msg39Request) ,
&m_r->size_readSizes ,
&m_r->size_whiteList,//coll ,
&m_r->ptr_readSizes,
m_r->m_buf );
// sanity check
if ( finalSize != requestSize ) {char *xx=NULL;*xx=0; }
getDocIds2 ( m_r );
}
// . the main function to get the docids for the provided query in "req"
// . it always blocks i guess
void Msg39::getDocIds2 ( Msg39Request *req ) {
// flag it as in use
m_inUse = true;
// store it, might be redundant if called from getDocIds() above
m_r = req;
// a handy thing
m_debug = false;
if ( m_r->m_debug ) m_debug = true;
if ( g_conf.m_logDebugQuery ) m_debug = true;
if ( g_conf.m_logTimingQuery ) m_debug = true;
// ensure it's size is ok
/*
if ( m_r->size_whiteList <= 0 ) {
g_errno = ENOCOLLREC;
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
*/
CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum );
if ( ! cr ) {
g_errno = ENOCOLLREC;
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
// . set our m_q class
// . m_boolFlag is either 1 or 0 in this case, the caller did the
// auto-detect (boolFlag of 2) before calling us
// . this now calls Query::addCompoundTerms() for us
if ( ! m_tmpq.set2 ( m_r->ptr_query ,
m_r->m_language ,
m_r->m_queryExpansion ,
m_r->m_useQueryStopWords ) ) {
log("query: msg39: setQuery: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
// wtf?
if ( g_errno ) { char *xx=NULL;*xx=0; }
QUICKPOLL ( m_r->m_niceness );
// set m_errno
if ( m_tmpq.m_truncated ) m_errno = EQUERYTRUNCATED;
// ensure matches with the msg3a sending us this request
if ( m_tmpq.getNumTerms() != m_r->m_nqt ) {
g_errno = EBADENGINEER;
log("query: Query parsing inconsistency for q=%s. "
"langid=%li. Check langids and m_queryExpansion parms "
"which are the only parms that could be different in "
"Query::set2(). You probably have different mysynoyms.txt "
"files on two different hosts! check that!!"
,m_tmpq.m_orig
,(long)m_r->m_language
);
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
// debug
if ( m_debug )
logf(LOG_DEBUG,"query: msg39: [%lu] Got request "
"for q=%s", (long) this,m_tmpq.m_orig);
// reset this
m_tt.reset();
QUICKPOLL ( m_r->m_niceness );
// . if caller already specified a docid range, then be loyal to that!
// . or if we do not have enough query terms to warrant splitting
//if ( m_numDocIdSplits == 1 ) {
// getLists();
// return;
//}
// . set up docid range cursor
// . do twin splitting
// . we do no do it this way any more... we subsplit each split
// into two halves...!!! see logic in getLists() below!!!
//if ( m_r->m_stripe == 1 ) {
// m_ddd = MAX_DOCID / 2LL;
// m_dddEnd = MAX_DOCID + 1LL;
//}
//else if ( m_r->m_stripe == 0 ) {
// m_ddd = 0;
// m_dddEnd = MAX_DOCID / 2LL;
//}
// support triplets, etc. later
//else {
// char *xx=NULL;*xx=0;
//}
// do not do twin splitting if only one host per group
//if ( g_hostdb.getNumStripes() == 1 ) {
m_ddd = 0;
m_dddEnd = MAX_DOCID;
//}
m_phase = 0;
// . otherwise, to prevent oom, split up docids into ranges
// and get winners of each range.
//if ( ! doDocIdSplitLoop() ) return;
// . return false if it blocks true otherwise
// . it will send a reply when done
if ( ! controlLoop() ) return;
// error?
// if ( g_errno ) {
// log(LOG_LOGIC,"query: msg39: doDocIdSplitLoop: %s." ,
// mstrerror(g_errno) );
// sendReply ( m_slot , this , NULL , 0 , 0 , true );
// return ;
// }
// it might not have blocked! if all lists in tree and used no thread
// it will come here after sending the reply and destroying "this"
return;
}
void controlLoopWrapper2 ( void *state , ThreadEntry *t ) {
Msg39 *THIS = (Msg39 *)state;
THIS->controlLoop();
}
void controlLoopWrapper ( void *state ) {
Msg39 *THIS = (Msg39 *)state;
THIS->controlLoop();
}
// . returns false if blocks true otherwise
// 1. read all termlists for docid range
// 2. intersect termlists to get the intersecting docids
// 3. increment docid ranges and keep going
// 4. when done return the top docids
bool Msg39::controlLoop ( ) {
loop:
// error?
if ( g_errno ) {
hadError:
log(LOG_LOGIC,"query: msg39: controlLoop: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return true;
}
if ( m_phase == 0 ) {
// next phase
m_phase++;
// the starting docid...
long long d0 = m_ddd;
// shortcut
long long delta = MAX_DOCID / (long long)m_r->m_numDocIdSplits;
// advance to point to the exclusive endpoint
m_ddd += delta;
// ensure this is exclusive of ddd since it will be
// inclusive in the following iteration.
long long d1 = m_ddd;
// fix rounding errors
if ( d1 + 20LL > MAX_DOCID ) {
d1 = MAX_DOCID;
m_ddd = MAX_DOCID;
}
// fix it
m_r->m_minDocId = d0;
m_r->m_maxDocId = d1; // -1; // exclude d1
// allow posdbtable re-initialization each time to set
// the msg2 termlist ptrs anew, otherwise we core in
// call to PosdbTable::init() below
//m_posdbTable.m_initialized = false;
// reset ourselves, partially, anyway, not tmpq etc.
reset2();
// debug log
if ( ! m_r->m_forSectionStats )
log("msg39: docid split phase %lli-%lli",d0,d1);
// wtf?
//if ( d0 >= d1 ) break;
// load termlists for these docid ranges using msg2 from posdb
if ( ! getLists() ) return false;
}
if ( m_phase == 1 ) {
m_phase++;
// intersect the lists we loaded using a thread
if ( ! intersectLists() ) return false;
// error?
if ( g_errno ) goto hadError;
}
// sum up some stats
if ( m_phase == 2 ) {
m_phase++;
if ( m_posdbTable.m_t1 ) {
// . measure time to add the lists in bright green
// . use darker green if rat is false (default OR)
long color;
//char *label;
color = 0x0000ff00 ;
//label = "termlist_intersect";
g_stats.addStat_r ( 0 ,
m_posdbTable.m_t1 ,
m_posdbTable.m_t2 , color );
}
// accumulate total hits count over each docid split
m_numTotalHits += m_posdbTable.m_docIdVoteBuf.length() / 6;
// minus the shit we filtered out because of gbminint/gbmaxint/
// gbmin/gbmax/gbsortby/gbrevsortby/gbsortbyint/gbrevsortbyint
m_numTotalHits -= m_posdbTable.m_filtered;
// error?
if ( m_posdbTable.m_errno ) {
// we do not need to store the intersection i guess..??
m_posdbTable.freeMem();
g_errno = m_posdbTable.m_errno;
log("query: posdbtable had error = %s",
mstrerror(g_errno));
sendReply ( m_slot , this , NULL , 0 , 0 ,true);
return true;
}
// if we have more docid ranges remaining do more
if ( m_ddd < m_dddEnd ) {
m_phase = 0;
goto loop;
}
}
// ok, we are done, get cluster recs of the winning docids
if ( m_phase == 3 ) {
m_phase++;
// . this loads them using msg51 from clusterdb
// . if m_r->m_doSiteClustering is false it just returns true
// . this sets m_gotClusterRecs to true if we get them
if ( ! setClusterRecs ( ) ) return false;
// error setting clusterrecs?
if ( g_errno ) goto hadError;
}
// process the cluster recs if we got them
if ( m_gotClusterRecs && ! gotClusterRecs() )
goto hadError;
// . all done! set stats and send back reply
// . only sends back the cluster recs if m_gotClusterRecs is true
estimateHitsAndSendReply();
return true;
}
/*
// . returns false if blocked, true if done
// . only come here if m_numDocIdSplits > 1
// . to avoid running out of memory, generate the search results for
// multiple smaller docid-ranges, one range at a time.
bool Msg39::doDocIdSplitLoop ( ) {
long long delta = MAX_DOCID / (long long)m_numDocIdSplits;
for ( ; m_ddd < m_dddEnd ; ) {
// the starting docid...
long long d0 = m_ddd;
// advance to point to the exclusive endpoint
m_ddd += delta;
// ensure this is exclusive of ddd since it will be
// inclusive in the following iteration.
long long d1 = m_ddd;
// fix rounding errors
if ( d1 + 20LL > MAX_DOCID ) {
d1 = MAX_DOCID;
m_ddd = MAX_DOCID;
}
// fix it
m_r->m_minDocId = d0;
m_r->m_maxDocId = d1; // -1; // exclude d1
// allow posdbtable re-initialization each time to set
// the msg2 termlist ptrs anew, otherwise we core in
// call to PosdbTable::init() below
//m_posdbTable.m_initialized = false;
// reset ourselves, partially, anyway, not tmpq etc.
reset2();
// debug log
log("msg39: docid split phase %lli-%lli",d0,d1);
// wtf?
if ( d0 >= d1 ) break;
// use this
//m_debug = true;
//log("call1");
// . get the lists
// . i think this always should block!
// . it will also intersect the termlists to get the search
// results and accumulate the winners into the "tree"
if ( ! getLists() ) return false;
//log("call2 g_errno=%li",(long)g_errno);
// if there was an error, stop!
if ( g_errno ) break;
}
// return error reply if we had an error
if ( g_errno ) {
log("msg39: Had error3: %s.", mstrerror(g_errno));
sendReply (m_slot,this,NULL,0,0 , true);
return true;
}
if ( m_debug )
log("msg39: done with all docid range splits");
// all done. this will send reply back
//estimateHitsAndSendReply();
//addedLists();
// should we put cluster recs in the tree?
//m_gotClusterRecs = ( g_conf.m_fullSplit && m_r->m_doSiteClustering );
m_gotClusterRecs = ( m_r->m_doSiteClustering );
// . before we send the top docids back, lookup their site hashes
// in clusterdb so we can do filtering at this point.
// BUT only do this if we are in a "full split" config, because that
// way we can guarantee all clusterdb recs are local (on this host)
// and should be in the page cache. the page cache should do ultra
// quick lookups and no memcpy()'s for this operation. it should
// be <<1ms to lookup thousands of docids.
// . when doing innerLoopSiteClustering we always use top tree now
// because our number of "top docids" can be somewhat unpredictably
// large due to having a ton of results with the same "domain hash"
// (see the "vcount" in IndexTable2.cpp)
// . do NOT do if we are just "getting weights", phr and aff weights
if ( m_gotClusterRecs ) {
// . set the clusterdb recs in the top tree
// . this calls estimateHits() in its reply wrapper when done
return setClusterRecs ( ) ;
}
// if we did not call setClusterRecs, go on to estimate the hits
estimateHitsAndSendReply();
// no block, we are done
return true;
}
*/
// void tryAgainWrapper ( int fd , void *state ) {
// Msg39 *THIS = (Msg39 *)state;
// g_loop.unregisterSleepCallback ( state , tryAgainWrapper );
// THIS->getLists();
// }
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . called either from
// 1) doDocIdSplitLoop
// 2) or getDocIds2() if only 1 docidsplit
bool Msg39::getLists () {
if ( m_debug ) m_startTime = gettimeofdayInMilliseconds();
// . ask Indexdb for the IndexLists we need for these termIds
// . each rec in an IndexList is a termId/score/docId tuple
//
// restrict to docid range?
//
// . get the docid start and end
// . do docid paritioning so we can send to all hosts
// in the network, not just one stripe
long long docIdStart = 0;
long long docIdEnd = MAX_DOCID;
// . restrict to this docid?
// . will really make gbdocid:| searches much faster!
long long dr = m_tmpq.m_docIdRestriction;
if ( dr ) {
docIdStart = dr;
docIdEnd = dr + 1;
}
// . override
// . this is set from Msg39::doDocIdSplitLoop() to compute
// search results in stages, so that we do not load massive
// termlists into memory and got OOM (out of memory)
if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId;
if ( m_r->m_maxDocId != -1 ) docIdEnd = m_r->m_maxDocId+1;
// if we have twins, then make sure the twins read different
// pieces of the same docid range to make things 2x faster
//bool useTwins = false;
//if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
//if ( useTwins ) {
// long long delta2 = ( docIdEnd - docIdStart ) / 2;
// if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
// else docIdStart = docIdStart + delta2;
//}
// new striping logic:
long numStripes = g_hostdb.getNumStripes();
long long delta2 = ( docIdEnd - docIdStart ) / numStripes;
long stripe = g_hostdb.getMyHost()->m_stripe;
docIdStart += delta2 * stripe; // is this right?
docIdEnd = docIdStart + delta2;
// add 1 to be safe so we don't lose a docid
docIdEnd++;
// TODO: add triplet support later for this to split the
// read 3 ways. 4 ways for quads, etc.
//if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
// do not go over MAX_DOCID because it gets masked and
// ends up being 0!!! and we get empty lists
if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID;
// remember so Msg2.cpp can use them to restrict the termlists
// from "whiteList" as well
m_docIdStart = docIdStart;
m_docIdEnd = docIdEnd;
//
// set startkey/endkey for each term/termlist
//
for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
// breathe
QUICKPOLL ( m_r->m_niceness );
// shortcuts
QueryTerm *qterm = &m_tmpq.m_qterms[i];
char *sk = qterm->m_startKey;
char *ek = qterm->m_endKey;
// get the term id
long long tid = m_tmpq.getTermId(i);
// if only 1 stripe
//if ( g_hostdb.getNumStripes() == 1 ) {
// docIdStart = 0;
// docIdEnd = MAX_DOCID;
//}
// store now in qterm
g_posdb.makeStartKey ( sk , tid , docIdStart );
g_posdb.makeEndKey ( ek , tid , docIdEnd );
qterm->m_ks = sizeof(POSDBKEY);//key144_t);
}
// debug msg
if ( m_debug || g_conf.m_logDebugQuery ) {
for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
// get the term in utf8
//char bb[256];
QueryTerm *qt = &m_tmpq.m_qterms[i];
//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
char *tpc = qt->m_term + qt->m_termLen;
char tmp = *tpc;
*tpc = '\0';
char sign = qt->m_termSign;
if ( sign == 0 ) sign = '0';
QueryWord *qw = qt->m_qword;
long wikiPhrId = qw->m_wikiPhraseId;
if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0;
char leftwikibigram = 0;
char rightwikibigram = 0;
if ( qt->m_leftPhraseTerm &&
qt->m_leftPhraseTerm->m_isWikiHalfStopBigram )
leftwikibigram = 1;
if ( qt->m_rightPhraseTerm &&
qt->m_rightPhraseTerm->m_isWikiHalfStopBigram )
rightwikibigram = 1;
/*
char c = m_tmpq.getTermSign(i);
char tt[512];
long ttlen = m_tmpq.getTermLen(i);
if ( ttlen > 254 ) ttlen = 254;
if ( ttlen < 0 ) ttlen = 0;
// old:painful: convert each term from unicode to ascii
memcpy ( tt , m_tmpq.getTerm(i) , ttlen );
*/
long isSynonym = 0;
QueryTerm *st = qt->m_synonymOf;
if ( st ) isSynonym = true;
SafeBuf sb;
// now we can display it
//tt[ttlen]='\0';
//if ( c == '\0' ) c = ' ';
sb.safePrintf(
"query: msg39: [%lu] query term #%li \"%s\" "
"phr=%li termId=%llu rawTermId=%llu "
//"estimatedTermFreq=%lli (+/- ~16000) "
"tfweight=%.02f "
"sign=%c "
"numPlusses=%hhu "
"required=%li "
"fielcode=%li "
"ebit=0x%0llx "
"impBits=0x%0llx "
"wikiphrid=%li "
"leftwikibigram=%li "
"rightwikibigram=%li "
//"range.startTermNum=%hhi range.endTermNum=%hhi "
//"minRecSizes=%li "
"readSizeInBytes=%li "
//"ebit=0x%llx "
//"impBits=0x%llx "
"hc=%li "
"component=%li "
"otermLen=%li "
"isSynonym=%li "
"querylangid=%li " ,
(long)this ,
i ,
qt->m_term,//bb ,
(long)m_tmpq.isPhrase (i) ,
m_tmpq.getTermId (i) ,
m_tmpq.getRawTermId (i) ,
((float *)m_r->ptr_termFreqWeights)[i] ,
sign , //c ,
0 ,
(long)qt->m_isRequired,
(long)qt->m_fieldCode,
(long long)qt->m_explicitBit ,
(long long)qt->m_implicitBits ,
wikiPhrId,
(long)leftwikibigram,
(long)rightwikibigram,
((long *)m_r->ptr_readSizes)[i] ,
//(long long)m_tmpq.m_qterms[i].m_explicitBit ,
//(long long)m_tmpq.m_qterms[i].m_implicitBits ,
(long)m_tmpq.m_qterms[i].m_hardCount ,
(long)m_tmpq.m_componentCodes[i],
(long)m_tmpq.getTermLen(i) ,
isSynonym,
(long)m_tmpq.m_langId ); // ,tt
// put it back
*tpc = tmp;
if ( st ) {
long stnum = st - m_tmpq.m_qterms;
sb.safePrintf("synofterm#=%li",stnum);
//sb.safeMemcpy(st->m_term,st->m_termLen);
sb.pushChar(' ');
sb.safePrintf("synwid0=%lli ",qt->m_synWids0);
sb.safePrintf("synwid1=%lli ",qt->m_synWids1);
sb.safePrintf("synalnumwords=%li ",
qt->m_numAlnumWordsInSynonym);
// like for synonym "nj" it's base,
// "new jersey" has 2 alnum words!
sb.safePrintf("synbasealnumwords=%li ",
qt->m_numAlnumWordsInBase);
}
logf(LOG_DEBUG,"%s",sb.getBufStart());
}
m_tmpq.printBooleanTree();
}
// timestamp log
if ( m_debug )
log(LOG_DEBUG,"query: msg39: [%lu] Getting %li index lists ",
(long)this,m_tmpq.getNumTerms());
// . now get the index lists themselves
// . return if it blocked
// . not doing a merge (last parm) means that the lists we receive
// will be an appending of a bunch of lists so keys won't be in order
// . merging is uneccessary for us here because we hash the keys anyway
// . and merging takes up valuable cpu time
// . caution: the index lists returned from Msg2 are now compressed
// . now i'm merging because it's 10 times faster than hashing anyway
// and the reply buf should now always be <= minRecSizes so we can
// pre-allocate one better, and, 3) this should fix the yahoo.com
// reindex bug
char rdbId = RDB_POSDB;
// . TODO: MDW: fix
// . partap says there is a bug in this??? we can't cache UOR'ed lists?
bool checkCache = false;
// split is us????
//long split = g_hostdb.m_myHost->m_group;
long split = g_hostdb.m_myHost->m_shardNum;
// call msg2
if ( ! m_msg2.getLists ( rdbId ,
m_r->m_collnum,//m_r->ptr_coll ,
m_r->m_maxAge ,
m_r->m_addToCache ,
//m_tmpq.m_qterms ,
&m_tmpq,
m_r->ptr_whiteList,
// we need to restrict docid range for
// whitelist as well! this is from
// doDocIdSplitLoop()
m_docIdStart,
m_docIdEnd,
// how much of each termlist to read in bytes
(long *)m_r->ptr_readSizes ,
//m_tmpq.getNumTerms() , // numLists
// 1-1 with query terms
m_lists ,
this ,
controlLoopWrapper,//gotListsWrapper ,
m_r ,
m_r->m_niceness ,
true , // do merge?
m_debug ,
NULL , // best hostids
m_r->m_restrictPosdbForQuery ,
split ,
checkCache )) {
m_blocked = true;
return false;
}
// error?
//if ( g_errno ) {
// log("msg39: Had error getting termlists2: %s.",
// mstrerror(g_errno));
// // don't bail out here because we are in docIdSplitLoop()
// //sendReply (m_slot,this,NULL,0,0,true);
// return true;
//}
//return gotLists ( true );
return true;
}
/*
void gotListsWrapper ( void *state ) {
Msg39 *THIS = (Msg39 *) state;
// save this
long numDocIdSplits = THIS->m_numDocIdSplits;
// . hash the lists into our index table
// . this will send back a reply or recycle and read more list data
// . this may call addedLists() which may call
// estimateHitsAndSendReply() which nukes "THIS" msg39 but
// it only does that if m_numDocIdSplits is 1
// . this make nuke msg39
if ( ! THIS->gotLists ( true ) ) return;
// . if he did not block and there was an errno we send reply
// otherwise if there was NO error he will have sent the reply
// . if gotLists() was called in the ABOVE function and it returns
// true then the docIdLoop() function will send back the reply.
if ( g_errno ) {
log("msg39: sending back error reply = %s",mstrerror(g_errno));
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
}
// no, block? call the docid split loop
// . but if we only had one split msg39 will have been nuked
//if ( numDocIdSplits <= 1 ) return;
// if we get the lists and processed them without blocking, repeat!
if ( ! THIS->doDocIdSplitLoop() ) return;
// send back reply
estimateHitsAndSendReply();
}
*/
// . now come here when we got the necessary index lists
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg39::intersectLists ( ) { // bool updateReadInfo ) {
// bail on error
if ( g_errno ) {
hadError:
log("msg39: Had error getting termlists: %s.",
mstrerror(g_errno));
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//sendReply (m_slot,this,NULL,0,0,true);
return true;
}
// timestamp log
if ( m_debug ) {
log(LOG_DEBUG,"query: msg39: [%lu] Got %li lists in %lli ms"
, (long)this,m_tmpq.getNumTerms(),
gettimeofdayInMilliseconds() - m_startTime);
m_startTime = gettimeofdayInMilliseconds();
}
// breathe
QUICKPOLL ( m_r->m_niceness );
// ensure collection not deleted from under us
CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum );
if ( ! cr ) {
g_errno = ENOCOLLREC;
goto hadError;
}
// . set the IndexTable so it can set it's score weights from the
// termFreqs of each termId in the query
// . this now takes into account the special termIds used for sorting
// by date (0xdadadada and 0xdadadad2 & TERMID_MASK)
// . it should weight them so much so that the summation of scores
// from other query terms cannot make up for a lower date score
// . this will actually calculate the top
// . this might also change m_tmpq.m_termSigns
// . this won't do anything if it was already called
m_posdbTable.init ( &m_tmpq ,
m_debug ,
this ,
&m_tt ,
m_r->m_collnum,//ptr_coll ,
&m_msg2 , // m_lists ,
//m_tmpq.m_numTerms , // m_numLists
m_r );
// breathe
QUICKPOLL ( m_r->m_niceness );
// . we have to do this here now too
// . but if we are getting weights, we don't need m_tt!
// . actually we were using it before for rat=0/bool queries but
// i got rid of NO_RAT_SLOTS
if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) {
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//sendReply ( m_slot , this , NULL , 0 , 0 , true);
return true;
}
// we have to allocate this with each call because each call can
// be a different docid range from doDocIdSplitLoop.
if ( ! m_posdbTable.allocWhiteListTable() ) {
log("msg39: Had error allocating white list table: %s.",
mstrerror(g_errno));
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//sendReply (m_slot,this,NULL,0,0,true);
return true;
}
// do not re do it if doing docid range splitting
m_allocedTree = true;
// . now we must call this separately here, not in allocTopTree()
// . we have to re-set the QueryTermInfos with each docid range split
// since it will set the list ptrs from the msg2 lists
if ( ! m_posdbTable.setQueryTermInfo () ) return true;
// print query term bit numbers here
for ( long i = 0 ; m_debug && i < m_tmpq.getNumTerms() ; i++ ) {
QueryTerm *qt = &m_tmpq.m_qterms[i];
//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
char *tpc = qt->m_term + qt->m_termLen;
char tmp = *tpc;
*tpc = '\0';
SafeBuf sb;
sb.safePrintf("query: msg39: BITNUM query term #%li \"%s\" "
"bitnum=%li ", i , qt->m_term, qt->m_bitNum );
// put it back
*tpc = tmp;
logf(LOG_DEBUG,"%s",sb.getBufStart());
}
// timestamp log
if ( m_debug ) {
log(LOG_DEBUG,"query: msg39: [%lu] Preparing to intersect "
"took %lli ms",
(long)this, gettimeofdayInMilliseconds() - m_startTime );
m_startTime = gettimeofdayInMilliseconds();
}
// time it
long long start = gettimeofdayInMilliseconds();
long long diff;
// . don't bother making a thread if lists are small
// . look at STAGE? in IndexReadInfo.cpp to see how we read in stages
// . it's always saying msg39 handler is hogging cpu...could this be it
//if ( m_msg2.getTotalRead() < 2000*8 ) goto skipThread;
// debug
//goto skipThread;
// . NOW! let's do this in a thread so we can continue to service
// incoming requests
// . don't launch more than 1 thread at a time for this
// . set callback when thread done
// breathe
QUICKPOLL ( m_r->m_niceness );
// . create the thread
// . only one of these type of threads should be launched at a time
if ( ! m_debug &&
g_threads.call ( INTERSECT_THREAD , // threadType
m_r->m_niceness ,
this , // top 4 bytes must be cback
controlLoopWrapper2,//threadDoneWrapper ,
addListsWrapper ) ) {
m_blocked = true;
return false;
}
// if it failed
//log(LOG_INFO,"query: Intersect thread creation failed. Doing "
// "blocking. Hurts performance.");
// check tree
if ( m_tt.m_nodes == NULL ) {
log(LOG_LOGIC,"query: msg39: Badness.");
char *xx = NULL; *xx = 0; }
// sometimes we skip the thread
//skipThread:
// . addLists() should never have a problem
// . g_errno should be set by prepareToAddLists() above if there is
// going to be a problem
//if ( m_r->m_useNewAlgo )
m_posdbTable.intersectLists10_r ( );
//else
// m_posdbTable.intersectLists9_r ( );
// time it
diff = gettimeofdayInMilliseconds() - start;
if ( diff > 10 ) log("query: Took %lli ms for intersection",diff);
// returns false if blocked, true otherwise
//return addedLists ();
return true;
}
void *addListsWrapper ( void *state , ThreadEntry *t ) {
// we're in a thread now!
Msg39 *THIS = (Msg39 *)state;
// . do the add
// . addLists() returns false and sets errno on error
// . hash the lists into our table
// . this returns false and sets g_errno on error
// . Msg2 always compresses the lists so be aware that the termId
// has been discarded
//THIS->m_posdbTable.intersectLists9_r ();
//if ( THIS->m_r->m_useNewAlgo )
THIS->m_posdbTable.intersectLists10_r ( );
//else
// THIS->m_posdbTable.intersectLists9_r ( );
// . exit the thread
// . top 4 bytes of "state" ptr should be our done callback
// . threadDoneWrapper will be called by g_loop when he gets the
// thread's termination signal, sig niceness is m_niceness
// . bogus return
return NULL;
}
/*
// we come here after thread exits
void threadDoneWrapper ( void *state , ThreadEntry *t ) {
// get this class
Msg39 *THIS = (Msg39 *)state;
// sanity check
if ( ! THIS->m_blocked ) { char *xx=NULL;*xx=0; }
// addedLists() could send reply and destroy "THIS" so save this.
// it will only sendReply back if it calls estimateHits() which
// is only called if numDocIdSplits <= 1...
long numDocIdSplits = THIS->m_numDocIdSplits;
char debug = THIS->m_debug;
// just return if it blocked
if ( ! THIS->addedLists () ) {
// this can't block
if ( numDocIdSplits >= 2 ) { char *xx=NULL;*xx=0; }
if ( debug ) log("msg39: addedLists blocked");
return;
}
if ( debug ) log("msg39: addedLists no block. i guess reply sent");
// . if he did not block and there was an errno we send reply
// otherwise if there was NO error he will have sent the reply
// . if gotLists() was called in the ABOVE function and it returns
// true then the docIdLoop() function will send back the reply.
if ( g_errno ) {
log("msg39: sending back error reply = %s",mstrerror(g_errno));
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
}
// no, block? call the docid split loop
// . but if we only had one split msg39 will have been nuked
//if ( numDocIdSplits <= 1 ) return;
// if we get the lists and processed them without blocking, repeat!
if ( ! THIS->doDocIdSplitLoop() ) return;
// send back reply
estimateHitsAndSendReply();
// no, block? call the docid split loop
//if ( numDocIdSplits <= 1 ) return;
// . just re-do the whole she-bang but do not reset m_tt top tree!!!
// . it returns false if it blocks
//THIS->doDocIdSplitLoop();
}
*/
/*
// return false if blocked, true otherwise
bool Msg39::addedLists ( ) {
if ( m_posdbTable.m_t1 ) {
// . measure time to add the lists in bright green
// . use darker green if rat is false (default OR)
long color;
//char *label;
color = 0x0000ff00 ;
//label = "termlist_intersect";
g_stats.addStat_r ( 0 ,
m_posdbTable.m_t1 ,
m_posdbTable.m_t2 , color );
}
// accumulate total hits count over each docid split
m_numTotalHits += m_posdbTable.m_docIdVoteBuf.length() / 6;
// before wrapping up, complete our docid split loops!
// so do not send the reply back yet... send reply back from
// the docid loop function... doDocIdSplitLoop()
//if ( m_numDocIdSplits >= 2 ) return true;
// . save some memory,free m_topDocIdPtrs2,m_topScores2,m_topExplicits2
// . the m_topTree should have been filled from the call to
// IndexTable2::fillTopDocIds() and it no longer has ptrs to the
// docIds, but has the docIds themselves
//m_posdbTable.freeMem();
// error?
if ( m_posdbTable.m_errno ) {
// we do not need to store the intersection i guess...??
m_posdbTable.freeMem();
g_errno = m_posdbTable.m_errno;
log("query: posdbtable had error = %s",mstrerror(g_errno));
sendReply ( m_slot , this , NULL , 0 , 0 ,true);
return true;
}
// should we put cluster recs in the tree?
//m_gotClusterRecs = ( g_conf.m_fullSplit && m_r->m_doSiteClustering );
//m_gotClusterRecs = ( m_r->m_doSiteClustering );
// . before we send the top docids back, lookup their site hashes
// in clusterdb so we can do filtering at this point.
// BUT only do this if we are in a "full split" config, because that
// way we can guarantee all clusterdb recs are local (on this host)
// and should be in the page cache. the page cache should do ultra
// quick lookups and no memcpy()'s for this operation. it should
// be <<1ms to lookup thousands of docids.
// . when doing innerLoopSiteClustering we always use top tree now
// because our number of "top docids" can be somewhat unpredictably
// large due to having a ton of results with the same "domain hash"
// (see the "vcount" in IndexTable2.cpp)
// . do NOT do if we are just "getting weights", phr and aff weights
// if ( m_gotClusterRecs ) {
// // . set the clusterdb recs in the top tree
// return setClusterRecs ( ) ;
// }
// if we did not call setClusterRecs, go on to estimate the hits
// estimateHitsAndSendReply();
// return true;
return true;
}
*/
// . set the clusterdb recs in the top tree
// . returns false if blocked, true otherwise
// . returns true and sets g_errno on error
bool Msg39::setClusterRecs ( ) {
if ( ! m_r->m_doSiteClustering ) return true;
// make buf for arrays of the docids, cluster levels and cluster recs
long nodeSize = 8 + 1 + 12;
long numDocIds = m_tt.m_numUsedNodes;
m_bufSize = numDocIds * nodeSize;
m_buf = (char *)mmalloc ( m_bufSize , "Msg39docids" );
// on error, return true, g_errno should be set
if ( ! m_buf ) {
log("query: msg39: Failed to alloc buf for clustering.");
sendReply(m_slot,this,NULL,0,0,true);
return true;
}
// assume we got them
m_gotClusterRecs = true;
// parse out the buf
char *p = m_buf;
// docIds
m_clusterDocIds = (long long *)p; p += numDocIds * 8;
m_clusterLevels = (char *)p; p += numDocIds * 1;
m_clusterRecs = (key_t *)p; p += numDocIds * 12;
// sanity check
if ( p > m_buf + m_bufSize ) { char *xx=NULL; *xx=0; }
// loop over all results
long nd = 0;
for ( long ti = m_tt.getHighNode() ; ti >= 0 ;
ti = m_tt.getPrev(ti) , nd++ ) {
// get the guy
TopNode *t = &m_tt.m_nodes[ti];
// get the docid
//long long docId = getDocIdFromPtr(t->m_docIdPtr);
// store in array
m_clusterDocIds[nd] = t->m_docId;
// assume not gotten
m_clusterLevels[nd] = CR_UNINIT;
// assume not found, make the whole thing is 0
m_clusterRecs[nd].n1 = 0;
m_clusterRecs[nd].n0 = 0LL;
}
// store number
m_numClusterDocIds = nd;
// sanity check
if ( nd != m_tt.m_numUsedNodes ) { char *xx=NULL;*xx=0; }
// . ask msg51 to get us the cluster recs
// . it should read it all from the local drives
// . "maxAge" of 0 means to not get from cache (does not include disk)
if ( ! m_msg51.getClusterRecs ( m_clusterDocIds ,
m_clusterLevels ,
m_clusterRecs ,
m_numClusterDocIds ,
m_r->m_collnum ,
0 , // maxAge
false , // addToCache
this ,
//gotClusterRecsWrapper ,
controlLoopWrapper,
m_r->m_niceness ,
m_debug ) )
// did we block? if so, return
return false;
// ok, process the replies
//gotClusterRecs();
// the above never blocks
return true;
}
// void gotClusterRecsWrapper ( void *state ) {
// // get this class
// Msg39 *THIS = (Msg39 *)state;
// // be on our way
// THIS->gotClusterRecs ();
// }
// return false and set g_errno on error
bool Msg39::gotClusterRecs ( ) {
if ( ! m_gotClusterRecs ) return true;
// now tell msg5 to set the cluster levels
if ( ! setClusterLevels ( m_clusterRecs ,
m_clusterDocIds ,
m_numClusterDocIds ,
2 , // maxdocidsperhostname
m_r->m_doSiteClustering ,
m_r->m_familyFilter ,
// turn this off, not needed now that
// we have the langid in every posdb key
0,//m_r->m_language ,
m_debug ,
m_clusterLevels )) {
m_errno = g_errno;
// send back an error reply
//sendReply ( m_slot , this , NULL , 0 , 0 ,true);
return false;
}
// count this
m_numVisible = 0;
// now put the info back into the top tree
long nd = 0;
for ( long ti = m_tt.getHighNode() ; ti >= 0 ;
ti = m_tt.getPrev(ti) , nd++ ) {
// get the guy
TopNode *t = &m_tt.m_nodes[ti];
// get the docid
//long long docId = getDocIdFromPtr(t->m_docIdPtr);
// sanity check
if ( t->m_docId != m_clusterDocIds[nd] ) {char *xx=NULL;*xx=0;}
// set it
t->m_clusterLevel = m_clusterLevels[nd];
t->m_clusterRec = m_clusterRecs [nd];
// visible?
if ( t->m_clusterLevel == CR_OK ) m_numVisible++;
}
log(LOG_DEBUG,"query: msg39: %li docids out of %li are visible",
m_numVisible,nd);
// free this junk now
mfree ( m_buf , m_bufSize , "Msg39cluster");
m_buf = NULL;
// accumulate total hit count over each docid split!
//m_numTotalHits += m_posdbTable.m_docIdVoteBuf.length() / 6;
// before wrapping up, complete our docid split loops!
// so do not send the reply back yet... send reply back from
// the docid loop function... doDocIdSplitLoop()
//if ( m_numDocIdSplits >= 2 ) return;
// finish up and send back the reply
//estimateHitsAndSendReply ();
return true;
}
void Msg39::estimateHitsAndSendReply ( ) {
// no longer in use
m_inUse = false;
// now this for the query loop on the QueryLogEntries.
m_topDocId50 = 0LL;
m_topScore50 = 0.0;
// a little hack for the seo pipeline in xmldoc.cpp
m_topDocId = 0LL;
m_topScore = 0.0;
m_topDocId2 = 0LL;
m_topScore2 = 0.0;
long ti = m_tt.getHighNode();
if ( ti >= 0 ) {
TopNode *t = &m_tt.m_nodes[ti];
m_topDocId = t->m_docId;
m_topScore = t->m_score;
}
// try the 2nd one too
long ti2 = -1;
if ( ti >= 0 ) ti2 = m_tt.getNext ( ti );
if ( ti2 >= 0 ) {
TopNode *t2 = &m_tt.m_nodes[ti2];
m_topDocId2 = t2->m_docId;
m_topScore2 = t2->m_score;
}
// convenience ptrs. we will store the docids/scores into these arrays
long long *topDocIds;
double *topScores;
key_t *topRecs;
// numDocIds counts docs in all tiers when using toptree.
long numDocIds = m_tt.m_numUsedNodes;
// the msg39 reply we send back
long replySize;
char *reply;
//m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6;
// make the reply?
Msg39Reply mr;
// this is what you want to look at if there is no seo.cpp module...
if ( ! m_callback ) {
// if we got clusterdb recs in here, use 'em
if ( m_gotClusterRecs ) numDocIds = m_numVisible;
// don't send more than the docs that are asked for
if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet;
// # of QueryTerms in query
long nqt = m_tmpq.m_numTerms;
// start setting the stuff
mr.m_numDocIds = numDocIds;
// copy # estiamted hits into 8 bytes of reply
//long long est = m_posdbTable.m_estimatedTotalHits;
// ensure it has at least as many results as we got
//if ( est < numDocIds ) est = numDocIds;
// or if too big...
//if ( numDocIds < m_r->m_docsToGet ) est = numDocIds;
// . total estimated hits
// . this is now an EXACT count!
mr.m_estimatedHits = m_numTotalHits;
// sanity check
mr.m_nqt = nqt;
// the m_errno if any
mr.m_errno = m_errno;
// shortcut
PosdbTable *pt = &m_posdbTable;
// the score info, in no particular order right now
mr.ptr_scoreInfo = pt->m_scoreInfoBuf.getBufStart();
mr.size_scoreInfo = pt->m_scoreInfoBuf.length();
// that has offset references into posdbtable::m_pairScoreBuf
// and m_singleScoreBuf, so we need those too now
mr.ptr_pairScoreBuf = pt->m_pairScoreBuf.getBufStart();
mr.size_pairScoreBuf = pt->m_pairScoreBuf.length();
mr.ptr_singleScoreBuf = pt->m_singleScoreBuf.getBufStart();
mr.size_singleScoreBuf = pt->m_singleScoreBuf.length();
// save some time since seo.cpp gets from posdbtable directly,
// so we can avoid serializing/copying this stuff at least
if ( ! m_r->m_makeReply ) {
mr.size_scoreInfo = 0;
mr.size_pairScoreBuf = 0;
mr.size_singleScoreBuf = 0;
}
//mr.m_sectionStats = pt->m_sectionStats;
// reserve space for these guys, we fill them in below
mr.ptr_docIds = NULL;
mr.ptr_scores = NULL;
mr.ptr_clusterRecs = NULL;
// this is how much space to reserve
mr.size_docIds = 8 * numDocIds; // long long
mr.size_scores = sizeof(double) * numDocIds; // float
// if not doing site clustering, we won't have these perhaps...
if ( m_gotClusterRecs )
mr.size_clusterRecs = sizeof(key_t) *numDocIds;
else
mr.size_clusterRecs = 0;
#define MAX_FACETS 20000
/////////////////
//
// FACETS
//
/////////////////
// We can have multiple gbfacet: terms in a query so
// serialize all the QueryTerm::m_facetHashTables into
// Msg39Reply::ptr_facetHashList.
//
// combine the facet hash lists of each query term into
// a list of lists. each lsit is preceeded by the query term
// id of the query term (like gbfacet:xpathsitehash12345)
// followed by a 4 byte length of the following 32-bit
// facet values
long need = 0;
for ( long i = 0 ; i < m_tmpq.m_numTerms; i++ ) {
QueryTerm *qt = &m_tmpq.m_qterms[i];
// skip if not facet
if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
qt->m_fieldCode != FIELD_GBFACETINT &&
qt->m_fieldCode != FIELD_GBFACETFLOAT )
continue;
HashTableX *ft = &qt->m_facetHashTable;
if ( ft->m_numSlotsUsed == 0 ) continue;
long used = ft->m_numSlotsUsed;
// limit for memory
if ( used > (long)MAX_FACETS ) {
log("msg39: truncating facet list to 20000 "
"from %li for %s",used,qt->m_term);
used = (long)MAX_FACETS;
}
// store query term id 64 bit
need += 8;
// then size
need += 4;
// then buckets. keys and counts
need += (4+sizeof(FacetEntry)) * used;
}
// allocate
SafeBuf tmp;
if ( ! tmp.reserve ( need ) ) {
log("query: Could not allocate memory "
"to hold reply facets");
sendReply(m_slot,this,NULL,0,0,true);
return;
}
// point to there
char *p = tmp.getBufStart();
for ( long i = 0 ; i < m_tmpq.m_numTerms ; i++ ) {
QueryTerm *qt = &m_tmpq.m_qterms[i];
// skip if not facet
if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
qt->m_fieldCode != FIELD_GBFACETINT &&
qt->m_fieldCode != FIELD_GBFACETFLOAT )
continue;
// get all the facet hashes and their counts
HashTableX *ft = &qt->m_facetHashTable;
// skip if none
if ( ft->m_numSlotsUsed == 0 ) continue;
// store query term id 64 bit
*(long long *)p = qt->m_termId;
p += 8;
long used = ft->getNumSlotsUsed();
if ( used > (long)MAX_FACETS ) used = (long)MAX_FACETS;
// store count
*(long *)p = used;
p += 4;
long count = 0;
// for sanity check
char *pend = p + (used * (4+sizeof(FacetEntry)));
// serialize the key/val pairs
for ( long k = 0 ; k < ft->m_numSlots ; k++ ) {
// skip empty buckets
if ( ! ft->m_flags[k] ) continue;
// store key. the hash of the facet value.
*(long *)p = ft->getKey32FromSlot(k); p += 4;
// then store count
//*(long *)p = ft->getVal32FromSlot(k); p += 4;
// now this has a docid on it so we can
// lookup the text of the facet in Msg40.cpp
FacetEntry *fe;
fe = (FacetEntry *)ft->getValFromSlot(k);
memcpy ( p , fe , sizeof(FacetEntry) );
p += sizeof(FacetEntry);
// do not breach
if ( ++count >= (long)MAX_FACETS ) break;
}
// sanity check
if ( p != pend ) { char *xx=NULL;*xx=0; }
// do the next query term
}
// now point to that so it can be serialized below
mr.ptr_facetHashList = tmp.getBufStart();
mr.size_facetHashList = p - tmp.getBufStart();//tmp.length();
/////////////
//
// END FACETS
//
/////////////
// . that is pretty much it,so serialize it into buffer,"reply"
// . mr.ptr_docIds, etc., will point into the buffer so we can
// re-serialize into it below from the tree
// . returns NULL and sets g_errno on error
// . "true" means we should make mr.ptr_* reference into the
// newly serialized buffer.
reply = serializeMsg ( sizeof(Msg39Reply), // baseSize
&mr.size_docIds, // firstSizeParm
&mr.size_clusterRecs,//lastSizePrm
&mr.ptr_docIds , // firstStrPtr
&mr , // thisPtr
&replySize ,
NULL ,
0 ,
true ) ;
if ( ! reply ) {
log("query: Could not allocated memory "
"to hold reply of docids to send back.");
sendReply(m_slot,this,NULL,0,0,true);
return;
}
topDocIds = (long long *) mr.ptr_docIds;
topScores = (double *) mr.ptr_scores;
topRecs = (key_t *) mr.ptr_clusterRecs;
}
long docCount = 0;
// loop over all results in the TopTree
for ( long ti = m_tt.getHighNode() ; ti >= 0 ;
ti = m_tt.getPrev(ti) ) {
// get the guy
TopNode *t = &m_tt.m_nodes[ti];
// skip if clusterLevel is bad!
if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK )
continue;
// if not sending back a reply... we were called from seo.cpp
// State3f logic to evaluate a QueryLogEntry, etc.
if ( m_callback ) {
// skip results past #50
if ( docCount > 50 ) continue;
// set this
m_topScore50 = t->m_score;
m_topDocId50 = t->m_docId;
// that's it
continue;
}
// get the docid ptr
//char *diptr = t->m_docIdPtr;
//long long docId = getDocIdFromPtr(diptr);
// sanity check
if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; }
//add it to the reply
topDocIds [docCount] = t->m_docId;
topScores [docCount] = t->m_score;
if ( m_tt.m_useIntScores )
topScores[docCount] = (double)t->m_intScore;
// supply clusterdb rec? only for full splits
if ( m_gotClusterRecs )
topRecs [docCount] = t->m_clusterRec;
//topExplicits [docCount] =
// getNumBitsOn(t->m_explicits)
docCount++;
// 50th score? set this for seo.cpp. if less than 50 results
// we want the score of the last doc then.
if ( docCount <= 50 ) m_topScore50 = t->m_score;
if ( m_debug ) {
log(LOG_DEBUG,"query: msg39: [%lu] "
"%03li) docId=%012llu sum=%.02f",
(long)this, docCount,
t->m_docId,t->m_score);
}
//don't send more than the docs that are wanted
if ( docCount >= numDocIds ) break;
}
if ( docCount > 300 && m_debug )
log("query: Had %li nodes in top tree",docCount);
// this is sensitive info
if ( m_debug ) {
log(LOG_DEBUG,
"query: msg39: [%li] Intersected lists took %lli (%lli) "
"ms "
"docIdsToGet=%li docIdsGot=%li "
"q=%s",
(long)this ,
m_posdbTable.m_addListsTime ,
gettimeofdayInMilliseconds() - m_startTime ,
m_r->m_docsToGet ,
numDocIds ,
m_tmpq.getQuery() );
}
// if we blocked because we used a thread then call callback if
// summoned from a msg3f handler and not a msg39 handler
if ( m_callback ) {
// if we blocked call user callback
if ( m_blocked ) m_callback ( m_state );
// if not sending back a udp reply, return now
return;
}
// now send back the reply
sendReply(m_slot,this,reply,replySize,replySize,false);
return;
}