mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-05 04:37:39 +03:00
1316 lines
40 KiB
C++
1316 lines
40 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Msg39.h"
|
|
#include "Stats.h"
|
|
#include "Threads.h"
|
|
#include "TopTree.h"
|
|
#include "UdpServer.h"
|
|
//#include "CollectionRec.h"
|
|
#include "SearchInput.h"
|
|
|
|
// called to send back the reply
|
|
static void sendReply ( UdpSlot *slot ,
|
|
Msg39 *msg39 ,
|
|
char *reply ,
|
|
long replySize ,
|
|
long replyMaxSize ,
|
|
bool hadError );
|
|
// called when Msg2 has got all the termlists
|
|
static void gotListsWrapper ( void *state ) ;
|
|
// thread wrappers
|
|
static void *addListsWrapper ( void *state , ThreadEntry *t ) ;
|
|
static void threadDoneWrapper ( void *state , ThreadEntry *t ) ;
|
|
static void gotClusterRecsWrapper ( void *state ) ;
|
|
|
|
bool Msg39::registerHandler ( ) {
|
|
// . register ourselves with the udp server
|
|
// . it calls our callback when it receives a msg of type 0x39
|
|
if ( ! g_udpServer.registerHandler ( 0x39, handleRequest39 ))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
Msg39::Msg39 () {
|
|
m_inUse = false;
|
|
reset();
|
|
}
|
|
|
|
void Msg39::reset() {
|
|
if ( m_inUse ) { char *xx=NULL;*xx=0; }
|
|
m_allocedTree = false;
|
|
m_numDocIdSplits = 1;
|
|
m_tmpq.reset();
|
|
m_numTotalHits = 0;
|
|
|
|
reset2();
|
|
}
|
|
|
|
void Msg39::reset2() {
|
|
// reset lists
|
|
for ( long j = 0 ; j < m_msg2.m_numLists ; j++ )
|
|
m_lists[j].freeList();
|
|
m_msg2.reset();
|
|
m_posdbTable.reset();
|
|
m_callback = NULL;
|
|
m_state = NULL;
|
|
m_blocked = false;
|
|
m_tmp = NULL;
|
|
}
|
|
|
|
// . handle a request to get a the search results, list of docids only
|
|
// . returns false if slot should be nuked and no reply sent
|
|
// . sometimes sets g_errno on error
|
|
void handleRequest39 ( UdpSlot *slot , long netnice ) {
|
|
// use Msg39 to get the lists and intersect them
|
|
Msg39 *THIS ;
|
|
try { THIS = new ( Msg39 ); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("msg39: new(%i): %s", sizeof(Msg39),mstrerror(g_errno));
|
|
sendReply ( slot , NULL , NULL , 0 , 0 ,true);
|
|
return;
|
|
}
|
|
mnew ( THIS , sizeof(Msg39) , "Msg39" );
|
|
// clear it
|
|
g_errno = 0;
|
|
// . get the resulting docIds, usually blocks
|
|
// . sets g_errno on error
|
|
THIS->getDocIds ( slot ) ;
|
|
}
|
|
|
|
// this must always be called sometime AFTER handleRequest() is called
|
|
void sendReply ( UdpSlot *slot , Msg39 *msg39 , char *reply , long replyLen ,
|
|
long replyMaxSize , bool hadError ) {
|
|
// debug msg
|
|
if ( g_conf.m_logDebugQuery || (msg39&&msg39->m_debug) )
|
|
logf(LOG_DEBUG,"query: msg39: [%lu] Sending reply len=%li.",
|
|
(long)msg39,replyLen);
|
|
|
|
// sanity
|
|
if ( hadError && ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
|
|
// no longer in use. msg39 will be NULL if ENOMEM or something
|
|
if ( msg39 ) msg39->m_inUse = false;
|
|
|
|
// . if we enter from a local call and not from handling a udp slot
|
|
// then execute this logic here to return control to caller.
|
|
// . do not delete ourselves because we will be re-used probably and
|
|
// caller handles that now.
|
|
if ( msg39 && msg39->m_callback ) {
|
|
// if we blocked call user callback
|
|
if ( msg39->m_blocked ) msg39->m_callback ( msg39->m_state );
|
|
// if not sending back a udp reply, return now
|
|
return;
|
|
}
|
|
|
|
// . now we can free the lists before sending
|
|
// . may help a little bit...
|
|
//if ( msg39 ) {
|
|
// for ( long j = 0 ; j < msg39->m_msg2.m_numLists ; j++ )
|
|
// msg39->m_lists[j].freeList();
|
|
//}
|
|
// get the appropriate UdpServer for this niceness level
|
|
UdpServer *us = &g_udpServer;
|
|
// i guess clear this
|
|
long err = g_errno;
|
|
g_errno = 0;
|
|
// send an error reply if g_errno is set
|
|
if ( err ) us->sendErrorReply ( slot , err ) ;
|
|
else us->sendReply_ass ( reply ,
|
|
replyLen ,
|
|
reply ,
|
|
replyMaxSize ,
|
|
slot );
|
|
// always delete ourselves when done handling the request
|
|
if ( msg39 ) {
|
|
mdelete ( msg39 , sizeof(Msg39) , "Msg39" );
|
|
delete (msg39);
|
|
}
|
|
}
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
// . calls gotDocIds to send a reply
|
|
void Msg39::getDocIds ( UdpSlot *slot ) {
|
|
// remember the slot
|
|
m_slot = slot;
|
|
// reset this
|
|
m_errno = 0;
|
|
// get the request
|
|
m_r = (Msg39Request *) m_slot->m_readBuf;
|
|
long requestSize = m_slot->m_readBufSize;
|
|
// ensure it's size is ok
|
|
if ( requestSize < 8 ) {
|
|
g_errno = EBADREQUESTSIZE;
|
|
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
|
|
mstrerror(g_errno) );
|
|
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
|
return ;
|
|
}
|
|
|
|
// deserialize it before we do anything else
|
|
long finalSize = deserializeMsg ( sizeof(Msg39Request) ,
|
|
&m_r->size_readSizes ,
|
|
&m_r->size_whiteList,//coll ,
|
|
&m_r->ptr_readSizes,
|
|
m_r->m_buf );
|
|
|
|
// sanity check
|
|
if ( finalSize != requestSize ) {char *xx=NULL;*xx=0; }
|
|
|
|
getDocIds2 ( m_r );
|
|
}
|
|
|
|
void Msg39::getDocIds2 ( Msg39Request *req ) {
|
|
|
|
// flag it as in use
|
|
m_inUse = true;
|
|
|
|
// store it, might be redundant if called from getDocIds() above
|
|
m_r = req;
|
|
|
|
// a handy thing
|
|
m_debug = false;
|
|
if ( m_r->m_debug ) m_debug = true;
|
|
if ( g_conf.m_logDebugQuery ) m_debug = true;
|
|
if ( g_conf.m_logTimingQuery ) m_debug = true;
|
|
|
|
// ensure it's size is ok
|
|
/*
|
|
if ( m_r->size_whiteList <= 0 ) {
|
|
g_errno = ENOCOLLREC;
|
|
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
|
|
mstrerror(g_errno) );
|
|
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
|
return ;
|
|
}
|
|
*/
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum );
|
|
if ( ! cr ) {
|
|
g_errno = ENOCOLLREC;
|
|
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
|
|
mstrerror(g_errno) );
|
|
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
|
return ;
|
|
}
|
|
|
|
// . set our m_q class
|
|
// . m_boolFlag is either 1 or 0 in this case, the caller did the
|
|
// auto-detect (boolFlag of 2) before calling us
|
|
// . this now calls Query::addCompoundTerms() for us
|
|
if ( ! m_tmpq.set2 ( m_r->ptr_query ,
|
|
m_r->m_language ,
|
|
m_r->m_queryExpansion ,
|
|
m_r->m_useQueryStopWords ) ) {
|
|
log(LOG_LOGIC,"query: msg39: setQuery: %s." ,
|
|
mstrerror(g_errno) );
|
|
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
|
return ;
|
|
}
|
|
|
|
QUICKPOLL ( m_r->m_niceness );
|
|
|
|
// set m_errno
|
|
if ( m_tmpq.m_truncated ) m_errno = EQUERYTRUNCATED;
|
|
// ensure matches with the msg3a sending us this request
|
|
if ( m_tmpq.getNumTerms() != m_r->m_nqt ) {
|
|
g_errno = EBADENGINEER;
|
|
log("query: Query parsing inconsistency for q=%s. "
|
|
"langid=%li. Check langids and m_queryExpansion parms "
|
|
"which are the only parms that could be different in "
|
|
"Query::set2(). You probably have different mysynoyms.txt "
|
|
"files on two different hosts! check that!!"
|
|
,m_tmpq.m_orig
|
|
,(long)m_r->m_language
|
|
);
|
|
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
|
return ;
|
|
}
|
|
// debug
|
|
if ( m_debug )
|
|
logf(LOG_DEBUG,"query: msg39: [%lu] Got request "
|
|
"for q=%s", (long) this,m_tmpq.m_orig);
|
|
|
|
// reset this
|
|
m_tt.reset();
|
|
|
|
QUICKPOLL ( m_r->m_niceness );
|
|
|
|
// assume not doing special docid splitting
|
|
m_numDocIdSplits = 1;
|
|
|
|
// . do not do splits if caller is already specifying a docid range
|
|
// like for gbdocid: queries i guess.
|
|
// . make sure m_msg2 is non-NULL, because if it is NULL we are
|
|
// evaluating a query for a single docid for seo tools
|
|
if ( m_r->m_minDocId == -1 ) { // && m_msg2 ) {
|
|
long nt = m_tmpq.getNumTerms();
|
|
m_numDocIdSplits = nt / 2;
|
|
if ( m_numDocIdSplits == 0 ) m_numDocIdSplits = 1;
|
|
}
|
|
|
|
//if ( ! g_conf.m_doDocIdRangeSplitting )
|
|
// m_numDocIdSplits = 1;
|
|
|
|
// limit to 10
|
|
if ( m_numDocIdSplits > 10 )
|
|
m_numDocIdSplits = 10;
|
|
|
|
// . if caller already specified a docid range, then be loyal to that!
|
|
// . or if we do not have enough query terms to warrant splitting
|
|
if ( m_numDocIdSplits == 1 ) {
|
|
getLists();
|
|
return;
|
|
}
|
|
|
|
// . set up docid range cursor
|
|
// . do twin splitting
|
|
// . we do no do it this way any more... we subsplit each split
|
|
// into two halves...!!! see logic in getLists() below!!!
|
|
//if ( m_r->m_stripe == 1 ) {
|
|
// m_ddd = MAX_DOCID / 2LL;
|
|
// m_dddEnd = MAX_DOCID + 1LL;
|
|
//}
|
|
//else if ( m_r->m_stripe == 0 ) {
|
|
// m_ddd = 0;
|
|
// m_dddEnd = MAX_DOCID / 2LL;
|
|
//}
|
|
// support triplets, etc. later
|
|
//else {
|
|
// char *xx=NULL;*xx=0;
|
|
//}
|
|
|
|
// do not do twin splitting if only one host per group
|
|
//if ( g_hostdb.getNumStripes() == 1 ) {
|
|
m_ddd = 0;
|
|
m_dddEnd = MAX_DOCID;
|
|
//}
|
|
|
|
|
|
// . otherwise, to prevent oom, split up docids into ranges
|
|
// and get winners of each range.
|
|
if ( ! doDocIdSplitLoop() ) return;
|
|
|
|
// error?
|
|
if ( g_errno ) {
|
|
log(LOG_LOGIC,"query: msg39: doDocIdSplitLoop: %s." ,
|
|
mstrerror(g_errno) );
|
|
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
|
return ;
|
|
}
|
|
// it might not have blocked! if all lists in tree and used no thread
|
|
// it will come here after sending the reply and destroying "this"
|
|
return;
|
|
}
|
|
|
|
// . returns false if blocked, true if done
|
|
// . to avoid running out of memory, generate the search results for
|
|
// multiple smaller docid-ranges, one range at a time.
|
|
bool Msg39::doDocIdSplitLoop ( ) {
|
|
long long delta = MAX_DOCID / (long long)m_numDocIdSplits;
|
|
for ( ; m_ddd < m_dddEnd ; ) {
|
|
// the starting docid...
|
|
long long d0 = m_ddd;
|
|
// advance to point to the exclusive endpoint
|
|
m_ddd += delta;
|
|
// ensure this is exclusive of ddd since it will be
|
|
// inclusive in the following iteration.
|
|
long long d1 = m_ddd;
|
|
// fix rounding errors
|
|
if ( d1 + 20LL > MAX_DOCID ) {
|
|
d1 = MAX_DOCID;
|
|
m_ddd = MAX_DOCID;
|
|
}
|
|
// fix it
|
|
m_r->m_minDocId = d0;
|
|
m_r->m_maxDocId = d1; // -1; // exclude d1
|
|
// allow posdbtable re-initialization each time to set
|
|
// the msg2 termlist ptrs anew, otherwise we core in
|
|
// call to PosdbTable::init() below
|
|
//m_posdbTable.m_initialized = false;
|
|
// reset ourselves, partially, anyway, not tmpq etc.
|
|
reset2();
|
|
// debug log
|
|
log("msg39: docid split phase %lli-%lli",d0,d1);
|
|
// wtf?
|
|
if ( d0 >= d1 ) break;
|
|
// use this
|
|
//m_debug = true;
|
|
//log("call1");
|
|
// . get the lists
|
|
// . i think this always should block!
|
|
// . it will also intersect the termlists to get the search
|
|
// results and accumulate the winners into the "tree"
|
|
if ( ! getLists() ) return false;
|
|
//log("call2 g_errno=%li",(long)g_errno);
|
|
// if there was an error, stop!
|
|
if ( g_errno ) break;
|
|
}
|
|
|
|
// return error reply if we had an error
|
|
if ( g_errno ) {
|
|
log("msg39: Had error3: %s.", mstrerror(g_errno));
|
|
sendReply (m_slot,this,NULL,0,0 , true);
|
|
return true;
|
|
}
|
|
|
|
if ( m_debug )
|
|
log("msg39: done with all docid range splits");
|
|
|
|
// all done. this will send reply back
|
|
//estimateHits();
|
|
//addedLists();
|
|
|
|
// should we put cluster recs in the tree?
|
|
//m_gotClusterRecs = ( g_conf.m_fullSplit && m_r->m_doSiteClustering );
|
|
m_gotClusterRecs = ( m_r->m_doSiteClustering );
|
|
|
|
// . before we send the top docids back, lookup their site hashes
|
|
// in clusterdb so we can do filtering at this point.
|
|
// BUT only do this if we are in a "full split" config, because that
|
|
// way we can guarantee all clusterdb recs are local (on this host)
|
|
// and should be in the page cache. the page cache should do ultra
|
|
// quick lookups and no memcpy()'s for this operation. it should
|
|
// be <<1ms to lookup thousands of docids.
|
|
// . when doing innerLoopSiteClustering we always use top tree now
|
|
// because our number of "top docids" can be somewhat unpredictably
|
|
// large due to having a ton of results with the same "domain hash"
|
|
// (see the "vcount" in IndexTable2.cpp)
|
|
// . do NOT do if we are just "getting weights", phr and aff weights
|
|
if ( m_gotClusterRecs ) {
|
|
// . set the clusterdb recs in the top tree
|
|
// . this calls estimateHits() in its reply wrapper when done
|
|
return setClusterRecs ( ) ;
|
|
}
|
|
|
|
// if we did not call setClusterRecs, go on to estimate the hits
|
|
estimateHits();
|
|
|
|
// no block, we are done
|
|
return true;
|
|
}
|
|
|
|
void tryAgainWrapper ( int fd , void *state ) {
|
|
Msg39 *THIS = (Msg39 *)state;
|
|
g_loop.unregisterSleepCallback ( state , tryAgainWrapper );
|
|
THIS->getLists();
|
|
}
|
|
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
bool Msg39::getLists () {
|
|
|
|
if ( m_debug ) m_startTime = gettimeofdayInMilliseconds();
|
|
// . ask Indexdb for the IndexLists we need for these termIds
|
|
// . each rec in an IndexList is a termId/score/docId tuple
|
|
|
|
//
|
|
// restrict to docid range?
|
|
//
|
|
// . get the docid start and end
|
|
// . do docid paritioning so we can send to all hosts
|
|
// in the network, not just one stripe
|
|
long long docIdStart = 0;
|
|
long long docIdEnd = MAX_DOCID;
|
|
// . restrict to this docid?
|
|
// . will really make gbdocid:| searches much faster!
|
|
long long dr = m_tmpq.m_docIdRestriction;
|
|
if ( dr ) {
|
|
docIdStart = dr;
|
|
docIdEnd = dr + 1;
|
|
}
|
|
// . override
|
|
// . this is set from Msg39::doDocIdSplitLoop() to compute
|
|
// search results in stages, so that we do not load massive
|
|
// termlists into memory and got OOM (out of memory)
|
|
if ( m_r->m_minDocId != -1 ) docIdStart = m_r->m_minDocId;
|
|
if ( m_r->m_maxDocId != -1 ) docIdEnd = m_r->m_maxDocId+1;
|
|
|
|
// if we have twins, then make sure the twins read different
|
|
// pieces of the same docid range to make things 2x faster
|
|
//bool useTwins = false;
|
|
//if ( g_hostdb.getNumStripes() == 2 ) useTwins = true;
|
|
//if ( useTwins ) {
|
|
// long long delta2 = ( docIdEnd - docIdStart ) / 2;
|
|
// if ( m_r->m_stripe == 0 ) docIdEnd = docIdStart + delta2;
|
|
// else docIdStart = docIdStart + delta2;
|
|
//}
|
|
// new striping logic:
|
|
long numStripes = g_hostdb.getNumStripes();
|
|
long long delta2 = ( docIdEnd - docIdStart ) / numStripes;
|
|
long stripe = g_hostdb.getMyHost()->m_stripe;
|
|
docIdStart += delta2 * stripe; // is this right?
|
|
docIdEnd = docIdStart + delta2;
|
|
// add 1 to be safe so we don't lose a docid
|
|
docIdEnd++;
|
|
// TODO: add triplet support later for this to split the
|
|
// read 3 ways. 4 ways for quads, etc.
|
|
//if ( g_hostdb.getNumStripes() >= 3 ) { char *xx=NULL;*xx=0;}
|
|
// do not go over MAX_DOCID because it gets masked and
|
|
// ends up being 0!!! and we get empty lists
|
|
if ( docIdEnd > MAX_DOCID ) docIdEnd = MAX_DOCID;
|
|
// remember so Msg2.cpp can use them to restrict the termlists
|
|
// from "whiteList" as well
|
|
m_docIdStart = docIdStart;
|
|
m_docIdEnd = docIdEnd;
|
|
|
|
|
|
//
|
|
// set startkey/endkey for each term/termlist
|
|
//
|
|
for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL ( m_r->m_niceness );
|
|
// shortcuts
|
|
QueryTerm *qterm = &m_tmpq.m_qterms[i];
|
|
char *sk = qterm->m_startKey;
|
|
char *ek = qterm->m_endKey;
|
|
// get the term id
|
|
long long tid = m_tmpq.getTermId(i);
|
|
// if only 1 stripe
|
|
//if ( g_hostdb.getNumStripes() == 1 ) {
|
|
// docIdStart = 0;
|
|
// docIdEnd = MAX_DOCID;
|
|
//}
|
|
// store now in qterm
|
|
g_posdb.makeStartKey ( sk , tid , docIdStart );
|
|
g_posdb.makeEndKey ( ek , tid , docIdEnd );
|
|
qterm->m_ks = sizeof(POSDBKEY);//key144_t);
|
|
}
|
|
|
|
// debug msg
|
|
if ( m_debug || g_conf.m_logDebugQuery ) {
|
|
for ( long i = 0 ; i < m_tmpq.getNumTerms() ; i++ ) {
|
|
// get the term in utf8
|
|
//char bb[256];
|
|
QueryTerm *qt = &m_tmpq.m_qterms[i];
|
|
//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
|
|
char *tpc = qt->m_term + qt->m_termLen;
|
|
char tmp = *tpc;
|
|
*tpc = '\0';
|
|
char sign = qt->m_termSign;
|
|
if ( sign == 0 ) sign = '0';
|
|
QueryWord *qw = qt->m_qword;
|
|
long wikiPhrId = qw->m_wikiPhraseId;
|
|
if ( m_tmpq.isPhrase(i) ) wikiPhrId = 0;
|
|
char leftwikibigram = 0;
|
|
char rightwikibigram = 0;
|
|
if ( qt->m_leftPhraseTerm &&
|
|
qt->m_leftPhraseTerm->m_isWikiHalfStopBigram )
|
|
leftwikibigram = 1;
|
|
if ( qt->m_rightPhraseTerm &&
|
|
qt->m_rightPhraseTerm->m_isWikiHalfStopBigram )
|
|
rightwikibigram = 1;
|
|
/*
|
|
char c = m_tmpq.getTermSign(i);
|
|
char tt[512];
|
|
long ttlen = m_tmpq.getTermLen(i);
|
|
if ( ttlen > 254 ) ttlen = 254;
|
|
if ( ttlen < 0 ) ttlen = 0;
|
|
// old:painful: convert each term from unicode to ascii
|
|
memcpy ( tt , m_tmpq.getTerm(i) , ttlen );
|
|
*/
|
|
long isSynonym = 0;
|
|
QueryTerm *st = qt->m_synonymOf;
|
|
if ( st ) isSynonym = true;
|
|
SafeBuf sb;
|
|
// now we can display it
|
|
//tt[ttlen]='\0';
|
|
//if ( c == '\0' ) c = ' ';
|
|
sb.safePrintf(
|
|
"query: msg39: [%lu] query term #%li \"%s\" "
|
|
"phr=%li termId=%llu rawTermId=%llu "
|
|
//"estimatedTermFreq=%lli (+/- ~16000) "
|
|
"tfweight=%.02f "
|
|
"sign=%c "
|
|
"numPlusses=%hhu "
|
|
"required=%li "
|
|
"fielcode=%li "
|
|
|
|
"ebit=0x%0llx "
|
|
"impBits=0x%0llx "
|
|
|
|
"wikiphrid=%li "
|
|
"leftwikibigram=%li "
|
|
"rightwikibigram=%li "
|
|
//"range.startTermNum=%hhi range.endTermNum=%hhi "
|
|
//"minRecSizes=%li "
|
|
"readSizeInBytes=%li "
|
|
//"ebit=0x%llx "
|
|
//"impBits=0x%llx "
|
|
"hc=%li "
|
|
"component=%li "
|
|
"otermLen=%li "
|
|
"isSynonym=%li "
|
|
"querylangid=%li " ,
|
|
(long)this ,
|
|
i ,
|
|
qt->m_term,//bb ,
|
|
(long)m_tmpq.isPhrase (i) ,
|
|
m_tmpq.getTermId (i) ,
|
|
m_tmpq.getRawTermId (i) ,
|
|
((float *)m_r->ptr_termFreqWeights)[i] ,
|
|
sign , //c ,
|
|
0 ,
|
|
(long)qt->m_isRequired,
|
|
(long)qt->m_fieldCode,
|
|
|
|
(long long)qt->m_explicitBit ,
|
|
(long long)qt->m_implicitBits ,
|
|
|
|
wikiPhrId,
|
|
(long)leftwikibigram,
|
|
(long)rightwikibigram,
|
|
((long *)m_r->ptr_readSizes)[i] ,
|
|
//(long long)m_tmpq.m_qterms[i].m_explicitBit ,
|
|
//(long long)m_tmpq.m_qterms[i].m_implicitBits ,
|
|
(long)m_tmpq.m_qterms[i].m_hardCount ,
|
|
(long)m_tmpq.m_componentCodes[i],
|
|
(long)m_tmpq.getTermLen(i) ,
|
|
isSynonym,
|
|
(long)m_tmpq.m_langId ); // ,tt
|
|
// put it back
|
|
*tpc = tmp;
|
|
if ( st ) {
|
|
long stnum = st - m_tmpq.m_qterms;
|
|
sb.safePrintf("synofterm#=%li",stnum);
|
|
//sb.safeMemcpy(st->m_term,st->m_termLen);
|
|
sb.pushChar(' ');
|
|
sb.safePrintf("synwid0=%lli ",qt->m_synWids0);
|
|
sb.safePrintf("synwid1=%lli ",qt->m_synWids1);
|
|
sb.safePrintf("synalnumwords=%li ",
|
|
qt->m_numAlnumWordsInSynonym);
|
|
// like for synonym "nj" it's base,
|
|
// "new jersey" has 2 alnum words!
|
|
sb.safePrintf("synbasealnumwords=%li ",
|
|
qt->m_numAlnumWordsInBase);
|
|
}
|
|
logf(LOG_DEBUG,"%s",sb.getBufStart());
|
|
|
|
}
|
|
m_tmpq.printBooleanTree();
|
|
}
|
|
// timestamp log
|
|
if ( m_debug )
|
|
log(LOG_DEBUG,"query: msg39: [%lu] Getting %li index lists ",
|
|
(long)this,m_tmpq.getNumTerms());
|
|
// . now get the index lists themselves
|
|
// . return if it blocked
|
|
// . not doing a merge (last parm) means that the lists we receive
|
|
// will be an appending of a bunch of lists so keys won't be in order
|
|
// . merging is uneccessary for us here because we hash the keys anyway
|
|
// . and merging takes up valuable cpu time
|
|
// . caution: the index lists returned from Msg2 are now compressed
|
|
// . now i'm merging because it's 10 times faster than hashing anyway
|
|
// and the reply buf should now always be <= minRecSizes so we can
|
|
// pre-allocate one better, and, 3) this should fix the yahoo.com
|
|
// reindex bug
|
|
char rdbId = RDB_POSDB;
|
|
|
|
// . TODO: MDW: fix
|
|
// . partap says there is a bug in this??? we can't cache UOR'ed lists?
|
|
bool checkCache = false;
|
|
// split is us????
|
|
//long split = g_hostdb.m_myHost->m_group;
|
|
long split = g_hostdb.m_myHost->m_shardNum;
|
|
// call msg2
|
|
if ( ! m_msg2.getLists ( rdbId ,
|
|
m_r->m_collnum,//m_r->ptr_coll ,
|
|
m_r->m_maxAge ,
|
|
m_r->m_addToCache ,
|
|
//m_tmpq.m_qterms ,
|
|
&m_tmpq,
|
|
m_r->ptr_whiteList,
|
|
// we need to restrict docid range for
|
|
// whitelist as well! this is from
|
|
// doDocIdSplitLoop()
|
|
m_docIdStart,
|
|
m_docIdEnd,
|
|
// how much of each termlist to read in bytes
|
|
(long *)m_r->ptr_readSizes ,
|
|
//m_tmpq.getNumTerms() , // numLists
|
|
m_lists ,
|
|
this ,
|
|
gotListsWrapper ,
|
|
m_r ,
|
|
m_r->m_niceness ,
|
|
true , // do merge?
|
|
m_debug ,
|
|
NULL , // best hostids
|
|
m_r->m_restrictPosdbForQuery ,
|
|
split ,
|
|
checkCache )) {
|
|
m_blocked = true;
|
|
return false;
|
|
}
|
|
|
|
// error?
|
|
if ( g_errno ) {
|
|
log("msg39: Had error getting termlists2: %s.",
|
|
mstrerror(g_errno));
|
|
// don't bail out here because we are in docIdSplitLoop()
|
|
//sendReply (m_slot,this,NULL,0,0,true);
|
|
return true;
|
|
}
|
|
|
|
return gotLists ( true );
|
|
}
|
|
|
|
void gotListsWrapper ( void *state ) {
|
|
Msg39 *THIS = (Msg39 *) state;
|
|
// . hash the lists into our index table
|
|
// . this will send back a reply or recycle and read more list data
|
|
|
|
if ( ! THIS->gotLists ( true ) ) return;
|
|
|
|
// . if he did not block and there was an errno we send reply
|
|
// otherwise if there was NO error he will have sent the reply
|
|
// . if gotLists() was called in the ABOVE function and it returns
|
|
// true then the docIdLoop() function will send back the reply.
|
|
if ( g_errno ) {
|
|
log("msg39: sending back error reply = %s",mstrerror(g_errno));
|
|
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
|
|
}
|
|
|
|
// no, block? call the docid split loop
|
|
//if ( numDocIdSplits <= 1 ) return;
|
|
|
|
// if we get the lists and processed them without blocking, repeat!
|
|
THIS->doDocIdSplitLoop();
|
|
}
|
|
|
|
// . now come here when we got the necessary index lists
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
bool Msg39::gotLists ( bool updateReadInfo ) {
|
|
// bail on error
|
|
if ( g_errno ) {
|
|
hadError:
|
|
log("msg39: Had error getting termlists: %s.",
|
|
mstrerror(g_errno));
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
//sendReply (m_slot,this,NULL,0,0,true);
|
|
return true;
|
|
}
|
|
// timestamp log
|
|
if ( m_debug ) {
|
|
log(LOG_DEBUG,"query: msg39: [%lu] Got %li lists in %lli ms"
|
|
, (long)this,m_tmpq.getNumTerms(),
|
|
gettimeofdayInMilliseconds() - m_startTime);
|
|
m_startTime = gettimeofdayInMilliseconds();
|
|
}
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_r->m_niceness );
|
|
|
|
// ensure collection not deleted from under us
|
|
CollectionRec *cr = g_collectiondb.getRec ( m_r->m_collnum );
|
|
if ( ! cr ) {
|
|
g_errno = ENOCOLLREC;
|
|
goto hadError;
|
|
}
|
|
|
|
// . set the IndexTable so it can set it's score weights from the
|
|
// termFreqs of each termId in the query
|
|
// . this now takes into account the special termIds used for sorting
|
|
// by date (0xdadadada and 0xdadadad2 & TERMID_MASK)
|
|
// . it should weight them so much so that the summation of scores
|
|
// from other query terms cannot make up for a lower date score
|
|
// . this will actually calculate the top
|
|
// . this might also change m_tmpq.m_termSigns
|
|
// . this won't do anything if it was already called
|
|
m_posdbTable.init ( &m_tmpq ,
|
|
m_debug ,
|
|
this ,
|
|
&m_tt ,
|
|
m_r->m_collnum,//ptr_coll ,
|
|
&m_msg2 , // m_lists ,
|
|
//m_tmpq.m_numTerms , // m_numLists
|
|
m_r );
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_r->m_niceness );
|
|
|
|
// . we have to do this here now too
|
|
// . but if we are getting weights, we don't need m_tt!
|
|
// . actually we were using it before for rat=0/bool queries but
|
|
// i got rid of NO_RAT_SLOTS
|
|
if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
//sendReply ( m_slot , this , NULL , 0 , 0 , true);
|
|
return true;
|
|
}
|
|
|
|
// we have to allocate this with each call because each call can
|
|
// be a different docid range from doDocIdSplitLoop.
|
|
if ( ! m_posdbTable.allocWhiteListTable() ) {
|
|
log("msg39: Had error allocating white list table: %s.",
|
|
mstrerror(g_errno));
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
//sendReply (m_slot,this,NULL,0,0,true);
|
|
return true;
|
|
}
|
|
|
|
|
|
// do not re do it if doing docid range splitting
|
|
m_allocedTree = true;
|
|
|
|
|
|
// . now we must call this separately here, not in allocTopTree()
|
|
// . we have to re-set the QueryTermInfos with each docid range split
|
|
// since it will set the list ptrs from the msg2 lists
|
|
if ( ! m_posdbTable.setQueryTermInfo () ) return true;
|
|
|
|
// print query term bit numbers here
|
|
for ( long i = 0 ;
|
|
m_debug && i < m_tmpq.getNumTerms() ; i++ ) {
|
|
QueryTerm *qt = &m_tmpq.m_qterms[i];
|
|
//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
|
|
char *tpc = qt->m_term + qt->m_termLen;
|
|
char tmp = *tpc;
|
|
*tpc = '\0';
|
|
SafeBuf sb;
|
|
sb.safePrintf("query: msg39: BITNUM query term #%li \"%s\" "
|
|
"bitnum=%li ", i , qt->m_term, qt->m_bitNum );
|
|
// put it back
|
|
*tpc = tmp;
|
|
logf(LOG_DEBUG,"%s",sb.getBufStart());
|
|
}
|
|
|
|
|
|
// timestamp log
|
|
if ( m_debug ) {
|
|
log(LOG_DEBUG,"query: msg39: [%lu] Preparing to intersect "
|
|
"took %lli ms",
|
|
(long)this, gettimeofdayInMilliseconds() - m_startTime );
|
|
m_startTime = gettimeofdayInMilliseconds();
|
|
}
|
|
|
|
// time it
|
|
long long start = gettimeofdayInMilliseconds();
|
|
long long diff;
|
|
|
|
// . don't bother making a thread if lists are small
|
|
// . look at STAGE? in IndexReadInfo.cpp to see how we read in stages
|
|
// . it's always saying msg39 handler is hogging cpu...could this be it
|
|
//if ( m_msg2.getTotalRead() < 2000*8 ) goto skipThread;
|
|
|
|
// debug
|
|
//goto skipThread;
|
|
|
|
// . NOW! let's do this in a thread so we can continue to service
|
|
// incoming requests
|
|
// . don't launch more than 1 thread at a time for this
|
|
// . set callback when thread done
|
|
|
|
// breathe
|
|
QUICKPOLL ( m_r->m_niceness );
|
|
|
|
// . create the thread
|
|
// . only one of these type of threads should be launched at a time
|
|
if ( ! m_debug &&
|
|
g_threads.call ( INTERSECT_THREAD , // threadType
|
|
m_r->m_niceness ,
|
|
this , // top 4 bytes must be cback
|
|
threadDoneWrapper ,
|
|
addListsWrapper ) ) {
|
|
m_blocked = true;
|
|
return false;
|
|
}
|
|
// if it failed
|
|
//log(LOG_INFO,"query: Intersect thread creation failed. Doing "
|
|
// "blocking. Hurts performance.");
|
|
// check tree
|
|
if ( m_tt.m_nodes == NULL ) {
|
|
log(LOG_LOGIC,"query: msg39: Badness.");
|
|
char *xx = NULL; *xx = 0; }
|
|
|
|
// sometimes we skip the thread
|
|
//skipThread:
|
|
// . addLists() should never have a problem
|
|
// . g_errno should be set by prepareToAddLists() above if there is
|
|
// going to be a problem
|
|
//if ( m_r->m_useNewAlgo )
|
|
m_posdbTable.intersectLists10_r ( );
|
|
//else
|
|
// m_posdbTable.intersectLists9_r ( );
|
|
|
|
// time it
|
|
diff = gettimeofdayInMilliseconds() - start;
|
|
if ( diff > 10 ) log("query: Took %lli ms for intersection",diff);
|
|
|
|
// returns false if blocked, true otherwise
|
|
return addedLists ();
|
|
}
|
|
|
|
void *addListsWrapper ( void *state , ThreadEntry *t ) {
|
|
// we're in a thread now!
|
|
Msg39 *THIS = (Msg39 *)state;
|
|
// . do the add
|
|
// . addLists() returns false and sets errno on error
|
|
// . hash the lists into our table
|
|
// . this returns false and sets g_errno on error
|
|
// . Msg2 always compresses the lists so be aware that the termId
|
|
// has been discarded
|
|
//THIS->m_posdbTable.intersectLists9_r ();
|
|
//if ( THIS->m_r->m_useNewAlgo )
|
|
THIS->m_posdbTable.intersectLists10_r ( );
|
|
//else
|
|
// THIS->m_posdbTable.intersectLists9_r ( );
|
|
// . exit the thread
|
|
// . top 4 bytes of "state" ptr should be our done callback
|
|
// . threadDoneWrapper will be called by g_loop when he gets the
|
|
// thread's termination signal, sig niceness is m_niceness
|
|
// . bogus return
|
|
return NULL;
|
|
}
|
|
|
|
// we come here after thread exits
|
|
void threadDoneWrapper ( void *state , ThreadEntry *t ) {
|
|
// get this class
|
|
Msg39 *THIS = (Msg39 *)state;
|
|
// sanity check
|
|
if ( ! THIS->m_blocked ) { char *xx=NULL;*xx=0; }
|
|
|
|
// addedLists() could send reply and destroy "THIS" so save this.
|
|
// it will only sendReply back if it calls estimateHits() which
|
|
// is only called if numDocIdSplits <= 1...
|
|
long numDocIdSplits = THIS->m_numDocIdSplits;
|
|
char debug = THIS->m_debug;
|
|
// just return if it blocked
|
|
if ( ! THIS->addedLists () ) {
|
|
// this can't block
|
|
if ( numDocIdSplits >= 2 ) { char *xx=NULL;*xx=0; }
|
|
if ( debug ) log("msg39: addedLists blocked");
|
|
return;
|
|
}
|
|
if ( debug ) log("msg39: addedLists no block. i guess reply sent");
|
|
// no, block? call the docid split loop
|
|
if ( numDocIdSplits <= 1 ) return;
|
|
// . just re-do the whole she-bang but do not reset m_tt top tree!!!
|
|
// . it returns false if it blocks
|
|
THIS->doDocIdSplitLoop();
|
|
}
|
|
|
|
// return false if blocked, true otherwise
|
|
bool Msg39::addedLists ( ) {
|
|
|
|
if ( m_posdbTable.m_t1 ) {
|
|
// . measure time to add the lists in bright green
|
|
// . use darker green if rat is false (default OR)
|
|
long color;
|
|
//char *label;
|
|
color = 0x0000ff00 ;
|
|
//label = "termlist_intersect";
|
|
g_stats.addStat_r ( 0 ,
|
|
m_posdbTable.m_t1 ,
|
|
m_posdbTable.m_t2 , color );
|
|
}
|
|
|
|
|
|
// accumulate total hits count over each docid split
|
|
m_numTotalHits += m_posdbTable.m_docIdVoteBuf.length() / 6;
|
|
|
|
// before wrapping up, complete our docid split loops!
|
|
// so do not send the reply back yet... send reply back from
|
|
// the docid loop function... doDocIdSplitLoop()
|
|
if ( m_numDocIdSplits >= 2 ) return true;
|
|
|
|
|
|
// . save some memory,free m_topDocIdPtrs2,m_topScores2,m_topExplicits2
|
|
// . the m_topTree should have been filled from the call to
|
|
// IndexTable2::fillTopDocIds() and it no longer has ptrs to the
|
|
// docIds, but has the docIds themselves
|
|
//m_posdbTable.freeMem();
|
|
|
|
// error?
|
|
if ( m_posdbTable.m_errno ) {
|
|
// we do not need to store the intersection i guess...??
|
|
m_posdbTable.freeMem();
|
|
g_errno = m_posdbTable.m_errno;
|
|
log("query: posdbtable had error = %s",mstrerror(g_errno));
|
|
sendReply ( m_slot , this , NULL , 0 , 0 ,true);
|
|
return true;
|
|
}
|
|
|
|
|
|
// should we put cluster recs in the tree?
|
|
//m_gotClusterRecs = ( g_conf.m_fullSplit && m_r->m_doSiteClustering );
|
|
m_gotClusterRecs = ( m_r->m_doSiteClustering );
|
|
|
|
// . before we send the top docids back, lookup their site hashes
|
|
// in clusterdb so we can do filtering at this point.
|
|
// BUT only do this if we are in a "full split" config, because that
|
|
// way we can guarantee all clusterdb recs are local (on this host)
|
|
// and should be in the page cache. the page cache should do ultra
|
|
// quick lookups and no memcpy()'s for this operation. it should
|
|
// be <<1ms to lookup thousands of docids.
|
|
// . when doing innerLoopSiteClustering we always use top tree now
|
|
// because our number of "top docids" can be somewhat unpredictably
|
|
// large due to having a ton of results with the same "domain hash"
|
|
// (see the "vcount" in IndexTable2.cpp)
|
|
// . do NOT do if we are just "getting weights", phr and aff weights
|
|
if ( m_gotClusterRecs ) {
|
|
// . set the clusterdb recs in the top tree
|
|
return setClusterRecs ( ) ;
|
|
}
|
|
|
|
// if we did not call setClusterRecs, go on to estimate the hits
|
|
estimateHits();
|
|
return true;
|
|
}
|
|
|
|
// . set the clusterdb recs in the top tree
|
|
// . returns false if blocked, true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool Msg39::setClusterRecs ( ) {
|
|
// make buf for arrays of the docids, cluster levels and cluster recs
|
|
long nodeSize = 8 + 1 + 12;
|
|
long numDocIds = m_tt.m_numUsedNodes;
|
|
m_bufSize = numDocIds * nodeSize;
|
|
m_buf = (char *)mmalloc ( m_bufSize , "Msg39docids" );
|
|
// on error, return true, g_errno should be set
|
|
if ( ! m_buf ) {
|
|
log("query: msg39: Failed to alloc buf for clustering.");
|
|
sendReply(m_slot,this,NULL,0,0,true);
|
|
return true;
|
|
}
|
|
|
|
// parse out the buf
|
|
char *p = m_buf;
|
|
// docIds
|
|
m_clusterDocIds = (long long *)p; p += numDocIds * 8;
|
|
m_clusterLevels = (char *)p; p += numDocIds * 1;
|
|
m_clusterRecs = (key_t *)p; p += numDocIds * 12;
|
|
// sanity check
|
|
if ( p > m_buf + m_bufSize ) { char *xx=NULL; *xx=0; }
|
|
|
|
// loop over all results
|
|
long nd = 0;
|
|
for ( long ti = m_tt.getHighNode() ; ti >= 0 ;
|
|
ti = m_tt.getPrev(ti) , nd++ ) {
|
|
// get the guy
|
|
TopNode *t = &m_tt.m_nodes[ti];
|
|
// get the docid
|
|
//long long docId = getDocIdFromPtr(t->m_docIdPtr);
|
|
// store in array
|
|
m_clusterDocIds[nd] = t->m_docId;
|
|
// assume not gotten
|
|
m_clusterLevels[nd] = CR_UNINIT;
|
|
// assume not found, make the whole thing is 0
|
|
m_clusterRecs[nd].n1 = 0;
|
|
m_clusterRecs[nd].n0 = 0LL;
|
|
}
|
|
|
|
// store number
|
|
m_numClusterDocIds = nd;
|
|
|
|
// sanity check
|
|
if ( nd != m_tt.m_numUsedNodes ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . ask msg51 to get us the cluster recs
|
|
// . it should read it all from the local drives
|
|
// . "maxAge" of 0 means to not get from cache (does not include disk)
|
|
if ( ! m_msg51.getClusterRecs ( m_clusterDocIds ,
|
|
m_clusterLevels ,
|
|
m_clusterRecs ,
|
|
m_numClusterDocIds ,
|
|
m_r->m_collnum ,
|
|
0 , // maxAge
|
|
false , // addToCache
|
|
this ,
|
|
gotClusterRecsWrapper ,
|
|
m_r->m_niceness ,
|
|
m_debug ) )
|
|
// did we block? if so, return
|
|
return false;
|
|
|
|
// ok, process the replies
|
|
gotClusterRecs();
|
|
// the above never blocks
|
|
return true;
|
|
}
|
|
|
|
void gotClusterRecsWrapper ( void *state ) {
|
|
// get this class
|
|
Msg39 *THIS = (Msg39 *)state;
|
|
// be on our way
|
|
THIS->gotClusterRecs ();
|
|
}
|
|
|
|
void Msg39::gotClusterRecs ( ) {
|
|
|
|
// now tell msg5 to set the cluster levels
|
|
if ( ! setClusterLevels ( m_clusterRecs ,
|
|
m_clusterDocIds ,
|
|
m_numClusterDocIds ,
|
|
2 , // maxdocidsperhostname
|
|
m_r->m_doSiteClustering ,
|
|
m_r->m_familyFilter ,
|
|
// turn this off, not needed now that
|
|
// we have the langid in every posdb key
|
|
0,//m_r->m_language ,
|
|
m_debug ,
|
|
m_clusterLevels )) {
|
|
m_errno = g_errno;
|
|
// send back an error reply
|
|
sendReply ( m_slot , this , NULL , 0 , 0 ,true);
|
|
return;
|
|
}
|
|
|
|
// count this
|
|
m_numVisible = 0;
|
|
|
|
// now put the info back into the top tree
|
|
long nd = 0;
|
|
for ( long ti = m_tt.getHighNode() ; ti >= 0 ;
|
|
ti = m_tt.getPrev(ti) , nd++ ) {
|
|
// get the guy
|
|
TopNode *t = &m_tt.m_nodes[ti];
|
|
// get the docid
|
|
//long long docId = getDocIdFromPtr(t->m_docIdPtr);
|
|
// sanity check
|
|
if ( t->m_docId != m_clusterDocIds[nd] ) {char *xx=NULL;*xx=0;}
|
|
// set it
|
|
t->m_clusterLevel = m_clusterLevels[nd];
|
|
t->m_clusterRec = m_clusterRecs [nd];
|
|
// visible?
|
|
if ( t->m_clusterLevel == CR_OK ) m_numVisible++;
|
|
}
|
|
|
|
log(LOG_DEBUG,"query: msg39: %li docids out of %li are visible",
|
|
m_numVisible,nd);
|
|
|
|
// free this junk now
|
|
mfree ( m_buf , m_bufSize , "Msg39cluster");
|
|
m_buf = NULL;
|
|
|
|
// accumulate total hit count over each docid split!
|
|
//m_numTotalHits += m_posdbTable.m_docIdVoteBuf.length() / 6;
|
|
|
|
// before wrapping up, complete our docid split loops!
|
|
// so do not send the reply back yet... send reply back from
|
|
// the docid loop function... doDocIdSplitLoop()
|
|
//if ( m_numDocIdSplits >= 2 ) return;
|
|
|
|
// finish up and send back the reply
|
|
estimateHits ();
|
|
}
|
|
|
|
void Msg39::estimateHits ( ) {
|
|
|
|
// no longer in use
|
|
m_inUse = false;
|
|
|
|
// now this for the query loop on the QueryLogEntries.
|
|
m_topDocId50 = 0LL;
|
|
m_topScore50 = 0.0;
|
|
|
|
// a little hack for the seo pipeline in xmldoc.cpp
|
|
m_topDocId = 0LL;
|
|
m_topScore = 0.0;
|
|
m_topDocId2 = 0LL;
|
|
m_topScore2 = 0.0;
|
|
long ti = m_tt.getHighNode();
|
|
if ( ti >= 0 ) {
|
|
TopNode *t = &m_tt.m_nodes[ti];
|
|
m_topDocId = t->m_docId;
|
|
m_topScore = t->m_score;
|
|
}
|
|
// try the 2nd one too
|
|
long ti2 = -1;
|
|
if ( ti >= 0 ) ti2 = m_tt.getNext ( ti );
|
|
if ( ti2 >= 0 ) {
|
|
TopNode *t2 = &m_tt.m_nodes[ti2];
|
|
m_topDocId2 = t2->m_docId;
|
|
m_topScore2 = t2->m_score;
|
|
}
|
|
|
|
// convenience ptrs. we will store the docids/scores into these arrays
|
|
long long *topDocIds;
|
|
double *topScores;
|
|
key_t *topRecs;
|
|
|
|
// numDocIds counts docs in all tiers when using toptree.
|
|
long numDocIds = m_tt.m_numUsedNodes;
|
|
|
|
// the msg39 reply we send back
|
|
long replySize;
|
|
char *reply;
|
|
|
|
//m_numTotalHits = m_posdbTable.m_docIdVoteBuf.length() / 6;
|
|
|
|
// make the reply?
|
|
Msg39Reply mr;
|
|
|
|
if ( ! m_callback ) {
|
|
// if we got clusterdb recs in here, use 'em
|
|
if ( m_gotClusterRecs ) numDocIds = m_numVisible;
|
|
|
|
// don't send more than the docs that are asked for
|
|
if ( numDocIds > m_r->m_docsToGet) numDocIds =m_r->m_docsToGet;
|
|
|
|
// # of QueryTerms in query
|
|
long nqt = m_tmpq.m_numTerms;
|
|
// start setting the stuff
|
|
mr.m_numDocIds = numDocIds;
|
|
// copy # estiamted hits into 8 bytes of reply
|
|
//long long est = m_posdbTable.m_estimatedTotalHits;
|
|
// ensure it has at least as many results as we got
|
|
//if ( est < numDocIds ) est = numDocIds;
|
|
// or if too big...
|
|
//if ( numDocIds < m_r->m_docsToGet ) est = numDocIds;
|
|
// . total estimated hits
|
|
// . this is now an EXACT count!
|
|
mr.m_estimatedHits = m_numTotalHits;
|
|
// sanity check
|
|
mr.m_nqt = nqt;
|
|
// the m_errno if any
|
|
mr.m_errno = m_errno;
|
|
// shortcut
|
|
PosdbTable *pt = &m_posdbTable;
|
|
// the score info, in no particular order right now
|
|
mr.ptr_scoreInfo = pt->m_scoreInfoBuf.getBufStart();
|
|
mr.size_scoreInfo = pt->m_scoreInfoBuf.length();
|
|
// that has offset references into posdbtable::m_pairScoreBuf
|
|
// and m_singleScoreBuf, so we need those too now
|
|
mr.ptr_pairScoreBuf = pt->m_pairScoreBuf.getBufStart();
|
|
mr.size_pairScoreBuf = pt->m_pairScoreBuf.length();
|
|
mr.ptr_singleScoreBuf = pt->m_singleScoreBuf.getBufStart();
|
|
mr.size_singleScoreBuf = pt->m_singleScoreBuf.length();
|
|
// save some time since seo.cpp gets from posdbtable directly,
|
|
// so we can avoid serializing/copying this stuff at least
|
|
if ( ! m_r->m_makeReply ) {
|
|
mr.size_scoreInfo = 0;
|
|
mr.size_pairScoreBuf = 0;
|
|
mr.size_singleScoreBuf = 0;
|
|
}
|
|
// and now the sitehash list if it exists
|
|
mr.ptr_siteHashList = pt->m_siteHashList.getBufStart();
|
|
mr.size_siteHashList = pt->m_siteHashList.length();
|
|
mr.m_sectionStats = pt->m_sectionStats;
|
|
// reserve space for these guys, we fill them in below
|
|
mr.ptr_docIds = NULL;
|
|
mr.ptr_scores = NULL;
|
|
mr.ptr_clusterRecs = NULL;
|
|
// this is how much space to reserve
|
|
mr.size_docIds = 8 * numDocIds; // long long
|
|
mr.size_scores = sizeof(double) * numDocIds; // float
|
|
// if not doing site clustering, we won't have these perhaps...
|
|
if ( m_gotClusterRecs )
|
|
mr.size_clusterRecs = sizeof(key_t) *numDocIds;
|
|
else
|
|
mr.size_clusterRecs = 0;
|
|
// . that is pretty much it,so serialize it into buffer,"reply"
|
|
// . mr.ptr_docIds, etc., will point into the buffer so we can
|
|
// re-serialize into it below from the tree
|
|
// . returns NULL and sets g_errno on error
|
|
// . "true" means we should make mr.ptr_* reference into the
|
|
// newly serialized buffer.
|
|
reply = serializeMsg ( sizeof(Msg39Reply), // baseSize
|
|
&mr.size_docIds, // firstSizeParm
|
|
&mr.size_clusterRecs,//lastSizePrm
|
|
&mr.ptr_docIds , // firstStrPtr
|
|
&mr , // thisPtr
|
|
&replySize ,
|
|
NULL ,
|
|
0 ,
|
|
true ) ;
|
|
if ( ! reply ) {
|
|
log("query: Could not allocated memory "
|
|
"to hold reply of docids to send back.");
|
|
sendReply(m_slot,this,NULL,0,0,true);
|
|
return ;
|
|
}
|
|
topDocIds = (long long *) mr.ptr_docIds;
|
|
topScores = (double *) mr.ptr_scores;
|
|
topRecs = (key_t *) mr.ptr_clusterRecs;
|
|
}
|
|
|
|
long docCount = 0;
|
|
// loop over all results in the TopTree
|
|
for ( long ti = m_tt.getHighNode() ; ti >= 0 ;
|
|
ti = m_tt.getPrev(ti) ) {
|
|
// get the guy
|
|
TopNode *t = &m_tt.m_nodes[ti];
|
|
// skip if clusterLevel is bad!
|
|
if ( m_gotClusterRecs && t->m_clusterLevel != CR_OK )
|
|
continue;
|
|
|
|
// if not sending back a reply... we were called from seo.cpp
|
|
// State3f logic to evaluate a QueryLogEntry, etc.
|
|
if ( m_callback ) {
|
|
// skip results past #50
|
|
if ( docCount > 50 ) continue;
|
|
// set this
|
|
m_topScore50 = t->m_score;
|
|
m_topDocId50 = t->m_docId;
|
|
// that's it
|
|
continue;
|
|
}
|
|
|
|
// get the docid ptr
|
|
//char *diptr = t->m_docIdPtr;
|
|
//long long docId = getDocIdFromPtr(diptr);
|
|
// sanity check
|
|
if ( t->m_docId < 0 ) { char *xx=NULL; *xx=0; }
|
|
//add it to the reply
|
|
topDocIds [docCount] = t->m_docId;
|
|
topScores [docCount] = t->m_score;
|
|
if ( m_tt.m_useIntScores )
|
|
topScores[docCount] = (double)t->m_intScore;
|
|
// supply clusterdb rec? only for full splits
|
|
if ( m_gotClusterRecs )
|
|
topRecs [docCount] = t->m_clusterRec;
|
|
//topExplicits [docCount] =
|
|
// getNumBitsOn(t->m_explicits)
|
|
docCount++;
|
|
|
|
// 50th score? set this for seo.cpp. if less than 50 results
|
|
// we want the score of the last doc then.
|
|
if ( docCount <= 50 ) m_topScore50 = t->m_score;
|
|
|
|
if ( m_debug ) {
|
|
log(LOG_DEBUG,"query: msg39: [%lu] "
|
|
"%03li) docId=%012llu sum=%.02f",
|
|
(long)this, docCount,
|
|
t->m_docId,t->m_score);
|
|
}
|
|
//don't send more than the docs that are wanted
|
|
if ( docCount >= numDocIds ) break;
|
|
}
|
|
if ( docCount > 300 && m_debug )
|
|
log("query: Had %li nodes in top tree",docCount);
|
|
|
|
// this is sensitive info
|
|
if ( m_debug ) {
|
|
log(LOG_DEBUG,
|
|
"query: msg39: [%li] Intersected lists took %lli (%lli) "
|
|
"ms "
|
|
"docIdsToGet=%li docIdsGot=%li "
|
|
"q=%s",
|
|
(long)this ,
|
|
m_posdbTable.m_addListsTime ,
|
|
gettimeofdayInMilliseconds() - m_startTime ,
|
|
m_r->m_docsToGet ,
|
|
numDocIds ,
|
|
m_tmpq.getQuery() );
|
|
}
|
|
|
|
|
|
// if we blocked because we used a thread then call callback if
|
|
// summoned from a msg3f handler and not a msg39 handler
|
|
if ( m_callback ) {
|
|
// if we blocked call user callback
|
|
if ( m_blocked ) m_callback ( m_state );
|
|
// if not sending back a udp reply, return now
|
|
return;
|
|
}
|
|
|
|
// now send back the reply
|
|
sendReply(m_slot,this,reply,replySize,replySize,false);
|
|
return;
|
|
}
|