open-source-search-engine/Msg40.cpp

6569 lines
204 KiB
C++

#ifndef _BUZZLOGIC_
#include "gb-include.h"
#endif
#include "Msg40.h"
#include "Stats.h" // for timing and graphing time to get all summaries
//#include "CollectionRec.h"
#include "Collectiondb.h"
//#include "TitleRec.h" // containsAdultWords ()
#include "LanguageIdentifier.h"
#include "sort.h"
#include "matches2.h"
#include "XmlDoc.h" // computeSimilarity()
//#include "Facebook.h" // msgfb
#include "Speller.h"
#include "Wiki.h"
#include "HttpServer.h"
#include "PageResults.h"
// increasing this doesn't seem to improve performance any on a single
// node cluster....
#define MAX_OUTSTANDING_MSG20S 200
bool printHttpMime ( class State0 *st ) ;
//static void handleRequest40 ( UdpSlot *slot , int32_t netnice );
//static void gotExternalReplyWrapper ( void *state , void *state2 ) ;
static void gotCacheReplyWrapper ( void *state );
static void gotDocIdsWrapper ( void *state );
static bool gotSummaryWrapper ( void *state );
//static void didTaskWrapper ( void *state );
//static void gotResults2 ( void *state );
// here's the GIGABIT knobs:
// sample radius in chars around each query term : 600 (line 212)
// max sample size, all excerpts, per document : 100k (line 213)
// map from distance to query term in words to score: (line 855)
// map from popularity to score weight : (lines 950 et al)
// the comments above are way out of date (aac, Jan 2008)
//
// QPOP multiplier params
#define QPOP_ZONE_0 10
#define QPOP_ZONE_1 30
#define QPOP_ZONE_2 80
#define QPOP_ZONE_3 100
#define QPOP_ZONE_4 300
#define QPOP_MULT_0 10
#define QPOP_MULT_1 8
#define QPOP_MULT_2 6
#define QPOP_MULT_3 4
#define QPOP_MULT_4 2
// QTR scoring params
#define MAX_SCORE_MULTIPLIER 3000 // orig: 3000
#define ALT_MAX_SCORE 12000 // orig: 12000
#define ALT_START_SCORE 1000
#define QTR_ZONE_0 4
#define QTR_ZONE_1 8
#define QTR_ZONE_2 12
#define QTR_ZONE_3 20
#define QTR_BONUS_0 1000
#define QTR_BONUS_1 800
#define QTR_BONUS_2 500
#define QTR_BONUS_3 200
#define QTR_BONUS_CW 1
#define MULTIPLE_HIT_BOOST 1000 // orig: 1000
// gigabit phrase scoring params
//#define SPARSE_MARK 0.34
//#define SPARSE_PENALTY 1000
#define FWC_PENALTY 500 // penalty for begining with common word
#define POP_ZONE_0 10 // 0.00001
#define POP_ZONE_1 30 //0.0001
#define POP_ZONE_2 80 // 0.001
#define POP_ZONE_3 300 // 0.01
#define POP_BOOST_0 4.0
#define POP_BOOST_1 3.0
#define POP_BOOST_2 2.0
#define POP_BOOST_3 1.0
#define POP_BOOST_4 0.1
bool isSubDom(char *s , int32_t len);
Msg40::Msg40() {
m_firstTime = true;
m_doneWithLookup = false;
m_socketHadError = 0;
m_buf = NULL;
m_buf2 = NULL;
m_cachedResults = false;
m_msg20 = NULL;
m_numMsg20s = 0;
m_msg20StartBuf = NULL;
m_numToFree = 0;
// new stuff for streaming results:
m_hadPrintError = false;
m_numPrinted = 0;
m_printedHeader = false;
m_printedTail = false;
m_sendsOut = 0;
m_sendsIn = 0;
m_printi = 0;
m_numDisplayed = 0;
m_numPrintedSoFar = 0;
m_lastChunk = false;
m_didSummarySkip = false;
m_omitCount = 0;
//m_numGigabitInfos = 0;
}
#define MAX2 50
void Msg40::resetBuf2 ( ) {
// remember num to free in reset() function
char *p = m_msg20StartBuf;
// msg20 destructors
for ( int32_t i = 0 ; i < m_numToFree ; i++ ) {
// skip if empty
//if ( ! m_msg20[i] ) continue;
// call destructor
//m_msg20[i]->destructor();
// cast it
Msg20 *m = (Msg20 *)p;
// free its stuff
m->destructor();
// advance
p += sizeof(Msg20);
}
// now free the msg20 ptrs and buffer space
if ( m_buf2 ) mfree ( m_buf2 , m_bufMaxSize2 , "Msg40b" );
m_buf2 = NULL;
// make a safebuf of 50 of them if we haven't yet
if ( m_unusedBuf.length() <= 0 ) return;
Msg20 *ma = (Msg20 *)m_unusedBuf.getBufStart();
for ( int32_t i = 0 ; i < (int32_t)MAX2 ; i++ ) ma[i].destructor();
}
Msg40::~Msg40() {
if ( m_buf ) mfree ( m_buf , m_bufMaxSize , "Msg40" );
m_buf = NULL;
resetBuf2();
}
bool Msg40::registerHandler ( ) {
// . register ourselves with the udp server
// . it calls our callback when it receives a msg of type 0x40
//if ( ! g_udpServer.registerHandler ( 0x40, handleRequest40 ))
// return false;
return true;
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . uses Msg3a to get docIds
// . uses many msg20s to get title/summary/url/docLen for each docId
bool Msg40::getResults ( SearchInput *si ,
bool forward ,
void *state ,
void (* callback) ( void *state ) ) {
m_omitCount = 0;
// warning
//if ( ! si->m_coll2 ) log(LOG_LOGIC,"net: NULL collection. msg40.");
if ( si->m_collnumBuf.length() < (int32_t)sizeof(collnum_t) )
log(LOG_LOGIC,"net: NULL collection. msg40.");
m_lastProcessedi = -1;
m_didSummarySkip = false;
m_si = si;
m_state = state;
m_callback = callback;
m_msg3aRecallCnt = 0;
// we haven't allocated any Msg20s yet
m_numMsg20s = 0;
// reset our error keeper
m_errno = 0;
// we need this info for caching as well
//m_numGigabitInfos = 0;
m_lastHeartbeat = getTimeLocal();
//just getfrom searchinput
//.... m_catId = hr->getLong("catid",0);m_si->m_catId;
m_postQueryRerank.set1( this, si );
// take search parms i guess from first collnum
collnum_t *cp = (collnum_t *)m_si->m_collnumBuf.getBufStart();
// get the collection rec
CollectionRec *cr =g_collectiondb.getRec( cp[0] );
// g_errno should be set if not found
if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
// save that
m_firstCollnum = cr->m_collnum;
// what is our max docids ceiling?
//m_maxDocIdsToCompute = cr->m_maxDocIdsToCompute;
// topic similarity cutoff
m_topicSimilarCutoff = cr->m_topicSimilarCutoffDefault ;
m_gigabitBuf.reset();
m_factBuf.reset();
// reset this for family filter
m_queryCensored = false;
m_filterStats[CR_DIRTY] = 0; //m_numCensored = 0;
// . compute the min number of results to scan
// . it is the 3rd number in a topicGroupPtr string
// . the minimum number of docids to get for topic-clustering purposes
//m_docsToScanForTopics = 30;
//m_docsToScanForTopics = cr->m_docsToScanForTopics;
m_docsToScanForTopics = 0;
// we usually only have one TopicGroup but there can be multiple
// ones. each TopicGroup can derive its gigabits/topics from a
// different source, like the meta keywords tags only, for instance.
// This support was originally put in for a client.
for ( int32_t i = 0 ; i < m_si->m_numTopicGroups ; i++ ) {
int32_t x = m_si->m_topicGroups[i].m_docsToScanForTopics ;
if ( x > m_docsToScanForTopics ) m_docsToScanForTopics = x;
}
// . but only for first page!
// . no! the second page of results may not match with the first
// if they have different result increments
//if ( m_firstResultNum > 0 ) m_docsToScanForTopics = 0;
// . reset these
// . Next X Results links? yes or no?
m_moreToCome = false;
// set this to zero -- assume not in cache
m_cachedTime = 0;
// assume we are not taken from the serp cache
m_cachedResults = false;
// bail now if 0 requested!
if ( m_si->m_docsWanted == 0 ) return true;
// . do this now in case results were cached.
// . set SearchInput class instance, m_si
// . has all the input that we need to get the search results just
// the way the caller wants them
//m_msg1a.setSearchInput(m_si);
// how many docids do we need to get?
int32_t get = m_si->m_docsWanted + m_si->m_firstResultNum ;
// we get one extra for so we can set m_moreToFollow so we know
// if more docids can be gotten (i.e. show a "Next 10" link)
get++;
// make sure we get more than requested for various other tasks, like
// this one here is for gigabit generation. it likes to have 30 docids
// typically to generate gigabits from.
// NOTE: pqr needs gigabits for all pages
if ( /*m_si->m_firstResultNum == 0 && */get < m_docsToScanForTopics )
get = m_docsToScanForTopics;
// for alden's reranking. often this is 50!
//if ( get < m_si->m_docsToScanForReranking )
// get = m_si->m_docsToScanForReranking;
// for zak's reference pages
// if(get<m_si->m_refs_numToGenerate ) get=m_si->m_refs_numToGenerate;
// limit to this ceiling though for peformance reasons
//if ( get > m_maxDocIdsToCompute ) get = m_maxDocIdsToCompute;
// ok, need some sane limit though to prevent malloc from
// trying to get 7800003 docids and going ENOMEM
if ( get > MAXDOCIDSTOCOMPUTE ) {
log("msg40: asking for too many docids. reducing to %"INT32"",
(int32_t)MAXDOCIDSTOCOMPUTE);
get = MAXDOCIDSTOCOMPUTE;
}
// this is how many visible results we need, after filtering/clustering
m_docsToGetVisible = get;
// if site clustering is on, get more than we should in anticipation
// that some docIds will be clustered.
// MDW: we no longer do this here for full splits because Msg39 does
// clustering on its end now!
//if ( m_si->m_doSiteClustering ) get = (get*150LL)/100LL;
//if ( m_si->m_doSiteClustering && ! g_conf.m_fullSplit )
// get = (get*150LL)/100LL;
// ip clustering is not really used now i don't think (MDW)
//if ( m_si->m_doIpClustering ) get = (get*150LL)/100LL;
// . get a little more since this usually doesn't remove many docIds
// . deduping is now done in Msg40.cpp once the summaries are gotten
if ( m_si->m_doDupContentRemoval ) get = (get*120LL)/100LL;
// . get 30% more for what reason? i dunno, just cuz...
// . well, for "missing query terms" filtering... errors (not founds)
//get = (get*130LL)/100LL;
// make it 10% because we are getting too many summaries some times
// no, this is bad when not doing site clustering or dup removal
// we need to skip directly to the 1000th result sometimes to show
// those results and we do not want to lookup the first 1000
// summaries, so we don't, and this makes us end up looking up 100
// more summaries. well, leave this in, just limit the max out
// for summaries below then to what we want to show.
// crap, Msg40::gotSummary() has a m_numRequests < m_numDocIds
// condition, so take this out...
//get = (get*110LL)/100LL;
// get at least 50 since we need a good sample that explicitly has all
// query terms in order to calculate reliable affinities
//if ( get < MIN_AFFINITY_SAMPLE ) get = MIN_AFFINITY_SAMPLE;
// now apply the multiplier. before it was not getting applied and
// we were constantly doing Msg3a recalls. you can set this multiplier
// dynamically in the "search controls"
// MDW: don't apply if in a full split though, i don't see why...
// MDW: just ignore this now, ppl will just mis-set it and serisouly
// screw things up...
//if ( cr->m_numDocsMultiplier > 1.0 && ! g_conf.m_fullSplit )
// get = (int32_t) ((float)get * cr->m_numDocsMultiplier);
// limit to this ceiling though for peformance reasons
//if ( get > m_maxDocIdsToCompute ) get = m_maxDocIdsToCompute;
// . ALWAYS get at least this many
// . this allows Msg3a to allow higher scoring docids in tier #1 to
// outrank lower-scoring docids in tier #0, even if such docids have
// all the query terms explicitly. and we can guarantee consistency
// as int32_t as we only allow for this outranking within the first
// MIN_DOCS_TO_GET docids.
if ( get < MIN_DOCS_TO_GET ) get = MIN_DOCS_TO_GET;
// this is how many docids to get total, assuming that some will be
// filtered out for being dups, etc. and that we will have at least
// m_docsToGetVisible leftover that are unfiltered and visible. so
// we tell each msg39 split to get more docids than we actually want
// in anticipation some will be filtered out in this class.
m_docsToGet = get;
// debug msg
if ( m_si->m_debug )
logf(LOG_DEBUG,"query: msg40 mapped %"INT32" wanted to %"INT32" to get",
m_docsToGetVisible,m_docsToGet );
// let's try using msg 0xfd like Proxy.cpp uses to forward an http
// request! then we just need specify the ip of the proxy and we
// do not need hosts2.conf!
if ( forward ) { char *xx=NULL;*xx=0; }
// . forward to another *collection* and/or *cluster* if we should
// . this is used by Msg41 for importing results from another cluster
/*
if ( forward ) {
// serialize input
int32_t requestSize;
// CAUTION: m_docsToGet can be different on remote host!!!
char *request = m_si->serializeForMsg40 ( &requestSize );
if ( ! request ) return true;
// . set timeout based on docids requested!
// . the more docs requested the longer it will take to get
// . use 50ms per docid requested
int32_t timeout = (50 * m_docsToGet) / 1000;
// always wait at least 20 seconds
if ( timeout < 20 ) timeout = 20;
// . forward to another cluster
// . use the advanced composite query to make the key
uint32_t h = hash32 ( m_si->m_qbuf1 );
// get groupId from docId, if positive
int32_t groupNum = h % g_hostdb2.m_numGroups;
uint32_t groupId = g_hostdb2.getGroupId ( groupNum );
if ( ! m_mcast.send ( request ,
requestSize ,
0x40 , // msgType 0x40
false , // mcast own m_request?
groupId , //sendtogroup(groupKey)
false , // send to whole group?
h , // key for host in grp
this , // state data
NULL , // state data
gotExternalReplyWrapper ,
timeout , // to re-send to twin
m_si->m_niceness, // niceness ,
false , // real time udp?
-1 , // first hostid
NULL , // m_reply ,
0 , // m_replyMaxSize ,
false , // free reply buf?
false , // disk load balancing?
-1 , // max cache age
0 , // cacheKey
0 , // bogus rdbId
-1 , // minRecSizes(-1=ukwn)
true , // sendToSelf
false , // retry forever
&g_hostdb2 )) {
m_mcast.reset();
return true;
}
// always blocks
return false; // gotExternalReply();
}
*/
// time the cache lookup
if ( g_conf.m_logTimingQuery || m_si->m_debug )
m_startTime = gettimeofdayInMilliseconds();
// use cache?
bool useCache = m_si->m_rcache;
// turn it off for now until we cache the scoring tables
log("db: cache is disabled until we cache scoring tables");
useCache = false;
// if searching multiple collections do not cache for now
if ( m_si->m_collnumBuf.length() > (int32_t)sizeof(collnum_t) )
useCache=false;
// . try setting from cache first
// . cacher --> "do we READ from cache?"
if ( useCache ) {
// make the key based on query and other input parms in msg40
key_t key = m_si->makeKey ( );
// this should point to the cached rec, if any
m_cachePtr = NULL;
m_cacheSize = 0;
// this returns false if blocked, true otherwise
if ( ! m_msg17.getFromCache ( SEARCHRESULTS_CACHEID,
key ,
&m_cachePtr,
&m_cacheSize,
// use first collection #
m_si->m_firstCollnum,
this ,
gotCacheReplyWrapper ,
m_si->m_niceness ,
1 ) )
return false;
// reset g_errno, we're just a cache
g_errno = 0;
return gotCacheReply();
}
// keep going
return prepareToGetDocIds ( );
}
/*
void gotExternalReplyWrapper ( void *state , void *state2 ) {
Msg40 *THIS = (Msg40 *)state;
if ( ! THIS->gotExternalReply() ) return;
THIS->m_callback ( THIS->m_state );
}
bool Msg40::gotExternalReply ( ) {
if ( g_errno ) {
log("query: Trying to forward to another cluster "
"had error: %s.",mstrerror(g_errno));
return true;
}
// grab the reply from the multicast class
bool freeit;
int32_t bufSize , bufMaxSize;
char *buf = m_mcast.getBestReply ( &bufSize , &bufMaxSize , &freeit );
relabel( buf, bufMaxSize, "Msg40-mcastGBR" );
// sanity check
if ( freeit ) {
log(LOG_LOGIC,"query: msg40: gotReply: Bad engineer.");
char *xx = NULL; *xx = 0;
}
if ( bufSize != bufMaxSize ) {
log(LOG_LOGIC,"query: msg40: fix me.");
char *xx = NULL; *xx = 0;
}
// set ourselves from it
deserialize ( buf , bufSize );
return true;
}
*/
// msg17 calls this after it gets a reply
void gotCacheReplyWrapper ( void *state ) {
Msg40 *THIS = (Msg40 *)state;
// reset g_errno, we're just a cache
g_errno = 0;
// handle the reply
if ( ! THIS->gotCacheReply() ) return;
// otherwise, call callback
THIS->m_callback ( THIS->m_state );
}
bool Msg40::gotCacheReply ( ) {
// if not found, get the result the hard way
if ( ! m_msg17.wasFound() ) return prepareToGetDocIds ( );
// otherwise, get the deserialized stuff
int32_t nb = deserialize(m_cachePtr, m_cacheSize);
if ( nb <= 0 ) {
log ("query: Deserialization of cached search results "
"page failed." );
// free m_buf!
if ( m_buf )
mfree ( m_buf , m_bufMaxSize , "deserializeMsg40");
// get results the hard way!
return prepareToGetDocIds ( );
}
// log the time it took for cache lookup
if ( g_conf.m_logTimingQuery ) {
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - m_startTime;
log(LOG_TIMING,
"query: [%"PTRFMT"] found in cache. "
"lookup took %"INT64" ms.",(PTRTYPE)this,took);
}
m_cachedTime = m_msg17.getCachedTime();
m_cachedResults = true;
// if it was found, we return true, m_cachedTime should be set
return true;
}
bool Msg40::prepareToGetDocIds ( ) {
// log the time it took for cache lookup
if ( g_conf.m_logTimingQuery || m_si->m_debug ) {
int64_t now = gettimeofdayInMilliseconds();
int64_t took = now - m_startTime;
logf(LOG_TIMING,"query: [%"PTRFMT"] Not found in cache. "
"Lookup took %"INT64" ms.",(PTRTYPE)this,took);
m_startTime = now;
logf(LOG_TIMING,"query: msg40: [%"PTRFMT"] Getting up to %"INT32" "
"(docToGet=%"INT32") docids", (PTRTYPE)this,
m_docsToGetVisible, m_docsToGet);
}
//if ( m_si->m_compoundListMaxSize <= 0 )
// log("query: Compound list max size is %"INT32". That is bad. You "
// "will not get back some search results for UOR queries.",
// m_si->m_compoundListMaxSize );
// . if query has dirty words and family filter is on, set
// number of results to 0, and set the m_queryClen flag to true
// . m_qbuf1 should be the advanced/composite query
if ( m_si->m_familyFilter &&
getDirtyPoints ( m_si->m_sbuf1.getBufStart() ,
m_si->m_sbuf1.length() ,
0 ,
NULL ) ) {
// make sure the m_numDocIds gets set to 0
m_msg3a.reset();
m_queryCensored = true;
return true;
}
return getDocIds( false );
}
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Msg40::getDocIds ( bool recall ) {
// . get the docIds
// . this sets m_msg3a.m_clusterLevels[] for us
//if(! m_msg3a.getDocIds ( &m_r, m_si->m_q, this , gotDocIdsWrapper))
// return false;
////
//
// NEW CODE FOR LAUNCHING one MSG3a per collnum to search a token
//
////
m_num3aReplies = 0;
m_num3aRequests = 0;
// how many are we searching? usually just one.
m_numCollsToSearch = m_si->m_collnumBuf.length() /sizeof(collnum_t);
// make enough for ptrs
int32_t need = sizeof(Msg3a *) * m_numCollsToSearch;
if ( ! m_msg3aPtrBuf.reserve ( need ) ) return true;
// cast the mem buffer
m_msg3aPtrs = (Msg3a **)m_msg3aPtrBuf.getBufStart();
// clear these out so we do not free them when destructing
for ( int32_t i = 0 ; i < m_numCollsToSearch ;i++ )
m_msg3aPtrs[i] = NULL;
// use first guy in case only one coll we are searching, the std case
if ( m_numCollsToSearch <= 1 )
m_msg3aPtrs[0] = &m_msg3a;
return federatedLoop();
}
bool Msg40::federatedLoop ( ) {
// search the provided collnums (collections)
collnum_t *cp = (collnum_t *)m_si->m_collnumBuf.getBufStart();
// we modified m_rcache above to be true if we should read from cache
int32_t maxAge = 0 ;
if ( m_si->m_rcache ) maxAge = g_conf.m_indexdbMaxIndexListAge;
// reset it
Msg39Request mr;
mr.reset();
//m_r.ptr_coll = m_si->m_coll2;
//m_r.size_coll = m_si->m_collLen2+1;
mr.m_maxAge = maxAge;
mr.m_addToCache = m_si->m_wcache;
mr.m_docsToGet = m_docsToGet;
mr.m_niceness = m_si->m_niceness;
mr.m_debug = m_si->m_debug ;
mr.m_getDocIdScoringInfo = m_si->m_getDocIdScoringInfo;
mr.m_doSiteClustering = m_si->m_doSiteClustering ;
mr.m_hideAllClustered = m_si->m_hideAllClustered;
mr.m_familyFilter = m_si->m_familyFilter;
//mr.m_useMinAlgo = m_si->m_useMinAlgo;
//mr.m_useNewAlgo = m_si->m_useNewAlgo;
mr.m_doMaxScoreAlgo = m_si->m_doMaxScoreAlgo;
//mr.m_fastIntersection = m_si->m_fastIntersection;
//mr.m_doIpClustering = m_si->m_doIpClustering ;
mr.m_doDupContentRemoval = m_si->m_doDupContentRemoval ;
//mr.m_restrictIndexdbForQuery = m_si->m_restrictIndexdbForQuery ;
mr.m_queryExpansion = m_si->m_queryExpansion;
//mr.m_compoundListMaxSize = m_si->m_compoundListMaxSize ;
mr.m_boolFlag = m_si->m_boolFlag ;
mr.m_familyFilter = m_si->m_familyFilter ;
mr.m_language = (unsigned char)m_si->m_queryLangId;
mr.ptr_query = m_si->m_q.m_orig;
mr.size_query = m_si->m_q.m_origLen+1;
//mr.ptr_whiteList = m_si->m_whiteListBuf.getBufStart();
//mr.size_whiteList = m_si->m_whiteListBuf.length()+1;
int32_t slen = 0; if ( m_si->m_sites ) slen=gbstrlen(m_si->m_sites)+1;
mr.ptr_whiteList = m_si->m_sites;
mr.size_whiteList = slen;
mr.m_timeout = -1; // auto-determine based on #terms
// make sure query term counts match in msg39
mr.m_maxQueryTerms = m_si->m_maxQueryTerms;
mr.m_realMaxTop = m_si->m_realMaxTop;
mr.m_minSerpDocId = m_si->m_minSerpDocId;
mr.m_maxSerpScore = m_si->m_maxSerpScore;
mr.m_sameLangWeight = m_si->m_sameLangWeight;
//
// how many docid splits should we do to avoid going OOM?
//
CollectionRec *cr = g_collectiondb.getRec(m_firstCollnum);
RdbBase *base = NULL;
if ( cr ) g_titledb.getRdb()->getBase(cr->m_collnum);
int64_t numDocs = 0;
if ( base ) numDocs = base->getNumTotalRecs();
// for every 5M docids per host, lets split up the docid range
// to avoid going OOM
int32_t mult = numDocs / 5000000;
if ( mult <= 0 ) mult = 1;
// . do not do splits if caller is already specifying a docid range
// like for gbdocid: queries i guess.
// . make sure m_msg2 is non-NULL, because if it is NULL we are
// evaluating a query for a single docid for seo tools
//if ( m_r->m_minDocId == -1 ) { // && m_msg2 ) {
int32_t nt = m_si->m_q.getNumTerms();
int32_t numDocIdSplits = nt / 2; // ;/// 2;
if ( numDocIdSplits <= 0 ) numDocIdSplits = 1;
// and mult based on index size
numDocIdSplits *= mult;
// prevent going OOM for type:article AND html
//if ( numDocIdSplits < 5 ) numDocIdSplits = 5;
//}
// special oom hack fix
if ( cr && cr->m_isCustomCrawl && numDocIdSplits < 4 )
numDocIdSplits = 4;
// for testing
//m_numDocIdSplits = 3;
//if ( ! g_conf.m_doDocIdRangeSplitting )
// m_numDocIdSplits = 1;
// limit to 10
if ( numDocIdSplits > 15 )
numDocIdSplits = 15;
// store it in the reuquest now
mr.m_numDocIdSplits = numDocIdSplits;
int32_t maxOutMsg3as = 1;
// create new ones if searching more than 1 coll
for ( int32_t i = m_num3aRequests ; i < m_numCollsToSearch ; i++ ) {
// do not have more than this many outstanding
if ( m_num3aRequests - m_num3aReplies >= maxOutMsg3as )
// wait for it to return before launching another
return false;
// get it
Msg3a *mp = m_msg3aPtrs[i];
// stop if only searching one collection
if ( ! mp ) {
try { mp = new ( Msg3a); }
catch ( ... ) {
g_errno = ENOMEM;
return true;
}
mnew(mp,sizeof(Msg3a),"tm3ap");
}
// error?
if ( ! mp ) {
log("msg40: Msg40::getDocIds() had error: %s",
mstrerror(g_errno));
return true;
}
// assign it
m_msg3aPtrs[i] = mp;
// assign the request for it
memcpy ( &mp->m_rrr , &mr , sizeof(Msg39Request) );
// then customize it to just search this collnum
mp->m_rrr.m_collnum = cp[i];
// launch a search request
m_num3aRequests++;
// this returns false if it would block and will call callback
// m_si is actually contained in State0 in PageResults.cpp
// and Msg40::m_si points to that. so State0's destructor
// should call SearchInput's destructor which calls
// Query's destructor to destroy &m_si->m_q here when done.
if(!mp->getDocIds(&mp->m_rrr,&m_si->m_q,this,gotDocIdsWrapper))
continue;
if ( g_errno && ! m_errno )
m_errno = g_errno;
m_num3aReplies++;
}
// call again w/o parameters now
return gotDocIds ( );
}
// . uses parameters assigned to local member vars above
// . returns false if blocked, true otherwise
// . sets g_errno on error
void gotDocIdsWrapper ( void *state ) {
Msg40 *THIS = (Msg40 *) state;
// if this blocked, it returns false
//if ( ! checkTurnOffRAT ( state ) ) return;
THIS->m_num3aReplies++;
// try to launch more if there are more colls left to search
if ( THIS->m_num3aRequests < THIS->m_numCollsToSearch ) {
THIS->federatedLoop ( );
return;
}
// return if this blocked
if ( ! THIS->gotDocIds() ) return;
// now call callback, we're done
THIS->m_callback ( THIS->m_state );
}
// . return false if blocked, true otherwise
// . sets g_errno on error
bool Msg40::gotDocIds ( ) {
// return now if still waiting for a msg3a reply to get in
if ( m_num3aReplies < m_num3aRequests ) return false;
// if searching over multiple collections let's merge their docids
// into m_msg3a now before we go forward
// this will set g_errno on error, like oom
if ( ! mergeDocIdsIntoBaseMsg3a() )
log("msg40: error: %s",mstrerror(g_errno));
// log the time it took for cache lookup
int64_t now = gettimeofdayInMilliseconds();
if ( g_conf.m_logTimingQuery || m_si->m_debug||g_conf.m_logDebugQuery){
int64_t took = now - m_startTime;
logf(LOG_DEBUG,"query: msg40: [%"PTRFMT"] Got %"INT32" docids in %"INT64" ms",
(PTRTYPE)this,m_msg3a.getNumDocIds(),took);
logf(LOG_DEBUG,"query: msg40: [%"PTRFMT"] Getting up to %"INT32" "
"summaries", (PTRTYPE)this,m_docsToGetVisible);
}
// save any covered up error
if ( ! m_errno && m_msg3a.m_errno ) m_errno = m_msg3a.m_errno;
//sanity check. we might not have allocated due to out of memory
if ( g_errno ) { m_errno = g_errno; return true; }
// . ok, do the actual clustering
// . sets m_clusterLevels[]
// . sets m_clusterLevels[i] to CR_OK if docid #i is not clustered
// . sets m_clusterLevels[i] to CR_OK if not doing any clustering or
// filtering, and then returns right away
// . allow up to "2" docids per hostname if site clustering is on
// . returns false and sets g_errno on error
// . this should get rid of cluster levels of CR_GOT_REC
// . if we had g_conf.m_fullSplit then Msg3a should have these
// already set...
/*
if ( ! g_conf.m_fullSplit &&
m_si->m_doSiteClustering &&
! setClusterLevels ( m_msg3a.m_clusterRecs ,
m_msg3a.m_docIds ,
m_msg3a.m_numDocIds ,
2 ,
m_si->m_doSiteClustering ,
m_si->m_familyFilter ,
m_si->m_language ,
// list of blacklisted site ids
NULL ,
m_si->m_debug ,
m_msg3a.m_clusterLevels )) {
m_errno = g_errno;
return true;
}
*/
// DEBUG HACK -- make most clustered!
//for ( int32_t i = 1 ; i < m_msg3a.m_numDocIds ; i++ )
// m_msg3a.m_clusterLevels[i] = CR_CLUSTERED;
// time this
m_startTime = gettimeofdayInMilliseconds();
// we haven't got any Msg20 responses as of yet or sent any requests
m_numRequests = 0;
m_numReplies = 0;
//m_maxiLaunched = -1;
// when returning search results in csv let's get the first 100
// results and use those to determine the most common column headers
// for the csv. any results past those that have new json fields we
// will add a header for, but the column will not be labelled with
// the header name unfortunately.
m_needFirstReplies = 0;
if ( m_si->m_format == FORMAT_CSV ) {
m_needFirstReplies = m_msg3a.m_numDocIds;
if ( m_needFirstReplies > 100 ) m_needFirstReplies = 100;
}
// we have received m_numGood contiguous Msg20 replies!
//m_numContiguous = 0;
//m_visibleContiguous = 0;
// . do not uncluster more than 5 docids! it slows things down.
// . kind of a HACK until we do it right
m_unclusterCount = 5;
// assume we do not have enough visible docids
//m_gotEnough = false;
if ( ! m_urlTable.set ( m_msg3a.m_numDocIds * 2 ) ) {
m_errno = g_errno;
log("query: Failed to allocate memory for url deduping. "
"Not deduping search results.");
return true;
}
// if only getting docids, skip summaries,topics, and references
// if ( m_si->m_docIdsOnly ) return launchMsg20s ( false );
if ( m_si->m_docIdsOnly ) return true;
// . alloc buf to hold all m_msg20[i] ptrs and the Msg20s they point to
// . returns false and sets g_errno/m_errno on error
// . salvage any Msg20s that we can if we are being re-called
if ( ! reallocMsg20Buf() ) return true;
// these are just like for passing to Msg39 above
//int32_t maxAge = 0;
//if ( m_si->m_rcache ) maxAge = g_conf.m_titledbMaxCacheAge;
// . launch a bunch of task that depend on the docids we got
// . gigabits, reference pages and dmoz topics
// . keep track of how many are out
m_tasksRemaining = 0;
// debug msg
if ( m_si->m_debug || g_conf.m_logDebugQuery )
logf(LOG_DEBUG,"query: [%"PTRFMT"] Getting topics/gigabits, "
"reference pages and dir pages.",(PTRTYPE)this);
// . do not bother getting topics if we are passed first page
// . AWL NOTE: pqr needs topics on all pages
//if ( m_si->m_firstResultNum > 0 ) return launchMsg20s ( false );
// do not bother getting topics if we will be re-called below so we
// will be here again!
//if ( numVisible < m_docsToGet && // are we int16_t?
// m_msg3a.m_tier+1 < MAX_TIERS && // do we have a tier to go to?
// ! m_msg3a.m_isDiskExhausted )// SOME more data on disk?
// return launchMsg20s ( false );
// . TODO: do this LAST, after we get all summaries and do PQR
// . TODO: give it all our Msg20s so it can just use those!!!
// . get the topics
// . returns true right away if m_docsToScanForTopics is <= 0
// if (!m_msg24.generateTopics ( m_si->m_coll ,
// m_si->m_collLen ,
// m_msg3a.m_q->m_orig ,// query
// m_msg3a.m_q->m_origLen ,// query
// m_msg3a.m_docIds ,
// m_msg3a.m_clusterLevels ,
// m_msg3a.m_numDocIds ,
// m_si->m_topicGroups ,
// m_si->m_numTopicGroups ,
// maxAge ,
// m_si->m_wcache ,//addToCache
// m_si->m_returnDocIdCount ,
// m_si->m_returnDocIds ,
// m_si->m_returnPops ,
// this ,
// didTaskWrapper ,
// m_si->m_niceness ))
// m_tasksRemaining++;
//generate reference and related pages.
// if ( ! m_msg1a.generateReferences(m_si,(void*)this,didTaskWrapper) )
// m_tasksRemaining++;
//
// call Msg2b to generate directory
//
// why is this here? it does not depend on the docids. (mdw 9/25/13)
// dissect it and fix it!!
//
//if ( m_si->m_catId &&
// ! m_msg2b.generateDirectory ( m_si->m_catId,
// (void*)this,
// didTaskWrapper ) )
// m_tasksRemaining++;
return launchMsg20s ( false );
}
bool Msg40::mergeDocIdsIntoBaseMsg3a() {
// only do this if we were searching multiple collections, otherwise
// all the docids are already in m_msg3a
if ( m_numCollsToSearch <= 1 ) return true;
// free any mem in use
m_msg3a.reset();
// count total docids into "td"
int32_t td = 0LL;
for ( int32_t i = 0 ; i < m_numCollsToSearch ; i++ ) {
Msg3a *mp = m_msg3aPtrs[i];
td += mp->m_numDocIds;
// reset cursor for list of docids from this collection
mp->m_cursor = 0;
// add up here too
m_msg3a.m_numTotalEstimatedHits += mp->m_numTotalEstimatedHits;
}
// setup to to merge all msg3as into our one m_msg3a
int32_t need = 0;
need += td * 8;
need += td * sizeof(double);
need += td * sizeof(key_t);
need += td * 1;
need += td * sizeof(collnum_t);
// make room for the merged docids
m_msg3a.m_finalBuf = (char *)mmalloc ( need , "finalBuf" );
m_msg3a.m_finalBufSize = need;
// return true with g_errno set
if ( ! m_msg3a.m_finalBuf ) return true;
// parse the memory up into arrays
char *p = m_msg3a.m_finalBuf;
m_msg3a.m_docIds = (int64_t *)p; p += td * 8;
m_msg3a.m_scores = (double *)p; p += td * sizeof(double);
m_msg3a.m_clusterRecs = (key_t *)p; p += td * sizeof(key_t);
m_msg3a.m_clusterLevels = (char *)p; p += td * 1;
m_msg3a.m_scoreInfos = NULL;
m_msg3a.m_collnums = (collnum_t *)p; p += td * sizeof(collnum_t);
if ( p - m_msg3a.m_finalBuf != need ) { char *xx=NULL;*xx=0; }
m_msg3a.m_numDocIds = td;
//
// begin the collection merge
//
int32_t next = 0;
loop:
// get next biggest score
double max = -1000000000.0;
Msg3a *maxmp = NULL;
for ( int32_t i = 0 ; i < m_numCollsToSearch ; i++ ) {
// int16_tcut
Msg3a *mp = m_msg3aPtrs[i];
// get cursor
int32_t cursor = mp->m_cursor;
// skip if exhausted
if ( cursor >= mp->m_numDocIds ) continue;
// get his next score
double score = mp->m_scores[ cursor ];
if ( score <= max ) continue;
// got a new winner
max = score;
maxmp = mp;
}
// store him
if ( maxmp ) {
m_msg3a.m_docIds [next] = maxmp->m_docIds[maxmp->m_cursor];
m_msg3a.m_scores [next] = maxmp->m_scores[maxmp->m_cursor];
m_msg3a.m_collnums[next] = maxmp->m_rrr.m_collnum;
m_msg3a.m_clusterLevels[next] = CR_OK;
maxmp->m_cursor++;
next++;
goto loop;
}
// free tmp msg3as now
for ( int32_t i = 0 ; i < m_numCollsToSearch ; i++ ) {
if ( m_msg3aPtrs[i] == &m_msg3a ) continue;
mdelete ( m_msg3aPtrs[i] , sizeof(Msg3a), "tmsg3a");
delete ( m_msg3aPtrs[i] );
m_msg3aPtrs[i] = NULL;
}
return true;
}
// . returns false and sets g_errno/m_errno on error
// . makes m_msg3a.m_numDocIds ptrs to Msg20s.
// . does not allocate a Msg20 in the buffer if the m_msg3a.m_clusterLevels[i]
// is something other than CR_OK
bool Msg40::reallocMsg20Buf ( ) {
// if the user only requested docids, we have no summaries
if ( m_si->m_docIdsOnly ) return true;
// . allocate m_buf2 to hold all our Msg20 pointers and Msg20 classes
// . how much mem do we need?
// . need space for the msg20 ptrs
int32_t need = m_msg3a.m_numDocIds * sizeof(Msg20 *);
// need space for the classes themselves, only if "visible" though
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ )
if ( m_msg3a.m_clusterLevels[i] == CR_OK )
need += sizeof(Msg20);
// MDW: try to preserve the old Msg20s if we are being re-called
if ( m_buf2 ) {
// we do not do recalls when streaming yet
if ( m_si->m_streamResults ) { char *xx=NULL;*xx=0; }
// use these 3 vars for mismatch stat reporting
//int32_t mismatches = 0;
//int64_t mismatch1 = 0LL;
//int64_t mismatch2 = 0LL;
// make new buf
char *newBuf = (char *)mmalloc(need,"Msg40d");
// return false if it fails
if ( ! newBuf ) { m_errno = g_errno; return false; }
// fill it up
char *p = newBuf;
// point to our new array of Msg20 ptrs
Msg20 **tmp = (Msg20 **)p;
// skip over pointer array
p += m_msg3a.m_numDocIds * sizeof(Msg20 *);
// record start to set to m_msg20StartBuf
char *pstart = p;
// and count for m_numToFree
int32_t pcount = 0;
// fill in the actual Msg20s from the old buffer
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// assume empty, because clustered, filtered, etc.
tmp[i] = NULL;
// if clustered, keep it as a NULL ptr
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// point it to its memory
tmp[i] = (Msg20 *)p;
// point to the next Msg20
p += sizeof(Msg20);
// init it
tmp[i]->constructor();
// set this now
tmp[i]->m_owningParent = (void *)this;
tmp[i]->m_constructedId = 1;
// count it
pcount++;
// skip it if it is a new docid, we do not have a Msg20
// for it from the previous tier. IF it is from
// the current tier, THEN it is new.
//if ( m_msg3a.m_tiers[i] == m_msg3a.m_tier ) continue;
// see if we can find this docid from the old list!
int32_t k = 0;
for ( ; k < m_numMsg20s ; k++ ) {
// skip if NULL
if ( ! m_msg20[k] ) continue;
// if it never gave us a reply then skip it
if ( ! m_msg20[k]->m_gotReply ) continue;
//or if it had an error
if ( m_msg20[k]->m_errno ) continue;
// skip if no match
if ( m_msg3a .m_docIds[i] !=
m_msg20[k]->m_r->m_docId )//getDocId() )
continue;
// we got a match, grab its Msg20
break;
}
// . skip if we could not match it... strange...
// . no, because it may have been in the prev tier,
// from a split, but it was not in msg3a's final
// merged list made in Msg3a::mergeLists(), but now
// it is in there, with the previous tier, because
// we asked for more docids from msg3a.
// . NO! why did we go to the next tier unnecessarily
// THEN? no again, because we did a msg3a recall
// and asked for more docids which required us
// going to the next tier, even though some (but
// not enough) docids remained in the previous tier.
if ( k >= m_numMsg20s ) {
/*
logf(LOG_DEBUG,"query: msg40: could not match "
"docid %"INT64" (max=%"INT32") "
"to msg20. newBitScore=0x%hhx q=%s",
m_msg3a.m_docIds[i],
(char)m_msg3a.m_bitScores[i],
m_msg3a.m_q->m_orig);
*/
continue;
}
// it is from an older tier but never got the msg20
// for it? what happened? it got unclustered??
if ( ! m_msg20[k] ) continue;
/*
// . make sure they match!
// . they may get mismatched after the recall because
// a new doc gets added to the index!!!
// . also, if the re-call gets the termlist from a
// different twin and gets back different docids
// or scores, it will change this too!!
if ( tmp[i]->m_docId >= 0 &&
tmp[i]->m_docId != m_msg3a.m_docIds[i] ) {
// it should be rare!!!
mismatches++;
if ( ! mismatch1 )
mismatch1 = tmp[i]->m_docId;
if ( ! mismatch2 )
mismatch2 = m_msg3a.m_docIds[i];
continue;
//logf(LOG_DEBUG,"query: msg40: docid mismatch"
// " at #%"INT32". olddocid=%"INT64" newdocid=%"INT64"",i,
// tmp[i]->m_docId,m_msg3a.m_docIds[i]);
// core for testing on gb1d only!!!
//char *xx = NULL; *xx = 0;
}
*/
// . otherwise copy the memory if available
// . if m_msg20[i]->m_docId is set this will save us
// repeating a summary lookup
tmp[i]->copyFrom ( m_msg20[k] );
}
// sanity check
if ( p - (char *)tmp != need ) { char *xx = NULL; *xx = 0; }
resetBuf2();
// destroy all the old msg20s, this was mem leaking
//for ( int32_t i = 0 ; i < m_numMsg20s ; i++ ) {
// // assume empty, because clustered, filtered, etc.
// if ( ! m_msg20[i] ) continue;
// // call its destructor
// m_msg20[i]->destructor();
//}
// the new buf2 stuff
m_numToFree = pcount;
m_msg20StartBuf = pstart;
// re-assign the msg20 ptr to the ptrs
m_msg20 = tmp;
// update new count
m_numMsg20s = m_msg3a.m_numDocIds;
// free old buf
//mfree ( m_buf2 , m_bufMaxSize2 , "Msg40c");
// assign to new mem
m_buf2 = newBuf;
m_bufMaxSize2 = need;
// note it since this is inefficient for now
// . crap this is messing up m_nextMerged ptr!!
//log("query: msg40: rellocated msg20 buffer");
// show mismatch stats
//if ( mismatches )
// logf(LOG_DEBUG,"query: msg40: docid %"INT64" mismatched "
// "%"INT64". Total of %"INT32" mismathes. q=%s",
// mismatch1,mismatch2,mismatches,
// m_msg3a.m_q->m_orig );
// all done
return true;
}
m_numMsg20s = m_msg3a.m_numDocIds;
// when streaming because we can have hundreds of thousands of
// search results we recycle a few msg20s to save mem
if ( m_si->m_streamResults ) {
int32_t max = MAX_OUTSTANDING_MSG20S * 2;
if ( m_msg3a.m_numDocIds < max ) max = m_msg3a.m_numDocIds;
need = 0;
need += max * sizeof(Msg20 *);
need += max * sizeof(Msg20);
m_numMsg20s = max;
}
m_buf2 = NULL;
m_bufMaxSize2 = need;
// do the alloc
if ( need ) m_buf2 = (char *)mmalloc ( need ,"Msg40msg20");
if ( need && ! m_buf2 ) { m_errno = g_errno; return false; }
// point to the mem
char *p = m_buf2;
// point to the array, then make p point to the Msg20 buffer space
m_msg20 = (Msg20 **)p;
p += m_numMsg20s * sizeof(Msg20 *);
// start free here
m_msg20StartBuf = p;
// set the m_msg20[] array to use this memory, m_buf20
for ( int32_t i = 0 ; i < m_numMsg20s ; i++ ) {
// assume empty
m_msg20[i] = NULL;
// if clustered, do a NULL ptr
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// point it to its memory
m_msg20[i] = (Msg20 *)p;
// call its constructor
m_msg20[i]->constructor();
// set this now
m_msg20[i]->m_owningParent = (void *)this;
m_msg20[i]->m_constructedId = 2;
// point to the next Msg20
p += sizeof(Msg20);
// remember num to free in reset() function
m_numToFree++;
}
// remember how many we got in here in case we have to realloc above
//m_numMsg20s = m_msg3a.m_numDocIds;
return true;
}
void didTaskWrapper ( void* state ) {
Msg40 *THIS = (Msg40 *) state;
// one less task
THIS->m_tasksRemaining--;
// this returns false if blocked
if ( ! THIS->launchMsg20s ( false ) ) return;
// we are done, call the callback
THIS->m_callback ( THIS->m_state );
}
bool Msg40::launchMsg20s ( bool recalled ) {
// don't launch any more if client browser closed socket
if ( m_socketHadError ) { char *xx=NULL; *xx=0; }
// these are just like for passing to Msg39 above
int32_t maxAge = 0 ;
//if ( m_si->m_rcache ) maxAge = g_conf.m_titledbMaxCacheAge;
// may it somewhat jive with the search results caching, otherwise
// it will tell me a search result was indexed like 3 days ago
// when it was just indexed 10 minutes ago because the
// titledbMaxCacheAge was set way too high
if ( m_si->m_rcache ) maxAge = g_conf.m_searchResultsMaxCacheAge;
/*
// "need" = how many more msg20 replies do we need to get back to
// get the required number of search results?
int32_t sample = 0;
int32_t good = 0;
int32_t gaps = 0;
int32_t goodAfterGaps = 0;
// loop up to the last msg20 request we actually launched
for ( int32_t i = 0 ; i <= m_maxiLaunched ; i++ ) {
// if Msg51 had initially clustered (CR_CLUSTERED) this away
// we never actually gave it a msg20 ptr, so it is NULL. it
// m_msg3a.m_clusterLevel[i] != CR_OK ever.
if ( ! m_msg20[i] ) continue;
// do not count if reply not received yet. it is a gap.
if ( ! m_msg20[i]->m_gotReply ) { gaps++; continue; }
// ok, we had launched it and got a reply for it, it is
// therefore in our "sample", used to make the visibility ratio
sample++;
// . skip if not "good" (visible)
// . if msg20 has error, sets cluster level to CR_ERROR_SUMMARY
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// count as good. it is visible.
if ( gaps ) goodAfterGaps++;
else good++;
}
// how many MORE docs to we need to get? subtract what was desired from
// what we already have that is visible as int32_t as it is before any gap
int32_t need = m_docsToGetVisible - good ;
// if we fill in the gaps, we get "goodAfterGaps" more visible results
if ( need >= gaps ) {
// so no need to get these then
need -= goodAfterGaps ;
// but watch out for flooding!
if ( need < gaps ) need = gaps;
}
// how many total good?
int32_t allGood = good + goodAfterGaps;
// get the visiblity ratio from the replies we did get back
float ratio ;
if ( allGood > 0 ) ratio = (float)sample / (float)allGood;
else ratio = (float)sample / 1.0 ;
// give a 5% boost
ratio *= 1.05;
// assume some of what we "need" will be invisible, make up for that
if ( sample > 0 ) need = (int32_t)((float)need * ratio);
// . restrict "need" to no more than 50 at a time
// . we are using it for a "max outstanding" msg20s
// . do not overflow the udpservers
if ( need > 50 ) need = 50;
if ( m_si->m_debug || g_conf.m_logDebugQuery )
logf(LOG_DEBUG,"query: msg40: can launch %"INT32" more msg20s. "
"%"INT32" out. %"INT32" completed. %"INT32" visible. %"INT32" gaps. "
"%"INT32" contiguous. %"INT32" toGet. ",
need,m_numRequests-m_numReplies,sample,allGood,gaps,
m_numContiguous,m_docsToGet);
*/
int32_t bigSampleRadius = 0;
int32_t bigSampleMaxLen = 0;
// NOTE: pqr needs gigabits for all pages
if(m_docsToScanForTopics > 0 /*&& m_si->m_firstResultNum == 0*/) {
bigSampleRadius = 300;
//bigSampleMaxLen = m_si->m_topicGroups[0].m_topicSampleSize;
bigSampleMaxLen = 5000;
}
int32_t maxOut = (int32_t)MAX_OUTSTANDING_MSG20S;
if ( g_udpServer.getNumUsedSlots() > 500 ) maxOut = 10;
if ( g_udpServer.getNumUsedSlots() > 800 ) maxOut = 1;
// if not deduping or site clustering or getting gigabits, then
// just skip over docids for speed.
// don't bother with summaries we do not need
if ( m_si &&
! m_si->m_doDupContentRemoval &&
! m_si->m_doSiteClustering &&
// gigabits required the first X summaries to be computed
m_docsToScanForTopics <= 0 &&
m_lastProcessedi == -1 ) {
// start getting summaries with the result # they want
m_lastProcessedi = m_si->m_firstResultNum-1;
// assume we printed the summaries before
m_printi = m_si->m_firstResultNum;
m_numDisplayed = m_si->m_firstResultNum;
// fake this so Msg40::gotSummary() can let us finish
// because it checks m_numRequests < m_msg3a.m_numDocIds
m_numRequests = m_si->m_firstResultNum;
m_numReplies = m_si->m_firstResultNum;
m_didSummarySkip = true;
log("query: skipping summary generation of first %"INT32" docs",
m_si->m_firstResultNum);
}
// if not doing deduping or site clustering, let's not get like
// 100 summaries at a time when we only wanted 10 results
// for performance reasons
// if ( m_si &&
// ! m_si->m_doDupContentRemoval &&
// ! m_si->m_doSiteClustering &&
// maxOut > m_si->m_docsWanted )
// maxOut = m_si->m_docsWanted;
// . launch a msg20 getSummary() for each docid
// . m_numContiguous should preceed any gap, see below
for ( int32_t i = m_lastProcessedi+1 ; i < m_msg3a.m_numDocIds ; i++ ) {
// if the user only requested docids, do not get the summaries
if ( m_si->m_docIdsOnly ) break;
// if we have enough visible then no need to launch more!
//if ( m_gotEnough ) break;
// limit it to this dynamic limit so we can check to
// see if we got enough visible each time we get one back!
// this prevents us from having to do like 50 msg20 lookups
// at a time.
//if ( m_numRequests-m_numReplies >= need ) break;
// hard limit
if ( m_numRequests-m_numReplies >= maxOut ) break;
// do not launch another until m_printi comes back because
// all summaries are bottlenecked on printing him out now
if ( m_si->m_streamResults &&
i >= m_printi + MAX_OUTSTANDING_MSG20S - 1 )
break;
// if we have printed enough summaries then do not launch
// any more, wait for them to come back in.
/// this is causing problems because we have a bunch of
// m_printi < m_msg3a.m_numDocIds checks that kinda expect
// us to get all summaries for every docid. but when we
// do federated search we can get a ton of docids.
// if ( m_printi >= m_docsToGetVisible ) {
// logf(LOG_DEBUG,"query: got %"INT32" >= %"INT32" "
// "summaries. done. "
// "waiting on remaining "
// "%"INT32" to return."
// , m_printi
// , m_docsToGetVisible
// , m_numRequests-m_numReplies);
// // wait for all msg20 replies to come in
// if ( m_numRequests != m_numReplies ) break;
// // then let's hack fix this then so we can call
// // printSearchResultsTail()
// m_printi = m_msg3a.m_numDocIds;
// // set these to max so they do not launch another
// // summary request, just in case, below
// m_numRequests = m_msg3a.m_numDocIds;
// m_numReplies = m_msg3a.m_numDocIds;
// break;
// }
// do not double count!
//if ( i <= m_lastProcessedi ) continue;
// do not repeat for this i
m_lastProcessedi = i;
// if we have printed enough summaries then do not launch
// any more, wait for them to come back in.
/// this is causing problems because we have a bunch of
// m_printi < m_msg3a.m_numDocIds checks that kinda expect
// us to get all summaries for every docid. but when we
// do federated search we can get a ton of docids.
// if ( m_printi >= m_docsToGetVisible ) {
// logf(LOG_DEBUG,"query: got %"INT32" >= %"INT32" "
// "summaries. done. "
// "waiting on remaining "
// "%"INT32" to return."
// , m_printi
// , m_docsToGetVisible
// , m_numRequests-m_numReplies);
// m_numRequests++;
// m_numReplies++;
// continue;
// }
// start up a Msg20 to get the summary
Msg20 *m = NULL;
if ( m_si->m_streamResults ) {
// there can be hundreds of thousands of results
// when streaming, so recycle a few msg20s to save mem
m = getAvailMsg20();
// mark it so we know which docid it goes with
m->m_ii = i;
}
else
m = m_msg20[i];
// if msg20 ptr null that means the cluster level is not CR_OK
if ( ! m ) {
m_numRequests++;
m_numReplies++;
continue;
}
// . did we already TRY to get the summary for this docid?
// . we might be re-called from the refilter: below
// . if already did it, skip it
// . Msg20::getSummary() sets m_docId, first thing
if ( m_msg3a.m_docIds[i] == m->getRequestDocId() ) {
m_numRequests++;
m_numReplies++;
continue;
}
// get the LinkInfo class to set from it
//LinkInfo* li = NULL;
//if ( m_si->m_refs_numToGenerate != 0 &&
// m_si->m_refs_docsToScan > 0 )
// li = m_msg1a.getLinkInfo(i);
// ENGINEER NOTE -- use a single call to msg20 and use flags..
//if ( m_si->m_rp_useResultsAsReferences ) {
// if (!m_tmpMsg20[n].
// getSummary(m_msg3a.getQuery(),
// assume no error
g_errno = 0;
// debug msg
if ( m_si->m_debug || g_conf.m_logDebugQuery )
logf(LOG_DEBUG,"query: msg40: [%"PTRFMT"] Getting "
"summary #%"INT32" for docId=%"INT64"",
(PTRTYPE)this,i,m_msg3a.m_docIds[i]);
// launch it
m_numRequests++;
// keep for-loops int16_ter with this
//if ( i > m_maxiLaunched ) m_maxiLaunched = i;
// get the collection rec
CollectionRec *cr =g_collectiondb.getRec(m_firstCollnum);
//getRec(m_si->m_coll2,m_si->m_collLen2);
if ( ! cr ) {
log("msg40: missing coll");
g_errno = ENOCOLLREC;
if ( m_numReplies < m_numRequests ) return false;
return true;
}
// set the summary request then get it!
Msg20Request req;
Query *q = &m_si->m_q;
req.ptr_qbuf = q->getQuery();
req.size_qbuf = q->getQueryLen()+1;
req.m_langId = m_si->m_queryLangId;
// set highlight query
if ( m_si->m_highlightQuery &&
m_si->m_highlightQuery[0] ) {
req.ptr_hqbuf = m_si->m_highlightQuery;
req.size_hqbuf = gbstrlen(req.ptr_hqbuf)+1;
}
int32_t q3size = m_si->m_sbuf3.length()+1;
if ( q3size == 1 ) q3size = 0;
//req.ptr_q2buf = m_si->m_sbuf3.getBufStart();
//req.size_q2buf = q3size;
req.m_isMasterAdmin = m_si->m_isMasterAdmin;
//req.m_rulesetFilter = m_si->m_ruleset;
//req.m_getTitleRec = m_si->m_getTitleRec;
//req.m_isSuperTurk = m_si->m_isSuperTurk;
req.m_highlightQueryTerms = m_si->m_doQueryHighlighting;
//req.m_highlightDates = m_si->m_doDateHighlighting;
//req.ptr_coll = m_si->m_coll2;
//req.size_coll = m_si->m_collLen2+1;
req.m_isDebug = (bool)m_si->m_debug;
if ( m_si->m_displayMetas && m_si->m_displayMetas[0] ) {
int32_t dlen = gbstrlen(m_si->m_displayMetas);
req.ptr_displayMetas = m_si->m_displayMetas;
req.size_displayMetas = dlen+1;
}
req.m_docId = m_msg3a.m_docIds[i];
// if the msg3a was merged from other msg3as because we
// were searching multiple collections...
if ( m_msg3a.m_collnums )
req.m_collnum = m_msg3a.m_collnums[i];
// otherwise, just one collection
else
req.m_collnum = m_msg3a.m_rrr.m_collnum;
req.m_numSummaryLines = m_si->m_numLinesInSummary;
req.m_maxCacheAge = maxAge;
req.m_wcache = m_si->m_wcache; // addToCache
req.m_state = this;
req.m_callback = gotSummaryWrapper;
req.m_niceness = m_si->m_niceness;
req.m_summaryMode = m_si->m_summaryMode;
// need to see if it is banned, etc.
//req.m_checkSitedb = 1;
// 0 means not, 1 means is (should never be 2 at this point)
req.m_boolFlag = m_si->m_boolFlag;
req.m_allowPunctInPhrase = m_si->m_allowPunctInPhrase;
req.m_showBanned = m_si->m_showBanned;
//req.m_excludeLinkText = m_si->m_excludeLinkText ;
//req.m_excludeMetaText = m_si->m_excludeMetaText ;
req.m_includeCachedCopy = m_si->m_includeCachedCopy;//bigsmpl
req.m_getSectionVotingInfo = m_si->m_getSectionVotingInfo;
req.m_considerTitlesFromBody = m_si->m_considerTitlesFromBody;
if ( cr->m_considerTitlesFromBody )
req.m_considerTitlesFromBody = true;
req.m_expected = true;
req.m_getSummaryVector = true;
req.m_bigSampleRadius = bigSampleRadius;
req.m_bigSampleMaxLen = bigSampleMaxLen;
//req.m_titleMaxLen = 256;
req.m_titleMaxLen = m_si->m_titleMaxLen; // cr->
req.m_summaryMaxLen = cr->m_summaryMaxLen;
// Line means excerpt
req.m_summaryMaxNumCharsPerLine =
m_si->m_summaryMaxNumCharsPerLine;
// a special undocumented thing for getting <h1> tag
req.m_getHeaderTag = m_si->m_hr.getLong("geth1tag",0);
//req.m_numSummaryLines = cr->m_summaryMaxNumLines;
// let "ns" parm override
req.m_numSummaryLines = m_si->m_numLinesInSummary;
if(m_si->m_isMasterAdmin && m_si->m_format == FORMAT_HTML )
req.m_getGigabitVector = true;
else req.m_getGigabitVector = false;
req.m_flags = 0;
if ( m_postQueryRerank.isEnabled() ) {
req.m_flags |= REQ20FLAG1_PQRENABLED;
if (m_si->m_pqr_demFactLocSummary > 0)
req.m_flags |= REQ20FLAG1_PQRLOCENABLED;
}
if ( m_si->m_pqr_demFactCommonInlinks > 0.0 )
//req.m_getInlinks = true;
req.m_getLinkInfo = true;
// . buzz likes to do the &inlinks=1 parm to get inlinks
// . use "&inlinks=1" for realtime inlink info, use
// "&inlinks=2" to just get it from the title rec, which is
// more stale, but does not take extra time or resources
// . we "default" to the realtime stuff... i.e. since buzz
// is already using "&inlinks=1"
if ( m_si->m_displayInlinks == 1 )
req.m_computeLinkInfo = true;
if ( m_si->m_displayInlinks == 2 )
//req.m_getInlinks = true;
req.m_getLinkInfo = true;
if ( m_si->m_displayInlinks == 3 )
req.m_computeLinkInfo = true;
if ( m_si->m_displayInlinks == 4 )
req.m_computeLinkInfo = true;
if ( m_si->m_displayOutlinks )
req.m_getOutlinks = true;
// buzz still wants the SitePop, computed fresh from Msg25,
// even if they do not say "&inlinks=4" ... but they do
// seem to specify getsitepops, so use that too
//if ( m_si->m_getSitePops )
// req.m_computeLinkInfo = true;
if (m_si->m_queryMatchOffsets)
req.m_getMatches = true;
// it copies this using a serialize() function
if ( ! m->getSummary ( &req ) ) continue;
// got reply
m_numReplies++;
// . otherwise we got summary without blocking
// . deal with an error
if ( ! g_errno ) continue;
// log it
log("query: Had error getting summary: %s.",
mstrerror(g_errno));
// record g_errno
if ( ! m_errno ) m_errno = g_errno;
// reset g_errno
g_errno = 0;
}
// return false if still waiting on replies
if ( m_numReplies < m_numRequests ) return false;
// do not re-call gotSummary() to avoid a possible recursive stack
// explosion. this is only true if we are being called from
// gotSummary() already, so do not call it again!!
if ( recalled )
return true;
// if we got nothing, that's it
if ( m_msg3a.m_numDocIds <= 0 ) {
// but if in streaming mode we still have to stream the
// empty results back
if ( m_si->m_streamResults ) return gotSummary ( );
// otherwise, we're done
return true;
}
// . i guess crash here for now
// . seems like we can call reallocMsg20Buf() and the first 50
// can already be set, so we drop down to here... so don't core
logf(LOG_DEBUG,"query: Had all msg20s already.");
// . otherwise, we got everyone, so go right to the merge routine
// . returns false if not all replies have been received
// . returns true if done
// . sets g_errno on error
return gotSummary ( );
}
Msg20 *Msg40::getAvailMsg20 ( ) {
for ( int32_t i = 0 ; i < m_numMsg20s ; i++ ) {
// m_inProgress is set to false right before it
// calls Msg20::m_callback which is gotSummaryWrapper()
// so we should be ok with this
if ( m_msg20[i]->m_launched ) continue;
return m_msg20[i];
}
// how can this happen???
char *xx=NULL;*xx=0;
return NULL;
}
Msg20 *Msg40::getCompletedSummary ( int32_t ix ) {
for ( int32_t i = 0 ; i < m_numMsg20s ; i++ ) {
if ( m_msg20[i]->m_ii != ix ) continue;
if ( m_msg20[i]->m_inProgress ) return NULL;
return m_msg20[i];
}
return NULL;
}
bool gotSummaryWrapper ( void *state ) {
Msg40 *THIS = (Msg40 *)state;
// inc it here
THIS->m_numReplies++;
// log every 1000 i guess
if ( (THIS->m_numReplies % 1000) == 0 )
log("msg40: got %"INT32" summaries out of %"INT32"",THIS->m_numReplies,
THIS->m_msg3a.m_numDocIds);
// it returns false if we're still awaiting replies
if ( ! THIS->gotSummary ( ) ) return false;
// lookup facets
if ( THIS->m_si &&
! THIS->m_si->m_streamResults &&
! THIS->lookupFacets() )
return false;
// now call callback, we're done
THIS->m_callback ( THIS->m_state );
return true;
}
void doneSendingWrapper9 ( void *state , TcpSocket *sock ) {
Msg40 *THIS = (Msg40 *)state;
// the send completed, count it
THIS->m_sendsIn++;
// socket error? if client closes the socket midstream we get one.
if ( g_errno ) {
THIS->m_socketHadError = g_errno;
log("msg40: streaming socket had error: %s",
mstrerror(g_errno));
// i guess destroy the socket here so we don't get called again?
}
// clear it so we don't think it was a msg20 error below
g_errno = 0;
// try to send more... returns false if blocked on something
if ( ! THIS->gotSummary() ) return;
// all done!!!???
THIS->m_callback ( THIS->m_state );
}
// . returns false if not all replies have been received (or timed/erroredout)
// . returns true if done (or an error finished us)
// . sets g_errno on error
bool Msg40::gotSummary ( ) {
// now m_linkInfo[i] (for some i, i dunno which) is filled
if ( m_si->m_debug || g_conf.m_logDebugQuery )
logf(LOG_DEBUG,"query: msg40: [%"PTRFMT"] Got summary. "
"Total got=#%"INT32".",
(PTRTYPE)this,m_numReplies);
// come back up here if we have to get more docids from Msg3a and
// it gives us more right away without blocking, then we need to
// re-filter them!
// refilter:
// did we have a problem getting this summary?
if ( g_errno ) {
// save it
m_errno = g_errno;
// log it
if ( g_errno != EMISSINGQUERYTERMS )
log("query: msg40: Got error getting summary: %s.",
mstrerror(g_errno));
// reset g_errno
g_errno = 0;
}
// initialize dedup table if we haven't already
if ( ! m_dedupTable.isInitialized() &&
! m_dedupTable.set (4,0,64,NULL,0,false,m_si->m_niceness,"srdt") )
log("query: error initializing dedup table: %s",
mstrerror(g_errno));
State0 *st = (State0 *)m_state;
// keep socket alive if not streaming. like downloading csv...
// this fucks up HTTP replies by inserting a space before the "HTTP"
// it does not render properly on the browser...
/*
int32_t now2 = getTimeLocal();
if ( now2 - m_lastHeartbeat >= 10 && ! m_si->m_streamResults &&
// incase socket is closed and recycled for another connection
st->m_socket->m_numDestroys == st->m_numDestroys ) {
m_lastHeartbeat = now2;
int n = ::send ( st->m_socket->m_sd , " " , 1 , 0 );
log("msg40: sent heartbeat of %"INT32" bytes on sd=%"INT32"",
(int32_t)n,(int32_t)st->m_socket->m_sd);
}
*/
/*
// sanity check
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// stop as soon as we hit a gap breaking our contiguity...
Msg20 *m = m_msg20[i];
if ( ! m ) continue;
Msg20Reply *mr = m->m_r;
if ( ! mr ) continue;
char *cc = mr->ptr_content;
if ( ! cc ) continue;
//if ( ! strstr(cc,"Modern Marketing KF400032MA") ) continue;
//log("hey");
//fprintf(stderr,"msg %"INT32" = %s\n",i,cc );
if ( i == 48329 ) { char *xx=NULL;*xx=0; }
mr->ptr_content = NULL;
}
*/
doAgain:
SafeBuf *sb = &st->m_sb;
sb->reset();
// this is in PageResults.cpp
if ( m_si && m_si->m_streamResults && ! m_printedHeader ) {
// only print header once
m_printedHeader = true;
printHttpMime ( st );
printSearchResultsHeader ( st );
}
for ( ; m_si && m_si->m_streamResults&&m_printi<m_msg3a.m_numDocIds ;
m_printi++){
// if we are waiting on our previous send to complete... wait..
if ( m_sendsOut > m_sendsIn ) break;
// get summary for result #m_printi
Msg20 *m20 = getCompletedSummary ( m_printi );
// if printing csv we need the first 100 results back
// to get the most popular csv headers for to print that
// as the first row in the csv output. if we print a
// results with a column not in the header row then we
// augment the headers then and there, although the header
// row will be blank for the new column, we can put
// the new header row at the end of the file i guess. this way
// we can immediately start streaming back the csv.
if ( m_needFirstReplies ) {
// need at least this many replies to process
if ( m_numReplies < m_needFirstReplies )
break;
// ensure we got the TOP needFirstReplies in order
// of their display to ensure consistency
int32_t k;
for ( k = 0 ; k < m_needFirstReplies ; k++ ) {
Msg20 *xx = getCompletedSummary(k);
if ( ! xx ) break;
if ( ! xx->m_r ) break;
}
// if not all have come back yet, wait longer...
if ( k < m_needFirstReplies ) break;
// now make the csv header and print it
printCSVHeaderRow ( sb );
// and no longer need to do this logic
m_needFirstReplies = 0;
}
// otherwise, get the summary for result #m_printi
//Msg20 *m20 = m_msg20[m_printi];
//if ( ! m20 ) {
// log("msg40: m20 NULL #%"INT32"",m_printi);
// continue;
//}
// if result summary #i not yet in, wait...
if ( ! m20 )
break;
// wait if no reply for it yet
//if ( m20->m_inProgress )
// break;
if ( m20->m_errno ) {
log("msg40: sum #%"INT32" error: %s",
m_printi,mstrerror(m20->m_errno));
// make it available to be reused
m20->reset();
continue;
}
// get the next reply we are waiting on to print results order
Msg20Reply *mr = m20->m_r;
if ( ! mr ) break;
//if ( ! mr ) { char *xx=NULL;*xx=0; }
// primitive deduping. for diffbot json exclude url's from the
// XmlDoc::m_contentHash32.. it will be zero if invalid i guess
if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
mr->m_contentHash32 &&
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last
// time a doc was spidered and the error code or success
// code
mr->m_contentType != CT_STATUS &&
m_dedupTable.isInTable ( &mr->m_contentHash32 ) ) {
//if ( g_conf.m_logDebugQuery )
log("msg40: dup sum #%"INT32" (%"UINT32")(d=%"INT64")",m_printi,
mr->m_contentHash32,mr->m_docId);
// make it available to be reused
m20->reset();
continue;
}
// static int32_t s_bs = 0;
// if ( (s_bs++ % 5) != 0 ) {
// log("msg40: FAKE dup sum #%"INT32" (%"UINT32")(d=%"INT64")",m_printi,
// mr->m_contentHash32,mr->m_docId);
// // make it available to be reused
// m20->reset();
// continue;
// }
// return true with g_errno set on error
if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
mr->m_contentHash32 &&
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last
// time a doc was spidered and the error code or success
// code
mr->m_contentType != CT_STATUS &&
! m_dedupTable.addKey ( &mr->m_contentHash32 ) ) {
m_hadPrintError = true;
log("msg40: error adding to dedup table: %s",
mstrerror(g_errno));
}
// assume we show this to the user
m_numDisplayed++;
//log("msg40: numdisplayed=%"INT32"",m_numDisplayed);
// do not print it if before the &s=X start position though
if ( m_si && m_numDisplayed <= m_si->m_firstResultNum ){
log("msg40: hiding #%"INT32" (%"UINT32")(d=%"INT64")",
m_printi,mr->m_contentHash32,mr->m_docId);
m20->reset();
continue;
}
// . ok, we got it, so print it and stream it
// . this might set m_hadPrintError to true
printSearchResult9 ( m_printi , &m_numPrintedSoFar , mr );
//m_numPrintedSoFar++;
//log("msg40: printedsofar=%"INT32"",m_numPrintedSoFar);
// now free the reply to save memory since we could be
// streaming back 1M+. we call reset below, no need for this.
//m20->freeReply();
// return it so getAvailMsg20() can use it again
// this will set m_launched to false
m20->reset();
}
// set it to true on all but the last thing we send!
if ( m_si->m_streamResults )
st->m_socket->m_streamingMode = true;
// if streaming results, and too many results were clustered or
// deduped then try to get more by merging the docid lists that
// we already have from the shards. if this still does not provide
// enough docids then we will need to issue a new msg39 request to
// each shard to get even more docids from each shard.
if ( m_si && m_si->m_streamResults &&
// this is coring as well on multi collection federated searches
// so disable that for now too. it is because Msg3a::m_r is
// NULL.
m_numCollsToSearch == 1 &&
// must have no streamed chunk sends out
m_sendsOut == m_sendsIn &&
// if we did not ask for enough docids and they were mostly
// dups so they got deduped, then ask for more.
// m_numDisplayed includes results before the &s=X parm.
// and so does m_docsToGetVisiable, so we can compare them.
m_numDisplayed < m_docsToGetVisible &&
// wait for us to have exhausted the docids we have merged
m_printi >= m_msg3a.m_numDocIds &&
// wait for us to have available msg20s to get summaries
m_numReplies == m_numRequests &&
// this is true if we can get more docids from merging
// more of the termlists from the shards together.
// otherwise, we will have to ask each shard for a
// higher number of docids.
m_msg3a.m_moreDocIdsAvail &&
// do not do this if client closed connection
! m_socketHadError ) { //&&
// doesn't work on multi-coll just yet, it cores.
// MAKE it.
//m_numCollsToSearch == 1 ) {
// can it cover us?
int32_t need = m_msg3a.m_docsToGet + 20;
// note it
log("msg40: too many summaries deduped. "
"getting more "
"docids from msg3a merge and getting summaries. "
"%"INT32" are visible, need %"INT32". "
"changing docsToGet from %"INT32" to %"INT32". "
"numReplies=%"INT32" numRequests=%"INT32"",
m_numDisplayed,
m_docsToGetVisible,
m_msg3a.m_docsToGet,
need,
m_numReplies,
m_numRequests);
// merge more docids from the shards' termlists
m_msg3a.m_docsToGet = need;
// sanity. the original msg39request must be there
if ( ! m_msg3a.m_r ) { char *xx=NULL;*xx=0; }
// this should increase m_msg3a.m_numDocIds
m_msg3a.mergeLists();
}
// if we've printed everything out and we are streaming, now
// get the facet text. when done this should print the tail
// like we do below. lookupFacets() should scan the facet values
// and each value should have a docid with it that we do the lookup
// on. and store the text into m_facetTextBuf safebuf, and make
// the facet table have the offset of it in that safebuf.
if ( m_si &&
m_si->m_streamResults &&
m_printi >= m_msg3a.m_numDocIds )
if ( ! lookupFacets () ) return false;
// . wrap it up with Next 10 etc.
// . this is in PageResults.cpp
if ( m_si &&
m_si->m_streamResults &&
! m_printedTail &&
m_printi >= m_msg3a.m_numDocIds ) {
m_printedTail = true;
printSearchResultsTail ( st );
if ( m_sendsIn < m_sendsOut ) { char *xx=NULL;*xx=0; }
// this will be our final send
st->m_socket->m_streamingMode = false;
}
TcpServer *tcp = &g_httpServer.m_tcp;
//g_conf.m_logDebugTcp = 1;
// . transmit the chunk in sb if non-zero length
// . steals the allocated buffer from sb and stores in the
// TcpSocket::m_sendBuf, which it frees when socket is
// ultimately destroyed or we call sendChunk() again.
// . when TcpServer is done transmitting, it does not close the
// socket but rather calls doneSendingWrapper() which can call
// this function again to send another chunk
// . when we are truly done sending all the data, then we set lastChunk
// to true and TcpServer.cpp will destroy m_socket when done
if ( sb->length() &&
// did client browser close the socket on us midstream?
! m_socketHadError &&
! tcp->sendChunk ( st->m_socket ,
sb ,
this ,
doneSendingWrapper9 ) )
// if it blocked, inc this count. we'll only call m_callback
// above when m_sendsIn equals m_sendsOut... and
// m_numReplies == m_numRequests
m_sendsOut++;
// writing on closed socket?
if ( g_errno ) {
m_socketHadError = g_errno;
log("msg40: got tcp error : %s",mstrerror(g_errno));
}
// do we need to launch another batch of summary requests?
if ( m_numRequests < m_msg3a.m_numDocIds && ! m_socketHadError ) {
// . if we can launch another, do it
// . say "true" here so it does not call us, gotSummary() and
// do a recursive stack explosion
// . this returns false if still waiting on more to come back
if ( ! launchMsg20s ( true ) ) return false;
// it won't launch now if we are bottlnecked waiting for
// m_printi's summary to come in
if ( m_si->m_streamResults ) {
// it won't launch any if we printed out enough as well
// and it printed "waiting on remaining 0 to return".
// we shouldn't be waiting for more to come in b/c
// we are in gotSummart() so one just came in
// freeing up a msg20 to launch another, so assume
// this means we are basically done. and it
// set m_numRequests=m_msg3a.m_numDocIds etc.
//if ( m_numRequests == m_msg3a.m_numDocIds )
// goto printTail;
// otherwise, keep chugging
goto complete;
}
// maybe some were cached?
//goto refilter;
// it returned true, so m_numRequests == m_numReplies and
// we don't need to launch any more! but that does NOT
// make sense because m_numContiguous < m_msg3a.m_numDocIds
// . i guess the launch can fail because of oom... and
// end up returning true here... seen it happen, and
// we had full requests/replies for m_msg3a.m_numDocIds
log("msg40: got all replies i guess");
goto doAgain;
//char *xx=NULL; *xx=0;
}
complete:
// . ok, now i wait for everybody.
// . TODO: evaluate if this hurts us
if ( m_numReplies < m_numRequests )
return false;
// if streaming results, we are done
if ( m_si && m_si->m_streamResults ) {
// unless waiting for last transmit to complete
if ( m_sendsOut > m_sendsIn ) return false;
// delete everything! no, doneSendingWrapper9 does...
//mdelete(st, sizeof(State0), "msg40st0");
//delete st;
// otherwise, all done!
return true;
}
// save this before we increment m_numContiguous
//int32_t oldNumContiguous = m_numContiguous;
// . before launching more msg20s, first see if we got enough now
// . the first "m_numContiguous" of the m_msg20[] are valid!
// . save this for launchMsg20s() to look at at so it will not keep
// launching just to keep m_maxOutstanding satisfied. otherwise, if
// msg20[0] is really really slow, we end up getting back *way* more
// summaries than we probably need!
// . this also let's us know how many of the m_msg3a.m_docIds[] and
// m_msg20[]s we can look at at this point to determine how many of
// the docids are actually "visible" (unclustered)
// . if enough are already visible we set m_gotEnough to true to
// prevent more msg20s being launched. but we must wait for all that
// have launched to come back.
// . visibleContiguous = of the contiguous guys, how many are good,
// i.e. visible/unclustered?
/*
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// stop as soon as we hit a gap breaking our contiguity...
if ( m_msg20[i] && ! m_msg20[i]->m_gotReply ) break;
// count every docid
m_numContiguous++;
// count if it is visible
if (m_msg3a.m_clusterLevels[i] == CR_OK) m_visibleContiguous++;
}
*/
// if there's no way we have enough, keep going
/*
if ( m_visibleContiguous<m_si->m_docsWanted+m_si->m_firstResultNum+1 &&
// there have to be more docids to get summaries for...
m_numContiguous < m_msg3a.m_numDocIds ) {
// . if we can launch another, do it
// . say "true" here so it does not call us, gotSummary() and
// do a recursive stack explosion
// . this returns false if still waiting on more to come back
if ( ! launchMsg20s ( true ) ) return false;
// maybe some were cached?
goto refilter;
}
*/
// MDW: put this back once we figure out how to prevent so many
// wasted summary lookups
// how many msg20s have we got back but filtered out?
// int32_t filtered = m_numContiguous - m_visibleContinguous;
// we don't want to over-launch msg20s if we end up getting what
// we wanted without any disappearing because of clustering, etc.
// BUT if we UNDER the hard count, launch more
//if ( m_numContiguous == oldNumContiguous && ! m_gotEnough ) {
// // launch more msg20 requests
// if ( ! launchMsg20s ( true ) ) return false;
// // maybe some were cached?
// //goto refilter;
// // it returned true, so m_numRequests == m_numReplies and
// // we don't need to launch any more! but that does NOT
// // make sense because the reply we just got did not increase
// // m_numContiguous, meaning there is a gap we are waiting on.
// char *xx=NULL; *xx=0;
//}
// this logic here makes us get the msg20s in chunks of 50, so the
// 51st msg20 request will have to wait for the first 50 replies to
// arrive before it can even be launched! that seriously slows us down,
// because we often have a summary that takes 200ms to get... but most
// take like 10-20ms or so. MDW: comment out later again
//if( (m_numContiguous == oldNumContiguous) &&
// (m_numReplies < m_numRequests ) )
// return false;
// so we have to set this to zero 0 i guess
//oldNumContiguous = 0;
// and this
//m_numContiguous = m_numReplies;
int64_t startTime = gettimeofdayInMilliseconds();
int64_t took;
// int16_tcut
//Query *q = m_msg3a.m_q;
Query *q = &m_si->m_q;
//log(LOG_DEBUG, "query: msg40: deduping from %"INT32" to %"INT32"",
//oldNumContiguous, m_numContiguous);
// count how many are visible!
//int32_t visible = 0;
// loop over each clusterLevel and set it
for ( int32_t i = 0 ; i < m_numReplies ; i++ ) {
// did we skip the first X summaries because we were
// not deduping/siteclustering/gettingGigabits?
if ( m_didSummarySkip && i < m_si->m_firstResultNum )
continue;
// get current cluster level
char *level = &m_msg3a.m_clusterLevels[i];
// sanity check -- this is a transistional value msg3a should
// set it to something else!
if ( *level == CR_GOT_REC ) { char *xx=NULL; *xx=0; }
if ( *level == CR_ERROR_CLUSTERDB ) { char *xx=NULL; *xx=0; }
// skip if already "bad"
if ( *level != CR_OK ) continue;
// if the user only requested docids, we have no summaries
if ( m_si->m_docIdsOnly ) break;
// convenient var
Msg20 *m = m_msg20[i];
// get the Msg20 reply
Msg20Reply *mr = m->m_r;
// if no reply, all hosts must have been dead i guess so
// filter out this guy
if ( ! mr && ! m->m_errno ) {
logf(LOG_DEBUG,"query: msg 20 reply was null.");
m->m_errno = ENOHOSTS;
}
// if any msg20 has m_errno set, then set ours so at least the
// xml feed will know there was a problem even though it may
// have gotten search results.
// the BIG HACK is done in Msg20. Msg20::m_errno is set to
// something like EMISSINGQUERYTERMS if the document really
// doesn't match the query, maybe because of indexdb corruption
if ( m->m_errno ) {
if ( m_si->m_debug || g_conf.m_logDebugQuery )
logf( LOG_DEBUG, "query: result %"INT32" (docid=%"INT64") had "
"an error (%s) and will not be shown.", i,
m_msg3a.m_docIds[i], mstrerror(m->m_errno));
*level = CR_ERROR_SUMMARY;
//m_visibleContiguous--;
// update our m_errno while here
if ( ! m_errno ) m_errno = m->m_errno;
continue;
}
// a special case
if ( mr && mr->m_errno == CR_RULESET_FILTERED ) {
*level = CR_RULESET_FILTERED;
//m_visibleContiguous--;
continue;
}
// this seems to be set too!
if ( mr && mr->m_errno == EDOCFILTERED ) {
*level = CR_RULESET_FILTERED;
//m_visibleContiguous--;
continue;
}
if ( ! m_si->m_showBanned && mr->m_isBanned ) {
if ( m_si->m_debug || g_conf.m_logDebugQuery )
logf ( LOG_DEBUG, "query: result %"INT32" (docid=%"INT64") is "
"banned and will not be shown.", i,
m_msg3a.m_docIds[i] );
*level = CR_BANNED_URL;
//m_visibleContiguous--;
continue;
}
// filter out urls with <![CDATA in them
if ( strstr(mr->ptr_ubuf, "<![CDATA[") ) {
*level = CR_BAD_URL;
//m_visibleContiguous--;
continue;
}
// also filter urls with ]]> in them
if ( strstr(mr->ptr_ubuf, "]]>") ) {
*level = CR_BAD_URL;
//m_visibleContiguous--;
continue;
}
if( ! mr->m_hasAllQueryTerms ) {
if ( m_si->m_debug || g_conf.m_logDebugQuery )
logf( LOG_DEBUG, "query: result %"INT32" (docid=%"INT64") is "
"missing query terms and will not be"
" shown.", i, m_msg3a.m_docIds[i] );
*level = CR_MISSING_TERMS;
//m_visibleContiguous--;
// uncluster any docids below this one
if ( m_unclusterCount-- > 0 ) uncluster ( i );
continue;
}
//visible++;
}
// . assume no dups removed
// . we print "click here to show ommitted results" if this is true
m_removedDupContent = false;
// what is the deduping threshhold? 0 means do not do deuping
int32_t dedupPercent = 0;
if ( m_si->m_doDupContentRemoval && m_si->m_percentSimilarSummary )
dedupPercent = m_si->m_percentSimilarSummary;
// icc=1 turns this off too i think
if ( m_si->m_includeCachedCopy ) dedupPercent = 0;
// if the user only requested docids, we have no summaries
if ( m_si->m_docIdsOnly ) dedupPercent = 0;
// filter out duplicate/similar summaries
for ( int32_t i = 0 ; dedupPercent && i < m_numReplies ; i++ ) {
// skip if already invisible
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// start with the first docid we have not yet checked!
//int32_t m = oldNumContiguous;
// get it
Msg20Reply *mri = m_msg20[i]->m_r;
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last
// time a doc was spidered and the error code or
// success code
if ( mri->m_contentType == CT_STATUS ) continue;
// never let it be i
//if ( m <= i ) m = i + 1;
// see if any result lower-scoring than #i is a dup of #i
for( int32_t m = i+1 ; m < m_numReplies ; m++ ) {
// get current cluster level
char *level = &m_msg3a.m_clusterLevels[m];
// skip if already invisible
if ( *level != CR_OK ) continue;
// get it
Msg20Reply *mrm = m_msg20[m]->m_r;
// do not dedup CT_STATUS results, those are
// spider reply "documents" that indicate the last
// time a doc was spidered and the error code or
// success code
if ( mrm->m_contentType == CT_STATUS ) continue;
// use gigabit vector to do topic clustering, etc.
int32_t *vi = (int32_t *)mri->ptr_vbuf;
int32_t *vm = (int32_t *)mrm->ptr_vbuf;
//char s = g_clusterdb.
// getSampleSimilarity (vi,vm,VECTOR_REC_SIZE );
float s ;
s = computeSimilarity(vi,vm,NULL,NULL,NULL,
m_si->m_niceness);
// skip if not similar
if ( (int32_t)s < dedupPercent ) continue;
// otherwise mark it as a summary dup
if ( m_si->m_debug || g_conf.m_logDebugQuery )
logf( LOG_DEBUG, "query: result #%"INT32" "
"(docid=%"INT64") is %.02f%% similar-"
"summary of #%"INT32" (docid=%"INT64")",
m, m_msg3a.m_docIds[m] ,
s, i, m_msg3a.m_docIds[i] );
*level = CR_DUP_SUMMARY;
//m_visibleContiguous--;
m_removedDupContent = true;
// uncluster the next clustered docid from this
// hostname below "m"
if ( m_unclusterCount-- > 0 ) uncluster ( m );
}
}
//
// BEGIN URL NORMALIZE AND COMPARE
//
// . ONLY DEDUP URL if it explicitly enabled AND we are not performing
// a site: or suburl: query.
if(m_si->m_dedupURL &&
!q->m_hasPositiveSiteField &&
!q->m_hasSubUrlField) {
for(int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++) {
// skip if already invisible
if(m_msg3a.m_clusterLevels[i] != CR_OK) continue;
// get it
Msg20Reply *mr = m_msg20[i]->m_r;
// hash the URL all in lower case to catch wiki dups
char *url = mr-> ptr_ubuf;
int32_t ulen = mr->size_ubuf - 1;
// since the redirect url is a more accurate
// representation of the conent do that if it exists.
if ( mr->ptr_rubuf ) {
url = mr-> ptr_rubuf;
ulen = mr->size_rubuf - 1;
}
// fix for directories, sometimes they are indexed
// without a trailing slash, so let's normalize to
// this standard.
if(url[ulen-1] == '/')
ulen--;
Url u;
u.set(url,ulen);
url = u.getHost();
if(u.getPathLen() > 1) {
// . remove sub-domain to fix conflicts with
// sites having www,us,en,fr,de,uk,etc AND
// it redirects to the same page.
char *host = u.getHost();
char *mdom = u.getMidDomain();
if(mdom && host) {
int32_t hlen = mdom - host;
if (isSubDom(host, hlen-1))
url = mdom;
}
}
// adjust url string length
ulen -= url - u.getUrl();
uint64_t h = hash64Lower_a(url, ulen);
int32_t slot = m_urlTable.getSlot(h);
// if there is no slot,this url doesn't exist => add it
if(slot == -1) {
m_urlTable.addKey(h,mr->m_docId);
}
else {
// If there was a slot, denote with the
// cluster level URL already exited previously
char *level = &m_msg3a.m_clusterLevels[i];
if(m_si->m_debug || g_conf.m_logDebugQuery)
logf(LOG_DEBUG, "query: result #%"INT32" "
"(docid=%"INT64") is the "
"same URL as "
"(docid=%"INT64")",
i,m_msg3a.m_docIds[i],
m_urlTable.
getValueFromSlot(slot));
*level = CR_DUP_URL;
//m_visibleContiguous--;
m_removedDupContent = true;
}
}
}
//
// END URL NORMALIZE AND COMPARE
//
// how many docids are visible? (unfiltered)
//int32_t visible = m_filterStats[CR_OK];
m_omitCount = 0;
// count how many are visible!
int32_t visible = 0;
// loop over each clusterLevel and set it
for ( int32_t i = 0 ; i < m_numReplies ; i++ ) {
// get current cluster level
char *level = &m_msg3a.m_clusterLevels[i];
// on CR_OK
if ( *level == CR_OK ) visible++;
// otherwise count as ommitted
else m_omitCount++;
}
// do we got enough search results now?
//if ( visible >= m_docsWanted )
// m_gotEnough = true;
// show time
took = gettimeofdayInMilliseconds() - startTime;
if ( took > 3 )
log(LOG_INFO,"query: Took %"INT64" ms to do clustering and dup "
"removal.",took);
// do we have enough visible at this point?
//if ( m_visibleContiguous >= m_docsToGetVisible ) m_gotEnough = true;
// . wait for all the replies to come in
// . no more should be launched in launchedMsg20s() since we set
// m_gotEnough to true
// . MDW: i added "m_gotEnough &&" to this line...
//if ( m_gotEnough && m_numReplies < m_numRequests ) return false;
// . let's wait for the tasks to complete before even trying to launch
// more than the first MAX_OUTSTANDING msg20s
// . the msg3a re-call will end up re-doing our tasks as well! so we
// have to make sure they complete at this point
if ( m_tasksRemaining > 0 ) return false;
// debug
bool debug = (m_si->m_debug || g_conf.m_logDebugQuery);
for ( int32_t i = 0 ; debug && i < m_msg3a.m_numDocIds ; i++ ) {
//uint32_t sh;
//sh = g_titledb.getHostHash(*(key_t*)m_msg20[i]->m_vectorRec);
int32_t cn = (int32_t)m_msg3a.m_clusterLevels[i];
if ( cn < 0 || cn >= CR_END ) { char *xx=NULL;*xx=0; }
char *s = g_crStrings[cn];
if ( ! s ) { char *xx=NULL;*xx=0; }
logf(LOG_DEBUG, "query: msg40 final hit #%"INT32") d=%"UINT64" "
"cl=%"INT32" (%s)",
i,m_msg3a.m_docIds[i],(int32_t)m_msg3a.m_clusterLevels[i],s);
}
if ( debug )
logf (LOG_DEBUG,"query: msg40: firstResult=%"INT32", "
"totalDocIds=%"INT32", resultsWanted=%"INT32" "
"visible=%"INT32" toGet=%"INT32" recallCnt=%"INT32"",
m_si->m_firstResultNum, m_msg3a.m_numDocIds ,
m_docsToGetVisible, visible,
//m_numContiguous,
m_docsToGet , m_msg3aRecallCnt);
// if we do not have enough visible, try to get more
if ( visible < m_docsToGetVisible && m_msg3a.m_moreDocIdsAvail &&
// do not spin too long in this!
// TODO: fix this better somehow later
m_docsToGet <= 1000 &&
// doesn't work on multi-coll just yet, it cores
m_numCollsToSearch == 1 ) {
// can it cover us?
//int32_t need = m_msg3a.m_docsToGet + 20;
int32_t need = m_docsToGet + 20;
// increase by 25 percent as well
need *= 1.25;
// note it
log("msg40: too many summaries invisible. getting more "
"docids from msg3a merge and getting summaries. "
"%"INT32" are visible, need %"INT32". "
"%"INT32" to %"INT32". "
"numReplies=%"INT32" numRequests=%"INT32"",
visible, m_docsToGetVisible,
m_msg3a.m_docsToGet, need,
m_numReplies, m_numRequests);
// get more
//m_docsToGet = need;
// get more!
//m_msg3a.m_docsToGet = need;
m_docsToGet = need;
// reset this before launch
m_numReplies = 0;
m_numRequests = 0;
// reprocess all!
m_lastProcessedi = -1;
// let's do it all from the top!
return getDocIds ( true ) ;
//m_msg3a.mergeLists();
// rellaoc the msg20 array
//if ( ! reallocMsg20Buf() ) return true;
// reset this before launch
//m_numReplies = 0;
//m_numRequests = 0;
// reprocess all!
//m_lastProcessedi = -1;
// now launch!
//if ( ! launchMsg20s ( true ) ) return false;
// all done, call callback
//return true;
}
/*
// if we do not have enough visible, try to get more
if ( visible < m_docsToGet &&
// if we got some docids yet to get, from any tier...
//m_msg3a.m_moreDocIdsAvail &&
// do not recall until all done with the msg20s
//m_numContiguous >= m_msg3a.m_numDocIds &&
// and we had to have gotten all requested of use but just lost
// some docids due to clustering/filtering
m_msg3a.m_numDocIds >= m_docsToGet &&
// . only recall 3 times at most
// . this also prevents potential stack explosion since we
// re-call getDocIds() below!
m_msg3aRecallCnt < 3 &&
// do not recall if doing rerank
//m_si->m_rerankRuleset < 0 &&
// do not recall if we got the max to compute
m_msg3a.m_numDocIds < m_maxDocIdsToCompute ) {
// get the visibility ratio
float ratio ;
if ( m_visibleContiguous < 2 )
ratio = m_msg3a.m_numDocIds / 1;
else
ratio = m_msg3a.m_numDocIds / m_visibleContiguous;
// always boost by at least 50% more for good measure
ratio *= 1.5;
// keep stats on it
g_stats.m_msg3aRecallCnt++;
m_msg3aRecallCnt++;
// . re-call msg3a and ask for more docids because some of them
// are invisible/filtered and we need more
// . MDW: can we make Msg3a just re-do its merge if it can,
// rather than re-call Msg39 again? (TODO)
// . apply the ratio, to get more docids
int32_t get = (int32_t)((float)m_docsToGet * ratio);
// do not breach the limit
if ( get > m_maxDocIdsToCompute ) get = m_maxDocIdsToCompute;
// . if different, recall msg3a
// . if we are then we can start from msg3a.MergedocIds
if ( get > m_docsToGet ) {
// debug msg
//if ( g_conf.m_logDebugQuery || m_si->m_debug )
logf(LOG_DEBUG,"query: msg40: recalling msg3a "
"merge oldactual=%"INT32" newactual=%"INT32"",
m_docsToGet,get);
// ok, we got a new number to get now
m_docsToGet = get;
// let's do it all from the top!
return getDocIds ( true ) ;
// NOTE: we no longer do msg3a re-calls for simplicity
// so all re-calling is done from right here only
// MDW: hack it in msg3a too
//m_msg3a.m_docsToGet = get;
// . true = recalled?
// . this will re-merge the lists with a higher
// m_docsToGet and hopefully squeeze more docids out
// . this will block (return false) if it has to
// re-call the Msg39s to get more docids by calling
// Msg3a::fetchLists().
// . if it blocks it will eventually call our
// gotDocIdsWrapper() callback
//if ( ! m_msg3a.mergeLists ( true ) ) return false;
// hey, we got some more docids out of the merge,
// so check them out
//goto refilter;
}
}
*/
/*
// how many msg20::getSummary() calls did we do unnecessarily?
int32_t vcnt = 0;
for ( int32_t i = 0 ; i <= m_maxiLaunched ; i++ ) {
// skip if never launched and should have... a gap...
if ( m_msg20[i] && ! m_msg20[i]->m_gotReply ) continue;
// get cluster level
char level = m_msg3a.m_clusterLevels[i];
// sanity check
if ( level < 0 || level >= CR_END ) { char *xx=NULL; *xx =0; }
// add it up
g_stats.m_filterStats[(int32_t)level]++;
// skip if NOT visible
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// count if visible
vcnt++;
// skip if not wasted
if ( vcnt <= m_docsToGetVisible ) continue;
// a special g_stat, means msg20 call was not necessary
g_stats.m_filterStats[CR_WASTED]++;
// discount from visible
g_stats.m_filterStats[CR_OK]--;
}
*/
// get time now
int64_t now = gettimeofdayInMilliseconds();
// . add the stat for how long to get all the summaries
// . use purple for tie to get all summaries
// . THIS INCLUDES Msg3a/Msg39 RECALLS!!!
// . can we subtract that?
g_stats.addStat_r ( 0 ,
m_startTime ,
now ,
//"get_all_summaries",
0x008220ff );
// timestamp log
if ( g_conf.m_logTimingQuery || m_si->m_debug )
logf(LOG_DEBUG,"query: msg40: [%"PTRFMT"] Got %"INT32" summaries in "
"%"INT64" ms",
(PTRTYPE)this ,
visible, // m_visibleContiguous,
now - m_startTime );
//int32_t maxAge = 0;
//if ( m_si->m_rcache ) maxAge = g_conf.m_titledbMaxCacheAge;
/////////////
//
//
// prepare query term extra info for gigabits
//
////////////
//QueryTerm *qterms[MAX_QUERY_TERMS];
//int32_t nqt = 0;
//Query *q = m_si->m_q;
// english? TEST!
unsigned char lang = m_si->m_queryLangId;
// just print warning i guess
if ( lang == 0 ) {
log("query: queryLang is 0 for q=%s",q->m_orig);
//char *xx=NULL;*xx=0; }
}
// we gotta use query TERMS not words, because the query may be
// 'cd rom' and the phrase term will be 'cdrom' which is a good one
// to use for gigabits! plus we got synonyms now!
for ( int32_t i = 0 ; i < q->m_numTerms ; i++ ) {
// int16_tcut
QueryTerm *qt = &q->m_qterms[i];
// assume ignored
qt->m_popWeight = 0;
qt->m_hash64d = 0;
// skip if ignored query stop word etc.
if ( qt->m_ignored && qt->m_ignored != IGNORE_QUOTED )continue;
// get the word or phrase
char *s = qt->m_term;
int32_t slen = qt->m_termLen;
// use this special hash for looking up popularity in pop dict
// i think it is just like hash64 but ignores spaces so we
// can hash 'cd rom' as "cdrom". but i think we do this
// now, so use m_termId as see...
uint64_t qh = hash64d(s, slen);
//int64_t qh = qt->m_termId;
int32_t qpop;
qpop = g_speller.getPhrasePopularity(s, qh, true,lang);
int32_t qpopWeight;
if ( qpop < QPOP_ZONE_0 ) qpopWeight = QPOP_MULT_0;
else if ( qpop < QPOP_ZONE_1 ) qpopWeight = QPOP_MULT_1;
else if ( qpop < QPOP_ZONE_2 ) qpopWeight = QPOP_MULT_2;
else if ( qpop < QPOP_ZONE_3 ) qpopWeight = QPOP_MULT_3;
else if ( qpop < QPOP_ZONE_4 ) qpopWeight = QPOP_MULT_4;
else qpopWeight = 1;
// remember them in the query term
qt->m_hash64d = qh;
qt->m_popWeight = qpopWeight;
// store that queryterm ptrs into our array
//qterms[nqt] = qt;
//nqt++;
// debug it
if ( ! m_si->m_debugGigabits ) continue;
SafeBuf msg;
msg.safePrintf("gbits: qpop=%"INT32" qweight=%"INT32" "
"queryterm=",
qpop,qpopWeight);
msg.safeMemcpy(qt->m_term,qt->m_termLen);
msg.pushChar('\0');
logf(LOG_DEBUG,"%s",msg.getBufStart());
}
/////////////
//
// make gigabits
//
/////////////
if ( m_docsToScanForTopics > 0 ) {
// time it
int64_t stt = gettimeofdayInMilliseconds();
// get the fist one, just use that for now
TopicGroup *tg = &m_si->m_topicGroups[0];
// . this will not block
// . this code is in XmlDoc.cpp
// . samples are from XmlDoc::getSampleForGigabits(), generated
// for each titlerec in the search results
// . SHIT! lets go back to the old code since this was
// the new approach and didn't support single lower-case
// words, like "parchment" for the 'Magna Carta' query.
// or 'copies' for the 'Magna Carta' query, etc. which
// i think are very interesting, especially when displayed
// in sentences
// . set m_gigabitInfos[] to be the gigabits
if ( ! computeGigabits( tg ) ) {
// note it
log("gbits: general error: %s",mstrerror(g_errno));
// g_errno should be set on error here!
return true;
}
// now make the fast facts from the gigabits and the
// samples. these are sentences containing the query and
// a gigabit.
if ( ! computeFastFacts ( ) ) {
// note it
log("gbits: general error: %s",mstrerror(g_errno));
// g_errno should be set on error here!
return true;
}
/*
int32_t ng;
ng = intersectGigabits ( //m_msg3a.m_q->m_orig ,
//m_msg3a.m_q->m_origLen ,
m_msg20 ,
m_msg3a.m_numDocIds,//m_numContiguous
//m_msg3a.getClusterLevels(),
//m_si->m_topicGroups ,
//m_si->m_numTopicGroups ,
m_si->m_langHint ,
tg->m_maxTopics ,
tg->m_docsToScanForTopics ,
tg->m_minDocCount ,
m_gigabitInfos ,
m_si->m_niceness );
*/
// ng is -1 on error, g_errno should be set
//if ( ng == -1 ) return true;
// otherwise, it is legit!
//m_numGigabitInfos = ng;
// sanity check
//if ( ng > 50 ) { char *xx=NULL;*xx=0; }
// time it
int64_t took = gettimeofdayInMilliseconds() - stt;
if ( took > 5 )
logf(LOG_DEBUG,"query: make gigabits took %"INT64" ms",
took);
}
// take this out for now...
#ifdef GB_PQR
// run post query reranks for this query
int32_t wanted = m_si->m_docsWanted + m_si->m_firstResultNum + 1;
if ( m_postQueryRerank.isEnabled() &&
m_postQueryRerank.set2(wanted)){
if ( ! m_postQueryRerank.preRerank () ) {
log("query: PostQueryRerank::"
"preRerank() failed.");
m_postQueryRerank.rerankFailed();
}
else if ( ! m_postQueryRerank.rerank () ) {
log("query: PostQueryRerank::"
"rerank() failed.");
m_postQueryRerank.rerankFailed();
}
else if ( ! m_postQueryRerank.postRerank () ) {
log("query: PostQueryRerank::"
"postRerank() failed.");
m_postQueryRerank.rerankFailed();
}
}
#endif
// set m_moreToCome, if true, we print a "Next 10" link
m_moreToCome = (visible > //m_visibleContiguous >
m_si->m_docsWanted+m_si->m_firstResultNum);
if ( m_si->m_debug || g_conf.m_logDebugQuery )
logf ( LOG_DEBUG, "query: msg40: more? %d", m_moreToCome );
// alloc m_buf, which should be NULL
if ( m_buf ) { char *xx = NULL; *xx = 0; }
// . we need to collapse m_msg3a.m_docIds[], etc. into m_docIds[] etc
// to be just the docids we wanted.
// . at this point we should merge in all docids from all Msg40s from
// different clusters, etc.
// . now alloc space for "docsWanted" m_docIds[], m_scores[],
// m_bitScores[], m_clusterLevels[] and m_newMsg20[]
//
// HACK TIME
//
// . bury filtered/clustered docids from m_msg3a.m_docIds[]
// . also remove result no in the request window specified by &s=X&n=Y
// where "s" is m_si->m_firstResultNum (which starts at 0) and "n"
// is the number of results requested, m_si->m_docsWanted
// . this is a bit of a hack (MDW)
int32_t c = 0;
int32_t v = 0;
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// assume we got a valid docid
bool skip = false;
// must ahve a cluster level of CR_OK (visible)
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) skip = true;
// v is the visible count
else if ( v++ < m_si->m_firstResultNum )
skip = true;
// . if skipping a valid msg20, give it a chance to destruct
// . no longer do this because CR_SUMMARY_MERGED needs to keep
// the msg20 reply around so PageResults.cpp can merge the
// event descriptions
//if ( skip && m_msg20[i] ) m_msg20[i]->destructor();
// if skipping continue
if ( skip ) continue;
// we got a winner, save it
m_msg3a.m_docIds [c] = m_msg3a.m_docIds [i];
m_msg3a.m_scores [c] = m_msg3a.m_scores [i];
m_msg3a.m_clusterLevels [c] = m_msg3a.m_clusterLevels [i];
m_msg20 [c] = m_msg20 [i];
if ( m_msg3a.m_scoreInfos )
m_msg3a.m_scoreInfos [c] = m_msg3a.m_scoreInfos [i];
int32_t need = m_si->m_docsWanted;
// if done, bail
if ( ++c >= need ) break;
}
// reset the # of docids we got to how many we kept!
m_msg3a.m_numDocIds = c;
// debug
for ( int32_t i = 0 ; debug && i < m_msg3a.m_numDocIds ; i++ )
logf(LOG_DEBUG, "query: msg40 clipped hit #%"INT32") d=%"UINT64" "
"cl=%"INT32" (%s)",
i,m_msg3a.m_docIds[i],(int32_t)m_msg3a.m_clusterLevels[i],
g_crStrings[(int32_t)m_msg3a.m_clusterLevels[i]]);
//
// END HACK
//
// . uc = use cache?
// . store in cache now if we need to
bool uc = false;
if ( m_si->m_useCache ) uc = true;
if ( m_si->m_wcache ) uc = true;
// . do not store if there was an error
// . no, allow errors in cache since we often have lots of
// docid not founds and what not, due to index corruption and
// being out of sync with titledb
if ( m_errno &&
// forgive "Record not found" errors, they are quite common
m_errno != ENOTFOUND &&
m_errno != EMISSINGQUERYTERMS ) {
logf(LOG_DEBUG,"query: not storing in cache: %s",
mstrerror(m_errno));
uc = false;
}
if ( m_si->m_docIdsOnly ) uc = false;
// all done if not storing in cache
if ( ! uc ) return true;
// debug
if ( m_si->m_debug )
logf(LOG_DEBUG,"query: [%"PTRFMT"] Storing output in cache.",
(PTRTYPE)this);
// store in this buffer
char tmpBuf [ 64 * 1024 ];
// use that
char *p = tmpBuf;
// how much room?
int32_t tmpSize = getStoredSize();
// unless too small
if ( tmpSize > 64*1024 )
p = (char *)mmalloc(tmpSize,"Msg40Cache");
if ( ! p ) {
// this is just for cachinig, not critical... ignore errors
g_errno = 0;
logf ( LOG_INFO ,
"query: Size of cached search results page (and "
"all associated data) is %"INT32" bytes. Max is %i. "
"Page not cached.", tmpSize, 32*1024 );
return true;
}
// serialize into tmp
int32_t nb = serialize ( p , tmpSize );
// it must fit exactly
if ( nb != tmpSize || nb == 0 ) {
g_errno = EBADENGINEER;
log (LOG_LOGIC,
"query: Size of cached search results page (%"INT32") "
"does not match what it should be. (%"INT32")",
nb, tmpSize );
return true;
}
if ( ! m_msg3a.m_rrr.m_getDocIdScoringInfo ) {
// make key based on the hash of certain vars in SearchInput
key_t k = m_si->makeKey();
// cache it
m_msg17.storeInCache ( SEARCHRESULTS_CACHEID ,
k ,
p , // rec
tmpSize , // recSize
m_firstCollnum ,//m_si->m_coll2
m_si->m_niceness ,
3 ); //timeout=3secs
}
// free it, cache will copy it into its ring buffer
if ( p != tmpBuf ) mfree ( p , tmpSize , "Msg40Cache" );
// ignore errors
g_errno = 0;
return true;
}
// m_msg3a.m_docIds[m] was filtered because it was a dup or something so we
// must "uncluster" the *next* docid from the same hostname that is clustered
void Msg40::uncluster ( int32_t m ) {
// skip for now
return;
key_t crec1 = m_msg3a.m_clusterRecs[m];
int64_t sh1 = g_clusterdb.getSiteHash26 ( (char *)&crec1 );
for ( int32_t k = 0 ; k < m_msg3a.m_numDocIds ; k++ ) {
// skip docid #k if not from same hostname
key_t crec2 = m_msg3a.m_clusterRecs[k];
int64_t sh2 = g_clusterdb.getSiteHash26 ( (char *)&crec2 );
if ( sh2 != sh1 ) continue;
// skip if not OK or CLUSTERED
if ( m_msg3a.m_clusterLevels[k] != CR_CLUSTERED ) continue;
// UNHIDE IT
m_msg3a.m_clusterLevels[k] = CR_OK;
// we must UN-dedup anything after us because now that we are
// no longer clustered, we could dedup a result below us,
// which deduped another result, which is now no longer deduped
// because its deduped was this unclustered results dup! ;)
for ( int32_t i = k+1 ; i < m_msg3a.m_numDocIds ; i++ ) {
// get current cluster level
char *level = &m_msg3a.m_clusterLevels[i];
// reset dupped guys, they will be re-done if needed!
if ( *level == CR_DUP_SUMMARY ) *level = CR_OK;
if ( *level == CR_DUP_TOPIC ) *level = CR_OK;
}
// . reset this so it gets re-computed
// . we are placing a gap at m_msg20[k] since
// m_msg20[k].m_gotReply = false
//m_numContiguous = 0;
//m_visibleContiguous = 0;
// debug note
logf(LOG_DEBUG,"query: msg40: unclustering docid #%"INT32" %"INT64". "
"(unclusterCount=%"INT32")",
k,m_msg3a.m_docIds[k],m_unclusterCount);
// . steal the msg20!
// . sanity check -- should have been NULL!
if ( m_msg20[k] ) { char *xx=NULL; *xx=0; }
// sanity check
if ( ! m_msg20[m] ) { char *xx=NULL; *xx=0; }
// sanity check
if ( k == m ) { char *xx=NULL; *xx=0; }
// for every one guy marked as a dup, we uncluster FIVE
//if ( ++count >= 5 ) break;
// grab it
m_msg20[k] = m_msg20[m];
// reset it, m_gotReply should be false now
m_msg20[k]->reset();
// the dup guy has a NULL ptr now
m_msg20[m] = NULL;
// . only have to unhide one at a time
// . one is a dup, so no more than one will
// become UNhidden
break;
}
}
int32_t Msg40::getStoredSize ( ) {
// moreToCome=1
int32_t size = 1;
// msg3a
size += m_msg3a.getStoredSize();
// add each summary
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds; i++ ) {
// do not store the big samples if we're not storing cached
// copy. if "includeCachedCopy" is true then the page itself
// will be the summary.
//if ( ! m_si->m_includeCachedCopy )
// m_msg20[i]->clearBigSample();
// getting rid of this makes it take up less room
m_msg20[i]->clearLinks();
m_msg20[i]->clearVectors();
// if not visisble, do not store!
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// otherwise, store it
size += m_msg20[i]->getStoredSize();
}
// . the related topics, only those whose score is >= m_minTopicScore
// . nah, just re-instersect from the msg20 replies again! its quick
//size += m_msg24.getStoredSize ( );
//size += m_msg1a.getStoredSize ( );
// cache msg2b if we have it
//size += m_msg2b.getStoredSize();
return size;
}
// . serialize ourselves for the cache
// . returns bytes written
// . returns -1 and sets g_errno on error
int32_t Msg40::serialize ( char *buf , int32_t bufLen ) {
// set the ptr stuff
char *p = buf;
char *pend = buf + bufLen;
// miscellaneous
*p++ = m_moreToCome;
// msg3a:
// m_numDocIds[]
// m_docIds[]
// m_scores[]
// m_clusterLevels[]
// m_totalHits (estimated)
int32_t nb = m_msg3a.serialize ( p , pend );
// return -1 on error
if ( nb < 0 ) return -1;
// otherwise, inc over it
p += nb;
// . then summary excerpts, keep them word aligned...
// . TODO: make sure empty Msg20s are very little space!
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// sanity check
if ( m_msg3a.m_clusterLevels[i] == CR_OK && ! m_msg20[i] ) {
char *xx = NULL; *xx = 0; }
// if null skip it
if ( ! m_msg20[i] ) continue;
// do not store the big samples if we're not storing cached
// copy. if "includeCachedCopy" is true then the page itself
// will be the summary.
//if ( ! m_si->m_includeCachedCopy )
// m_msg20[i]->clearBigSample();
// getting rid of this makes it take up less room
m_msg20[i]->clearLinks();
m_msg20[i]->clearVectors();
// if not visisble, do not store!
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// return -1 on error, g_errno should be set
int32_t nb = m_msg20[i]->serialize ( p , pend - p ) ;
// count it
if ( m_msg3a.m_rrr.m_debug )
log("query: msg40 serialize msg20size=%"INT32"",nb);
//if ( m_r.m_debug ) {
// int32_t mcount = 0;
// Msg20Reply *mr = m_msg20[i]->m_r;
// for ( int32_t *mm = &mr->size_tbuf ;
// mm <= &mr->size_templateVector ;
// mm++ ) {
// log("query: msg20 #%"INT32" = %"INT32"",
// mcount,*mm);
// mcount++;
// }
//}
if ( nb == -1 ) return -1;
p += nb;
}
// nah, just re-instersect from the msg20 replies again! its quick
//int32_t x = m_msg24.serialize ( p , pend - p );
//if ( x == -1 ) return -1;
//p += x;
//int32_t y = m_msg1a.serialize (p, pend - p);
//if ( y == -1 ) return -1;
//p += y;
//int32_t z = m_msg2b.serialize (p, pend - p);
//if ( z == -1 ) return -1;
//p += z;
if ( m_msg3a.m_rrr.m_debug )
log("query: msg40 serialize nd=%"INT32" "
"msg3asize=%"INT32" ",m_msg3a.m_numDocIds,nb);
// return bytes stored
return p - buf;
}
// . deserialize ourselves for the cache
// . returns bytes written
// . returns -1 and sets g_errno on error
int32_t Msg40::deserialize ( char *buf , int32_t bufSize ) {
// we OWN the buffer
m_buf = buf;
m_bufMaxSize = bufSize;
// set the ptr stuff
char *p = buf;
char *pend = buf + bufSize;
// miscellaneous
m_moreToCome = *p++;
// msg3a:
// m_numDocIds
// m_docIds[]
// m_scores[]
// m_clusterLevels[]
// m_totalHits (estimated)
int32_t nb = m_msg3a.deserialize ( p , pend );
// return -1 on error
if ( nb < 0 ) return -1;
// otherwise, inc over it
p += nb;
// . alloc buf to hold all m_msg20[i] ptrs and the Msg20s they point to
// . return -1 if this failed! it will set g_errno/m_errno already
if ( ! reallocMsg20Buf() ) return -1;
// MDW: then summary excerpts, keep them word aligned...
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// if flag is 0 that means a NULL msg20
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// return -1 on error, g_errno should be set
int32_t x = m_msg20[i]->deserialize ( p , pend - p ) ;
if ( x == -1 ) return -1;
p += x;
}
// msg2b
//int32_t z = m_msg2b.deserialize ( p , pend - p );
//if ( z == -1 ) return -1;
//p += z;
// return bytes read
return p - buf;
}
static char *s_subDoms[] = {
// Common Language sub-domains
"en" ,
"fr" ,
"es" ,
"ru" ,
"zz" ,
"ja" ,
"tw" ,
"cn" ,
"ko" ,
"de" ,
"nl" ,
"it" ,
"fi" ,
"sv" ,
"no" ,
"pt" ,
"vi" ,
"ar" ,
"he" ,
"id" ,
"el" ,
"th" ,
"hi" ,
"bn" ,
"pl" ,
"tl" ,
// Common Country sub-domains
"us" ,
"uk" ,
// Common web sub-domains
"www" };
static HashTable s_subDomTable;
static bool s_subDomInitialized = false;
static bool initSubDomTable(HashTable *table, char *words[], int32_t size ){
// set up the hash table
if ( ! table->set ( size * 2 ) )
return log(LOG_INIT,"build: Could not init sub-domain "
"table." );
// now add in all the stop words
int32_t n = (int32_t)size/ sizeof(char *);
for ( int32_t i = 0 ; i < n ; i++ ) {
char *sw = words[i];
int32_t swlen = gbstrlen ( sw );
int32_t h = hash32Lower_a(sw, swlen);
int32_t slot = table->getSlot(h);
// if there is no slot, this url doesn't exist => add it
if(slot == -1)
table->addKey(h,0);
else
log(LOG_INIT,"build: Sub-domain table has duplicates");
}
return true;
}
bool isSubDom(char *s , int32_t len) {
if ( ! s_subDomInitialized ) {
s_subDomInitialized =
initSubDomTable(&s_subDomTable, s_subDoms,
sizeof(s_subDoms));
if (!s_subDomInitialized) return false;
}
// get from table
int32_t h = hash32Lower_a(s, len);
if(s_subDomTable.getSlot(h) == -1)
return false;
return true;
}
//////////////////////////////////
//
// COMPUTE GIGABITS!!!
//
//////////////////////////////////
bool hashSample ( Query *q,
HashTableX *master,
TopicGroup *tg ,
SafeBuf *vecBuf,
Msg20 *thisMsg20 ,
HashTableX *repeatTable ,
bool debugGigabits ) ;
static int gigabitCmp ( const void *a, const void *b ) {
Gigabit *ga = *(Gigabit **)a;
Gigabit *gb = *(Gigabit **)b;
// put termlen =0 at end, that means it was nuked
//if ( ga->m_termLen == 0 && gb->m_termLen > 0 ) return 1; // swap
//if ( ga->m_termLen > 0 && gb->m_termLen == 0 ) return -1;
float sa = ga->m_gbscore * ga->m_numPages;
float sb = gb->m_gbscore * gb->m_numPages;
// "King John" on 6 pages should be "John" on 12!
sa *= ga->m_numWords;
sb *= gb->m_numWords;
// punish if only on one page
if ( ga->m_numPages <= 1 ) sa /= 4.0;
if ( gb->m_numPages <= 1 ) sb /= 4.0;
if ( sa < sb ) return 1; // swap!
if ( sa > sb ) return -1;
if ( ga->m_numPages < gb->m_numPages ) return 1; // swap
if ( ga->m_numPages > gb->m_numPages ) return -1;
if ( ga->m_termLen > gb->m_termLen ) return 1; // swap
if ( ga->m_termLen < gb->m_termLen ) return -1;
return 0;
}
//#define MAXPOP 10000
#define MAXPOP 32000
//
// . set m_gigabitInfos[] array and return # of them we set
// . returns -1 with g_errno set on error
// . fills m_gigabitPtrs safebuf with ptrs to the Gigabit class
// and the ptrs are sorted by m_gbscore
//
bool Msg40::computeGigabits( TopicGroup *tg ) {
// not if we skipped the first X summariest
if ( m_didSummarySkip ) { char *xx=NULL;*xx=0; }
//return true;
//int64_t start = gettimeofdayInMilliseconds();
int32_t niceness = 0;
Query *q = &m_si->m_q;
// for every sample estimate the number of words so we know how big
// to make our repeat hash table
int32_t maxWords = 0;
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// skip if not visible
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// get it
Msg20* thisMsg20 = m_msg20[i];
// must be there! wtf?
if ( ! thisMsg20 ) { char *xx=NULL;*xx=0; }
// make sure the summary is not in a foreign language (aac)
//if (thisMsg20) {
// unsigned char sLang;
// sLang = thisMsg20->m_r->m_summaryLanguage;
// if (language != langUnknown && sLang != language)
// continue;
//};
// . get the sample as provided by XmlDoc::getMsg20Reply()
// calling XmlDoc::getGigabitSample() for each docid in
// the search results
// . the sample is a bunch of text snippets surrounding the
// query terms in the doc in the search results
Msg20Reply *reply = thisMsg20->getReply();
char *sample = reply->ptr_gigabitSample;
int32_t slen = reply->size_gigabitSample;
// but if doing metas, get the display content as the sample
//char *next = thisMsg20->getDisplayBuf();
//if ( tg->m_meta[0] && next )
// sample = thisMsg20->getNextDisplayBuf(&slen,&next);
// set parser vars
char *p = sample;
char *pend = sample + slen;
int32_t sampleWords = 0;
//int32_t numExcerpts = 0;
while ( p < pend ) {
// buffer is \0 separated text snippets
int32_t plen = gbstrlen (p);
sampleWords += countWords( p,plen);
// advance to next exerpt
p += plen + 1;
//if ( debug ) numExcerpts++;
};
if (maxWords + sampleWords > 0x08000000) {
log("gbits: too many words in samples. "
"Discarding the remaining samples "
"(maxWords=%"INT32")", maxWords);
char *xx=NULL;*xx=0;
}
// the thing we are counting!!!!
maxWords += sampleWords;
}
//
// hash table for repeated fragment detection
//
// make it big enough so there are gaps, so chains are not too long
int32_t minBuckets = (int32_t)(maxWords * 1.5);
if(minBuckets < 512) minBuckets = 512;
int32_t numSlots = 2 * getHighestLitBitValue ( minBuckets ) ;
// return -1 with g_errno set on error
HashTableX repeatTable;
if ( ! repeatTable.set(8,4,numSlots,NULL , 0, false,niceness,"gbbux"))
return false;
//
// only allow one gigabit sample per ip?
//
HashTableX iptable;
if ( tg->m_ipRestrict ) {
int32_t ns = m_msg3a.m_numDocIds * 4;
if ( ! iptable.set(4,0,ns,NULL,0,false,niceness,"gbit") )
return false;
}
//
// space for all vectors for deduping samples that are 80% similar
//
SafeBuf vecBuf;
int32_t vneed = m_msg3a.m_numDocIds * SAMPLE_VECTOR_SIZE;
if ( tg->m_dedupSamplePercent >= 0 && ! vecBuf.reserve ( vneed ) )
return false;
//
//
// . the master hash table for scoring gigabits
// . each slot is a class "Gigabit"
//
//
HashTableX master;
int32_t bs = sizeof(Gigabit);
// key is a 64-bit wordid hash from Words.cpp
if ( ! master.set ( 8 , bs , 20000,NULL,0,false,niceness,"mgbt") )
return false;
//
// now combine all the pronouns and pronoun phrases into one big hash
// table and collect the top 10 topics
//
QUICKPOLL(niceness);
int32_t numDocsProcessed = 0;
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// skip if not visible
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// get it
Msg20* thisMsg20 = m_msg20[i];
// must be there! wtf?
if ( ! thisMsg20 ) { char *xx=NULL;*xx=0; }
// make sure the summary is not in a foreign language (aac)
//if (thisMsg20) {
// unsigned char sLang;
// sLang = thisMsg20->m_r->m_summaryLanguage;
// if(language!=langUnknown && sLang != language) continue;
//};
Msg20Reply *reply = thisMsg20->getReply();
// skip if from an ip we already did
if ( tg->m_ipRestrict ) {
int32_t ipd = ipdom ( reply->m_firstIp );
// zero is invalid!
if ( ! ipd ) continue;
//log("url=%s",thisMsg20->getUrl());
if ( iptable.isInTable(&ipd) ) {
//log("dup=%s",thisMsg20->getUrl());
continue;
}
// now we also check domain
Url uu;
uu.set ( reply->ptr_ubuf, reply->size_ubuf-1);
// "mid dom" is the "ibm" part of ibm.com or ibm.de
char *dom = uu.getMidDomain();
int32_t dlen = uu.getMidDomainLen();
if ( dom && dlen > 0 ) {
int32_t h = hash32 ( dom , dlen );
if ( iptable.isInTable(&h) ) continue;
iptable.addKey (&h);
}
// add ip
iptable.addKey ( &ipd );
}
// continue; // mdw
// count it
numDocsProcessed++;
// . hash it into the master table
// . this may alloc st->m_mem, so be sure to free below
hashSample ( q,
&master,
tg ,
&vecBuf,
thisMsg20,
&repeatTable,
m_si->m_debugGigabits);
// ignore errors
g_errno = 0;
}
// debug msg
/*
for ( int32_t i = 0 ; i < nt ; i++ ) {
int32_t score = master->getScoreFromTermNum(i) ;
if ( ! score ) continue;
char *ptr = master->getTermPtr(i) ;
int32_t len = master->getTermLen(i);
char ff[1024];
if ( len > 1020 ) len = 1020;
memcpy ( ff , ptr , len );
ff[len] = '\0';
// we can have html entities in here now
//if ( ! is_alnum(ff[0]) ) { char *xx = NULL; *xx = 0; }
log("%08"INT32" %s",score,ff);
}
*/
// how many do we need?
//int32_t need = tg->m_maxTopics ;
SafeBuf gigabitPtrBuf;
int32_t need = master.m_numSlotsUsed * sizeof(Gigabit *);
if ( ! gigabitPtrBuf.reserve ( need ) ) return false;
//int32_t minScore = 0x7fffffff;
//int32_t minj = -1;
int32_t i ;
for ( i = 0 ; i < master.m_numSlots ; i++ ) {
// skip if empty
if ( master.isEmpty(i) ) continue;
// get it
Gigabit *gb = (Gigabit *)master.getValueFromSlot(i);
// skip term #i from "table" if it has 0 score
//int32_t score = master.m_scores[i]; // getScoreFromTermNum(i) ;
//if ( ! score ) continue;
// skip if 0 score i guess
//if ( ! gb->m_qrt ) continue;
// . make it higher the more popular a term is
// . these are based on a MAXPOP of 10000
//int32_t mdc = (int32_t)((((double)numDocsProcessed * 3.0 *
// (double)(gb->m_gbpop&0x7fffffff))+0.5)/
// MAXPOP);
//if ( mdc < tg->m_minDocCount ) mdc = tg->m_minDocCount;
// skip if does not meet the min doc count
if ( gb->m_numPages < tg->m_minDocCount ) continue;
// set the min of all in our list
//if ( score < minScore ) { minScore = score; minj = np; }
// i've seen this become NULL at line 753 on gb1 below for
// /search?code=mammaXbG&uip=12.41.126.39&n=15&raw=8&q=
// manhattan,+ny
// so let's try it again and try to find out why maybe
if ( gb->m_termLen <= 0 ) {
char *orig = "";
if ( q ) orig = q->m_orig;
log (LOG_LOGIC,"query: Got 0 length gigabit. q=%s",
orig);
continue;
}
// recalc the score
//double frac1 = ((MAXPOP-(pops[i]&0x7fffffff))*100.0)/MAXPOP;
//double frac2 = ((double)count * 100.0) / (double)sampled;
//score = (int32_t)((frac1 * frac2) / 100.0);
// we got a winner
gigabitPtrBuf.pushPtr(gb);
}
//
//
// sort the gigabit ptrs
//
//
Gigabit **ptrs = (Gigabit **)gigabitPtrBuf.getBufStart();
int32_t numPtrs = gigabitPtrBuf.length() / sizeof(Gigabit *);
gbqsort ( ptrs , numPtrs , sizeof(Gigabit *) , gigabitCmp , 0 );
// we are done if not deduping
if ( ! tg->m_dedup ) goto skipdedup;
// . scan the gigabits
// . now remove similar terms from the gigabits
for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
// get it
Gigabit *gi = ptrs[i];
// skip if nuked already
if ( gi->m_termLen == 0 ) continue;
// scan down to this score, but not below
//int32_t minScore = scores[i] - 25;
// if we get replaced by a longer guy, remember him
//int32_t replacerj = -1;
// . a longer term than encapsulates us can eliminate us
// . or, if we're the longer, we eliminate the int16_ter
for ( int32_t j = i + 1 ; j < numPtrs ; j++ ) {
// get it
Gigabit *gj = ptrs[j];
// skip if nuked already
if ( gj->m_termLen == 0 ) continue;
// wtf?
if ( gj->m_termId64 == gi->m_termId64 ) {
char *xx=NULL; *xx=0; }
// if page count not the same let it coexist
if ( gi->m_numPages != gj->m_numPages )
continue;
// if we are the int16_ter, nuke the longer guy
// that contains us because we have a higher score
// since ptrs are sorted by score then length.
if ( gi->m_termLen < gj->m_termLen ) {
// just null term the longer
char c1 = gi->m_term[gi->m_termLen];
gi->m_term[gi->m_termLen] = '\0';
char c2 = gj->m_term[gj->m_termLen];
gj->m_term[gj->m_termLen] = '\0';
// if int16_ter is contained
char *s;
s = gb_strcasestr (gj->m_term, gi->m_term);
// un-null term longer
gi->m_term[gi->m_termLen] = c1;
gj->m_term[gj->m_termLen] = c2;
// even if he's longer, if his score is too
// low then he cannot nuke us
// MDW: try doing page count!
//if ( scores[j] < minScore ) continue;
// if we were NOT contained by someone below...
if ( ! s ) continue;
// just punish, and resort by score later.
// TODO: ensure cannot go negative!
//gj->m_numPages -= gi->m_numPages;
// he's gotta be on all of our pages, too
//if ( ! onSamePages(i,j,slots,heads,pages) )
// continue;
// debug it
if ( m_si->m_debugGigabits ) {
SafeBuf msg;
msg.safePrintf("gbits: gigabit \"");
msg.safeMemcpy(gi->m_term,
gi->m_termLen);
msg.safePrintf("\"[%.0f] *NUKES0* \"",
gi->m_gbscore);
msg.safeMemcpy(gj->m_term,
gj->m_termLen);
msg.safePrintf("\"[%.0f]",
gj->m_gbscore);
logf(LOG_DEBUG,"%s",msg.getBufStart());
}
// int16_ter gets our score (we need to sort)
// not yet! let him finish, then replace him!!
//replacerj = j;
gj->m_termLen = 0;
// see if we can nuke other guys at least
//continue;
}
else {
// just null term the longer
char c1 = gi->m_term[gi->m_termLen];
gi->m_term[gi->m_termLen] = '\0';
char c2 = gj->m_term[gj->m_termLen];
gj->m_term[gj->m_termLen] = '\0';
// . otherwise, we are the longer
// . we can nuke any int16_ter below us, all
// scores
char *s;
s = gb_strcasestr ( gi->m_term,gj->m_term );
// un-null term
gi->m_term[gi->m_termLen] = c1;
gj->m_term[gj->m_termLen] = c2;
// keep going if no match
if ( ! s ) continue;
// just punish, and resort by score later.
// TODO: ensure cannot go negative!
//gj->m_numPages -= gi->m_numPages;
// debug it
if ( m_si->m_debugGigabits ) {
SafeBuf msg;
msg.safePrintf("gbits: gigabit \"");
msg.safeMemcpy(gi->m_term,
gi->m_termLen);
msg.safePrintf("\"[%.0f] *NUKES1* \"",
gi->m_gbscore);
msg.safeMemcpy(gj->m_term,
gj->m_termLen);
msg.safePrintf("\"[%.0f]",
gj->m_gbscore);
logf(LOG_DEBUG,"%s",msg.getBufStart());
}
// remove him if we contain him
gj->m_termLen = 0;
}
}
/*
// if we got replaced by a longer guy, he replaces us
// and takes our score
if ( replacerj >= 0 ) {
// gigabit #i is now gigabit #j
Gigabit *gj = ptrs[replacerj];
// debug it
SafeBuf msg;
msg.safePrintf("msg40: replacing gigabit \"");
msg.safeMemcpy(gi->m_term,gi->m_termLen);
msg.safePrintf("\"[%.0f] *WITH2* \"",gi->m_gbscore);
msg.safeMemcpy(gj->m_term,gj->m_termLen);
msg.safePrintf("\"[%.0f]",gj->m_gbscore);
logf(LOG_DEBUG,msg.getBufStart());
// make us longer then!
gi->m_termLen = gj->m_termLen;
// and nuke him
gj->m_termLen = 0;
}
*/
}
// remove common phrases
for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
// get it
Gigabit *gi = ptrs[i];
// skip if nuked already
if ( gi->m_termLen == 0 ) continue;
// int16_tcut
char *s = gi->m_term;
int32_t slen = gi->m_termLen;
// compare
if (!strncasecmp(s, "all rights reserved",slen) ||
!strncasecmp(s, "rights reserved" ,slen) ||
!strncasecmp(s, "in addition" ,slen) ||
!strncasecmp(s, "for example" ,slen) ||
!strncasecmp(s, "in order" ,slen) ||
!strncasecmp(s, "in fact" ,slen) ||
!strncasecmp(s, "in general" ,slen) ||
!strncasecmp(s, "contact us" ,slen) ||
!strncasecmp(s, "at the same time" ,slen) ||
!strncasecmp(s, "http" ,slen) ||
!strncasecmp(s, "html" ,slen) ||
!strncasecmp(s, "s " ,slen) ||
!strncasecmp(s, "for more information",slen))
gi->m_termLen = 0;
}
// now after longer topics replaced the int16_ter topics which they
// contained, remove the longer topics if they have too many words
// remove common phrases
for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
// get it
Gigabit *gi = ptrs[i];
// skip if nuked already
if ( gi->m_termLen == 0 ) continue;
// set the words to this gigabit
char *s = gi->m_term;
int32_t slen = gi->m_termLen;
Words w;
w.setx ( s , slen , 0 );
int32_t nw = w.getNumWords();
// . does it have comma? or other punct besides an apostrophe?
// . we allow gigabit phrases to incorporate a int32_t stretch
// of punct... only before the LAST word in the phrase,
// that way our overlap removal still works well.
bool hasPunct = false;
for ( int32_t k = 0 ; k < slen ; k++ ) {
if ( ! is_punct_a(s[k]) ) continue;
// apostrophe is ok as int32_t as alnum follows
if ( s[k] == '\'' &&
is_alnum_a(s[k+1]) ) continue;
// . period ok, as int32_t as space or alnum follows
// . if space follows, then an alnum must follow that
// . same goes for colon
QUICKPOLL(niceness);
// . for now, until we get abbreviations working,
// alnum must follow period
if ( (s[k] == '.' || s[k] == ':' ) &&
( is_alnum_a(s[k+1]) ||
// accept single intial before the period, too
(s[k+1] ==' ' && is_alnum_a(s[k+2])
&& k>=2 && s[k-2]==' ')))
continue;
// comma is ok if surrounded by digits
if ( (s[k] == ',' &&
is_digit(s[k-1]) &&
is_digit(s[k+1]) )) continue;
// percent is ok
if ( s[k] == '%' ) continue;
if ( s[k] == '&' ) continue;
if ( s[k] == '@' ) continue;
if ( s[k] == '-' ) continue;
//if ( s[k] == '(' ) continue;
//if ( s[k] == ')' ) continue;
hasPunct = true;
break;
}
// keep it if words are under limit
// and has no commas
if ( nw <= 2 * tg->m_maxWordsPerTopic -1 && ! hasPunct )
continue;
// remove it!!!
gi->m_termLen = 0;
}
// resort!! put termLen = 0 at end!
//gbqsort ( ptrs , numPtrs , sizeof(Gigabit *) , gigabitCmp , 0 );
// fucking, done, just use the ptrs!!!
//m_gigabitPtrsValid = true;
// return ptr to the safebuf
//return &m_gigabitPtrs;
skipdedup:
int32_t stored = 0;
// now top winning copy winning gigabits into safebuf
for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
// get it
Gigabit *gi = ptrs[i];
// skip if nuked already
if ( gi->m_termLen == 0 ) continue;
// store it
m_gigabitBuf.safeMemcpy ( gi , sizeof(Gigabit) );
// stop at 100 i guess
if ( ++stored >= 100 ) break;
}
return true;
}
void hashExcerpt ( Query *q ,
// local gigabits table
HashTableX *tt ,
// the excerpt
Words &ww,
TopicGroup *tg ,
HashTableX *repeatTable ,
Msg20 *thisMsg20 ,
bool debugGigabits );
// . returns false and sets g_errno on error
// . here's the tricky part
bool hashSample ( Query *q,
HashTableX *master,
TopicGroup *tg ,
SafeBuf *vecBuf,
Msg20 *thisMsg20 ,
HashTableX *repeatTable ,
bool debugGigabits ) {
// numTerms must be less than this
//if ( q && q->m_numTerms > MAX_QUERY_TERMS )
// return log("gbits: Too many query terms for "
// "topic generation.");
Msg20Reply *reply = thisMsg20->getReply();
// get the ith big sample
char *bigSampleBuf = reply->ptr_gigabitSample;
int32_t bigSampleLen = reply->size_gigabitSample;
// but if doing metas, get the display content
//char *next = thisMsg20->getDisplayBuf();
// but if doing metas, get the display content
//if ( tg->m_meta[0] && next)
// bigSampleBuf=thisMsg20->getNextDisplayBuf(&bigSampleLen,&next);
// skip if empty
if ( bigSampleLen<=0 || ! bigSampleBuf ) return true;
// the docid
int64_t docId = reply->m_docId;
//int64_t start = gettimeofdayInMilliseconds();
//
// termtable. for hashing all excerpts in a sample
//
HashTableX localGigabitTable;
int32_t bs = sizeof(Gigabit);
if ( ! localGigabitTable.set(8,bs,20000,NULL,0,false, 0,"gbtrmtbl") ) {
log("gbits: Had error allocating a table for topic "
"generation: %s.",mstrerror(g_errno));
return true;
}
//---> word next to both query terms should not be between by word just
//next to one....
//---> weight by query popularity too!
//log("******** hashing doc *********");
HashTableX simTable;
char tmpBuf[41000];
simTable.set(4,0,8192,tmpBuf,41000,false,0,"simtbl");
// store our elements into here
//char vstack[10000];
//int32_t vneed = nw * 8;
//SafeBuf vbuf(vstack,10000);
//if ( ! vbuf.reserve ( vneed ) ) return true;
SafeBuf vbuf;
// TODO: make this better
if ( ! vbuf.reserve ( 10000 * 8 ) ) return true;
//
// NOTE: now we have only a sample and excerpts are separated
// with |'s
//
// hash each excerpt
char *p = bigSampleBuf;
// most samples are under 5k, i've seend a 32k sample take 11ms!
char *pend = p + bigSampleLen;
while ( p < pend ) {
// debug
//log("docId=%"INT64" EXCERPT=%s",docId,p);
int32_t plen = gbstrlen(p);
// parse into words
Words ww;
ww.setx ( p, plen, 0);// niceness
// advance to next excerpt
p += plen + 1;
// p is only non-NULL if we are doing it the old way
hashExcerpt ( q,
&localGigabitTable,
ww,
tg,
repeatTable ,
thisMsg20 ,
debugGigabits );
// skip if not deduping
if ( tg->m_dedupSamplePercent <= 0 ) continue;
// make a vector out of words
int64_t *wids = ww.getWordIds();
int32_t nw = ww.getNumWords();
for ( int32_t i = 0 ; i < nw ; i++ ) {
// make it this
uint32_t widu = (uint64_t)(wids[i]);
// donot allow this! zero is a vector terminator
if ( widu == 0 ) widu = 1;
// skip if already added to vector
if ( simTable.isInTable(&widu) ) continue;
// store that as a vector component
if ( ! vbuf.pushLong(widu) ) return false;
// make sure we do not dedup
if ( ! simTable.addKey(&widu) ) return false;
}
}
// sort 32-bit word ids from whole sample. niceness = 0
vbuf.sortLongs(0);
// make sure under (128-4) bytes...
vbuf.truncLen(((int32_t)SAMPLE_VECTOR_SIZE) - 4);
// make last int32_t a 0
vbuf.pushLong(0);
// . compute the fingerprint/similarirtyVector from this table
// the same way we do for documents for deduping them at query time
// . or we could just wait for our dedup algo to kick in... (mdw)
// then comment this stuff out ...
if ( tg->m_dedupSamplePercent > 0 ) {
// store it there
//SafeBuf sampleVec;
//getSampleVector ( bigSample , bigSampleLen , &sampleVec );
// point to it
char *v1 = vbuf.getBufStart();
// get # stored so far
int32_t numVecs = vecBuf->length() / (int32_t)SAMPLE_VECTOR_SIZE;
char *v2 = vecBuf->getBufStart();
// see if our vector is too similar
for ( int32_t i = 0 ; i < numVecs ; i++ ) {
char ss;
ss = g_clusterdb.getSampleSimilarity(v1,v2,
SAMPLE_VECTOR_SIZE);
v2 += SAMPLE_VECTOR_SIZE;
// return true if too similar to another sample we did
if ( ss >= tg->m_dedupSamplePercent ) { // 80 ) {
localGigabitTable.reset();
log(LOG_DEBUG,"gbits: removed dup sample.");
return true;
}
}
// add our vector to the array
vecBuf->safeMemcpy(v1,(int32_t)SAMPLE_VECTOR_SIZE);
}
//log("TOOK %"INT64" ms plen=%"INT32"",gettimeofdayInMilliseconds()-start,
// bufLen);
//log("have %"INT32" terms in termtable. adding to master.",
// tt.getNumTermsUsed());
// . now hash the entries of this table, tt, into the master
// . the master contains entries from all the other tables
int32_t nt = localGigabitTable.getNumSlots();
//int32_t pop = 0 ;
for ( int32_t i = 0 ; i < nt ; i++ ) {
// skip if empty
if ( localGigabitTable.isEmpty(i) ) continue;
// get it
Gigabit *gc = (Gigabit *)localGigabitTable.getDataFromSlot(i);
// this should be indented
if ( ! gc->m_gbscore ) continue;//tt.m_scores[i] ) continue;
//int32_t ii = (int32_t)tt.getTermPtr(i);
// then divide by that
//int32_t score =gc->m_scoreFromTermNum;//tt.getScoreFromTermNum(i
// watch out for 0
//if ( score <= 0 ) continue;
// get termid
int64_t termId64 = *(int64_t *)localGigabitTable.getKey(i);
// . get the bucket
// . may be or may not be full (score is 0 if empty)
//int32_t n = master->getTermNum ( tt.getTermId(i) );
Gigabit *mg = (Gigabit *)master->getValue(&termId64);
// skip if 0, i've seen this happen before
//if ( tt.getTermId(i) == 0 ) continue;
//if ( returnPops ) pop = tt.m_pops[i];
// set hi bit of "pop" if in unicode
//if ( isUnicode ) pop |= 0x80000000;
//else pop &= 0x7fffffff;
//pop &= 0x7fffffff;
Gigabit gbit;
Gigabit *pg;
// if already there... inc the score i guess
if ( mg ) {
// if already seen it for this docid skip?
if ( mg->m_lastDocId == docId ) continue;
// first time for this docid
mg->m_numPages++;
mg->m_gbscore += gc->m_gbscore;
mg->m_lastDocId = docId;
pg = mg;
}
else {
// . add term to master table
gbit.m_term = gc->m_term;
gbit.m_termLen = gc->m_termLen;
gbit.m_numPages = 1;
gbit.m_gbscore = gc->m_gbscore;
gbit.m_lastDocId = docId;
gbit.m_termId64 = termId64;
gbit.m_minPop = gc->m_minPop;
gbit.m_numWords = gc->m_numWords;
// zero out
memset ( gbit.m_wordIds , 0 , MAX_GIGABIT_WORDS*8);
// sanity
if ( gc->m_numWords > MAX_GIGABIT_WORDS ) {
char*xx=NULL;*xx=0;}
memcpy((char *)gbit.m_wordIds,
(char *)gc->m_wordIds,
gc->m_numWords * 8 );
if ( ! master->addKey ( &termId64, &gbit ) )
return false;
pg = &gbit;
}
// debug msg
//if ( ! g_conf.m_logDebugQuery ) continue;
if ( ! debugGigabits ) continue;
char *ww = pg->m_term;
int32_t wwlen = pg->m_termLen;
char c = ww[wwlen];
ww[wwlen]='\0';
logf(LOG_DEBUG,"gbits: master "
"termId=%020"UINT64" "
"d=%018"INT64" "
"score=%7.1f "
"cumscore=%7.1f "
"pages=%"INT32" "
"len=%02"INT32" term=%s",
termId64,
docId,
gc->m_gbscore, // this time score
pg->m_gbscore, // cumulative score
pg->m_numPages,
wwlen,
ww);
ww[wwlen]=c;
}
//log("master has %"INT32" terms",master.getNumTermsUsed());
// clear any error
if ( g_errno ) {
log("gbits: Had error getting topic candidates from "
"document: "
"%s.",mstrerror(g_errno));
g_errno = 0;
}
//mfree ( buf , bufMaxLen , "Msg24" );
return true;
}
class WordInfo {
public:
// popularity
int32_t m_wpop;
// is query term?
bool m_isQueryTerm;
// is common word? (do not let frags end in these words)
bool m_isCommonWord;
// the raw QTR scores (aac)
float m_proxScore;//qtr;
// a hash for looking up in the popularity dictionary
//int64_t dwid64;
// . from 0 to 100. 100 means not repeated.
// . set in setRepeatScores() function
char m_repeatScore;
};
void setRepeatScores ( Words *ww ,
WordInfo *wis,
HashTableX *repeatTable ) ;
void hashExcerpt ( Query *q ,
HashTableX *localGigabitTable ,
Words &words,
TopicGroup *tg ,
HashTableX *repeatTable ,
Msg20 *thisMsg20 ,
bool debugGigabits ) {
// . bring it out
// . allow one more word per gigabit, then remove gigabits that
// are that length. this fixes the problem of having the same
// sentence repeated in different documents, which are fairly
// different as a whole, but have the same repeated sentence or
// paragraph.
// . by only adding one, if the next word is a common word then
// we would fail to make a larger gigabit, that's why i added
// the maxjend code below this.
int32_t maxWordsPerPhrase = tg->m_maxWordsPerTopic ;
if ( tg->m_topicRemoveOverlaps ) maxWordsPerPhrase += 2;
//char enforceQueryRadius = ! tg->m_meta[0];
char delimeter = tg->m_delimeter; // 0 means none (default)
//char idf = tg->m_useIdfForTopics;
// or if no query, no query radius
//if ( ! q || q->getNumNonFieldedSingletonTerms() == 0 )
// enforceQueryRadius = false;
// . now all the data is in buf/bufLen
// . parse it up into Words
// . now XmlDoc::getGigabitVector() calls us and it already has the
// Words pased up, so it will use a NULL buf
int32_t nw = words.getNumWords();
// don't breech our arrays man
//if ( nw > 10000 ) nw = 10000;
Msg20Reply *reply = thisMsg20->getReply();
unsigned char lang = reply->m_language;
//
//
// additional info for each word
//
//
SafeBuf wibuf;
int32_t need = nw * sizeof(WordInfo);
if ( ! wibuf.reserve ( need ) ) {
log("gigabits: could not allocate local buffer "
"(%"INT32" bytes required)", need);
return;
}
WordInfo *wis = (WordInfo *)wibuf.getBufStart();
//
//
// . where does each query term occur in the doc?
// . record each query term's word position into the PosInfo array
//
//
class PosInfo {
public:
int32_t m_pos[1000];
int32_t m_posLen;
int32_t m_posPtr;
};
SafeBuf posBuf;
int32_t need2 = MAX_QUERY_TERMS * sizeof(PosInfo);
posBuf.setLabel("m40posbuf");
if ( ! posBuf.reserve ( need2 ) ) {
log("gigabits: could not allocate 2 local buffer "
"(%"INT32" bytes required)", need2);
return;
}
PosInfo *pis = (PosInfo *)posBuf.getBufStart();
for (int32_t i = 0; i < q->m_numTerms ; i++) {
pis[i].m_posLen = 0;
pis[i].m_posPtr = 0;
}
// start parsing at word #0 in the excerpt
int32_t i = 0;
// skip punct at beginning of excerpt
if ( i < nw && words.isPunct(i) ) i++;
// this is aac's thing...
if ( i < nw ) wis[i].m_proxScore = 0.0;
// . now we keep a hash table to zero out repeated fragments
// . it uses a sliding window of 5 words
// . it stores the hash of those 5 words in the hash table
// . if sees how many 5-word matches it gets in a row
// . the more matches it gets, the more it demotes the word scores
// in that span of 5 words
// . these are stored in the weights class
// . a repeatScore of 0 means to demote it out completely, 100 means
// it is not repeated at all
// . multiply the final gigabit score by the repeatScore/100.
// . this function sets WordInfo::m_repeatScore
// . each word in the excerpt is 1-1 with the WordInfos
setRepeatScores ( &words , wis, repeatTable );
// record the positions of all query words
char **wp = words.m_words;
int32_t *wlen = words.m_wordLens;
int64_t *wids = words.getWordIds();
// loop over the words in our EXCERPT
for ( ; i < nw ; i++ ) {
// get associated WordInfo class
WordInfo *wi = &wis[i];
// aac's thing
wi->m_proxScore = 0.0;
// skip if not indexable
if ( ! wids[i] ) continue;
// skip if repeated too much according to setRepeatScores()
if ( wi->m_repeatScore <= 20 ) continue;
// reset popularity
//if ( idf ) wi->m_wpop = -1;
// assume all same if not using idf
//else wi->m_wpop = 1;
// assume this word is not in the query
wi->m_isQueryTerm = 0;
// reset
wi->m_wpop = -1;
// debug point
//if ( strncmp( wp[i],"This",4) == 0 )
// log("hey");
// store the id
//wi->m_dwid64 = hash64d(wp[i], wlen[i] );
// . is it a common word?
// . it is if it is just one letter
// . what about X-windows coming up for a 'windows' query?
// or e-mail coming up for a query?
// . METALINCS likes to have 1 digit topics
if ( wlen[i] <= 1 && is_lower_a(wp[i][0]) )
wi->m_isCommonWord = 1;
// 2004 is common here but if it makes it in, don't remove it
// in the top topics list... no. loses 'atari 2600' then!
//else if ( is_digit(ww.getWord(i)[0]) )
// icw[i] = 1;
//#ifndef _METALINCS_
else wi->m_isCommonWord = isCommonWord ( wids[i] );
//#else
// always allow gigabits that start with numbers for metalincs
//else if ( ! is_digit(wp[i][0]))
// wi->m_isCommonWord = isCommonWord ( (int32_t)wids[i] );
//else
// wi->m_isCommonWord = 0;
//#endif
// debug msg
/*
char *s = ww.getWord(i);
int32_t slen = ww.getWordLen(i);
char c = s[slen];
s[slen]='\0';
log("icw=%"INT32" %s",icw[i],s);
s[slen]=c;
*/
// is it a query term? if so, record its word # in "pos" arry
int32_t nt = q->m_numTerms;
for ( int32_t j = 0 ; j < nt ; j++ ) {
// get query term #j
QueryTerm *qt = &q->m_qterms[j];
// does word #i match query word id #j? skip if not.
if ( wids[i] != qt->m_hash64d ) continue;
// get vector for query word #j
PosInfo *pi = &pis[j];
// skip if already have 1000 occurences of this term
if ( pi->m_posLen >= 1000 ) continue;
// add this query term # into our m_pos vector
pi->m_pos[pi->m_posLen] = i;
pi->m_posLen++;
// mark this word so if a phrase only has
// all query terms we do not hash it
wi->m_isQueryTerm = 1;
break;
}
}
//
//
// done scanning words in excerpt
//
//
// max score -- ONE max scoring hits per doc
//int32_t maxScore = nqi * MAX_SCORE_MULTIPLIER;
// this happens when generating the gigabit vector for a single doc
// so don't hamper it to such a small ceiling
//if ( nqi == 0 ) maxScore = ALT_MAX_SCORE;
// reset cursor to word #0 in excerpt again
i = 0;
// skip initial punct and spaces
if ( i < nw && words.isPunct(i) ) i++;
// score each word based on distance to query terms
//float score;
//
//
// loop through all the words again and set WordInfo::m_proxScore
// and WordInfo::m_wpop
//
//
for ( ; i < nw ; i++ ) {
// debug point
//if ( strncasecmp( wp[i],"Microsoft",9) == 0 )
// log("hey");
// do we have pre-supplied words and scores from XmlDoc.cpp?
//if ( wids ) {
// skip if not indexable
if ( ! wids[i] ) continue;
// shorcut
WordInfo *wi = &wis[i];
// skip if in a repeat chunk of doc
if ( wi->m_repeatScore <= 20 ) continue;
// protect against misspelled html entities (aac)
if ( (wp[i][-1] == '&' && is_alnum_a(wp[i][0])) ||
(wp[i][0] == '&' && is_alnum_a(wp[i][1])) )
continue;
// no more one or two letter gigabits (aac)
if ( wlen[i] < 3 && (! is_digit(wp[i][0])) ) continue;
//continue; //mdw
// if we had a delimeter, previous word must have it
// or be the first punct word
if ( delimeter && i >= 2 && ! words.hasChar(i-1,delimeter) )
continue;
// skip if a query term, it's ineligible
//if ( ww.getWordLen(i) == 0 ) continue;
// if query is NULL, assume we are restricting to meta tags
// and query is not necessary
//if ( enforceQueryRadius ) score = 0;
//else score = ALT_START_SCORE;
int32_t j ;
// number of matches
int32_t nm = 0;
// how close is the word to the query terms? base
// the proxScore on that.
float proxScore = 0.0;
// loop over the # of matchable words in the query
for ( j = 0 ; j < q->m_numTerms ; j++ ) {
// get the vector that has what word #'s in the
// excerpt that query word #j matches
PosInfo *pe = &pis[j];
// skip query word #j if does not match ANY words
// in this excerpt...
if ( pe->m_posLen <= 0 ) continue;
// get the jth quer term we match then
QueryTerm *qt = &q->m_qterms[j];
// zero for this term
float score = 0.0;
// get distance in words
//int32_t d1 = i - pos[ 1000 * j + posPtr[j] ] ;
// . posPtr is like a cursor into our m_pos array
// that has the word #'s that this query word
// matches in the excerpt
// . "d1" is distance in words from word #i to
// the next closest query term
int32_t d1 = i - pe->m_pos[pe->m_posPtr];
// if word #i is BEFORE this matching word in the
// excerpt, flip the sign
if ( d1 < 0 ) d1 = d1 * -1;
//
// if the matching word is the last that occurence
// of that word...
//
if ( pe->m_posPtr + 1 >= pe->m_posLen ) {
// if too far apart, go to next query term
if (d1 >= QTR_ZONE_3) continue;
if ( wi->m_isQueryTerm ||
wi->m_isCommonWord ||
wlen[i] <= 3) {
// common word, query terms, int16_t words
// are all second class citizens when it
// comes to scoring: they get a small
// bonus, to ensure that they are
// considered in the next stage, but do not
// benefit from QPOP and multiple hit
// bonuses (aac)
//score = QTR_BONUS_CW;
//proxScore += score;
continue;
};
// QTR_ZONE_0 is the tighest radius
if (d1 < QTR_ZONE_0)
score = QTR_BONUS_0;
else if (d1 < QTR_ZONE_1)
score = QTR_BONUS_1;
else if (d1 < QTR_ZONE_2)
score = QTR_BONUS_2;
else
score = QTR_BONUS_3;
// increment the # of matches
nm++;
// multiplier based on query word popularity
score *= qt->m_popWeight;//qpops[j];
proxScore += score;
continue;
}
//
// look at the following match
//
//int32_t d2 = pos[ 1000 * j + posPtr[j] + 1 ] - i ;
// look at the next occurence of query term #j
// in the excerpt and get dist from us to it
int32_t d2 = pe->m_pos[pe->m_posPtr + 1] - i;
// make it positive
if ( d2 < 0 ) d2 = d2 * -1;
// if we are closer to the current matching word
// then set score for that...
if ( d1 <= d2 ) {
// if ( d1 >=20 ) continue;
// if ( d1 < 4 ) score += 1000;
// else if ( d1 < 8 ) score += 800;
// else if ( d1 < 12 ) score += 500;
// else score += 200;
// nm++;
// score *= qpops[j];
// continue;
if (d1 >= QTR_ZONE_3) continue;
if ( wi->m_isQueryTerm ||
wi->m_isCommonWord ||
wlen[i] <= 3) {
// common word, query terms, int16_t words
// are all second class citizens when it
// comes to scoring: they get a small
// bonus, to ensure that they are
// considered in the next stage, but do not
// benefit from QPOP and multiple hit
// bonuses (aac)
//score = QTR_BONUS_CW;
continue;
};
if (d1 < QTR_ZONE_0)
score = QTR_BONUS_0;
else if (d1 < QTR_ZONE_1)
score = QTR_BONUS_1;
else if (d1 < QTR_ZONE_2)
score = QTR_BONUS_2;
else
score = QTR_BONUS_3;
nm++;
score *= qt->m_popWeight;//qpops[j];
proxScore += score;
continue;
}
//
//
// otherwise, we are closer to the next occurence!!
// be sure to ince its posPtr cursor then
//
//
// i think it is safe to increment this here now
// because we are closer to the word position
// m_pos[m_posPtr+1] then m_pos[m_posPtr].
pe->m_posPtr++;
// if radius is too big... no score increase
if (d2 >= QTR_ZONE_3)
continue;
if ( wi->m_isQueryTerm ||
wi->m_isCommonWord ||
wlen[i] <= 3) {
// common word, query terms, int16_t words
// are all second class citizens when it
// comes to scoring: they get a small
// bonus, to ensure that they are
// considered in the next stage, but do not
// benefit from QPOP and multiple hit
// bonuses (aac)
//score = QTR_BONUS_CW;
//proxScore += score;
continue;
}
// give out some score bonuses
if (d2 < QTR_ZONE_0) score = QTR_BONUS_0;
else if (d2 < QTR_ZONE_1) score = QTR_BONUS_1;
else if (d2 < QTR_ZONE_2) score = QTR_BONUS_2;
else score = QTR_BONUS_3;
// and match count.. why this?
nm++;
// multiply by query term pop weight that we are
// closest too
score *= qt->m_popWeight;//qpops[j];
proxScore += score;
}
// skip if too far away from all query terms
if ( proxScore <= 0 ) continue;
// give a boost for multiple hits
// the more terms in range, the bigger the boost...
if ( nm > 1 ) {
//log("nm=%"INT32"",nm);
// hmmm... try to rely on more pages mentioning it!
//score += MULTIPLE_HIT_BOOST * nm;
};
// . save the raw QTR score (aac)
// . this is based on how close the word is to all query
// terms...
wi->m_proxScore = proxScore;
// no longer count closeness to query terms for score,
// just use # times topic is in doc(s) and popularity
//score = 1000;
// set pop if it is -1
if ( wi->m_wpop == -1 ) { // pops[i] == -1 ) {
wi->m_wpop = g_speller.
getPhrasePopularity( wp[i],wids[i], true,lang);
// decrease popularity by half if
// capitalized so Jack does not have
// same pop as "jack"
if ( is_upper_a (wp[i][0]) ) wi->m_wpop >>= 1;
if ( wi->m_wpop == 0 ) wi->m_wpop = 1;
}
// log that
if ( ! debugGigabits ) continue;
SafeBuf msg;
msg.safePrintf("gbits: wordpos=%3"INT32" "
"repeatscore=%3"INT32" "
"wordproxscore=%6.1f word=",
i,
(int32_t)wi->m_repeatScore,
proxScore);
msg.safeMemcpy(wp[i],wlen[i]);
msg.pushChar(0);
logf(LOG_DEBUG,"%s",msg.getBufStart());
}
//int32_t mm = 0;
// reset word ptr again
i = 0;
// skip initial punct again
if ( i < nw && words.isPunct(i) ) i++;
int32_t wikiEnd = -1;
//
//
// scan words again and add GIGABITS to term table "localGigabitTable"
//
//
for ( ; i < nw ; i++ ) {
// int16_tcut
WordInfo *wi = &wis[i];
// must start with a QTR-scoring word (aac)
if ( wi->m_proxScore <= 0.0 ) continue;
// do not split a phrase like "next generation" to just
// get the gigabit "generation" by itself.
// should also fix "search engine" from being split into
// "search" and "engine"
if ( i <= wikiEnd ) continue;
//if ( strncmp(words.m_words[i],"point",5) == 0 )
// log("hey");
// in a wikipedia title?
int32_t numWiki = g_wiki.getNumWordsInWikiPhrase ( i,&words );
wikiEnd = i + numWiki;
// point to the string of the word
char *ww = wp[i];
int32_t wwlen = wlen[i];
//int32_t ss;
//float ss;
if ( wi->m_isCommonWord ) {
// . skip this and all phrases if we're "to"
// . avoid "to use..." "to do..." "to make..." annoying
// . "to" has score 1, "and" has score 2, "of" is 3,
// . "the" is 4, "this" is 5
if ( wi->m_isCommonWord <= 5 ) continue;
// cannot start with any common word,unless capitalized
if ( is_lower_a(wp[i][0]) ) continue;
}
// if a hyphen is immediately before us, we cannot start
// a phrase... fu-ture, preven-tion
if ( i > 0 && wp[i][-1]=='-' ) continue;
// same for colon
if ( i > 0 && wp[i][-1]==':' ) continue;
// . if a "'s " is before us, we cannot start either
// . "valentine's day cards"
if ( i >= 3 &&
wp[i][-3]=='\'' &&
wp[i][-2 ]=='s' &&
is_wspace_a(wp[i][-1]) ) continue;
// or if our first char is a digit and a "digit," is before us
// because we don't want to break numbers with commas in them
if ( is_digit(wp[i][0]) && i >= 2 && wp[i][-1]==',' &&
is_digit(wp[i][-2]) ) continue;
// set initial popularity
//float gigabitPop = 1.0;
int32_t minPop = 0x7fffffff;
//if ( wi->m_wpop > 0) pop = ((float) wi->m_wpop) / MAXPOP;
//else pop = 1.0 / MAXPOP;
//
//
// set initial score and bonus resuming from above for loop
//
//
//float wordProxSum = 0;//wi->m_proxScore;
float wordProxMax = 0;
float bonus = 0;
uint64_t ph64 = 0;//wids[i]; // hash value
// if first letter is upper case, double the score
//if ( is_upper_a (ww.getWord(i)[0]) ) score <<= 1;
// . loop through all phrases that start with this word
// . up to 6 real words per phrase
// . 'j' counts our 'words' which counts a $ of puncts as word
int32_t jend = i + maxWordsPerPhrase * 2; // 12;
int32_t maxjend = jend ;
if ( tg->m_topicRemoveOverlaps ) maxjend += 8;
if ( jend > nw ) jend = nw;
if ( maxjend > nw ) maxjend = nw;
int32_t count = 0;
int32_t nqc = 0; // # common/query words in our phrase
int32_t nhw = 0; // # of "hot words" (contribute to score)
//if ( wlen[i] == 8 && strncmp(wp[i],"Practice",8) == 0 )
// log("hey");
int32_t jWikiEnd = -1;
for ( int32_t j = i ; j < jend ; j++ ) {
// skip if not indexable
if ( ! wids[j] ) continue;
// . do not split a wiki title
//if ( j < wikiEnd-1 ) continue;
// . j starts at i, so we can pick up the wikiphrase
// from i
// . so if "search" is i and is @ 146 and "engine" @
// 148 then jWikiEnd will be 148, and j needs to be
// able to end on that so use wikiEnd-1
if ( j < jWikiEnd - 1) continue;
int32_t njw = g_wiki.getNumWordsInWikiPhrase ( j,&words );
jWikiEnd = j + njw;
// get word info
WordInfo *wj = &wis[j];
// skip if in a repeated fragment
if ( wj->m_repeatScore <= 20 ) continue;
// no ending in ing on capitalized
if ( wlen[j] > 3 &&
wp[j][wlen[j]-1 ]=='g' &&
wp[j][wlen[j]-2 ]=='n' &&
wp[j][wlen[j]-3]=='i' &&
is_lower_a(wp[j][0]) )
continue;
if (j == i) {
if ( wj->m_isCommonWord || wlen[j] < 3)
bonus -= FWC_PENALTY;
// if word is 4 letters or more and ends in ed, do
// not allow to be its own gigabit
if ( wlen[j] > 3 &&
wp[j][wlen[j]-1 ]=='d' &&
wp[j][wlen[j]-2]=='e' )
continue;
// no more "com" gigabits, please! (aac)
if ( wlen[j] == 3 &&
wp[j][0 ]=='c' &&
wp[j][1 ]=='o' &&
wp[j][2]=='m') continue;
};
// let's generalize even more! do not allow common
// single words as gigabits, with 250+ pop
//if ( pop > 100 && j == i && is_lower(wp[j][0]) )
//continue;
// the above assumes a MAX_POP of 10k (sanity check)
//if ( MAXPOP != 10000 ) { char *xx = NULL; *xx = 0; }
// are we passed the first word in the phrase?
if ( j > i ) {
// advance phrase length
wwlen += wlen[j-1] + wlen[j];
// . cut phrase int16_t if too much punct between
// the current word, j, and the last one, j-2
// . but allow for abbreviations or initials
// of single letters, like 'harry s. truman'.
// we do not want to break before 's.'
// . because the phrase "s. doesn't stand for
// anything." was unable to form. we only
// got "s." and "doesn't stand for anything."
// as possible gigabit candidates.
//if ( wlen[j-1] > 1 ) {
// if ( wlen[j-1] != 2 ) break;
// if ( wp [j-1][0] != '.' ) break;
// if ( wlen[j-2] > 1 ) break;
//}
// . we now allow most punct since it is
// filtered out above w/ hasPunct variable
// . this a little more than doubles the
// processing overhead going from 1 to 3
// . going from 1 to 2 we see that we take 60ms
// instead of 50ms *when removing overlaps*
// . at 1 we take about 48/45ms, not much
// different when removing overlaps
// . increasing this totally wipes out our
// overlap problem, but it is very expensive,
// so now i just halt after jumping one big
// string of punct below, and filter out
// those gigabits above with hasPunct.
// . i'd really like to NOT have this here
// becase we get much better gigabits, but
// we need it as a speed saver...
if (wlen[j-1]>tg->m_topicMaxPunctLen) break;
// no phrasing across commas, etc.
/*
if ( wlen[j-1] == 2 ) {
// only allow " " or ": " or ". "
if ( wp[j-1][1]!=' ' ) break;
if ( wp[j-1][0]!=' ' &&
wp[j-1][0]!=':' &&
wp[j-1][0]!='\'' && // beatles'
// allow commas here, but we
// remove any gigabits with commas
// because we just use them to
// cancel out bad gigabits.
wp[j-1][0]!=',' &&
wp[j-1][0]!='.' ) break;
// . TODO: add in sgt. col. so that
// stuff can be in a gigabit
// . only allow ". " if prev word was
// abbreviation.
if ( wp[j-1][0]=='.' &&
j >= 2 &&
wlen[j-2] > 3) break; // != 1
}
*/
// or if we just skipped the delimeter,
// we are not allowed to phrase across that
// if one was provided
if ( delimeter &&words.hasChar(j-1,delimeter))
break;
// make sure we could phrase across last word
//if ( wlen[j-1] > 1 &&
// bits.getPunctuationBits(wp[j-1],wlen[j-1])
// == 0 ) break;
}
//
// accumulate the phrase's hash AND pop
//
ph64 = hash64 ( ph64 , wids[j] );
// set pop if it is -1
if ( wj->m_wpop == -1 ) { // pops[i] == -1 ) {
wj->m_wpop = g_speller.
getPhrasePopularity( wp[j],wids[j], true,lang);
// decrease popularity by half if
// capitalized so Jack does not have
// same pop as "jack"
if ( is_upper_a (wp[j][0]) ) wj->m_wpop >>= 1;
if ( wj->m_wpop == 0 ) wj->m_wpop = 1;
}
// adjust popularity
//gigabitPop = (gigabitPop* wj->m_wpop)/MAXPOP;
// watch our for overflow
//if ( gigabitPop <= 0 ) gigabitPop = 1.0/MAXPOP;
if ( wj->m_wpop < minPop )
minPop = wj->m_wpop;
// get lowest of scores
//if(scores && scores[j] > mm ) mm = scores[j];
// accumulate wordproxscores
//wordProxSum += wj->m_proxScore;
if ( wj->m_proxScore > wordProxMax )
wordProxMax = wj->m_proxScore;
// keep track of words
count++;
if ( wj->m_isQueryTerm || wj->m_isCommonWord ) {
nqc++; // increment number of query/commoners
}
// do not count 1.0 cuz those are the query terms!
else if ( wj->m_proxScore > 1.0) {
// increment "hot word" counter
nhw++;
};
// keep phrasing until next punct word is delimeter
// or the end
if ( delimeter ) {
// if we end on a punct word, then hash
// our phrase, otherwise, only hash it if
// the next word has the delimeter
if ( j+2<jend &&!words.hasChar(j+1,delimeter))
continue;
}
// otherwise, ensure phrase is not ALL query terms
else {
// if phrase is all commoners & query skip it
if ( nqc == count ) {
// debug
//char saveChar = ww[wwlen];
//ww[wwlen] = '\0';
//log("gbits: "
//"phrase is all QT or CW; skipping"
//" phrase %s", ww);
//ww[wwlen] = saveChar;
continue;
};
}
// . skip if we're common, pair across common words
// . BUT it is common to end a meta from tag in ".com"
// so we should not count that one as common
if ( wj->m_isCommonWord ) {
// allow for more words only for purposes
// of fixing the ABCD and BCDE overlap bug
// without having to raise jend for all cases
if ( jend < maxjend ) jend++;
continue;
}
// do not stop if - . or @ follows us right b4 alnum
if ( j+1 < nw && is_alnum_a(wp[j+1][1]) ) {
if ( wp[j+1][0]=='-' ) continue;
if ( wp[j+1][0]=='.' ) continue;
if ( wp[j+1][0]=='\'') continue;
if ( wp[j+1][0]=='@' ) continue;
// . do not split phrases between capitalized words
// . this should fix the Costa Rica, Costa Blah bug
// . it may decrease score of Belkin for query
// 'Belkin Omni Cube' but that's ok because if
// Belkin is important it will be used independently.
if ( is_upper_a(wp[j][0]) &&
j + 2 < nw &&
wp[j+1][0]==' ' &&
is_upper_a(wp[j+2][0]) &&
wlen[j+1] == 1 &&
tg->m_maxWordsPerTopic > 1 )
continue;
}
// do not mix caps
if ( is_upper_a(wp[i][0]) != is_upper_a(wp[j][0]) )
continue;
// . do not stop on a single capital letter
// . so we don't stop on "George W->" (george w. bush)
// . i added the " && j > i" so METALINCS can have
// single digit gigabits
if ( wlen[j] == 1 && j > i ) continue;
// . do not split after Mr. or St. or Ms. or Mt. ...
// . fixes 'st. valentines day'
if ( wlen[j] == 2 && is_upper_a(wp[j][0]) &&
wp[j][2]=='.' ) continue;
// sgt. or col.
if ( wlen[j] == 3 && wp[j][3]=='.' ){
if ( to_lower_a(wp[j][0 ])=='s' &&
to_lower_a(wp[j][1 ])=='g' &&
to_lower_a(wp[j][2])=='t' ) continue;
if ( to_lower_a(wp[j][0 ])=='c' &&
to_lower_a(wp[j][1 ])=='o' &&
to_lower_a(wp[j][2])=='l' ) continue;
if ( to_lower_a(wp[j][0 ])=='m' &&
to_lower_a(wp[j][1 ])=='r' &&
to_lower_a(wp[j][2])=='s' ) continue;
}
// . do not split commas in numbers
// . like 1,000,000,000
if ( j >= 2 &&
wp[j][-1 ]==',' &&
is_digit(wp[j][-2]) &&
wp[j][wlen[j]]==',' &&
is_digit(wp[j][wlen[j]+1]))
continue;
/*
if ( pop < 1 ) ;
else if ( pop < 2 ) ss = (score * 90) / 100;
else if ( pop < 5 ) ss = (score * 85) / 100;
else if ( pop < 10 ) ss = (score * 80) / 100;
else if ( pop < 20 ) ss = (score * 75) / 100;
else if ( pop < 30 ) ss = (score * 70) / 100;
else if ( pop < 40 ) ss = (score * 65) / 100;
else if ( pop < 50 ) ss = (score * 60) / 100;
else ss = (score * 40) / 100;
*/
//if ( tt->getScoreFromTermId((int64_t)h) > 0 )
// continue;
// debug msg
//char c = ww[wwlen];
//ww[wwlen]='\0';
//fprintf(stderr,"tid=%"UINT32" score=%"INT32" pop=%"INT32" len=%"INT32" "
// "repeat=%"INT32" term=%s\n",h,ss,pop,wwlen,
// repeatScores[i],ww);
//ww[wwlen]=c;
// include any ending or starting ( or )
if ( i > 0 && ww[-1] == '(' ) {
// ensure we got a ')' somwhere before adding (
for ( int32_t r = 0 ; r <= wwlen ; r++ )
if ( ww[r]==')' ) {
ww--; wwlen++; break; }
}
if ( i < nw && ww[wwlen] == ')' ) {
// we need a '(' somewhere before adding the )
for ( int32_t r = 0 ; r <= wwlen ; r++ )
if ( ww[r]=='(' ) {
wwlen++; break; }
}
// now remove ('s if begin AND end in them
if ( ww[0] == '(' && ww[wwlen-1] == ')' ) {
ww++; wwlen -= 2; }
// base his score on this
float wordScore = wj->m_proxScore;
// now double score if capitalized, we need more
// proper nouns for topic clustering to work better,
// but it doesn't count if start of a sentence, so
// there must be some alnum word right before it.
if ( is_upper_a(ww[0]) &&
wwlen>=2 &&
j >= 2 && // do not breach!
is_alnum_a(ww[-2]))
wordScore *= 2; // <<= 1; // 1;
// adjust the gigabit score using the new scores array
//if ( scores && mm != NORM_WORD_SCORE )
// ss = (ss * mm) / NORM_WORD_SCORE;
// adjust the gigabit score using the new scores array
//if ( scores && mm != NORM_WORD_SCORE )
// ss = (ss * mm) / NORM_WORD_SCORE;
// only count the highest scoring guy once per page
//int32_t tn = tt->getTermNum((int64_t)h);
//maxScore = ss;
//if ( tn >= 0 ) {
// int32_t sc = tt->getScoreFromTermNum(tn);
// if ( sc > maxScore ) maxScore = sc;
//}
// . add it
// . now store the popularity, too, so we can display
// it for the winning gigabits
//if ( ! tt->addTerm ((int64_t)h,ss,maxScore,false,
// ww,wwlen,tn,NULL,pop) )
// . weight score by pop
// . lets try weighting more popular phrases more!
//ss = score;
// i guess average the > 0 prox scores
//if ( nhw > 0) wordScore /= nhw;
// i think a common word penalty is this bonus?
// it is accumulate, so we can add it down here
//wordProxSum += bonus;
// penalty if not enough hot words
//if ( nhw < 3 )
// wordScore -= 100;
// accumulate proxScores of each word
// involved in the gigbit, including the
// FIRST word!
//wordScoreSum += wordScore;
float boost;
if (minPop < POP_ZONE_0) boost = POP_BOOST_0;
else if (minPop < POP_ZONE_1) boost = POP_BOOST_1;
else if (minPop < POP_ZONE_2) boost = POP_BOOST_2;
else if (minPop < POP_ZONE_3) boost = POP_BOOST_3;
else boost = POP_BOOST_4;
// apply the boost
//float popModScore = wordProxSum * boost;
float popModScore = wordProxMax * boost;
if ( popModScore <= 0 ) popModScore = 1;
// average among the words with positive prox scores
//if ( nhw > 0 ) popModScore /= nhw;
// store it
//int32_t ipop = (int32_t)(pop * MAXPOP);
//
// ADD A GIGABIT CANDIDATE
//
Gigabit gc;
gc.m_term = ww;
gc.m_termLen = wwlen;
gc.m_gbscore = popModScore;
gc.m_minPop = minPop;
// how many words in the gigabit?
int32_t ngw = (j - i)/2 + 1;
gc.m_numWords = ngw;
// breach check. go to next gigabit beginning word?
if ( ngw > MAX_GIGABIT_WORDS ) break;
// record each word!
int32_t wcount = 0;
for ( int32_t k = i ; k <= j ; k++ ) {
if ( ! wids[k] ) continue;
gc.m_wordIds[wcount] = wids[k];
wcount++;
if ( wcount >= MAX_GIGABIT_WORDS ) break;
gc.m_wordIds[wcount] = 0LL;
}
if ( ! localGigabitTable->addKey ( &ph64 , &gc ) ) {
log("gbits: No memory to grow table.");
return;
}
// debug it
if ( debugGigabits ) {
SafeBuf msg;
msg.safePrintf("gbits: adding gigabit "
"d=%018"UINT64" "
"termId=%020"UINT64" "
"popModScore=%7.1f "
//"wordProxSum=%7.1f "
"wordProxMax=%7.1f "
"nhw=%2"INT32" "
"minWordPopBoost=%2.1f "
"minWordPop=%5"INT32" "
"term=\"",
reply->m_docId,
ph64,
popModScore,
wordProxMax,
nhw,
boost,
minPop);
msg.safeMemcpy(gc.m_term,gc.m_termLen);
msg.safePrintf("\"");
logf(LOG_DEBUG,"%s",msg.getBufStart());
}
// stop after indexing a word after a int32_t string of
// punct, this is the overlap bug fix without taking
// a performance hit. hasPunct above will remove it.
if ( j > i && wlen[j-1] > 2 ) break;
}
}
// report error
if ( g_errno )
log("gbits: Had error getting topic candidates from "
"document: %s.",mstrerror(g_errno));
// clear any error
g_errno = 0;
}
// taken from Weights.cpp's set3() function
void setRepeatScores ( Words *words ,
WordInfo *wis,
HashTableX *repeatTable ) {
int32_t nw = words->getNumWords();
// if no words, nothing to do
if ( nw == 0 ) return;
//char *ptr = repeatTable;
//int32_t numSlots = repeatTableNumSlots;
//int64_t *hashes = (int64_t *)ptr; ptr += numSlots * 8;
//int32_t *vals = (int32_t *)ptr; ptr += numSlots * 4;
int64_t ringWids [ 5 ];
int32_t ringPos [ 5 ];
int32_t ringi = 0;
int32_t count = 0;
int64_t h = 0;
//int32_t numSlots = repeatTable->getNumSlots();
// make the mask
//uint32_t mask = numSlots - 1;
// clear ring of hashes
memset ( ringWids , 0 , 5 * sizeof(int64_t) );
// for sanity check
//int32_t lastStart = -1;
// count how many 5-word sequences we match in a row
int32_t matched = 0;
int32_t matchStart = -1;
// reset
for ( int32_t i = 0 ; i < nw ; i++ )
wis[i].m_repeatScore = 100;
// return until we fix the infinite loop bug
//return;
int64_t *wids = words->getWordIds();
// . hash EVERY 5-word sequence in the document
// . if we get a match look and see what sequences it matches
// . we allow multiple instances of the same hash to be stored in
// the hash table, so keep checking for a matching hash until you
// chain to a 0 hash, indicating the chain ends
// . check each matching hash to see if more than 5 words match
// . get the max words that matched from all of the candidates
// . demote the word and phrase weights based on the total/max
// number of words matching
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip if not alnum word
if ( ! wids[i] ) continue;
// reset
//repeatScores[i] = 100;
// add new to the 5 word hash
h ^= wids[i];
// . remove old from 5 word hash before adding new...
// . initial ring wids are 0, so should be benign at startup
h ^= ringWids[ringi];
// add to ring
ringWids[ringi] = wids[i];
// save our position
ringPos[ringi] = i;
// wrap the ring ptr if we need to, that is why we are a ring
if ( ++ringi >= 5 ) ringi = 0;
// this 5-word sequence starts with word # "start"
int32_t start = ringPos[ringi];
// need at least 5 words in the ring buffer to do analysis
if ( ++count < 5 ) continue;
// sanity check
//if ( start <= lastStart ) { char *xx = NULL; *xx = 0; }
// look up in the hash table
//int32_t n = h & mask;
// stop at new york times - debug
/*
if ( words->m_words[i][0] == 'A' &&
words->m_words[i][1] == 's' &&
words->m_words[i][2] == 'k' &&
words->m_words[i][3] == 'e' &&
words->m_words[i][4] == 'd' &&
words->m_words[i][5] == ' ' &&
words->m_words[i][6] == 'Q' &&
words->m_words[i][7] == 'u' )
log("hey");
*/
//loop:
// all done if empty
if ( ! repeatTable->isInTable(&h) ) {//! hashes[n] ) {
// add ourselves to the hash table now
//hashes[n] = h;
// this is where the 5-word sequence starts
//vals [n] = matchStart+1;
int32_t val = matchStart+1;
repeatTable->addKey(&h,&val);
// do not demote any words if less than 8 matched
if ( matched < 3 ) { matched = 0; continue; }
// reset
matched = 0;
// . how much we should we demote
// . 10 matching words pretty much means 0 weights
//float demote = 1.0 - ((matched-5)*.10);
//if ( demote >= 1.0 ) continue;
//if ( demote < 0.0 ) demote = 0.0;
// demote the words involved
for ( int32_t j = matchStart ; j < i ; j++ )
wis[j].m_repeatScore = 0;
// get next word
continue;
}
// save start of matching sequence for demote loop
if ( matched == 0 ) matchStart = start;
// inc the match count
matched++;
}
// if we ended without nulling out some matches
if ( matched < 3 ) return;
for ( int32_t j = matchStart ; j < nw ; j++ )
wis[j].m_repeatScore = 0;
}
///////////////////
//
// FAST FACTS
//
// Sentences containing a gigabit and a lot or all of the query terms.
//
//
///////////////////
static int factCmp ( const void *a, const void *b ) {
Fact *fa = *(Fact **)a;
Fact *fb = *(Fact **)b;
float sa = fa->m_maxGigabitModScore * fa->m_queryScore;
float sb = fb->m_maxGigabitModScore * fb->m_queryScore;
// punish if more than one gigabit! just try to get all
// query terms and ONE gigabit to keep things more targetted.
sa /= fa->m_numGigabits;
sb /= fb->m_numGigabits;
if ( sa < sb ) return 1; // swap!
if ( sa > sb ) return -1;
if ( fa->m_factLen > fb->m_factLen ) return 1; // swap
if ( fa->m_factLen < fb->m_factLen ) return -1;
// then based on docid
if ( fa->m_docId > fb->m_docId ) return 1; // swap
if ( fa->m_docId < fb->m_docId ) return -1;
// if same docid, base on doc position
if ( fa->m_fact > fb->m_fact ) return 1; // swap
if ( fa->m_fact < fb->m_fact ) return -1;
return 0;
}
// . aka NUGGABITS
// . now make the fast facts from the gigabits and the samples.
// . these are sentences containing the query and a gigabit.
// . sets m_factBuf
bool Msg40::computeFastFacts ( ) {
// skip for now
//return true;
bool debugGigabits = m_si->m_debugGigabits;
//
// hash gigabits by first wordid and # words, and phrase hash
//
HashTableX gbitTable;
char gbuf[30000];
if ( ! gbitTable.set(8,sizeof(Gigabit *),1024,gbuf,30000,
false,0,"gbtbl") )
return false;
int32_t numGigabits = m_gigabitBuf.length()/sizeof(Gigabit);
Gigabit *gigabits = (Gigabit *)m_gigabitBuf.getBufStart();
for ( int32_t i = 0 ; i < numGigabits ; i++ ) {
// get the ith gigabit
Gigabit *gi = &gigabits[i];
// parse into words
Words ww;
ww.setx ( gi->m_term , gi->m_termLen , 0 );
int64_t *wids = ww.getWordIds();
// fix mere here
//if ( ! wids[0] ) { char *xx=NULL;*xx=0; }
if ( ! wids[0] ) {
log("doc: wids[0] is null");
return true;
}
// . hash first word
// . so gigabit has # words in it so we can do a slower
// compare function to make sure entire gigabit is matched
// in the code below
if ( ! gbitTable.addKey ( &wids[0] , &gi ) ) return false;
}
//
// hash the query terms we need to match into table as well
//
Query *q = &m_si->m_q;
HashTableX queryTable;
char qbuf[10000];
if ( ! queryTable.set(8,sizeof(QueryTerm *),512,qbuf,
10000,false,0,"qrttbl") )
return false;
for ( int32_t i = 0 ; i < q->m_numTerms ; i++ ) {
// int16_tcut
QueryTerm *qt = &q->m_qterms[i];
// skip if no weight!
if ( qt->m_popWeight <= 0.0 ) continue;
// use RAW termid
if ( ! queryTable.addKey ( &qt->m_rawTermId, &qt ) )
return false;
}
//
// store Facts (sentences) into this safebuf (nuggets)(nuggabits)
//
char ftmp[100000];
SafeBuf factBuf(ftmp,100000);
// scan docs in search results
for ( int32_t i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
// skip if not visible
if ( m_msg3a.m_clusterLevels[i] != CR_OK ) continue;
// get it
Msg20* thisMsg20 = m_msg20[i];
// must be there! wtf?
Msg20Reply *reply = thisMsg20->getReply();
// get sample. sample uses \0 as delimeters between excerpts
char *p = reply-> ptr_gigabitSample;
char *pend = p + reply->size_gigabitSample; // includes \0
// skip if empty
if ( ! p ) continue;
// scan excerpts in sample, \0 separated
while ( p < pend ) {
//
// find the terminating " * " that delineates sections
// and sentences in this excerpt.
//
// NOW we delineate sentences and headings with |'s
//
char *pstart = p;
for ( ; *p ; p++ ) {
if ( p[0] == '|' )
break;
}
// mark that
char *pend = p;
// skip over delimeter if it was there so
// pstart points to the next section on the next
// iteration
if ( *p ) p += 1;
// otherwise, skip the \0
else p++;
// debug
//log("docId=%"INT64" EXCERPT=%s",docId,p);
// . add facts that have the query and a gigabit
// . set Fact::m_score based on gigabit it contains
// . limit to complete sentences, surrounded by *'s
// i guess...
if ( ! addFacts ( &queryTable,
&gbitTable,
pstart,
pend,
debugGigabits,
reply,
&factBuf ) )
return false;
}
}
//
// now sort the Facts by scores
//
int32_t numFacts = factBuf.getLength() / sizeof(Fact);
Fact *facts = (Fact *)factBuf.getBufStart();
SafeBuf ptrBuf;
if ( ! ptrBuf.reserve( numFacts * sizeof(Fact *) ) ) return false;
for ( int32_t i = 0 ; i < numFacts ; i++ ) {
Fact *fi = &facts[i];
ptrBuf.pushPtr ( fi );
}
Fact **ptrs = (Fact **)ptrBuf.getBufStart();
gbqsort ( ptrs , numFacts , sizeof(Fact *) , factCmp , 0 );
//
// now dedup and set m_gigabitModScore to 0 if a dup fact!
//
int32_t need = 0;
for ( int32_t i = 0 ; i < numFacts ; i++ ) {
// get it
Fact *fi = &facts[i];
char *v1 = fi->m_dedupVector;
int32_t vsize = SAMPLE_VECTOR_SIZE;
// compare its dedup vector to the facts before us
int32_t j; for ( j = 0 ; j < i ; j++ ) {
// get it
Fact *fj = &facts[j];
char *v2 = fj->m_dedupVector;
char ss = g_clusterdb.getSampleSimilarity(v1,v2,vsize);
if ( ss < 80 ) continue;
// damn, we're a dup sentence...
fi->m_gigabitModScore = 0.0;
fi->m_queryScore = 0.0;
break;
}
// otherwise we passed
if ( j >= i ) need += sizeof(Fact);
}
//
// now transcribe the non-dups over into permanent buf
//
if ( ! m_factBuf.reserve ( need ) ) return false;
for ( int32_t i = 0 ; i < numFacts ; i++ ) {
// get it
Fact *fi = &facts[i];
if ( fi->m_gigabitModScore == 0.0 ) continue;
// transcribe
m_factBuf.safeMemcpy ( fi , sizeof(Fact) );
}
return true;
}
bool Msg40::addFacts ( HashTableX *queryTable,
HashTableX *gbitTable ,
char *pstart,
char *pend,
bool debugGigabits ,
Msg20Reply *reply,
SafeBuf *factBuf ) {
// parse into words. 0 niceness.
Words ww;
if ( ! ww.set11 ( pstart,pend , 0 ) ) return false;
int32_t nw = ww.getNumWords();
int64_t *wids = ww.getWordIds();
// initialize the sentence/fact we might add to factBuf if score>0
Fact fact;
fact.m_queryScore = 0;
fact.m_gigabitModScore = 0;
fact.m_numGigabits = 0;
fact.m_printed = 0;
fact.m_numQTerms = 0;
fact.m_fact = pstart;
fact.m_factLen = pend - pstart;
fact.m_reply = reply;
fact.m_maxGigabitModScore = 0;
// . sentences end in periods.
// . all sections delimeted by **'s
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip punct words in the sentence/section
if ( ! wids[i] ) continue;
// does it match a query term?
QueryTerm **qtp = (QueryTerm **)queryTable->getValue(&wids[i]);
// yes?
if ( qtp ) {
// get the query term it matches
QueryTerm *qt = *qtp;
// add points for matching it!
fact.m_queryScore += qt->m_popWeight;
fact.m_numQTerms++;
// no need to add gigabit then
continue;
}
// match a gigabit?
Gigabit **gbp = (Gigabit **)gbitTable->getValue(&wids[i]);
if ( gbp ) {
// avoid overflow of ptrs!
if ( fact.m_numGigabits >= MAX_GIGABIT_PTRS )
continue;
// get the gigabit it might match
Gigabit *gb = *gbp;
// see if matches all words in the gigabit
int32_t x = i + 2;
int32_t k;
for ( k = 1 ; k < gb->m_numWords ; k++ ) {
// get next word id in sent
for ( ; x < nw && ! wids[x] ; x++ );
// all done? no match then
if ( x >= nw ) break;
// ok check it
if ( gb->m_wordIds[k] != wids[x]) break;
// advance x too
x++;
}
// it does NOT match the full gigabit! next word then.
if ( k < gb->m_numWords ) goto nomatch;
// . ok, it is a match
// . multiply gigabit score by # pages it is on
// to get the modified gigabit score
float gbModScore = gb->m_gbscore * gb->m_numPages;
fact.m_gigabitModScore += gbModScore;
if ( gbModScore > fact.m_maxGigabitModScore )
fact.m_maxGigabitModScore = gbModScore;
fact.m_gigabitPtrs[fact.m_numGigabits] = gb;
fact.m_numGigabits++;
continue;
}
nomatch:
// otherwise, it does not match a gigabit or query word
continue;
}
// ok, skip if missing either a gigabit or query term
if ( fact.m_gigabitModScore == 0 ) return true;
if ( fact.m_queryScore == 0 ) return true;
//
// make a vector out of words for deduping it!
//
HashTableX simTable;
char sbuf[5000];
simTable.set(4,0,256,sbuf,5000,false,0,"simtab3");
char vtmp[5000];
SafeBuf vbuf(vtmp,5000);
for ( int32_t j = 0 ; j < nw ; j++ ) {
// make it this
uint32_t widu;
widu = (uint64_t)(wids[j]);
// dont allow this! zero is a vector terminator
if ( widu == 0 ) widu = 1;
// skip if already added to vector
if ( simTable.isInTable(&widu) ) continue;
// store that as a vector component
if ( ! vbuf.pushLong(widu) ) return false;
// make sure we do not dedup
if ( ! simTable.addKey(&widu) ) return false;
}
// sort 32-bit word ids from excerpt. niceness = 0
vbuf.sortLongs(0);
// make sure under (128-4) bytes...
vbuf.truncLen(((int32_t)SAMPLE_VECTOR_SIZE) - 4);
// make last int32_t a 0 so Clusterdb::getSimilarity() likes it
vbuf.pushLong(0);
// now store it in the Fact struct
memcpy ( fact.m_dedupVector , vbuf.getBufStart(), vbuf.length() );
// otherwise, add it
if ( ! factBuf->safeMemcpy ( &fact , sizeof(Fact) ) ) return false;
return true;
}
// . printSearchResult into "sb"
bool Msg40::printSearchResult9 ( int32_t ix , int32_t *numPrintedSoFar ,
Msg20Reply *mr ) {
// . we stream results right onto the socket
// . useful for thousands of results... and saving mem
if ( ! m_si || ! m_si->m_streamResults ) { char *xx=NULL;*xx=0; }
// get state0
State0 *st = (State0 *)m_state;
//SafeBuf *sb = &st->m_sb;
// clear it since we are streaming
//sb->reset();
Msg40 *msg40 = &st->m_msg40;
// then print each result
// don't display more than docsWanted results
if ( m_numPrinted >= msg40->getDocsWanted() ) {
// i guess we can print "Next 10" link
m_moreToCome = true;
// hide if above limit
log("msg40: hiding above docsWanted #%"INT32" (%"UINT32")(d=%"INT64")",
m_printi,mr->m_contentHash32,mr->m_docId);
// do not exceed what the user asked for
return true;
}
// prints in xml or html
if ( m_si->m_format == FORMAT_CSV ) {
printJsonItemInCSV ( st , ix );
//log("print: printing #%"INT32" csv",(int32_t)ix);
}
// print that out into st->m_sb safebuf
else if ( ! printResult ( st , ix , numPrintedSoFar ) ) {
// oom?
if ( ! g_errno ) g_errno = EBADENGINEER;
log("query: had error: %s",mstrerror(g_errno));
m_hadPrintError = true;
}
log("msg40: printing #%"INT32" (%"UINT32")(d=%"INT64")",
m_printi,mr->m_contentHash32,mr->m_docId);
// count it
m_numPrinted++;
return true;
}
bool printHttpMime ( State0 *st ) {
SearchInput *si = &st->m_si;
// grab the query
//Msg40 *msg40 = &(st->m_msg40);
//char *q = msg40->getQuery();
//int32_t qlen = msg40->getQueryLen();
//char local[ 128000 ];
//SafeBuf sb(local, 128000);
SafeBuf *sb = &st->m_sb;
// reserve 1.5MB now!
if ( ! sb->reserve(1500000 ,"pgresbuf" ) ) // 128000) )
return true;
// just in case it is empty, make it null terminated
sb->nullTerm();
char *ct = "text/csv";
if ( si->m_format == FORMAT_JSON )
ct = "application/json";
if ( si->m_format == FORMAT_XML )
ct = "text/xml";
if ( si->m_format == FORMAT_HTML )
ct = "text/html";
//if ( si->m_format == FORMAT_TEXT )
// ct = "text/plain";
if ( si->m_format == FORMAT_CSV )
ct = "text/csv";
// . if we haven't yet sent an http mime back to the user
// then do so here, the content-length will not be in there
// because we might have to call for more spiderdb data
HttpMime mime;
mime.makeMime ( -1, // totel content-lenght is unknown!
0 , // do not cache (cacheTime)
0 , // lastModified
0 , // offset
-1 , // bytesToSend
NULL , // ext
false, // POSTReply
ct, // "text/csv", // contenttype
"utf-8" , // charset
-1 , // httpstatus
NULL ); //cookie
sb->safeMemcpy(mime.getMime(),mime.getMimeLen() );
return true;
}
/////////////////
//
// CSV LOGIC from PageResults.cpp
//
/////////////////
// return 1 if a should be before b
static int csvPtrCmp ( const void *a, const void *b ) {
//JsonItem *ja = (JsonItem **)a;
//JsonItem *jb = (JsonItem **)b;
char *pa = *(char **)a;
char *pb = *(char **)b;
if ( strcmp(pa,"type") == 0 ) return -1;
if ( strcmp(pb,"type") == 0 ) return 1;
// force title on top
if ( strcmp(pa,"product.title") == 0 ) return -1;
if ( strcmp(pb,"product.title") == 0 ) return 1;
if ( strcmp(pa,"title") == 0 ) return -1;
if ( strcmp(pb,"title") == 0 ) return 1;
// otherwise string compare
int val = strcmp(pa,pb);
return val;
}
#include "Json.h"
//
// print header row in csv
//
bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
//Msg40 *msg40 = &st->m_msg40;
//int32_t numResults = msg40->getNumResults();
char tmp1[1024];
SafeBuf tmpBuf (tmp1 , 1024);
char tmp2[1024];
SafeBuf nameBuf (tmp2, 1024);
char nbuf[27000];
HashTableX nameTable;
if ( ! nameTable.set ( 8,4,2048,nbuf,27000,false,0,"ntbuf") )
return false;
int32_t niceness = 0;
// . scan every fucking json item in the search results.
// . we still need to deal with the case when there are so many
// search results we have to dump each msg20 reply to disk in
// order. then we'll have to update this code to scan that file.
for ( int32_t i = 0 ; i < m_needFirstReplies ; i++ ) {
Msg20 *m20 = getCompletedSummary(i);
if ( ! m20 ) break;
if ( m20->m_errno ) continue;
if ( ! m20->m_r ) { char *xx=NULL;*xx=0; }
Msg20Reply *mr = m20->m_r;
// get content
char *json = mr->ptr_content;
// how can it be empty?
if ( ! json ) continue;
// parse it up
Json jp;
jp.parseJsonStringIntoJsonItems ( json , niceness );
// scan each json item
for ( JsonItem *ji = jp.getFirstItem(); ji ; ji = ji->m_next ){
// skip if not number or string
if ( ji->m_type != JT_NUMBER &&
ji->m_type != JT_STRING )
continue;
// if in an array, do not print! csv is not
// good for arrays... like "media":[....] . that
// one might be ok, but if the elements in the
// array are not simple types, like, if they are
// unflat json objects then it is not well suited
// for csv.
if ( ji->isInArray() )
continue;
// reset length of buf to 0
tmpBuf.reset();
// . get the name of the item into "nameBuf"
// . returns false with g_errno set on error
if ( ! ji->getCompoundName ( tmpBuf ) )
return false;
// skip the "html" column, strip that out now
if ( strcmp(tmpBuf.getBufStart(),"html") == 0 )
continue;
// is it new?
int64_t h64 = hash64n ( tmpBuf.getBufStart() );
if ( nameTable.isInTable ( &h64 ) ) continue;
// record offset of the name for our hash table
int32_t nameBufOffset = nameBuf.length();
// store the name in our name buffer
if ( ! nameBuf.safeStrcpy ( tmpBuf.getBufStart() ) )
return false;
if ( ! nameBuf.pushChar ( '\0' ) )
return false;
// it's new. add it
if ( ! nameTable.addKey ( &h64 , &nameBufOffset ) )
return false;
}
}
// . make array of ptrs to the names so we can sort them
// . try to always put title first regardless
char *ptrs [ 1024 ];
int32_t numPtrs = 0;
for ( int32_t i = 0 ; i < nameTable.m_numSlots ; i++ ) {
if ( ! nameTable.m_flags[i] ) continue;
int32_t off = *(int32_t *)nameTable.getValueFromSlot(i);
char *p = nameBuf.getBufStart() + off;
ptrs[numPtrs++] = p;
if ( numPtrs >= 1024 ) break;
}
// sort them
qsort ( ptrs , numPtrs , 4 , csvPtrCmp );
// set up table to map field name to column for printing the json items
HashTableX *columnTable = &m_columnTable;
if ( ! columnTable->set ( 8,4, numPtrs * 4,NULL,0,false,0,"coltbl" ) )
return false;
// now print them out as the header row
for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
if ( i > 0 && ! sb->pushChar(',') ) return false;
if ( ! sb->safeStrcpy ( ptrs[i] ) ) return false;
// record the hash of each one for printing out further json
// objects in the same order so columns are aligned!
int64_t h64 = hash64n ( ptrs[i] );
if ( ! columnTable->addKey ( &h64 , &i ) )
return false;
}
m_numCSVColumns = numPtrs;
if ( ! sb->pushChar('\n') )
return false;
if ( ! sb->nullTerm() )
return false;
return true;
}
// returns false and sets g_errno on error
bool Msg40::printJsonItemInCSV ( State0 *st , int32_t ix ) {
int32_t niceness = 0;
//
// get the json from the search result
//
Msg20 *m20 = getCompletedSummary(ix);
if ( ! m20 ) return false;
if ( m20->m_errno ) return false;
if ( ! m20->m_r ) { char *xx=NULL;*xx=0; }
Msg20Reply *mr = m20->m_r;
// get content
char *json = mr->ptr_content;
// how can it be empty?
if ( ! json ) { char *xx=NULL;*xx=0; }
// parse the json
Json jp;
jp.parseJsonStringIntoJsonItems ( json , niceness );
HashTableX *columnTable = &m_columnTable;
int32_t numCSVColumns = m_numCSVColumns;
//SearchInput *si = m_si;
SafeBuf *sb = &st->m_sb;
// make buffer space that we need
char ttt[1024];
SafeBuf ptrBuf(ttt,1024);
int32_t maxCols = numCSVColumns;
// allow for additionals colls
maxCols += 100;
int32_t need = maxCols * sizeof(JsonItem *);
if ( ! ptrBuf.reserve ( need ) ) return false;
JsonItem **ptrs = (JsonItem **)ptrBuf.getBufStart();
// reset json item ptrs for csv columns. all to NULL
memset ( ptrs , 0 , need );
char tmp1[1024];
SafeBuf tmpBuf (tmp1 , 1024);
JsonItem *ji;
///////
//
// print json item in csv
//
///////
for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) {
// skip if not number or string
if ( ji->m_type != JT_NUMBER &&
ji->m_type != JT_STRING )
continue;
// skip if not well suited for csv (see above comment)
if ( ji->isInArray() ) continue;
// . get the name of the item into "nameBuf"
// . returns false with g_errno set on error
if ( ! ji->getCompoundName ( tmpBuf ) )
return false;
// is it new?
int64_t h64 = hash64n ( tmpBuf.getBufStart() );
// ignore the "html" column
if ( strcmp(tmpBuf.getBufStart(),"html") == 0 ) continue;
int32_t slot = columnTable->getSlot ( &h64 ) ;
// MUST be in there
// get col #
int32_t column = -1;
if ( slot >= 0 )
column =*(int32_t *)columnTable->getValueFromSlot ( slot);
// sanity
if ( column == -1 ) {//>= numCSVColumns ) {
// add a new column...
int32_t newColnum = numCSVColumns + 1;
// silently drop it if we already have too many cols
if ( newColnum >= maxCols ) continue;
columnTable->addKey ( &h64 , &newColnum );
column = newColnum;
numCSVColumns++;
//char *xx=NULL;*xx=0; }
}
// set ptr to it for printing when done parsing every field
// for this json item
ptrs[column] = ji;
}
// now print out what we got
for ( int32_t i = 0 ; i < numCSVColumns ; i++ ) {
// , delimeted
if ( i > 0 ) sb->pushChar(',');
// get it
ji = ptrs[i];
// skip if none
if ( ! ji ) continue;
// skip "html" field... too spammy for csv and > 32k causes
// libreoffice calc to truncate it and break its parsing
if ( ji->m_name &&
//! ji->m_parent &&
strcmp(ji->m_name,"html")==0)
continue;
//
// get value and print otherwise
//
if ( ji->m_type == JT_NUMBER ) {
// print numbers without double quotes
if ( ji->m_valueDouble *10000000.0 ==
(double)ji->m_valueLong * 10000000.0 )
sb->safePrintf("%"INT32"",ji->m_valueLong);
else
sb->safePrintf("%f",ji->m_valueDouble);
continue;
}
// print the value
sb->pushChar('\"');
// get the json item to print out
int32_t vlen = ji->getValueLen();
// truncate
char *truncStr = NULL;
if ( vlen > 32000 ) {
vlen = 32000;
truncStr = " ... value truncated because "
"Excel can not handle it. Download the "
"JSON to get untruncated data.";
}
// print it out
sb->csvEncode ( ji->getValue() , vlen );
// print truncate msg?
if ( truncStr ) sb->safeStrcpy ( truncStr );
// end the CSV
sb->pushChar('\"');
}
sb->pushChar('\n');
sb->nullTerm();
return true;
}
// this is a safebuf of msg20s for doing facet string lookups
Msg20 *Msg40::getUnusedMsg20 ( ) {
// make a safebuf of 50 of them if we haven't yet
if ( m_unusedBuf.getCapacity() <= 0 ) {
if ( ! m_unusedBuf.reserve ( (int32_t)MAX2 * sizeof(Msg20) ) ) {
return NULL;
}
Msg20 *ma = (Msg20 *)m_unusedBuf.getBufStart();
for ( int32_t i = 0 ; i < (int32_t)MAX2 ; i++ ) {
ma[i].constructor();
ma[i].m_owningParent = (void *)this;
ma[i].m_constructedId = 3;
// if we don't update length then Msg40::resetBuf2()
// will fail to call Msg20::destructor on them
m_unusedBuf.m_length += sizeof(Msg20);
}
}
Msg20 *ma = (Msg20 *)m_unusedBuf.getBufStart();
for ( int32_t i = 0 ; i < (int32_t)MAX2 ; i++ ) {
// m_inProgress is set to false right before it
// calls Msg20::m_callback which is gotSummaryWrapper()
// so we should be ok with this
if ( ma[i].m_inProgress ) continue;
return &ma[i];
}
// how can this happen???
char *xx=NULL;*xx=0;
return NULL;
}
static bool gotFacetTextWrapper ( void *state ) {
Msg20 *m20 = (Msg20 *)state;
Msg40 *THIS = (Msg40 *)m20->m_hack;
THIS->gotFacetText(m20);
return true;
}
void Msg40::gotFacetText ( Msg20 *msg20 ) {
m_numMsg20sIn++;
//log("msg40: numin=%"INT32"",m_numMsg20sIn);
if ( ! msg20->m_r ) {
log("msg40: msg20 reply is NULL");
return;
}
char *buf = msg20->m_r->ptr_facetBuf;
// null as well?
if ( ! buf ) {
log("msg40: ptr_facetBuf is NULL");
// try to launch more msg20s
lookupFacets();
return;
}
char *p = buf;
// skip query term string
p += gbstrlen(p) + 1;
// then <val32>,<str32>
FacetValHash_t fvh = atoll(p);
char *text = strstr ( p , "," );
// skip comma. text could be truncated/ellipsis-sized
if ( text ) text++;
int32_t offset = m_facetTextBuf.length();
m_facetTextBuf.safeStrcpy ( text );
m_facetTextBuf.pushChar('\0');
// initialize this if it needs it
if ( m_facetTextTable.m_ks == 0 )
m_facetTextTable.set(sizeof(FacetValHash_t),4,
64,NULL,0,false,0,"fctxtbl");
// store in buffer
m_facetTextTable.addKey ( &fvh , &offset );
// try to launch more msg20s
if ( ! lookupFacets() ) return;
}
// return false if blocked, true otherwise
bool Msg40::lookupFacets ( ) {
if ( m_doneWithLookup ) return true;
if ( m_firstTime ) {
m_firstTime = false;
m_numMsg20sOut = 0;
m_numMsg20sIn = 0;
m_j = 0;
m_i = 0;
}
lookupFacets2();
// if not done return false
if ( m_numMsg20sOut > m_numMsg20sIn ) return false;
m_doneWithLookup = true;
// did nothing? return true so control resumes from where
// lookupFacets() was called
if ( m_numMsg20sOut == 0 ) return true;
// hack: dec since gotSummaryWrapper incs this
m_numReplies--;
// . ok, we blocked, so call callback, etc.
// . pretend we just got another summary
gotSummaryWrapper ( this );
return true;
}
void Msg40::lookupFacets2 ( ) {
// scan each query term
for ( ; m_i < m_si->m_q.getNumTerms() ; m_i++ ) {
QueryTerm *qt = &m_si->m_q.m_qterms[m_i];
// skip if not STRING facet. we don't need to lookup
// numeric facets because we already have the # for compiling
// and presenting on the search results page.
if ( qt->m_fieldCode != FIELD_GBFACETSTR ) //&&
//qt->m_fieldCode != FIELD_GBFACETINT &&
//qt->m_fieldCode != FIELD_GBFACETFLOAT )
continue;
HashTableX *fht = &qt->m_facetHashTable;
// scan every value this facet has
for ( ; m_j < fht->getNumSlots() ; m_j++ ) {
// skip empty slots
if ( ! fht->m_flags[m_j] ) continue;
// get hash of the facet value
FacetValHash_t fvh = *(int32_t *)fht->getKeyFromSlot(m_j);
//int32_t count = *(int32_t *)fht->getValFromSlot(j);
// get the docid as well
FacetEntry *fe =(FacetEntry *)fht->getValFromSlot(m_j);
// how many docids in the results had this valud?
//int32_t count = fe->m_count;
// one of the docids that had it
int64_t docId = fe->m_docId;
// more than 50 already outstanding?
if ( m_numMsg20sOut - m_numMsg20sIn >= MAX2 )
// wait for some to come back
return;
// lookup docid that has this to get text
Msg20 *msg20 = getUnusedMsg20();
// wait if none available
if ( ! msg20 ) return;
// make the request
Msg20Request req;
req.m_docId = docId;
// supply the query term so we know what to return.
// it's either an xpath facet, a json/xml field facet
// or a meta tag facet.
SafeBuf tmp;
tmp.safeMemcpy ( qt->m_term , qt->m_termLen );
tmp.nullTerm();
req. ptr_qbuf = tmp.getBufStart();
req.size_qbuf = tmp.length() + 1; // include \0
req.m_justGetFacets = true;
// need to supply the hash of the facet value otherwise
// if a doc has multiple values for a facet it always
// returns the first one. so tell it we want this one.
req.m_facetValHash = fvh;
msg20->m_hack = this;//(int32_t)this;
req.m_state = msg20;
req.m_callback = gotFacetTextWrapper;
// TODO: fix this
req.m_collnum = m_si->m_firstCollnum;
// get it
if ( ! msg20->getSummary ( &req ) ) {
m_numMsg20sOut++;
//log("msg40: numout=%"INT32"",m_numMsg20sOut);
continue;
}
// must have been error otherwise
log("facet: error getting text: %s",
mstrerror(g_errno));
}
// done! reset scan of inner loop
m_j = 0;
}
}
// this is new PageResults.cpp
bool replaceParm ( char *cgi , SafeBuf *newUrl , HttpRequest *hr ) ;
bool Msg40::printFacetTables ( SafeBuf *sb ) {
char format = m_si->m_format;
int32_t saved = sb->length();
for ( int32_t i = 0 ; i < m_si->m_q.getNumTerms() ; i++ ) {
// only for html for now i guess
//if ( m_si->m_format != FORMAT_HTML ) break;
QueryTerm *qt = &m_si->m_q.m_qterms[i];
// skip if not facet
if ( qt->m_fieldCode != FIELD_GBFACETSTR &&
qt->m_fieldCode != FIELD_GBFACETINT &&
qt->m_fieldCode != FIELD_GBFACETFLOAT )
continue;
// if had facet ranges, print them out
printFacetsForTable ( sb , qt );;
}
// if json, remove ending ,\n and make it just \n
if ( format == FORMAT_JSON && sb->length() != saved ) {
// remove ,\n
sb->m_length -= 2;
// make just \n
sb->pushChar('\n');
//sb->safePrintf("],\n");
// search results will follow so put a comma here if not
// streaming result. if we are streaming results we print
// the facets after the results so we can take advantage
// of the msg20 summary lookups we already did to get the
// facet text.
if ( ! m_si->m_streamResults )
sb->safePrintf(",\n");
}
return true;
}
HashTableX *g_fht = NULL;
QueryTerm *g_qt = NULL;
// sort facets by document counts before displaying
static int feCmp ( const void *a1, const void *b1 ) {
int32_t a = *(int32_t *)a1;
int32_t b = *(int32_t *)b1;
FacetEntry *fe1 = (FacetEntry *)g_fht->getValFromSlot(a);
FacetEntry *fe2 = (FacetEntry *)g_fht->getValFromSlot(b);
if ( fe2->m_count > fe1->m_count ) return 1;
if ( fe2->m_count < fe1->m_count ) return -1;
int32_t *k1 = (int32_t *)g_fht->getKeyFromSlot(a);
int32_t *k2 = (int32_t *)g_fht->getKeyFromSlot(b);
if ( g_qt->m_fieldCode == FIELD_GBFACETFLOAT )
return (int)( *(float *)k2 - *(float *)k1 );
// otherwise an int
return ( *k2 - *k1 );
}
bool Msg40::printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ) {
//QueryWord *qw = qt->m_qword;
//if ( qw->m_numFacetRanges > 0 )
HashTableX *fht = &qt->m_facetHashTable;
// first sort facetentries in hashtable by their key before
// we print them out
int32_t np = fht->getNumSlotsUsed();
SafeBuf pbuf;
if ( ! pbuf.reserve(np*4) ) return false;
int32_t *ptrs = (int32_t *)pbuf.getBufStart();
int32_t numPtrs = 0;
for ( int32_t j = 0 ; j < fht->getNumSlots() ; j++ ) {
if ( ! fht->m_flags[j] ) continue;
ptrs[numPtrs++] = j;
}
// use this as global for qsort
g_fht = fht;
g_qt = qt;
// use qsort
gbqsort ( ptrs , numPtrs , sizeof(int32_t) , feCmp , 0 );
// now scan the slots and print out
HttpRequest *hr = &m_si->m_hr;
bool firstTime = true;
bool isString = false;
if ( qt->m_fieldCode == FIELD_GBFACETSTR ) isString = true;
char format = m_si->m_format;
// a new table for each facet query term
bool needTable = true;
// print out the dumps
for ( int32_t x= 0 ; x < numPtrs ; x++ ) {
// skip empty slots
//if ( ! fht->m_flags[j] ) continue;
int32_t j = ptrs[x];
// this was originally 32 bit hash of the facet val
// but now it is 64 bit i guess
FacetValHash_t *fvh ;
fvh = (FacetValHash_t *)fht->getKeyFromSlot(j);
// we store how many docids had this value
//int32_t count = *(int32_t *)fht->getValFromSlot(j);
FacetEntry *fe;
fe = (FacetEntry *)fht->getValueFromSlot(j);
int32_t count = 0;
// could be empty if range had no values in it
if ( fe ) count = fe->m_count;
char *text = NULL;
char *termPtr = qt->m_term;
int32_t termLen = qt->m_termLen;
if ( termPtr[0] == ' ' ) { termPtr++; termLen--; }
if ( strncasecmp(termPtr,"gbfacetstr:",11)== 0 ) {
termPtr += 11; termLen -= 11; }
if ( strncasecmp(termPtr,"gbfacetint:",11)== 0 ) {
termPtr += 11; termLen -= 11; }
if ( strncasecmp(termPtr,"gbfacetfloat:",13)== 0 ) {
termPtr += 13; termLen -= 13; }
char tmpBuf[64];
SafeBuf termBuf(tmpBuf,64);
termBuf.safeMemcpy(termPtr,termLen);
termBuf.nullTerm();
char *term = termBuf.getBufStart();
char tmp[64];
QueryWord *qw= qt->m_qword;
if ( qt->m_fieldCode == FIELD_GBFACETINT ) {
sprintf(tmp,"%"INT32"",(int32_t)*fvh);
text = tmp;
}
if ( qt->m_fieldCode == FIELD_GBFACETFLOAT ) {
sprintf(tmp,"%f",*(float *)fvh);
text = tmp;
}
int32_t k2 = -1;
for ( int32_t k = 0 ; k < qw->m_numFacetRanges; k++ ) {
if ( qt->m_fieldCode != FIELD_GBFACETINT )
break;
if ( *(int32_t *)fvh < qw->m_facetRangeIntA[k])
continue;
if ( *(int32_t *)fvh >= qw->m_facetRangeIntB[k])
continue;
sprintf(tmp,"[%"INT32"-%"INT32")"
,qw->m_facetRangeIntA[k]
,qw->m_facetRangeIntB[k]
);
text = tmp;
k2 = k;
}
for ( int32_t k = 0 ; k < qw->m_numFacetRanges; k++ ) {
if ( qt->m_fieldCode != FIELD_GBFACETFLOAT )
break;
if ( *(float *)fvh < qw->m_facetRangeFloatA[k])
continue;
if ( *(float *)fvh >= qw->m_facetRangeFloatB[k])
continue;
sprintf(tmp,"[%f-%f)"
,qw->m_facetRangeFloatA[k]
,qw->m_facetRangeFloatB[k]
);
text = tmp;
k2 = k;
}
// lookup the text representation, whose hash is *fvh
if ( qt->m_fieldCode == FIELD_GBFACETSTR ) {
int32_t *offset;
offset =(int32_t *)m_facetTextTable.getValue(fvh);
// wtf?
if ( ! offset ) {
log("msg40: missing facet text for "
"val32=%"UINT32"",
(uint32_t)*fvh);
continue;
}
text = m_facetTextBuf.getBufStart() + *offset;
}
if ( format == FORMAT_XML ) {
sb->safePrintf("\t<facet>\n"
"\t\t<field>%s</field>\n"
"\t\t<value>"
, term
);
if ( isString )
sb->safePrintf("<![CDATA[%"UINT32",",
(uint32_t)*fvh);
sb->cdataEncode ( text );
if ( isString )
sb->safePrintf("]]>");
sb->safePrintf("</value>\n"
"\t\t<docCount>%"INT32""
"</docCount>\n"
"\t</facet>\n",count);
continue;
}
if ( format == FORMAT_JSON && firstTime ) {
firstTime = false;
// if streaming results we may have hacked off
// the last ,\n so put it back
if ( m_si->m_streamResults ) {
//sb->m_length -= 1;
sb->safeStrcpy(",\n\n");
}
//sb->safePrintf("\"facets\":[\n");
}
// print that out
if ( needTable && format == FORMAT_HTML ) {
needTable = false;
sb->safePrintf("<div id=facets "
"style="
"padding:5px;"
"position:relative;"
"border-width:3px;"
"border-right-width:0px;"
"border-style:solid;"
"margin-left:10px;"
"border-top-left-radius:10px;"
"border-bottom-left-radius:10px;"
"border-color:blue;"
"background-color:white;"
"border-right-color:white;"
"margin-right:-3px;"
">"
"<table cellspacing=7>"
"<tr><td width=200px; "
"valign=top>"
"<center>"
"<img src=/facets40.jpg>"
"</center>"
"<br>"
);
sb->safePrintf("<font color=gray>"
"values for</font> "
"<b>%s</b></td></tr>\n",
term);
}
if ( needTable && format == FORMAT_JSON ) {
needTable = false;
sb->safePrintf("\"facets\":[");
}
if ( format == FORMAT_JSON ) {
sb->safePrintf("{\n"
"\t\"field\":\"%s\",\n"
"\t\"value\":\""
, term
);
if ( isString )
sb->safePrintf("%"UINT32","
, (uint32_t)*fvh);
sb->jsonEncode ( text );
//if ( isString )
// just use quotes for ranges like "[1-3)" now
sb->safePrintf("\"");
sb->safePrintf(",\n"
"\t\"docCount\":%"INT32"\n"
"}\n,\n", count);
continue;
}
// make the cgi parm to add to the original url
char nsbuf[128];
SafeBuf newStuff(nsbuf,128);
// they are all ints...
//char *suffix = "int";
//if ( qt->m_fieldCode == FIELD_GBFACETFLOAT )
// suffix = "float";
//newStuff.safePrintf("prepend=gbequalint%%3A");
if ( qt->m_fieldCode == FIELD_GBFACETINT &&
qw->m_numFacetRanges > 0 ) {
int32_t min = qw->m_facetRangeIntA[k2];
int32_t max = qw->m_facetRangeIntB[k2];
if ( min == max )
newStuff.safePrintf("prepend="
"gbequalint%%3A%s%%3A%"UINT32"+"
,term
,(int32_t)*fvh);
else
newStuff.safePrintf("prepend="
"gbminint%%3A%s%%3A%"UINT32"+"
"gbmaxint%%3A%s%%3A%"UINT32"+"
,term
,min
,term
,max-1
);
}
else if ( qt->m_fieldCode == FIELD_GBFACETFLOAT &&
qw->m_numFacetRanges > 0 ) {
float min = qw->m_facetRangeIntA[k2];
float max = qw->m_facetRangeIntB[k2];
if ( min == max )
newStuff.safePrintf("prepend="
"gbequalfloat%%3A%s%%3A%f+"
,term
,*(float *)fvh);
else
newStuff.safePrintf("prepend="
"gbminfloat%%3A%s%%3A%f+"
"gbmaxfloat%%3A%s%%3A%f+"
,term
,min
,term
,max
);
}
else if ( qt->m_fieldCode == FIELD_GBFACETFLOAT )
newStuff.safePrintf("prepend="
"gbequalfloat%%3A%s%%3A%f",
term,
*(float *)fvh);
else if ( qt->m_fieldCode == FIELD_GBFACETINT )
newStuff.safePrintf("prepend="
"gbequalint%%3A%s%%3A%"UINT32"",
term,
(int32_t)*fvh);
else if ( qt->m_fieldCode == FIELD_GBFACETSTR &&
// in XmlDoc.cpp the gbxpathsitehash123456: terms
// call hashFacets2() separately with val32
// equal to the section inner hash which is not
// an exact hash of the string using hash32()
// unfortunately, so we can't use gbfieldmatch:
// which is case sensitive etc.
!strncmp(qt->m_term,
"gbfacetstr:gbxpathsitehash",26) )
newStuff.safePrintf("prepend="
"gbequalint%%3Agbfacetstr%%3A"
"%s%%3A%"UINT32"",
term,
(int32_t)*fvh);
else if ( qt->m_fieldCode == FIELD_GBFACETSTR ) {
newStuff.safePrintf("prepend="
"gbfieldmatch%%3A%s%%3A%%22"
,term
//"gbequalint%%3A%s%%3A%"UINT32""
//,(int32_t)*fvh
);
newStuff.urlEncode(text);
newStuff.safePrintf("%%22");
}
// get the original url and add
// &prepend=gbequalint:gbhopcount:1 type stuff to it
SafeBuf newUrl;
replaceParm ( newStuff.getBufStart(), &newUrl , hr );
// print the facet in its numeric form
// we will have to lookup based on its docid
// and get it from the cached page later
sb->safePrintf("<tr><td width=200px; valign=top>"
//"<a href=?search="//gbfacet%3A"
//"%s:%"UINT32""
// make a search to just show those
// docs from this facet with that
// value. actually gbmin/max would work
"<a href=\"%s\">"
, newUrl.getBufStart()
);
sb->safePrintf("%s (%"UINT32" documents)"
"</a>"
"</td></tr>\n"
,text
,count); // count for printing
}
if ( ! needTable && format == FORMAT_JSON ) {
sb->m_length -= 2; // hack off trailing comma
sb->safePrintf("],\n"); // close off json array
}
if ( ! needTable && format == FORMAT_HTML )
sb->safePrintf("</table></div><br>\n");
return true;
}