open-source-search-engine/Msg24.cpp

3352 lines
110 KiB
C++

#include "gb-include.h"
#include "Msg51.h"
//#include "Msg24.h"
#include "Query.h"
#include "Msg20.h"
//#include "TermTable.h"
#include "Words.h"
#include "Speller.h"
#include <math.h>
#include "StopWords.h"
#include "HashTable.h"
#include "Clusterdb.h"
#include "Scores.h"
#include "Stats.h"
#include "Words.h"
// here's the knobs:
// sample radius in chars around each query term : 600 (line 212)
// max sample size, all excerpts, per document : 100k (line 213)
// map from distance to query term in words to score: (line 855)
// map from popularity to score weight : (lines 950 et al)
// the comments above are way out of date (aac, Jan 2008)
//
// QPOP multiplier params
#define QPOP_ZONE_0 10
#define QPOP_ZONE_1 30
#define QPOP_ZONE_2 80
#define QPOP_ZONE_3 100
#define QPOP_ZONE_4 300
#define QPOP_MULT_0 10
#define QPOP_MULT_1 8
#define QPOP_MULT_2 6
#define QPOP_MULT_3 4
#define QPOP_MULT_4 2
// QTR scoring params
#define MAX_SCORE_MULTIPLIER 3000 // orig: 3000
#define ALT_MAX_SCORE 12000 // orig: 12000
#define ALT_START_SCORE 1000
#define QTR_ZONE_0 4
#define QTR_ZONE_1 8
#define QTR_ZONE_2 12
#define QTR_ZONE_3 20
#define QTR_BONUS_0 1000
#define QTR_BONUS_1 800
#define QTR_BONUS_2 500
#define QTR_BONUS_3 200
#define QTR_BONUS_CW 1
#define MULTIPLE_HIT_BOOST 1000 // orig: 1000
// gigabit phrase scoring params
#define SPARSE_MARK 0.34
#define SPARSE_PENALTY 1000
#define FWC_PENALTY 500 // penalty for begining with common word
#define POP_ZONE_0 0.00001
#define POP_ZONE_1 0.0001
#define POP_ZONE_2 0.001
#define POP_ZONE_3 0.01
#define POP_BOOST_0 3.0
#define POP_BOOST_1 1.5
#define POP_BOOST_2 1.0
#define POP_BOOST_3 0.3
#define POP_BOOST_4 0.1
//static bool onSamePages(int32_t i,int32_t j,int32_t *slots,int32_t *heads,int32_t *pages);
static void handleRequest24 ( UdpSlot *slot , int32_t netnice ) ;
static void setRepeatScores ( char *repeatScores ,
int64_t *wids ,
int32_t nw ,
char *repeatTable ,
int32_t repeatTableNumSlots ,
Words *words ) ;
Msg24::Msg24 ( ) {
m_numTopics = 0;
m_request = NULL;
m_reply = NULL;
m_topicPtrs = NULL;
m_topicLens = NULL;
m_topicScores = NULL;
m_topicGids = NULL;
m_topicPops = NULL;
m_topicDocIds = NULL;
m_topicNumDocIds = NULL;
m_isUnicode = false;
}
Msg24::~Msg24 ( ) { reset(); }
void Msg24::reset ( ) {
if ( m_request && m_request != m_requestBuf )
mfree ( m_request , m_requestSize , "Msg24" );
m_request = NULL;
// free reply if we should
if ( m_reply ) mfree ( m_reply , m_replySize , "Msg24" );
m_reply = NULL;
m_isUnicode = false;
}
bool Msg24::registerHandler ( ) {
// . register ourselves with the udp server
// . it calls our callback when it receives a msg of type 0x24
if ( ! g_udpServer.registerHandler ( 0x24, handleRequest24 ))
return false;
return true;
}
static void gotReplyWrapper24 ( void *state1 , void *state2 ) ;
bool Msg24::generateTopics ( char *coll ,
int32_t collLen ,
char *query ,
int32_t queryLen ,
//float termFreqWeights ,
//float phraseAffWeights ,
int64_t *docIds ,
char *clusterLevels ,
int32_t numDocIds ,
TopicGroup *topicGroups ,
int32_t numTopicGroups ,
//int32_t docsToScanForTopics ,
//int32_t minTopicScore ,
//int32_t maxTopics ,
//int32_t maxWordsPerPhrase ,
int32_t maxCacheAge ,
bool addToCache ,
bool returnDocIdCount ,
bool returnDocIds ,
bool returnPops ,
void *state ,
void (* callback) (void *state ),
int32_t niceness) {
// force it to be true, since hi bit is set in pops if topic is unicode
returnPops = true;
// warning
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg24.");
// force it
returnDocIdCount = true;
// if we don't get docids, then deserialize doesn't work because it
// expects the docids to be valid.
returnDocIds = true;
// reset
m_numTopics = 0;
//m_docsToScanForTopics = docsToScanForTopics;
//m_minTopicScore = minTopicScore;
//m_maxTopics = maxTopics;
m_numDocIds = numDocIds;
m_coll = coll;
m_collLen = collLen;
m_returnDocIdCount = returnDocIdCount;
m_returnDocIds = returnDocIds;
m_returnPops = returnPops;
// bail if no operations to do
if ( numTopicGroups <= 0 ) return true;
if ( numDocIds <= 0 ) return true;
int32_t numTopicsToGen = topicGroups->m_numTopics;
// get the min we have to scan
int32_t docsToScanForTopics = topicGroups[0].m_docsToScanForTopics;
for ( int32_t i = 1 ; i < numTopicGroups ; i++ ) {
int32_t x = topicGroups[i].m_docsToScanForTopics ;
if ( x > docsToScanForTopics ) docsToScanForTopics = x;
if ( topicGroups[i].m_numTopics > numTopicsToGen )
numTopicsToGen = topicGroups[i].m_numTopics;
}
// bail if none
if ( docsToScanForTopics <= 0 ) return true;
if ( numTopicsToGen == 0 ) return true;
m_state = state;
m_callback = callback;
m_startTime = gettimeofdayInMilliseconds();
// save, caller should not delete this!
m_topicGroups = topicGroups;
m_numTopicGroups = numTopicGroups;
// truncate
//if ( maxTopics > MAX_TOPICS ) maxTopics = MAX_TOPICS;
// truncate
//if ( numDocIds > MAX_DOCIDS_TO_SCAN )
// numDocIds = MAX_DOCIDS_TO_SCAN ;
if ( numDocIds > docsToScanForTopics )
numDocIds = docsToScanForTopics ;
int32_t size = sizeof(TopicGroup) * numTopicGroups ;
if ( queryLen > MAX_QUERY_LEN ) queryLen = MAX_QUERY_LEN;
// how much space do we need?
int32_t need = 4+4+4+size+
queryLen+1+
numDocIds*8 +
numDocIds +collLen+1 + sizeof(niceness);
m_requestSize = need;
// make enough room for the request
if ( need < MSG24_REQUEST_SIZE ) m_request = m_requestBuf;
else {
m_request = (char *)mmalloc ( need , "Msg24a" );
if ( ! m_request ) {
log("topics: Failed to allocate %"INT32" bytes.",need);
return true;
}
}
char *p = m_request;
// store the cache parms
*(int32_t *)p = maxCacheAge ; p += 4;
*(char *)p = addToCache ; p += 1;
*(char *)p = returnDocIdCount ; p += 1;
*(char *)p = returnDocIds ; p += 1;
*(char *)p = returnPops ; p += 1;
*(int32_t *)p = niceness ; p += sizeof(int32_t);
// store minTopicScore
//*(int32_t *)p = minTopicScore ; p += 4;
//*(int32_t *)p = maxTopics ; p += 4;
//*(int32_t *)p = maxWordsPerPhrase ; p += 4;
// store topic group information
*(int32_t *)p = numTopicGroups; p += 4;
memcpy ( p , topicGroups , size ); p += size;
// then coll
memcpy ( p , coll , collLen ); p += collLen ;
*p++ = '\0';
// then query
memcpy ( p , query , queryLen ); p += queryLen;
*p++ = '\0';
// then docids
memcpy ( p , docIds , numDocIds * 8 ); p += numDocIds * 8;
// then cluster levels
memcpy ( p , clusterLevels , numDocIds ); p += numDocIds ;
// how big is it?
//m_requestSize = p - m_request;
// sanity check
//if ( m_requestSize > 5+MAX_QUERY_LEN + 1 + MAX_DOCIDS_TO_SCAN * 9){
// char *xx = NULL ; *xx = 0; }
if ( p - m_request != m_requestSize ) {
log("Bad msg24 request size");
char *xx = NULL ; *xx = 0;
}
// . the groupId to handle... just pick randomly
int32_t groupId = ((uint32_t)docIds[0]) & g_hostdb.m_groupMask;
// . returns false and sets g_errno on error
// . reply should be stored in UdpSlot::m_tmpBuf
if ( ! m_mcast.send ( m_request ,
m_requestSize ,
0x24 , // msgType 0x24
false , // m_mcast own m_request?
groupId , // send to group (groupKey)
false , // send to whole group?
(int32_t)docIds[0] , // key is lower bits of docId
this , // state data
NULL , // state data
gotReplyWrapper24 ,
30 , // 30 second time out
niceness , // niceness
false , // realtime?
-1 , // first hostid
NULL,//m_reply , // store reply in here
0,//MAX_REPLY_LEN , // this is how big it can be
false , // free reply buf?
false , // do disk load balancing?
0 , // maxCacheAge
(key_t)0 , // cacheKey
RDB_NONE , // TITLEDB // rdbId of titledb
0 ) ){// minRecSizes avg
log("topics: Had error sending request for topics to host in "
"group #%"INT32": %s.",groupId,mstrerror(g_errno));
return true;
}
// otherwise, we blocked and gotReplyWrapper will be called
return false;
}
void gotReplyWrapper24 ( void *state1 , void *state2 ) {
Msg24 *THIS = (Msg24 *)state1;
THIS->gotReply();
THIS->m_callback ( THIS->m_state );
}
void Msg24::gotReply ( ) {
// bail on error, multicast will free the reply buffer if it should
if ( g_errno ) {
log("topics: Had error getting topics: %s.",
mstrerror(g_errno));
return;
}
// get the reply
int32_t maxSize ;
bool freeIt ;
m_reply = m_mcast.getBestReply (&m_replySize, &maxSize, &freeIt);
relabel( m_reply, m_replySize, "Msg24-GBR" );
// sanity check
//if ( reply != m_reply ) { char *xx = NULL ; *xx = 0 ; }
// . parse the reply, it should be our m_reply buffer
// . topics are NULL terminated
deserialize ( m_reply , m_replySize );
int64_t now = gettimeofdayInMilliseconds();
g_stats.addStat_r ( 0 ,
m_startTime ,
now,
"get_gigabits",
0x00d1e1ff ,
STAT_QUERY );
/*
int32_t i = 0;
while ( p < pend && i < MAX_TOPICS ) {
m_topicScores[i] = *(int32_t *)p ; p += 4;
m_topicLens [i] = *(int32_t *)p ; p += 4;
m_topicGids [i] = *(char *)p ; p += 1;
m_topicPtrs [i] = p ; p += m_topicLens[i] + 1;
i++;
}
m_numTopics = i;
*/
}
// if this is too big we can run out of sockets to use to launch
#define MAX_OUTSTANDING 50
State24::State24 ( ) {
m_msg20 = NULL;
m_mem = NULL;
m_memPtr = NULL;
m_memEnd = NULL;
}
State24::~State24 ( ) {
if ( m_msg20 == m_buf20 ) return;
for ( int32_t i = 0 ; i < m_numDocIds ; i++ ) m_msg20[i].destructor();
mfree ( m_msg20 , sizeof(Msg20) * m_numDocIds , "Msg24" );
m_msg20 = NULL;
if ( m_mem ) {
mfree ( m_mem, m_memEnd - m_mem, "Msg24" );
m_mem = NULL;
m_memEnd = NULL;
m_memPtr = NULL;
}
}
static void launchMsg20s ( State24 *st, bool callsample, int32_t sampleSize );
static void gotSampleWrapper ( void *state ) ;
void handleRequest24 ( UdpSlot *slot , int32_t netnice ) {
// if niceness is 0, use the higher priority udpServer
UdpServer *us = &g_udpServer;
//if ( niceness == 0 ) us = &g_udpServer2;
// make the state
State24 *st ;
try { st = new (State24); }
catch ( ... ) {
g_errno = ENOMEM;
log("topics: Could not allocate %i bytes for generating "
"topics. Replying with error.",sizeof(State24));
us->sendErrorReply ( slot , EBADREQUESTSIZE );
return;
}
mnew ( st , sizeof(State24) , "Msg24b" );
// get the request
char *request = slot->m_readBuf;
int32_t requestSize = slot->m_readBufSize;
char *requestEnd = request + requestSize;
// parse the request
char *p = request;
// get cache parms
//int32_t maxCacheAge = *(int32_t *)p ; p += 4;
//char addToCache = *(char *)p ; p += 1;
st->m_maxCacheAge = *(int32_t *)p ; p += 4;
st->m_addToCache = *(char *)p ; p += 1;
st->m_returnDocIdCount = *(char *)p ; p += 1;
st->m_returnDocIds = *(char *)p ; p += 1;
st->m_returnPops = *(char *)p ; p += 1;
st->m_niceness = *(int32_t *)p ; p += sizeof(int32_t);
// first is minTopicScore
//int32_t minTopicScore = *(int32_t *)p ; p += 4;
// until we roll to all hosts, lets keep the protocol standard
//int32_t maxTopics = *(int32_t *)p ; p += 4;
//int32_t maxWordsPerPhrase = *(int32_t *)p ; p += 4;
//int32_t maxTopics = 100;
//int32_t maxWordsPerPhrase = 6;
//st->m_minTopicScore = minTopicScore;
//st->m_maxTopics = maxTopics;
//st->m_maxWordsPerPhrase = maxWordsPerPhrase;
// get topic group information
st->m_numTopicGroups = *(int32_t *)p ; p += 4;
int32_t size = sizeof(TopicGroup) * st->m_numTopicGroups ;
memcpy ( st->m_topicGroups , p , size ); p += size;
// then coll
st->m_coll = p; p += strlen(p) + 1;
// . then the query, a NULL terminated string
// . store it in state
int32_t qlen = strlen ( p );
if ( qlen > MAX_QUERY_LEN ) qlen = MAX_QUERY_LEN;
memcpy ( st->m_query , p , qlen );
st->m_query [ qlen ] = '\0';
st->m_queryLen = qlen;
p += qlen + 1;
// then the docids
//int64_t *docIds = (int64_t *)p;
//int32_t numDocIds = (requestEnd - p) / 9;
//p += numDocIds * 8;
// cluster levels
//char *clusterLevels = p;
st->m_docIds = (int64_t *)p;
st->m_numDocIds = (requestEnd - p) / 9;
p += st->m_numDocIds * 8;
// cluster levels
st->m_clusterLevels = p;
// truncate
//if ( numDocIds > MAX_DOCIDS_TO_SCAN )
// numDocIds = MAX_DOCIDS_TO_SCAN ;
// see if anyone blocks at all
//bool noBlock = true;
// we haven't got any responses as of yet or sent any requests
st->m_slot = slot;
//st->m_niceness = 0; // niceness;
st->m_numReplies = 0;
st->m_numRequests = 0;
// allocate enough msg20s
if ( st->m_numDocIds <= 50 )
st->m_msg20 = st->m_buf20;
else {
st->m_msg20=(Msg20 *)mmalloc(sizeof(Msg20)*
st->m_numDocIds,"Msg24c");
if ( ! st->m_msg20 ) {
log("Msg24: alloc of msg20s for %"INT32" bytes failed",
sizeof(Msg20)*st->m_numDocIds);
// prevent a core dump in Msg24::~Msg24
st->m_numDocIds = 0;
mdelete ( st , sizeof(State24) , "Msg24" );
delete ( st );
us->sendErrorReply ( slot , g_errno );
return;
}
for ( int32_t i = 0 ; i < st->m_numDocIds ; i++ )
st->m_msg20[i].constructor();
}
// set query if need be
//Query qq;
st->m_qq.set ( st->m_query , st->m_queryLen , NULL , 0, 2 , true );
// make a display metas string to get content for out TopicGroups
//char dbuf[1024];
p = st->m_dbuf;
char *pend = st->m_dbuf + 1024;
for ( int32_t i = 0 ; i < st->m_numTopicGroups ; i++ ) {
TopicGroup *t = &st->m_topicGroups [ i ];
int32_t tlen = strlen ( t->m_meta );
if ( p + tlen + 1 >= pend ) break;
if ( i > 0 ) *p++ = ' ';
memcpy ( p , t->m_meta , tlen );
p += tlen;
}
//int32_t dbufLen = p - dbuf;
st->m_dbufLen = p - st->m_dbuf;
*p = '\0';
st->m_n = 0;
st->m_i = 0;
launchMsg20s ( st , true ,st->m_topicGroups[0].m_topicSampleSize );
}
void launchMsg20s ( State24 *st , bool callsample , int32_t sampleSize ) {
// launch all the msg20 to get big samples of each doc
//int32_t n = 0;
for ( ; st->m_i < st->m_numDocIds ; st->m_i++ ) {
// skip if clustered out
if ( st->m_clusterLevels[st->m_i] != CR_OK )
continue;
// wait for later if too many outstanding
if ( st->m_numRequests - st->m_numReplies >=
MAX_OUTSTANDING ) return;
// use the jth slot if we should
//if ( j >= 0 ) n = j;
// save the msg index
//st->m_msg20[n].m_n = n;
//st->m_msg20[n].m_parent = st;
// supply the display metas as the meta in our TopicGroups
// . start up a Msg20 to get the relevant doc text
// . this will return false if blocks
// . a 32k sample takes 11ms to hash in hashSample() and
// most samples are below 5k anyway...
Msg20 *mm = &st->m_msg20[st->m_n++];
// set the summary request then get it!
Msg20Request req;
Query *q = &st->m_qq;
//int32_t nt = q->m_numTerms;
req.ptr_qbuf = q->getQuery();
req.size_qbuf = q->getQueryLen()+1;
//req.ptr_termFreqs = (char *)m_msg3a.m_termFreqs;
//req.size_termFreqs = 8 * nt;
//req.ptr_affWeights = (char *)m_msg3a.m_affWeights;
//req.size_affWeights = 4 * nt; // 4 = sizeof(float)
req.ptr_coll = st->m_coll;
req.size_coll = strlen(st->m_coll)+1;
if ( st->m_dbufLen > 0 ) {
req.ptr_displayMetas = st->m_dbuf ;
req.size_displayMetas = st->m_dbufLen+1;
}
req.m_docId = st->m_docIds[st->m_i];
req.m_numSummaryLines = 0;
req.m_maxCacheAge = st->m_maxCacheAge;
req.m_wcache = st->m_addToCache;
req.m_state = st;
req.m_callback = gotSampleWrapper;
req.m_niceness = st->m_niceness;
//req.m_summaryMode = m_si->m_summaryMode;
req.m_boolFlag = q->m_isBoolean; // 2 means auto?
//req.m_allowPunctInPhrase = m_si->m_allowPunctInPhrase;
//req.m_showBanned = m_si->m_showBanned;
//req.m_excludeLinkText = m_si->m_excludeLinkText ;
//req.m_hackFixWords = m_si->m_hackFixWords ;
//req.m_hackFixPhrases = m_si->m_hackFixPhrases ;
//req.m_includeCachedCopy= m_si->m_includeCachedCopy;//bigsm
req.m_bigSampleRadius = 100;
req.m_bigSampleMaxLen = sampleSize;
if ( ! mm->getSummary ( &req )) {st->m_numRequests++;continue;}
#ifdef _OLDMSG20_
if ( ! mm->getSummary ( &st->m_qq ,
NULL , // term freqs
NULL , // aff weights
st->m_docIds[st->m_i] ,
1 , // clusterLevel
0 , // # sum lines
st->m_maxCacheAge ,
st->m_addToCache ,
st->m_coll , // coll
strlen(st->m_coll) ,
st , // state
gotSampleWrapper ,
st->m_niceness ,
false , // root?
st->m_dbuf , // dt metas
st->m_dbufLen , // dtmetalen
100 , // smpl radius
sampleSize )){// smpl max
st->m_numRequests++;
// if just launching one, bail if this blocked
//if ( j >= 0 ) return;
continue;
}
#endif
// deal with an error
if ( g_errno ) {
// log it
log("topics: Received error when getting "
"document with docId %"INT64": %s. Document will not "
"contribute to the topics generation.",
st->m_docIds[st->m_i],mstrerror(g_errno));
// reset g_errno
g_errno = 0;
}
// . otherwise we got summary without blocking
// . increment # of replies (instant reply) and results
st->m_numReplies++;
st->m_numRequests++;
// if we were just launching one and it did not block, return
//if ( j >= 0 ) return;
}
// did anyone block? if so, return false for now
if ( st->m_numReplies < st->m_numRequests ) return ;
// . otherwise, we got everyone, so go right to the merge routine
// . returns false if not all replies have been received
// . returns true if done
// . sets g_errno on error
if ( callsample ) gotSampleWrapper ( st );
}
static bool hashSample ( Query *q, char *sample , int32_t sampleLen ,
TermTable *master, int32_t *nqiPtr ,
TopicGroup *t ,
State24* st,
int64_t docId ,
char *vecs , int32_t *numVecs ,
class Words *wordsPtr , class Scores *scoresPtr ,
bool isUnicode ,
char *repeatTable , int32_t repeatTableNumSlots ,
char language );
void gotSampleWrapper ( void *state ) {
// get ptr to our state 24 class
State24 *st = (State24 *)state;
// if niceness is 0, use the higher priority udpServer
UdpServer *us = &g_udpServer;
//if ( st->m_niceness == 0 ) us = &g_udpServer2;
//else us = &g_udpServer ;
UdpSlot *slot = st->m_slot;
// just bitch if there was an error, then ignore it
if ( g_errno ) {
log("topics: Had error getting document: %s. Document will "
"not contribute to the topics generation.",
mstrerror(g_errno));
g_errno = 0;
}
// we got one
st->m_numReplies++;
// launch another request if we can
// return if all done
launchMsg20s ( st , false , st->m_topicGroups[0].m_topicSampleSize ) ;
// wait for all replies to get here
if ( st->m_numReplies < st->m_numRequests ) return;
// get time now
//int64_t now = gettimeofdayInMilliseconds();
// . add the stat
// . use purple for tie to get all summaries
//g_stats.addStat_r ( 0 ,
// m_startTime ,
// now ,
// 0x008220ff );
// timestamp log
//int64_t startTime = gettimeofdayInMilliseconds();
log(LOG_DEBUG,"topics: msg24: Got %"INT32" titleRecs.",// in %"INT64" ms",
st->m_numReplies );//, now - m_startTime );
// set query
//Query q;
//q.set ( st->m_query , st->m_queryLen , NULL , 0 , 2/*auto*/, true);
// . init table for up to about 5k total distinct pronouns & phrases
// . it automatically grows by like 20% if it runs out of space
// . only alloc space for linked lists if docid info is wanted
TermTable master;
if ( ! master.set ( 20000 , true , true ,
st->m_returnDocIdCount | st->m_returnDocIds ,
st->m_returnPops , true, false, NULL ) ) {
mdelete ( st , sizeof(State24) , "Msg24" );
delete ( st );
log("topics: Could not allocate memory for topic generation.");
us->sendErrorReply ( slot , ENOMEM );
return ;
}
// timestamp log
int64_t startTime = gettimeofdayInMilliseconds();
// debug
//char *pp = (char *)mmalloc ( 4 , "foo");
//*(int32_t *)pp = 0;
//us->sendReply_ass ( pp , 4 , pp , 4 , slot );
//delete(st);
//return;
// store all topics (scores/gids) in this buffer
//char buf [ 128*1024 ];
//char *p = buf;
//char *pend = buf + 128*1024;
char *buf = NULL;
int32_t bufSize = 0;
//for ( int32_t yyy = 0 ; yyy < 100 ; yyy++ ) { master.clear();//mdw
// loop over all topic groups
for ( int32_t i = 0 ; i < st->m_numTopicGroups ; i++ ) {
// get ith topic group descriptor
TopicGroup *t = &st->m_topicGroups[i];
// . generate topics for this topic group
// . serialize them into "p"
// . getTopics will realloc() this "buf" to exactly the size
// it needs
getTopics ( st , t , &master , &st->m_qq , i ,
// getTopics will realloc this buffer
&buf , &bufSize , NULL , NULL , NULL );
// clear master table each time
if ( i + 1 < st->m_numTopicGroups ) master.clear();
}
//}
// free mem now to avoid fragmentation
master.reset();
// if small enough, copy into slot's tmp buffer
char *reply = buf;
int32_t replySize = bufSize;
// launch it
us->sendReply_ass ( reply , replySize , reply , replySize , slot );
mdelete ( st , sizeof(State24) , "Msg24" );
delete ( st );
// . on host0, this is 21.3 ms with a std.dev. of 17.5 using dsrt=30
// measured on log[b-d] with the limit of 4 words per "giga bit".
// . now time with our new 6 word phrase maximum:
// sum = 1294.0 avg = 16.0 sdev = 10.8 ... our rewrite was faster!!
//if ( g_conf.m_timingDebugEnabled )
int64_t took = gettimeofdayInMilliseconds() - startTime ;
if ( took > 1 )
log(LOG_TIMING,"topics: Took %"INT64" ms to parse out topics.",
took );
// timing debug
else log(LOG_TIMING,"topics: Took %"INT64" ms to parse out topics.", took);
}
class DocIdLink {
public:
int64_t m_docId;
int32_t m_next; // offset into st->m_mem to DocIdLink
};
// returns false and set g_errno on error, true otherwise
bool getTopics ( State24 *st ,
TopicGroup *t ,
TermTable *master ,
Query *q ,
char gid ,
char **buf ,
int32_t *bufSize ,
// these ptrs are supplied by the spider when trying to
// generate the gigabit vector for a document it is indexing
class Words *wordsPtr ,
class Scores *scoresPtr ,
int32_t *hashes ,
unsigned char language ,
int32_t niceness ,
LinkInfo* linkInfo,
LinkInfo* linkInfo2) {
////////////////////////////////////////////
//
// GENERATE THE TOPICS
//
////////////////////////////////////////////
//int64_t start = gettimeofdayInMilliseconds();
// only allow one vote per ip
HashTable iptable;
// return fales and set g_errno if this alloc fails
if ( t->m_ipRestrict && ! iptable.set ( st->m_numRequests * 4 ) )
return false;
// space for all vectors for deduping samples that are 80% similar
char vbuf [ 64*1024 ];
char *vecs = vbuf;
int32_t numVecs = 0;
int32_t vneed = st->m_numRequests * SAMPLE_VECTOR_SIZE;
if ( t->m_dedupSamplePercent >= 0 && vneed > 64*1024 )
vecs = (char *)mmalloc ( vneed , "Msg24d" );
if ( ! vecs ) return false;
// hack, if words supplied, treat as one request
if ( wordsPtr ) st->m_numRequests = 1;
//
//
// . make the hash table used for repeated fragment detection
// . one slot per word, over all samples
//
//
// for every sample estimate the number of words so we know how big
// to make our repeat hash table
int32_t maxWords = 0;
Words tmpw;
// if getting a gigabit vector for a single doc, we know the # of words
if ( wordsPtr ) maxWords += wordsPtr->getNumWords();
// otherwise, get max # of words for each big sample via Msg20
int32_t numMsg20Used = 0;
for ( int32_t i = 0 ; ! wordsPtr && i < st->m_numRequests ; i++ ) {
Msg20* thisMsg20 = NULL;
if(wordsPtr) {}
else if(st->m_msg20) thisMsg20 = &st->m_msg20[i];
else {
thisMsg20 = st->m_msg20Ptrs[i];
if ( st->m_clusterLevels[i] != CR_OK ) continue;
}
//continue if we've gotten no content
if(!wordsPtr &&
(!thisMsg20 || (thisMsg20 && thisMsg20->m_errno)))
continue;
// make sure the summary is not in a foreign language (aac)
if (thisMsg20) {
unsigned char sLang;
sLang = thisMsg20->m_r->m_summaryLanguage;
if (language != langUnknown && sLang != language) continue;
};
// get the ith big sample
char *sample = NULL;
int32_t slen = 0;
// but if doing metas, get the display content
char *next = NULL;
if(thisMsg20) next = thisMsg20->getDisplayBuf();
if ( t->m_meta[0] && next)
sample = thisMsg20->getNextDisplayBuf(&slen,&next);
// XmlDoc::getGigabitVector() provides us with the Words/Scores
// classes for the whole document. that is the "sample"
else {
sample = thisMsg20->getBigSampleBuf();
slen = thisMsg20->getBigSampleLen();
}
// are we unicode?
bool isUnicode = thisMsg20->isUnicode();
// set parser vars
char *p = sample;
char *pend = sample + slen;
// each sample consists of multiple \0 terminated excerpts
int32_t sampleWords = 0;
#ifdef DEBUG_MSG24
int32_t numExcerpts = 0;
#endif
while ( p < pend ) {
int32_t plen ;
if ( isUnicode ) plen = ucStrNLen (p,pend-p);
else plen = strlen (p);
if ( isUnicode ) sampleWords += countWords((UChar *)p,plen);
else sampleWords += countWords( p,plen);
// advance to next exerpt
p += plen + 1;
#ifdef DEBUG_MSG24
numExcerpts++;
#endif
};
#ifdef DEBUG_MSG24
if ( sampleWords > 2048 ) {
char *dbgBuf = NULL;
log("topics: Unusually int32_t sample in Msg24: "
"sampleWords=%"INT32" numExcerpts=%"INT32"",
sampleWords, numExcerpts);
if ( (dbgBuf = (char *)mmalloc(slen+1, "DEBUG_MSG24")) ) {
int jjStep = 1;
if (isUnicode) jjStep = 2;
int kk = 0;
for (int jj = 0; jj< slen; jj += jjStep) {
if (sample[jj]) {
dbgBuf[kk++] = sample[jj];
}
else {
dbgBuf[kk++] = '#';
};
};
dbgBuf[kk++] = '\0';
log("topics: \tsample was: %s", dbgBuf);
};
}
else {
log("topics: Reasonable sample in Msg24: "
"sampleWords=%"INT32" numExcerpts=%"INT32"",
sampleWords, numExcerpts);
};
#endif
if (maxWords + sampleWords > 0x08000000) {
log("topics: too many words in samples. "
"Discarding the remaining samples "
"(maxWords=%"INT32")", maxWords);
break;
}
else {
maxWords += sampleWords;
numMsg20Used++;
};
}
// make it big enough so there are gaps, so chains are not too long
int32_t minBuckets = (int32_t)(maxWords * 1.5);
if(minBuckets < 512) minBuckets = 512;
int32_t numSlots = 2 * getHighestLitBitValue ( minBuckets ) ;
int32_t need2 = numSlots * (8+4);
char *rbuf = NULL;
char tmpBuf2[13000];
// sanity check
if ( need2 < 0 ) {
g_errno = EBADENGINEER;
return log("query: bad engineer in Msg24.cpp. need2=%"INT32" "
"numSlots=%"INT32" maxWords=%"INT32" q=%s", need2,numSlots,maxWords,q->m_orig);
}
if ( need2 < 13000 ) rbuf = tmpBuf2;
else rbuf = (char *)mmalloc ( need2 , "WeightsSet3");
if ( ! rbuf ) return false;
// sanity check
if ( numSlots * 8 > need2 || numSlots * 8 < 0 ) {
g_errno = EBADENGINEER;
return log("query: bad engineer in Msg24.cpp. need2=%"INT32" "
"numSlots=%"INT32" q=%s", need2,numSlots,q->m_orig);
}
// clear the keys in the hash table (empty it out)
memset ( rbuf , 0 , numSlots * 8 );
// set the member var to this
char *repeatTable = rbuf;
int32_t repeatTableNumSlots = numSlots;
//
//
// end making the hash table for repeated fragment detection
//
//
// now combine all the pronouns and pronoun phrases into one big hash
// table and collect the top 10 topics
int32_t nqi = 0; // how many query terms actually used? for normalizing.
int32_t tcount = 0; // how many title recs did we process?
QUICKPOLL(niceness);
for ( int32_t i = 0 ; i < numMsg20Used ; i++ ) {
Msg20* thisMsg20 = NULL;
if(wordsPtr) {}
else if(st->m_msg20) thisMsg20 = &st->m_msg20[i];
else {
thisMsg20 = st->m_msg20Ptrs[i];
if ( st->m_clusterLevels[i] != CR_OK ) continue;
}
// make sure the summary is not in a foreign language (aac)
if (thisMsg20) {
unsigned char sLang;
sLang = thisMsg20->m_r->m_summaryLanguage;
if (language != langUnknown && sLang != language) continue;
};
//continue if we've gotten no content
if(!wordsPtr &&
(!thisMsg20 || (thisMsg20 && thisMsg20->m_errno)))
continue;
// skip if from an ip we already did
if ( t->m_ipRestrict ) {
int32_t ipd = ipdom (thisMsg20->getIp() );
// zero is invalid!
if ( ! ipd ) continue;
//log("url=%s",thisMsg20->getUrl());
if ( iptable.getValue(ipd) ) {
//log("dup=%s",thisMsg20->getUrl());
continue;
}
// now we also check domain
Url uu;
uu.set ( thisMsg20->getUrl() ,
thisMsg20->getUrlLen() );
// "mid dom" is the "ibm" part of ibm.com or ibm.de
char *dom = uu.getMidDomain();
int32_t dlen = uu.getMidDomainLen();
if ( dom && dlen > 0 ) {
int32_t h = hash32 ( dom , dlen );
if ( iptable.getValue(h) ) continue;
iptable.addKey (h,1);
}
// add ip
iptable.addKey (ipd,1);
}
// get the ith big sample
char *bigSampleBuf = NULL;
int32_t bigSampleLen = 0;
// but if doing metas, get the display content
char *next = NULL;
if(thisMsg20) next = thisMsg20->getDisplayBuf();
// but if doing metas, get the display content
if ( t->m_meta[0] && next) {
bigSampleBuf =
thisMsg20->getNextDisplayBuf(&bigSampleLen,&next);
}
// XmlDoc::getGigabitVector() provides us with the Words/Scores
// classes for the whole document. that is the "sample"
else if ( ! wordsPtr ) {
bigSampleBuf = thisMsg20->getBigSampleBuf();
bigSampleLen = thisMsg20->getBigSampleLen();
}
// skip if empty
if ( !wordsPtr && (bigSampleLen<=0 ||!bigSampleBuf)) continue;
// otherwise count it
tcount++;
// the docid
int64_t docId = 0;
if ( ! wordsPtr ) docId = thisMsg20->getDocId();
// are we unicode?
bool isUnicode;
if ( ! wordsPtr ) isUnicode = thisMsg20->isUnicode();
else isUnicode = wordsPtr->isUnicode();
unsigned char lang = language;
if ( ! wordsPtr ) lang = thisMsg20->getLanguage();
// continue; // mdw
QUICKPOLL(niceness);
// . hash it into the master table
// . this may alloc st->m_mem, so be sure to free below
hashSample ( q, bigSampleBuf, bigSampleLen, master, &nqi , t ,
st, docId ,
vecs , &numVecs ,
wordsPtr , scoresPtr , isUnicode ,
repeatTable , repeatTableNumSlots , lang );
// ignore errors
g_errno = 0;
// hash the inlink texts and neighborhoods
for(Inlink *k=NULL;linkInfo&&(k=linkInfo->getNextInlink(k));){
char *s = k->ptr_linkText;
int32_t len = k->size_linkText - 1;
hashSample ( q, s, len, master, &nqi , t ,
st, docId , // 0
vecs , &numVecs ,
NULL , NULL , k->m_isUnicode,
repeatTable , repeatTableNumSlots ,
lang );
// and surrounding
s = k->ptr_surroundingText;
len = k->size_surroundingText - 1;
hashSample ( q, s, len, master, &nqi , t ,
st, docId , // 0
vecs , &numVecs ,
NULL , NULL , k->m_isUnicode,
repeatTable , repeatTableNumSlots ,
lang );
}
for(Inlink*k=NULL;linkInfo2&&(k=linkInfo2->getNextInlink(k));){
char *s = k->ptr_linkText;
int32_t len = k->size_linkText - 1;
hashSample ( q, s, len, master, &nqi, t ,
st, docId , // docId
vecs , &numVecs ,
NULL , NULL, isUnicode,
repeatTable, repeatTableNumSlots,
lang );
}
// ignore errors
g_errno = 0;
}
//hash meta keywords and meta description when generating the gigabit
//vector, mainly useful for docs which have all of their content in frames
if(st->m_dbufLen > 0 && wordsPtr) {
hashSample ( q, st->m_dbuf, st->m_dbufLen, master, &nqi , t ,
st, 0/*docId*/ ,
vecs , &numVecs ,
NULL , NULL , wordsPtr->isUnicode() ,
repeatTable , repeatTableNumSlots , language );
}
//log("did samples in %"INT64" ",gettimeofdayInMilliseconds()-start);
int32_t nt = master->getNumTerms();
// debug msg
/*
for ( int32_t i = 0 ; i < nt ; i++ ) {
int32_t score = master->getScoreFromTermNum(i) ;
if ( ! score ) continue;
char *ptr = master->getTermPtr(i) ;
int32_t len = master->getTermLen(i);
char ff[1024];
if ( len > 1020 ) len = 1020;
memcpy ( ff , ptr , len );
ff[len] = '\0';
// we can have html entities in here now
//if ( ! is_alnum(ff[0]) ) { char *xx = NULL; *xx = 0; }
log("%08"INT32" %s",score,ff);
}
*/
// how many do we need?
int32_t need = t->m_maxTopics ;
// get this many winners
int32_t maxWinners = need;
// double it in case some get deduped
if ( t->m_dedup ) maxWinners *= 2; // mdw
// count how many get removed, might have to recompute
int32_t removed ;
int32_t got = 0;
// now get the top MAX_TOPICS or maxWinners pronouns or pronoun phrases
//int32_t scores [ MAX_TOPICS ];
//char *ptrs [ MAX_TOPICS ];
//unsigned char lens [ MAX_TOPICS ];
int32_t *scores = NULL;
char **ptrs = NULL;
int32_t *lens = NULL;
char *isunis = NULL;
int32_t *slots = NULL;
int32_t *pages = NULL;
// these vars are used below
//char *ptrs2 [ MAX_TOPICS ];
//int32_t lens2 [ MAX_TOPICS ];
char **ptrs2 = NULL;
int32_t *lens2 = NULL;
char *tmpBuf = NULL;
int32_t tmpSize = 0;
//bool triedLinkInfo = false;
redo:
// ensure maxWinners not too big
//if ( maxWinners > MAX_TOPICS ) maxWinners = MAX_TOPICS;
// allocate enough space
int32_t newSize = maxWinners*(sizeof(char *)+4+4+4+4+sizeof(char *)+4+1);
char *newBuf = (char *)mrealloc(tmpBuf,tmpSize , newSize , "Msg24e" );
if ( ! newBuf ) {
if ( tmpBuf ) mfree ( tmpBuf , tmpSize , "Msg24" );
// free the links in the linked list, if any
if ( st->m_mem ) {
mfree ( st->m_mem, st->m_memEnd - st->m_mem, "Msg24" );
st->m_mem = NULL;
st->m_memEnd = NULL;
st->m_memPtr = NULL;
}
if ( vecs != vbuf ) mfree ( vecs , vneed , "Msg24" );
return log("topics: realloc to %"INT32" failed.",newSize);
}
tmpBuf = newBuf;
tmpSize = newSize;
char *pp = tmpBuf;
ptrs = (char **)pp ; pp += sizeof(char *) * maxWinners;
scores = (int32_t *)pp ; pp += 4 * maxWinners;
lens = (int32_t *)pp ; pp += 4 * maxWinners;
isunis = pp ; pp += maxWinners;
slots = (int32_t *)pp ; pp += 4 * maxWinners;
pages = (int32_t *)pp ; pp += 4 * maxWinners;
ptrs2 = (char **)pp ; pp += sizeof(char *) * maxWinners;
lens2 = (int32_t *)pp ; pp += 4 * maxWinners;
int32_t *pops = master->m_pops;
QUICKPOLL(niceness);
int32_t np = 0;
int32_t minScore = 0x7fffffff;
int32_t minj = -1;
int32_t i ;
int32_t *heads = master->getHeads();
bool callRedo = true;
// total # of pages sampled
int32_t sampled = numMsg20Used;
for ( i = 0 ; i < nt && np < maxWinners ; i++ ) {
// skip term #i from "table" if it has 0 score
int32_t score = master->m_scores[i]; // getScoreFromTermNum(i) ;
if ( ! score ) continue;
// . make it higher the more popular a term is
// . these are based on a MAXPOP of 10000
int32_t mdc = (int32_t)((((double)sampled * 3.0 *
(double)(pops[i]&0x7fffffff))+0.5)/MAXPOP);
if ( mdc < t->m_minDocCount ) mdc = t->m_minDocCount;
// skip if does not meet the min doc count
int32_t count = 0;
//if ( mdc > 1 || st->m_returnDocIds ) {
if ( t->m_minDocCount > 1 || st->m_returnDocIds ) {
DocIdLink *link = (DocIdLink *)(st->m_mem+heads[i]);
while ( (char *)link >= st->m_mem ) {
count++;
link = (DocIdLink*)(st->m_mem + link->m_next);
}
if ( count < mdc ) continue;
}
// set the min of all in our list
if ( score < minScore ) { minScore = score; minj = np; }
// i've seen this become NULL at line 753 on gb1 below for
// /search?code=mammaXbG&uip=12.41.126.39&n=15&raw=8&q=
// manhattan,+ny
// so let's try it again and try to find out why maybe
if ( master->m_termLens[i] <= 0 ) {
char *orig = "";
if ( q ) orig = q->m_orig;
log (LOG_LOGIC,"query: Got 0 length gigabit. q=%s",
orig);
continue;
}
// recalc the score
//double frac1 = ((MAXPOP-(pops[i]&0x7fffffff))*100.0)/MAXPOP;
//double frac2 = ((double)count * 100.0) / (double)sampled;
//score = (int32_t)((frac1 * frac2) / 100.0);
// we got a winner
scores [ np ] = score;
ptrs [ np ] = master->m_termPtrs[i]; // getTermPtr(i) ;
lens [ np ] = master->m_termLens[i]; // getTermLen(i);
isunis [ np ] = master->m_isunis[i];
slots [ np ] = i;
pages [ np ] = count;
np++;
}
QUICKPOLL(niceness);
// if not enough no matter what, do not redo
if ( np < maxWinners ) callRedo = false;
// now do the rest
for ( ; i < nt ; i++ ) {
// skip term #i from "table" if it has 0 score
int32_t score = master->m_scores[i]; // getScoreFromTermNum(i) ;
// bail if empty
if ( score <= 0 ) continue;
// ignore if not a winner
if ( score <= minScore ) continue;
// . make it higher the more popular a term is
// . these are based on a MAXPOP of 10000
int32_t mdc = (int32_t)((((double)sampled * 3.0 *
(double)(pops[i]&0x7fffffff))+0.5)/MAXPOP);
if ( mdc < t->m_minDocCount ) mdc = t->m_minDocCount;
// skip if does not meet the min doc count
int32_t count = 0;
if ( t->m_minDocCount > 1 || st->m_returnDocIds ) {
DocIdLink *link = (DocIdLink *)(st->m_mem+heads[i]);
// m_next is -1 to indicate end
while ( (char *)link >= st->m_mem ) {
count++;
link = (DocIdLink *)(st->m_mem + link->m_next);
}
if ( count < mdc ) continue;
}
// find the score it will replace, the min one
//int32_t j ;
//for ( j = 0 ; j < np ; j++ )
// if ( scores [ j ] == minScore ) break;
// bad engineer?
//if ( j == np ) { char *xx = NULL; *xx = 0; }
// recalc the score
//double frac1 = ((MAXPOP-(pops[i]&0x7fffffff))*100.0)/MAXPOP;
//double frac2 = ((double)count * 100.0) / (double)sampled;
//int32_t newScore = (int32_t)((frac1 * frac2) / 100.0);
//int32_t oldminj = minj;
// replace jth guy
scores [ minj ] = score;
ptrs [ minj ] = master->m_termPtrs[i]; // getTermPtr(i) ;
lens [ minj ] = master->m_termLens[i]; // getTermLen(i);
isunis [ minj ] = master->m_isunis[i];
pages [ minj ] = count;
slots [ minj ] = i;
//log("ptrs[%"INT32"]=%"XINT32"",j,ptrs[j]);
// hopefully we increased the min score in our top set now
minScore = 0x7fffffff;
for ( int32_t j = 0 ; j < np ; j++ ) {
if ( scores[j] < minScore ) {
minScore = scores[j];
minj = j;
}
}
//scores [oldminj] = newScore;
}
// bubble sort the top winners
again:
bool flag = 0;
for ( int32_t i = 1 ; i < np ; i++ ) {
if ( scores[i-1] >= scores[i] ) continue;
int32_t ts = scores[i];
char *tp = ptrs [i];
int32_t tl = lens [i];
char tu = isunis[i];
int32_t tc = pages [i];
int32_t tt = slots [i];
scores [i ] = scores[i-1];
ptrs [i ] = ptrs [i-1];
lens [i ] = lens [i-1];
isunis [i ] = isunis[i-1];
pages [i ] = pages [i-1];
slots [i ] = slots [i-1];
scores [i-1] = ts;
ptrs [i-1] = tp;
lens [i-1] = tl;
isunis [i-1] = tu;
pages [i-1] = tc;
slots [i-1] = tt;
flag = 1;
}
if ( flag == 1 ) goto again;
QUICKPOLL(niceness);
// . normalize all scores
// . assume 20000 pointer per query term per page
// . an topic term will get 20000 points for each query term it is
// close to
int32_t max = nqi * tcount * MAX_SCORE_MULTIPLIER ; //10000;
if ( nqi == 0 ) max = tcount * ALT_MAX_SCORE;
if ( max == 0 ) max = 1;
for ( i = 0 ; i < np ; i++ ) {
// skip if length is 0, it was a dup from above
//if ( lens[i] <= 0 ) continue;
scores[i] = (scores[i] * 100) / max;
if ( scores[i] <= 0 ) scores[i] = 1;
if ( scores[i] >= 100 ) scores[i] = 100; // add a log statement here? (aac)
}
// . now set ptrs2/lens2 to point to comparison string in each topic
// . skip it over stop words, don't compare those
// . this way we can do a more flexible strcasestr and ignore common
// words when comparing, they don't add much beyond repetition
// . "super bowl" + "the super bowl" --> "super bowl"
//char *ptrs2 [ MAX_TOPICS ];
//int32_t lens2 [ MAX_TOPICS ];
for ( i = 0 ; i < np ; i++ ) {
/*
Words w;
w.set ( false , ptrs[i] , lens[i] , false );
int32_t nw = w.getNumWords();
// skip if none
if ( nw <= 0 ) continue;
*/
// establish our new ptrs
ptrs2 [ i ] = ptrs[i];
lens2 [ i ] = lens[i];
// skip initial common words
//----> not if capitalized!! leave those in tact. like
// Michael Jackson's "Beat It"
/*
int32_t h;
int32_t j = 0;
if ( w.isPunct(j) ) j++;
for ( ; j < nw ; j += 2 ) {
char *ww = w.getWord (j);
int32_t wwlen = w.getWordLen(j);
// if capitlized, leave it
if ( is_upper(ww[0]) ) break;
// single letter lower case is common word
if ( wwlen <= 1 && is_alpha(ww[0]) ) goto gotone;
// leave it if not common
h= hash64d(w.getWord(j),w.getWordLen(j));
if ( ! isCommonWord ( h ) ) break;
// otherwise, scrub it off
gotone:
ptrs2 [i] = w.getWord(j+2);
}
// skip trailing common words
int32_t k = nw - 1 ;
if ( w.isPunct(k) ) k--;
for ( ; k >= j ; k -= 2 ) {
char *ww = w.getWord (k);
int32_t wwlen = w.getWordLen(k);
// if capitlized, leave it
if ( is_upper(ww[k]) ) break;
// single letter lower case is common word
if ( wwlen <= 1 && is_alpha(ww[0]) ) goto gotone;
// left off here!!
if ( w.getWordLen(j) <= 1&&is_alpha(w.getWord(j)[0]) )
continue;
h=hash64d(w.getWord(j),w.getWordLen(j));
if ( ! isCommonWord ( h ) ) break;
}
// set new length
char *end2 = w.getWord(k) + w.getWordLen(k);
lens2[i] = end2 - ptrs2[i];
*/
}
if ( ! t->m_dedup ) goto skipdedup;
//goto skipdedup; // mdw
removed = 0;
// now remove similar terms from the top topics
for ( int32_t i = 0 ; i < np - 1 ; i++ ) {
// skip if nuked already
if ( lens[i] == 0 ) continue;
// scan down to this score, but not below
//int32_t minScore = (scores[i] * 75) / 100 ;
int32_t minScore = scores[i] - 25;
// if we get replaced by a longer guy, remember him
int32_t replacerj = -1;
// . a longer term than encapsulates us can eliminate us
// . or, if we're the longer, we eliminate the int16_ter
for ( int32_t j = i + 1 ; j < np ; j++ ) {
// skip if nuked already
if ( lens[j] == 0 ) continue;
// null term both
char c1 = ptrs2[i][lens2[i]];
char c2 = ptrs2[j][lens2[j]];
ptrs2[i][lens2[i]] = '\0';
ptrs2[j][lens2[j]] = '\0';
// if we are the int16_ter, and longer contains us
// then it nukes us... unless his score is too low
if ( lens2[i] < lens2[j] ) {
// if int16_ter is contained
char *s;
if (isunis[j] == 0 && isunis[i] == 0)
s = gb_strcasestr (ptrs2[j],ptrs2[i]) ;
else if (isunis[j] == 0 && isunis[i] == 1)
s = ucStrNCaseStr(
ptrs2[j],
(UChar*)ptrs2[i], lens2[i]>>1);
else if (isunis[j] == 1 && isunis[i] == 0)
s = (char*)ucStrNCaseStr(
(UChar*)ptrs2[j], lens2[j]>>1,
ptrs2[i]);
else
s = (char*)ucStrNCaseStr(
(UChar*)ptrs2[j], lens2[j]>>1,
(UChar*)ptrs2[i], lens2[i]>>1);
// un-null term both
ptrs2[i][lens2[i]] = c1;
ptrs2[j][lens2[j]] = c2;
// even if he's longer, if his score is too
// low then he cannot nuke us
if ( scores[j] < minScore ) continue;
// if we were NOT contained by someone below...
if ( ! s ) continue;
// he's gotta be on all of our pages, too
//if ( ! onSamePages(i,j,slots,heads,pages) )
// continue;
// int16_ter gets our score (we need to sort)
// not yet! let him finish, then replace him!!
replacerj = j;
// see if we can nuke other guys at least
continue;
}
// . otherwise, we are the longer
// . we can nuke any int16_ter below us, all scores
char *s;
if (isunis[i] == 0 && isunis[j] == 0)
s = gb_strcasestr (ptrs2[i],ptrs2[j]) ;
else if (isunis[i] == 0 && isunis[j] == 1)
s = ucStrNCaseStr(
ptrs2[i],
(UChar*)ptrs2[j], lens2[j]>>1);
else if (isunis[i] == 1 && isunis[j] == 0)
s = (char*)ucStrNCaseStr(
(UChar*)ptrs2[i], lens2[i]>>1,
ptrs2[j]);
else
s = (char*)ucStrNCaseStr(
(UChar*)ptrs2[i], lens2[i]>>1,
(UChar*)ptrs2[j], lens2[j]>>1);
// un-null term both
ptrs2[i][lens2[i]] = c1;
ptrs2[j][lens2[j]] = c2;
QUICKPOLL(niceness);
// keep going if no match
if ( ! s ) continue;
// remove him if we contain him
lens[j] = 0;
// count him
removed++;
// the redo flag
//rflag = 1;
}
// if we got replaced by a longer guy, he replaces us
// and takes our score
if ( replacerj >= 0 ) {
ptrs [i] = ptrs [replacerj];
lens [i] = lens [replacerj];
pages [i] = pages [replacerj];
slots [i] = slots [replacerj];
ptrs2 [i] = ptrs2 [replacerj];
lens2 [i] = lens2 [replacerj];
//scores[i] = scores[replacerj];
lens [replacerj] = 0;
i--;
// count him
removed++;
// the redo flag
//rflag = 1;
}
}
// . PROBLEM #2: often a phrase and the next phrase, +1, are in
// there... how to fix? the higher scoring one should swallow
// up the lower scoring one, even if only 3 of the 4 words match
// (do not count common words)
// . #3 or when all non-query, non-common terms match... pick the
// longer and remove the common words, but keep query words.
// again2:
//char rflag = 0;
// if two terms are close in score, and one is a longer version
// of the other, choose it and remove the int16_ter
for ( int32_t i = 0 ; i < np - 1 ; i++ ) {
// skip if nuked already
if ( lens[i] == 0 ) continue;
// scan down to this score, but not below
//int32_t minScore = (scores[i] * 75) / 100 ;
int32_t minScore = scores[i] - 15;
// if we get replaced by a longer guy, remember him
int32_t replacerj = -1;
// . a longer term than encapsulates us can eliminate us
// . or, if we're the longer, we eliminate the int16_ter
for ( int32_t j = i + 1 ; j < np ; j++ ) {
// skip if nuked already
if ( lens[j] == 0 ) continue;
// null term both
char c1 = ptrs[i][lens[i]];
char c2 = ptrs[j][lens[j]];
ptrs[i][lens[i]] = '\0';
ptrs[j][lens[j]] = '\0';
// if we are the int16_ter, and longer contains us
// then it nukes us... unless his score is too low
if ( lens[i] < lens[j] ) {
// if int16_ter is contained
char *s;
if (isunis[j] == 0 && isunis[i] == 0)
s = gb_strcasestr (ptrs2[j],ptrs2[i]) ;
else if (isunis[j] == 0 && isunis[i] == 1)
s = ucStrNCaseStr(
ptrs2[j],
(UChar*)ptrs2[i], lens2[i]>>1);
else if (isunis[j] == 1 && isunis[i] == 0)
s = (char*)ucStrNCaseStr(
(UChar*)ptrs2[j], lens2[j]>>1,
ptrs2[i]);
else
s = (char*)ucStrNCaseStr(
(UChar*)ptrs2[j], lens2[j]>>1,
(UChar*)ptrs2[i], lens2[i]>>1);
// un-null term both
ptrs[i][lens[i]] = c1;
ptrs[j][lens[j]] = c2;
// even if he's longer, if his score is too
// low then he cannot nuke us
if ( scores[j] < minScore ) continue;
// if we were NOT contained by someone below...
if ( ! s ) continue;
// if we are not on the same pages as the
// int16_ter one, then we cannot absorb him
//if ( ! onSamePages(i,j,slots,heads,pages))
// continue;
// int16_ter gets our score (we need to sort)
// not yet! let him finish, then replace him!!
replacerj = j;
// see if we can nuke other guys at least
continue;
}
// . otherwise, we are the longer
// . we can nuke any int16_ter below us, all scores
char *s;
if (isunis[i] == 0 && isunis[j] == 0)
s = gb_strcasestr (ptrs2[i],ptrs2[j]) ;
else if (isunis[i] == 0 && isunis[j] == 1)
s = ucStrNCaseStr(
ptrs2[i],
(UChar*)ptrs2[j], lens2[j]>>1);
else if (isunis[i] == 1 && isunis[j] == 0)
s = (char*)ucStrNCaseStr(
(UChar*)ptrs2[i], lens2[i]>>1,
ptrs2[j]);
else
s = (char*)ucStrNCaseStr(
(UChar*)ptrs2[i], lens2[i]>>1,
(UChar*)ptrs2[j], lens2[j]>>1);
// un-null term both
ptrs[i][lens[i]] = c1;
ptrs[j][lens[j]] = c2;
QUICKPOLL(niceness);
// keep going if no match
if ( ! s ) continue;
// if we are not on the same pages as the
// int16_ter one, then we cannot absorb him
//if ( ! onSamePages(i,j,slots,heads,pages))
// continue;
// remove him if we contain him
lens[j] = 0;
// count him
removed++;
// the redo flag
//rflag = 1;
}
// if we got replaced by a longer guy, he replaces us
// and takes our score
if ( replacerj >= 0 ) {
ptrs [i] = ptrs [replacerj];
lens [i] = lens [replacerj];
pages [i] = pages [replacerj];
slots [i] = slots [replacerj];
//scores[i] = scores[replacerj];
lens [replacerj] = 0;
i--;
// count him
removed++;
// the redo flag
//rflag = 1;
}
}
// if someone got replaced, loop more
//if ( rflag ) goto again2;
// remove common phrases
for ( int32_t i = 0 ; i < np ; i++ ) {
// skip if nuked already
if ( lens[i] == 0 ) continue;
// compare
bool remove = false;
if ( isunis[i] == 0 ) { //com org dom xhtml html dtd
if (!strncasecmp(ptrs[i], "all rights reserved",lens[i]) ||
!strncasecmp(ptrs[i], "rights reserved" ,lens[i]) ||
!strncasecmp(ptrs[i], "in addition" ,lens[i]) ||
!strncasecmp(ptrs[i], "for example" ,lens[i]) ||
!strncasecmp(ptrs[i], "in order" ,lens[i]) ||
!strncasecmp(ptrs[i], "in fact" ,lens[i]) ||
!strncasecmp(ptrs[i], "in general" ,lens[i]) ||
!strncasecmp(ptrs[i], "contact us" ,lens[i]) ||
!strncasecmp(ptrs[i], "at the same time" ,lens[i]) ||
!strncasecmp(ptrs[i], "http" ,lens[i]) ||
!strncasecmp(ptrs[i], "html" ,lens[i]) ||
!strncasecmp(ptrs[i], "s " ,lens[i]) ||
!strncasecmp(ptrs[i], "for more information",lens[i]))
remove = true;
}
else {
if ( !ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"all rights reserved", 19) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"rights reserved", 15) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"in addition", 11) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"for example", 11) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"in order", 8) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"in fact", 7) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"in general", 10) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"contact us", 10) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"at the same time", 16) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"http", 4) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"s ", 2) ||
!ucStrCaseCmp( (UChar*)ptrs[i], lens[i] >> 1,
"for more information", 20) )
remove = true;
}
if ( remove ) {
lens[i] = 0;
// count him
removed++;
}
}
QUICKPOLL(niceness);
// now after longer topics replaced the int16_ter topics which they
// contained, remove the longer topics if they have too many words
// remove common phrases
for ( int32_t i = 0 ; i < np ; i++ ) {
// skip if nuked already
if ( lens[i] == 0 ) continue;
if ( ! ptrs[i] ) continue;
Words w;
w.set ( false , false, ptrs[i] , lens[i] , TITLEREC_CURRENT_VERSION,
false, false, niceness );
int32_t nw = w.getNumWords();
// . does it have comma? or other punct besides an apostrophe?
// . we allow gigabit phrases to incorporate a int32_t stretch
// of punct... only before the LAST word in the phrase,
// that way our overlap removal still works well.
bool hasPunct = false;
for ( int32_t k = 0 ; k < lens[i] ; k++ ) {
if ( ! is_punct(ptrs[i][k]) ) continue;
// apostrophe is ok as int32_t as alnum follows
if ( ptrs[i][k] == '\'' &&
is_alnum(ptrs[i][k+1]) ) continue;
// . period ok, as int32_t as space or alnum follows
// . if space follows, then an alnum must follow that
// . same goes for colon
QUICKPOLL(niceness);
// . for now, until we get abbreviations working,
// alnum must follow period
if ( (ptrs[i][k] == '.' || ptrs[i][k] == ':' ) &&
( is_alnum(ptrs[i][k+1]) ||
// accept single intial before the period, too
(ptrs[i][k+1] ==' ' && is_alnum(ptrs[i][k+2])
&& k>=2 && ptrs[i][k-2]==' ')))
continue;
// comma is ok if surrounded by digits
if ( (ptrs[i][k] == ',' &&
is_digit(ptrs[i][k-1]) &&
is_digit(ptrs[i][k+1]) )) continue;
// percent is ok
if ( ptrs[i][k] == '%' ) continue;
if ( ptrs[i][k] == '&' ) continue;
if ( ptrs[i][k] == '@' ) continue;
if ( ptrs[i][k] == '-' ) continue;
//if ( ptrs[i][k] == '(' ) continue;
//if ( ptrs[i][k] == ')' ) continue;
hasPunct = true;
break;
}
// keep it if words are under limit
// and has no commas
if ( nw <= 2*t->m_maxWordsPerTopic -1 && ! hasPunct )
continue;
lens[i] = 0;
removed++;
}
QUICKPOLL(niceness);
// if we removed enough to fall below maxWinners, redo
got = np - removed;
if ( got >= need ) goto skipdedup;
// if we already did all from "master", no more left!
if ( np >= master->getNumTermsUsed() ) goto skipdedup;
// if we didn't have enough raw results, do not redo it
if ( ! callRedo ) goto skipdedup;
// or if already hit MAX_TOPICS
//if ( maxWinners >= MAX_TOPICS ) goto skipdedup; mdw
if ( got == 0 ) maxWinners = maxWinners*2;
else maxWinners = ((int64_t)maxWinners *
(int64_t)need * 110LL) /
((int64_t)got * 100LL) + 10;
goto redo; // mdw
skipdedup:
// free the repeat table if it allocated mem
if ( repeatTable != tmpBuf2 ) {
mfree ( repeatTable , need2 , "Msg24" );
repeatTable = NULL;
}
// how much space do we need for reply?
int32_t size = 0;
// 4 bytes for number of topics
size += 4;
// then how much for each topic?
int32_t ntp = 0;
for ( i = 0 ; i < np ; i++ ) {
// cutoff at min score
if ( scores[i] < t->m_minTopicScore ) continue;
// skip if length is 0, it was a dup from above
if ( lens[i] <= 0 ) continue;
// we always get the count now
if ( st->m_returnDocIds ) {
int32_t count = 0;
DocIdLink *link = (DocIdLink *)(st->m_mem+heads[slots[i]]);
while ( (char *)link >= st->m_mem ) {
count++;
link = (DocIdLink *)(st->m_mem + link->m_next);
}
// space for the docids if they want them
size += 8 * count;
// sanity check
if ( count != pages[i] ) { char *xx = NULL; *xx = 0; }
}
// length (include \0 for null termination)
size += 4 + 4 + 4 + 1 + lens[i] + 1;
// . do we send back docid info?
// . each termId can have a linked list of docids
// . how many are in that list? (0 if none)
size += 4;
// 4 bytes for the dummy place holder. each one of these
// can be a ptr to the list of docids, but it will be NULL
// if we do not have a list of docids for this gigabit.
size += 4;
// the popularity... topic pop
size += 4;
// count numbre of topics we'll store
ntp++;
}
// realloc reply
newSize = *bufSize + size;
char *s = (char *) mrealloc ( *buf , *bufSize , newSize , "Msg24f" );
if ( ! s ) {
if ( tmpBuf ) mfree ( tmpBuf , tmpSize , "Msg24" );
if ( *buf ) mfree ( *buf , *bufSize , "Msg24" );
*buf = NULL;
*bufSize = 0;
// free the links in the linked list, if any
if ( st->m_mem ) {
mfree ( st->m_mem, st->m_memEnd - st->m_mem, "Msg24" );
st->m_mem = NULL;
st->m_memEnd = NULL;
st->m_memPtr = NULL;
}
if ( vecs != vbuf ) mfree ( vecs , vneed , "Msg24" );
return log("topics: Realloc reply buf to %"INT32" failed.",newSize);
}
// we realloc'd successfully, use it
*buf = s;
// copy into reply after previous topic groups
char *p = *buf + *bufSize;
// serialize ourselves into the buffer
//serialize2 ( p , ptrs , scores , lens , gids );
// store number of topics first
*(int32_t *)p = ntp; p += 4;
// arrays first
char **pptrs = (char **)p; p += ntp * 4;
int32_t *pscores = (int32_t *)p; p += ntp * 4;
int32_t *plens = (int32_t *)p; p += ntp * 4;
int32_t *ndocids = (int32_t *)p; p += ntp * 4;
int64_t **dptrs = (int64_t **)p; p += ntp * 4; // place holder
int32_t *ppops = (int32_t *)p; p += ntp * 4;
char *pgids = (char *)p; p += ntp ;
char *ptext = p;
int32_t j = 0;
for ( i = 0 ; i < np ; i++ ) {
// cutoff at min score
if ( scores[i] < t->m_minTopicScore ) continue;
// skip if length is 0, it was a dup from above
if ( lens[i] <= 0 ) continue;
// store it
pptrs [j] = (char *)(ptext - p);
pscores [j] = scores [i];
plens [j] = lens [i];
pgids [j] = gid;
if ( pops ) ppops [j] = pops[slots[i]];
else ppops [j] = 0;
ndocids [j] = 0;
dptrs [j] = NULL; // dummy placeholder
memcpy ( ptext , ptrs[i] , lens[i] ); ptext += lens[i];
//if ( hashes && j < GIGABITS_IN_VECTOR )
// hashes[j] = hash32Lower (ptrs[i],lens[i]);
*ptext++ = '\0';
j++;
}
QUICKPOLL(niceness);
// fill in docid info
if ( st->m_returnDocIdCount || st->m_returnDocIds ) {
// reset j for this repeat loop
j = 0;
// this loop header is the same as above
for ( i = 0 ; i < np ; i++ ) {
// cutoff at min score
if ( scores[i] < t->m_minTopicScore ) continue;
// skip if length is 0, it was a dup from above
if ( lens[i] <= 0 ) continue;
// count em
int32_t count = 0;
DocIdLink *link = (DocIdLink *)(st->m_mem+heads[slots[i]]);
while ( (char *)link >= st->m_mem ) {
count++;
if ( st->m_returnDocIds ) {
*(int64_t *)ptext = link->m_docId;
ptext += 8;
}
link = (DocIdLink *)(st->m_mem + link->m_next);
}
ndocids[j] = count;
j++;
}
}
//skipd:
// update buf parms for re-calls
*bufSize = newSize;
// free tmp buf
mfree ( tmpBuf , tmpSize , "Msg24" );
// free the links in the linked list, if any
if ( st->m_mem ) {
mfree ( st->m_mem , st->m_memEnd - st->m_mem , "Msg24" );
st->m_mem = NULL;
st->m_memEnd = NULL;
st->m_memPtr = NULL;
}
if ( vecs != vbuf ) mfree ( vecs , vneed , "Msg24" );
// copy into reply topic buf
//char *start = slot->m_tmpBuf;
//char *p = slot->m_tmpBuf;
//char *pend = p + TMPBUFSIZE;
/*
for ( i = 0 ; i < np ; i++ ) {
// cutoff at min score
if ( scores[i] < t->m_minTopicScore ) continue;
// skip if length is 0, it was a dup from above
if ( lens[i] <= 0 ) continue;
if ( p + lens[i] + 9 >= pend ) break;
*(int32_t *)p = scores[i]; p += 4;
*(int32_t *)p = lens [i]; p += 4;
*(char *)p = gid ; p += 1;
memcpy ( p , ptrs[i] , lens[i] ); p += lens[i];
*p++ = '\0';
}
*/
return true;
}
/*
bool onSamePages ( int32_t i, int32_t j, int32_t *slots, int32_t *heads, int32_t *pages ) {
if ( pages[i] != pages[j] ) return false;
DocIdLink *link1 = (DocIdLink *)(st->m_mem+heads[slots[i]]);
DocIdLink *link2 = (DocIdLink *)(st->m_mem+heads[slots[j]]);
while ( (char *)link1 >= st->m_mem ) {
if ( link1->m_docId != link2->m_docId ) return false;
link1 = (DocIdLink *)(st->m_mem + link1->m_next);
link2 = (DocIdLink *)(st->m_mem + link2->m_next);
}
return true;
}
*/
void hashExcerpt ( Query *q , uint64_t *qids , int32_t *qpops ,
int32_t nqi , TermTable *tt , char *buf , int32_t bufLen ,
Words *w , TopicGroup *t , Scores *scoresPtr ,
bool isUnicode , char *repeatTable ,
int32_t repeatTableNumSlots , char language );
// . returns false and sets g_errno on error
// . here's the tricky part
// . *nqiPtr is how many query terms we used - so caller can normalize scores
bool hashSample ( Query *q, char *bigSampleBuf , int32_t bigSampleLen ,
TermTable *master, int32_t *nqiPtr , TopicGroup *t ,
State24 *st, int64_t docId ,
char *vecs , int32_t *numVecs ,
Words *wordsPtr , Scores *scoresPtr , bool isUnicode ,
char *repeatTable , int32_t repeatTableNumSlots ,
char language ) {
// numTerms must be less than this
//if ( q && q->getNumTerms() > MAX_QUERY_TERMS ) (aac)
if ( q && q->m_numWords > MAX_QUERY_TERMS )
return log("topics: Too many query terms for "
"topic generation.");
//bool returnDocIdCount = st->m_returnDocIdCount;
//bool returnDocIds = st->m_returnDocIds;
bool returnPops = st->m_returnPops;
// this is the pure content now
char *content = bigSampleBuf;
int32_t contentLen = bigSampleLen;
// truncate it to 40k, that's enough
//if ( contentLen > 50*1024 ) contentLen = 50*1024;
// bail if empty!
if ( ! wordsPtr && (! content || contentLen <= 0) ) {
log("topics: Got empty document for topic generation.");
return true;
}
// make buf point to the available space
char *buf = content;
// get length of the buffer
int32_t bufLen = contentLen;
#ifdef DEBUG_MSG24
if (q) {
log("topics: Query stats in hashSample");
int32_t numQT = q->getNumTerms();
int32_t numQW = q->m_numWords;
log("topics: \tnumQueryTerms = %"INT32"", numQT);
log("topics: \tnumQueryWords = %"INT32"", numQW);
char *thisQT, *thisQW, iCode, tmpBuf[1024];
int32_t qtLen, qwLen, i, j, k;
for (i = 0; i < numQT; i++) {
thisQT = q->getTerm(i);
qtLen = q->getTermLen(i);
k = 0;
for (j = 0; j < qtLen && k < 1023; j++) {
if (thisQT[j]) tmpBuf[k++] = thisQT[j];
};
tmpBuf[k] = '\0';
log ("topics: \tQT[%"INT32"] = %s", i, &tmpBuf[0]);
};
for (i = 0; i < numQW; i++) {
thisQW = q->m_qwords[i].m_word;
qwLen = q->m_qwords[i].m_wordLen;
iCode = q->m_qwords[i].m_ignoreWord;
k = 0;
for (j = 0; j < qwLen && k < 1023; j++) {
if (thisQW[j]) tmpBuf[k++] = thisQW[j];
};
tmpBuf[k] = '\0';
log ("topics: \tQW[%"INT32"] = %s,\tignore = %i", i, &tmpBuf[0], iCode);
};
};
#endif
// get query hashes/ids, 32 bit, skip phrases
uint64_t qids [MAX_QUERY_TERMS];
int32_t qpops[MAX_QUERY_TERMS];
int32_t nqi = 0;
//for ( int32_t i=0 ; q && i<q->getNumTerms() && nqi<MAX_QUERY_TERMS; i++){ (aac)
for ( int32_t i=0 ; q && i < q->m_numWords && nqi<MAX_QUERY_TERMS; i++){
//if ( q->isPhrase (i) ) continue; (aac)
//if ( q->isQueryStopWord(i) ) continue; (aac)
char ignCode = q->m_qwords[i].m_ignoreWord;
if ( ignCode && ignCode != 8 ) continue;
char *s = q->m_qwords[i].m_word; // q->getTerm(i); (aac)
int32_t slen = q->m_qwords[i].m_wordLen; // q->getTermLen(i); (aac)
int32_t qpop;
int32_t encodeType = csISOLatin1;
if ( q->isUnicode() ) encodeType = csUTF16;
qids[nqi] = hash64d(s, slen, encodeType);
qpop = g_speller.getPhrasePopularity(s, qids[nqi], true,
language);
if ( qpop < QPOP_ZONE_0 ) qpop = QPOP_MULT_0;
else if ( qpop < QPOP_ZONE_1 ) qpop = QPOP_MULT_1;
else if ( qpop < QPOP_ZONE_2 ) qpop = QPOP_MULT_2;
else if ( qpop < QPOP_ZONE_3 ) qpop = QPOP_MULT_3;
else if ( qpop < QPOP_ZONE_4 ) qpop = QPOP_MULT_4;
else qpop = 1;
// qpop = 1; // this makes no sense here (aac)
qpops[nqi] = qpop;
nqi++;
}
// tell caller how many query terms we used so he can normalize scores
*nqiPtr = nqi;
//int64_t start = gettimeofdayInMilliseconds();
TermTable tt;
if ( ! tt.set(20000,true,true, false , returnPops, false, false,NULL)){
log("topics: Had error allocating a table for topic "
"generation: %s.",mstrerror(g_errno));
//mfree ( buf , bufMaxLen , "Msg24" );
return true;
}
Words w;
//---> word next to both query terms should not be between by word just
//next to one....
//---> weight by query popularity too!
//log("******** hashing doc *********");
// hash each excerpt
char *p = buf;
// most samples are under 5k, i've seend a 32k sample take 11ms!
char *pend = buf + bufLen;
while ( p < pend ) {
// debug
//log("docId=%"INT64" EXCERPT=%s",docId,p);
int32_t plen ;
if ( isUnicode ) plen = ucStrNLen(p,pend-p);
else plen = strlen(p);
// p is only non-NULL if we are doing it the old way
hashExcerpt ( q, qids, qpops, nqi, &tt, p, plen, &w, t , NULL,
isUnicode , repeatTable , repeatTableNumSlots ,
language );
// advance to next excerpt
if ( isUnicode ) p += plen + 2;
else p += plen + 1;
}
// hash the provided wordsPtr as one excerpt if there
if ( wordsPtr )
hashExcerpt ( q, qids, qpops, nqi, &tt, NULL,0, wordsPtr, t ,
scoresPtr , isUnicode ,
repeatTable , repeatTableNumSlots ,
language );
// . compute the fingerprint/similarirtyVector from this table
// the same way we do for documents for deduping them at query time
// . or we could just wait for our dedup algo to kick in... (mdw)
// then comment this stuff out ...
if ( t->m_dedupSamplePercent >= 0 ) {
char *v1 = vecs + (*numVecs * SAMPLE_VECTOR_SIZE);
g_clusterdb.getSampleVector ( v1 , &tt );
// compare to others done so far
char *v2 = vecs ;
for ( int32_t i = 0 ; i < *numVecs ; i++,v2+=SAMPLE_VECTOR_SIZE){
char ss = g_clusterdb.getSampleSimilarity(v1,v2,
SAMPLE_VECTOR_SIZE);
// return true if too similar to another sample we did
if ( ss >= t->m_dedupSamplePercent ) { // 80 ) {
log(LOG_DEBUG,"topics: removed dup sample.");
return true;
}
}
// we have another vector to contend with for next time
*numVecs = *numVecs + 1;
}
//log("TOOK %"INT64" ms plen=%"INT32"",gettimeofdayInMilliseconds()-start,
// bufLen);
// . this termtable carries two special buckets per slot in order
// to hold a linked list of docids with each termid in the hash table
// . heads is NULL if returnDocIdCount and returnDocIds are false
int32_t *heads = master->getHeads();
// . now hash the entries of this table, tt, into the master
// . the master contains entries from all the other tables
//log("have %"INT32" terms in termtable. adding to master.",
// tt.getNumTermsUsed());
int32_t nt = tt.getNumTerms();
int32_t pop = 0 ;
for ( int32_t i = 0 ; i < nt ; i++ ) {
// this should be indented
//if ( ! tt.getScoreFromTermNum(i) ) continue;
if ( ! tt.m_scores[i] ) continue;
//int32_t ii = (int32_t)tt.getTermPtr(i);
// then divide by that
int32_t score = tt.getScoreFromTermNum(i) ;
// watch out for 0
if ( score <= 0 ) continue;
// . get the bucket
// . may be or may not be full (score is 0 if empty)
int32_t n = master->getTermNum ( tt.getTermId(i) );
// skip if 0, i've seen this happen before
if ( tt.getTermId(i) == 0 ) continue;
// . but now we add one more things to the termtable,
// a linked list field for keeping track of the docids
// of the documents that contain each termid
// . grab some mem for the link
// . "heads" is NULL if we should not do this...
if ( heads ) {
if ( st->m_memPtr + sizeof(DocIdLink) > st->m_memEnd ) {
int32_t oldSize = st->m_memEnd - st->m_mem;
int32_t newSize = oldSize + 256*1024;
char *s = (char *)mrealloc(st->m_mem,oldSize,
newSize,"Msg24g");
if ( !s )
return log("Msg24: realloc failed.");
int32_t off = st->m_memPtr - st->m_mem;
st->m_mem = s;
st->m_memEnd = s + newSize;
st->m_memPtr = s + off;
}
DocIdLink *link = (DocIdLink *)st->m_memPtr;
st->m_memPtr += sizeof(DocIdLink);
link->m_docId = docId;
// if empty... make new head
if ( master->m_scores[n] == 0 ) {
link->m_next = -1;
master->m_heads[n] = (char *)link - st->m_mem;
}
// otherwise, add link to tail of this bucket
else {
link->m_next = master->m_heads[n];
master->m_heads[n] = (char *)link - st->m_mem;
}
}
if ( returnPops ) pop = tt.m_pops[i];
// set hi bit of "pop" if in unicode
if ( isUnicode ) pop |= 0x80000000;
else pop &= 0x7fffffff;
// . add term to master table
// . don't keep filling it up if we failed to alloc more space
// because that causes getTermNum() above to crash if the
// table is 100% full.
if ( ! master->addTerm ( tt.getTermId(i) ,
// divide by the AVG score used
//tt.getScoreFromTermNum(i)+30000/pop,
score ,
//tt.getScoreFromTermNum(i)+30000,
0x7fffffff ,
false ,
TITLEREC_CURRENT_VERSION ,
tt.getTermPtr(i) ,
tt.getTermLen(i) ,
n ,// termNum
NULL ,// dummy(char *)link
pop,
isUnicode ) )
break;
// debug msg
if ( g_conf.m_logDebugQuery ) {
char *ww = tt.getTermPtr(i);
int32_t wwlen = tt.getTermLen(i);
char c = ww[wwlen];
ww[wwlen]='\0';
log(LOG_DEBUG,"topics: master termId=%"UINT32" "
"score=%"INT32" cumscore=%"INT32" len=%"INT32" term=%s\n",
(int32_t)tt.getTermId(i),
score,master->getScoreFromTermId(tt.getTermId(i)),
wwlen,ww);
ww[wwlen]=c;
}
}
//log("master has %"INT32" terms",master->getNumTermsUsed());
// clear any error
if ( g_errno ) {
log("topics: Had error getting topic candidates from document: "
"%s.",mstrerror(g_errno));
g_errno = 0;
}
//mfree ( buf , bufMaxLen , "Msg24" );
return true;
}
void hashExcerpt ( Query *q , uint64_t *qids , int32_t *qpops, int32_t nqi,
TermTable *tt , char *buf , int32_t bufLen ,
Words *w , TopicGroup *t , Scores *scoresPtr ,
bool isUnicode , char *repeatTable ,
int32_t repeatTableNumSlots , char language ) {
// . bring it out
// . allow one more word per gigabit, then remove gigabits that
// are that length. this fixes the problem of having the same
// sentence repeated in different documents, which are fairly
// different as a whole, but have the same repeated sentence or
// paragraph.
// . by only adding one, if the next word is a common word then
// we would fail to make a larger gigabit, that's why i added
// the maxjend code below this.
int32_t maxWordsPerPhrase = t->m_maxWordsPerTopic ;
if ( t->m_topicRemoveOverlaps ) maxWordsPerPhrase += 2;
char enforceQueryRadius = ! t->m_meta[0];
char delimeter = t->m_delimeter; // 0 means none (default)
char idf = t->m_useIdfForTopics;
// or if no query, no query radius
if ( ! q || q->getNumNonFieldedSingletonTerms() == 0 )
enforceQueryRadius = false;
// . now all the data is in buf/bufLen
// . parse it up into Words
// . now XmlDoc::getGigabitVector() calls us and it already has the
// Words pased up, so it will use a NULL buf
if ( buf ) w->set ( isUnicode , // isUnicode?
false , // isNormalized?
buf ,
bufLen ,
TITLEREC_CURRENT_VERSION,
true , // compute word ids?
true ); // has html entities?
int32_t nw = w->getNumWords();
// don't breech our arrays man
if ( nw > 10000 ) nw = 10000;
void *lrgBuf;
int32_t lrgBufSize = 0;
lrgBufSize += 1002 * MAX_QUERY_TERMS * sizeof(int32_t);
lrgBufSize += 2 * nw * sizeof(int32_t);
lrgBufSize += 3 * nw * sizeof(char);
lrgBufSize += nw * sizeof(uint64_t);
lrgBuf = (char *)mmalloc(lrgBufSize, "hashExcerpt (Msg24)");
if (! lrgBuf) {
nw >>= 2;
lrgBufSize = 0;
lrgBufSize += 1002 * MAX_QUERY_TERMS * sizeof(int32_t);
lrgBufSize += 2 * nw * sizeof(int32_t);
lrgBufSize += 3 * nw * sizeof(char);
lrgBufSize += nw * sizeof(uint64_t);
lrgBuf = (char *)mmalloc(lrgBufSize, "hashExcerpt (Msg24)");
};
if (! lrgBuf) {
log("topics: could not allocate local buffer "
"(%"INT32" bytes required)", lrgBufSize);
return;
};
char *lrgBufPtr = (char *)lrgBuf;
// . the popularity of word #i is pops[i]
// . but we only set below if we need to
int32_t *pops = (int32_t *) lrgBufPtr; // popularity 1-1 with first 10000 words
lrgBufPtr += nw * sizeof(int32_t);
char *iqt = lrgBufPtr; // is query term? 1-1 with words
lrgBufPtr += nw * sizeof(char);
char *icw = lrgBufPtr; // do not let frags end in these words
lrgBufPtr += nw * sizeof(char);
int32_t *qtrs = (int32_t *)lrgBufPtr; // the raw QTR scores (aac)
lrgBufPtr += nw * sizeof(int32_t);
// record list of word positions for each query term
int32_t *pos = (int32_t *)lrgBufPtr;
lrgBufPtr += MAX_QUERY_TERMS * 1000 * sizeof(int32_t);
int32_t *posLen = (int32_t *)lrgBufPtr;
lrgBufPtr += MAX_QUERY_TERMS * sizeof(int32_t);
int32_t *posPtr = (int32_t *)lrgBufPtr;
lrgBufPtr += MAX_QUERY_TERMS * sizeof(int32_t);
//for ( int32_t i = 0 ; q && i < q->getNumTerms() ; i++ ) { (aac)
for (int32_t i = 0; q && i < q->m_numWords && i < MAX_QUERY_TERMS; i++) {
posLen[i] = 0; posPtr[i] = 0; }
// skip punct
int32_t i = 0;
if ( i < nw && w->isPunct(i) ) i++;
qtrs[i] = 0;
uint64_t *wids = (uint64_t *)lrgBufPtr;
lrgBufPtr += nw * sizeof(uint64_t);
// record the positions of all query words
char **wp = w->m_words;
int32_t *wlen = w->m_wordLens;
int32_t step = 2;
int64_t *rwids = w->getWordIds();
int32_t *scores = NULL;
// . now we keep a hash table to zero out repeated fragments
// . it uses a sliding window of 5 words
// . it stores the hash of those 5 words in the hash table
// . if sees how many 5-word matches it gets in a row
// . the more matches it gets, the more it demotes the word scores
// . these are stored in the weights class
// . a repeatScore of 0 means to demote it out completely, 100 means
// it is not repeated at all
// . multiply the final gigabit score by the repeatScore/100.
char *repeatScores = lrgBufPtr;
lrgBufPtr += nw * sizeof(char);
setRepeatScores ( repeatScores , rwids , nw , repeatTable ,
repeatTableNumSlots , w );
QUICKPOLL(0);
// single char length in bytes, etc.
char oneChar = 1;
char twoChars = 2;
char threeChars = 3;
if ( isUnicode ) {
oneChar = 2;
twoChars = 4;
threeChars = 6;
}
// . advance one word at a time if doing it the new way
// . also, the word ids will already be set, so use those to see what
// is indexable and what isn't
if ( ! buf ) {
step = 1;
scores = scoresPtr->m_scores;
}
// loop over the words in our sample
//for ( ; i < nw ; i += 2 ) {
for ( ; i < nw ; i += step ) {
qtrs[i] = 0;
// do we have pre-supplied words and scores from XmlDoc.cpp?
//if ( rwids ) {
// skip if not indexable
if ( ! rwids[i] ) continue;
// or if score is <= 0
if ( scores && scores[i] <= 0 ) continue;
// or repeated too much
if ( repeatScores[i] <= 20 ) continue;
//}
// reset popularity
if ( idf ) pops[i] = -1;
else pops[i] = 1; // assume all same if not using idf
// reset "is query term" array
iqt[i] = 0;
// store the id
int32_t encodeType = csISOLatin1;
if ( isUnicode ) encodeType = csUTF16;
wids[i] = hash64d(wp[i], wlen[i], encodeType);
// . is it a common word?
// . it is if it is just one letter
// . what about X-windows coming up for a 'windows' query?
// or e-mail coming up for a query?
// . METALINCS likes to have 1 digit topics
if ( wlen[i] <= oneChar && is_lower(wp[i][0]) ) icw[i] = 1;
// unicode ~equivalent
//if ( isUnicode && wlen[i] == 2 ) icw[i] = 1;
// 2004 is common here but if it makes it in, don't remove it
// in the top topics list... no. loses 'atari 2600' then!
//else if ( is_digit(w->getWord(i)[0]) )
// icw[i] = 1;
#ifndef _METALINCS_
else icw[i] = isCommonWord ( (int32_t)rwids[i] );
#else
// always allow gigabits that start with numbers for metalincs
else if ( ! is_digit(wp[i][0]))
icw[i] = isCommonWord ( (int32_t)rwids[i] );
else
icw[i] = 0;
#endif
// debug msg
/*
char *s = w->getWord(i);
int32_t slen = w->getWordLen(i);
char c = s[slen];
s[slen]='\0';
log("icw=%"INT32" %s",icw[i],s);
s[slen]=c;
*/
// is it a query term? if so, record its word # in "pos" arry
for ( int32_t j = 0 ; j < nqi ; j++ ) {
if ( wids[i] != qids[j] ) continue;
if ( posLen[j] >= 1000 ) continue;
pos [ 1000 * j + posLen[j] ] = i;
posLen [ j ]++;
// mark this word so if a phrase only has
// all query terms we do not hash it
iqt[i] = 1;
break;
}
}
QUICKPOLL(0);
// max score -- ONE max scoring hits per doc
int32_t maxScore = nqi * MAX_SCORE_MULTIPLIER;
// this happens when generating the gigabit vector for a single doc
// so don't hamper it to such a small ceiling
if ( nqi == 0 ) maxScore = ALT_MAX_SCORE;
// skip punct
i = 0;
if ( i < nw && w->isPunct(i) ) i++;
// score each word based on distance to query terms
int32_t score;
// loop through all the words
//for ( ; i < nw ; i += 2 ) {x
for ( ; i < nw ; i += step ) {
// debug point
//if ( strncasecmp( wp[i],"Microsoft",9) == 0 )
// log("hey");
// do we have pre-supplied words and scores from XmlDoc.cpp?
//if ( rwids ) {
// skip if not indexable
if ( ! rwids[i] ) continue;
// or if score is <= 0
if ( scores && scores[i] <= 0 ) continue;
//}
// skip if in a repeat chunk of doc
if ( repeatScores[i] <= 20 ) continue;
// protect against misspelled html entities (aac)
if ( (wp[i][-oneChar] == '&' && is_alnum(wp[i][0])) ||
(wp[i][0] == '&' && is_alnum(wp[i][oneChar])) ) continue;
// no more one or two letter gigabits (aac)
if ( wlen[i] < threeChars && (! is_digit(wp[i][0])) ) continue;
//continue; //mdw
// if we had a delimeter, previous word must have it
// or be the first punct word
if ( delimeter && i >= 2 && ! w->hasChar(i-1,delimeter) )
continue;
// skip if a query term, it's ineligible
//if ( w->getWordLen(i) == 0 ) continue;
// if query is NULL, assume we are restricting to meta tags
// and query is not necessary
if ( enforceQueryRadius ) score = 0;
else score = ALT_START_SCORE;
int32_t j ;
int32_t nm = 0; // number of matches
for ( j = 0 ; j < nqi ; j++ ) {
// skip if no query terms in doc for query term #j
if ( posLen[j] <= 0 ) continue;
// get distance in words
int32_t d1 = i - pos[ 1000 * j + posPtr[j] ] ;
if ( d1 < 0 ) d1 = d1 * -1;
if ( posPtr[j] + 1 >= posLen[j] ) {
if (d1 >= QTR_ZONE_3) continue;
if (iqt[i] || icw[i] ||
wlen[i] <= threeChars) {
// common word, query terms, int16_t words
// are all second class citizens when it
// comes to scoring: they get a small
// bonus, to ensure that they are
// considered in the next stage, but do not
// benefit from QPOP and multiple hit
// bonuses (aac)
score += QTR_BONUS_CW;
continue;
};
if (d1 < QTR_ZONE_0)
score += QTR_BONUS_0;
else if (d1 < QTR_ZONE_1)
score += QTR_BONUS_1;
else if (d1 < QTR_ZONE_2)
score += QTR_BONUS_2;
else
score += QTR_BONUS_3;
nm++;
score *= qpops[j];
continue;
}
int32_t d2 = pos[ 1000 * j + posPtr[j] + 1 ] - i ;
if ( d2 < 0 ) d2 = d2 * -1;
if ( d2 > d1 ) {
// if ( d1 >=20 ) continue;
// if ( d1 < 4 ) score += 1000;
// else if ( d1 < 8 ) score += 800;
// else if ( d1 < 12 ) score += 500;
// else score += 200;
// nm++;
// score *= qpops[j];
// continue;
if (d1 >= QTR_ZONE_3) continue;
if (iqt[i] || icw[i] ||
wlen[i] <= threeChars) {
// common word, query terms, int16_t words
// are all second class citizens when it
// comes to scoring: they get a small
// bonus, to ensure that they are
// considered in the next stage, but do not
// benefit from QPOP and multiple hit
// bonuses (aac)
score += QTR_BONUS_CW;
continue;
};
if (d1 < QTR_ZONE_0)
score += QTR_BONUS_0;
else if (d1 < QTR_ZONE_1)
score += QTR_BONUS_1;
else if (d1 < QTR_ZONE_2)
score += QTR_BONUS_2;
else
score += QTR_BONUS_3;
nm++;
score *= qpops[j];
continue;
}
// if ( d2 >=20 ) { posPtr[j]++; continue; }
// if ( d2 < 4 ) score += 1000;
// else if ( d2 < 8 ) score += 800;
// else if ( d2 < 12 ) score += 500;
// else score += 200;
// nm++;
// score *= qpops[j];
if (d2 >= QTR_ZONE_3) { posPtr[j]++; continue; };
if (iqt[i] || icw[i] || wlen[i] <= threeChars) {
// common word, query terms, int16_t words
// are all second class citizens when it
// comes to scoring: they get a small
// bonus, to ensure that they are
// considered in the next stage, but do not
// benefit from QPOP and multiple hit
// bonuses (aac)
score += QTR_BONUS_CW;
continue;
};
if (d2 < QTR_ZONE_0) score += QTR_BONUS_0;
else if (d2 < QTR_ZONE_1) score += QTR_BONUS_1;
else if (d2 < QTR_ZONE_2) score += QTR_BONUS_2;
else score += QTR_BONUS_3;
nm++;
score *= qpops[j];
continue;
posPtr[j]++;
}
// skip if too far away from all query terms
if ( score <= 0 ) continue;
// no longer count closeness to query terms for score,
// just use # times topic is in doc(s) and popularity
//score = 1000;
// set pop if it is -1
if ( pops[i] == -1 ) {
pops[i] = g_speller.
getPhrasePopularity( wp[i],wids[i], true,
language );
// decrease popularity by half if
// capitalized so Jack does not have
// same pop as "jack"
if ( is_upper (wp[i][0]) ) pops[i] >>= 1;
if ( pops[i] == 0 ) pops[i] = 1;
QUICKPOLL(0);
}
// give a boost for multiple hits
// the more terms in range, the bigger the boost
if ( nm > 1 ) {
//log("nm=%"INT32"",nm);
score += MULTIPLE_HIT_BOOST * nm;
};
// save the raw QTR score
qtrs[i] = score;
};
QUICKPOLL(0);
int32_t mm = 0;
// skip punct
i = 0;
if ( i < nw && w->isPunct(i) ) i++;
for ( ; i < nw ; i += step ) {
float pop;
int32_t score;
int32_t bonus;
// must start with a QTR-scoring word
if (qtrs[i] <= 0) continue;
// add it to table
// init for debug here
char *ww;
int32_t wwlen;
//char c;
int32_t ss;
ww = wp [i]; // w->getWord(i);
wwlen = wlen[i]; // w->getWordLen(i);
if ( icw[i] ) {
// . skip this and all phrases if we're "to"
// . avoid "to use..." "to do..." "to make..." annoying
// . "to" has score 1, "and" has score 2, "of" is 3,
// . "the" is 4, "this" is 5
if ( icw[i] <= 5 ) continue;
// cannot start with any common word, unless capitalized
if ( is_lower(wp[i][0]) ) continue;
}
// if a hyphen is immediately before us, we cannot start
// a phrase... fu-ture, preven-tion
if ( i > 0 && wp[i][-oneChar]=='-' ) continue;
// same for colon
if ( i > 0 && wp[i][-oneChar]==':' ) continue;
// . if a "'s " is before us, we cannot start either
// . "valentine's day cards"
if ( i >= 3 &&
wp[i][-threeChars]=='\'' &&
wp[i][-twoChars ]=='s' &&
is_space(wp[i][-oneChar]) ) continue;
// or if our first char is a digit and a "digit," is before us
// because we don't want to break numbers with commas in them
if ( is_digit(wp[i][0]) && i >= 2 && wp[i][-oneChar]==',' &&
is_digit(wp[i][-twoChars]) ) continue;
// set initial popularity
if (pops[i] > 0) {
pop = ((float) pops[i]) / MAXPOP;
}
else {
pop = 1.0 / MAXPOP;
};
// set initial score and bonus
score = qtrs[i];
bonus = 0;
uint64_t h = wids[i]; // hash value
// if first letter is upper case, double the score
//if ( is_upper (w->getWord(i)[0]) ) score <<= 1;
// . loop through all phrases that start with this word
// . up to 6 real words per phrase
// . 'j' counts our 'words' which counts a $ of puncts as word
int32_t jend = i + maxWordsPerPhrase * 2; // 12;
int32_t maxjend = jend ;
if ( t->m_topicRemoveOverlaps ) maxjend += 8;
if ( jend > nw ) jend = nw;
if ( maxjend > nw ) maxjend = nw;
QUICKPOLL(0);
int32_t count = 0;
int32_t nqc = 0; // # common/query words in our phrase
int32_t nhw = 0; // # of "hot words" (contribute to score)
if ( scores ) mm = scores[i];
//for ( int32_t j = i ; j < jend ; j += 2 ) {
for ( int32_t j = i ; j < jend ; j += step ) {
// skip if not indexable
if ( ! rwids[j] ) continue;
// or if score is <= 0
if ( scores && scores[j] <= 0 ) continue;
if ( repeatScores[j] <= 20 ) continue;
// no ending in ing on capitalized
if ( wlen[j] > threeChars &&
wp[j][wlen[j]-oneChar ]=='g' &&
wp[j][wlen[j]-twoChars ]=='n' &&
wp[j][wlen[j]-threeChars]=='i' &&
is_lower(wp[j][0]) )
continue;
if (j == i) {
if (icw[j] || wlen[j] < threeChars) bonus -= FWC_PENALTY;
// if word is 4 letters or more and ends in ed, do not
// allow to be its own gigabit
if ( wlen[j] > threeChars &&
wp[j][wlen[j]-oneChar ]=='d' &&
wp[j][wlen[j]-twoChars]=='e' )
continue;
// no more "com" gigabits, please! (aac)
if ( wlen[j] == threeChars &&
wp[j][0 ]=='c' &&
wp[j][oneChar ]=='o' &&
wp[j][twoChars]=='m') continue;
};
// let's generalize even more! do not allow common
// single words as gigabits, with 250+ pop
//if ( pop > 100 && j == i && is_lower(wp[j][0]) ) continue;
// the above assumes a MAX_POP of 10k (sanity check)
//if ( MAXPOP != 10000 ) { char *xx = NULL; *xx = 0; }
// are we passed the first word in the phrase?
if ( j > i ) {
// advance phrase length
wwlen += wlen[j-1] + wlen[j];
// . cut phrase int16_t if too much punct between
// the current word, j, and the last one, j-2
// . but allow for abbreviations or initials
// of single letters, like 'harry s. truman'.
// we do not want to break before 's.'
// . because the phrase "s. doesn't stand for
// anything." was unable to form. we only
// got "s." and "doesn't stand for anything."
// as possible gigabit candidates.
//if ( wlen[j-1] > 1 ) {
// if ( wlen[j-1] != 2 ) break;
// if ( wp [j-1][0] != '.' ) break;
// if ( wlen[j-2] > 1 ) break;
//}
// . we now allow most punct since it is
// filtered out above w/ hasPunct variable
// . this a little more than doubles the
// processing overhead going from 1 to 3
// . going from 1 to 2 we see that we take 60ms
// instead of 50ms *when removing overlaps*
// . at 1 we take about 48/45ms, not much
// different when removing overlaps
// . increasing this totally wipes out our
// overlap problem, but it is very expensive,
// so now i just halt after jumping one big
// string of punct below, and filter out
// those gigabits above with hasPunct.
// . i'd really like to NOT have this here
// becase we get much better gigabits, but
// we need it as a speed saver...
if (wlen[j-1]>t->m_topicMaxPunctLen) break;
// no phrasing across commas, etc.
/*
if ( wlen[j-1] == 2 ) {
// only allow " " or ": " or ". "
if ( wp[j-1][1]!=' ' ) break;
if ( wp[j-1][0]!=' ' &&
wp[j-1][0]!=':' &&
wp[j-1][0]!='\'' && // beatles'
// allow commas here, but we
// remove any gigabits with commas
// because we just use them to
// cancel out bad gigabits.
wp[j-1][0]!=',' &&
wp[j-1][0]!='.' ) break;
// . TODO: add in sgt. col. so that
// stuff can be in a gigabit
// . only allow ". " if prev word was
// abbreviation.
if ( wp[j-1][0]=='.' &&
j >= 2 &&
wlen[j-2] > 3) break; // != 1
}
*/
// or if we just skipped the delimeter,
// we are not allowed to phrase across that
// if one was provided
if ( delimeter && w->hasChar(j-1,delimeter))
break;
// make sure we could phrase across last word
//if ( wlen[j-1] > 1 &&
// bits.getPunctuationBits(wp[j-1],wlen[j-1])
// == 0 ) break;
// accumulate the phrase's hash
h = hash64 ( h , wids[j] );
// set pop if it is -1
if ( pops[j] == -1 ) {
pops[j]= g_speller.
getPhrasePopularity( wp[j],
wids[j], true, language );
// decrease popularity by half if
// capitalized so Jack does not have
// same pop as "jack"
if ( is_upper (wp[j][0]) )
pops[j] >>= 1;
// why was this in there?
if ( pops[j] <= 0 ) pops[j] = 1;
QUICKPOLL(0);
}
// adjust popularity
pop = (pop * pops[j])/MAXPOP;
// watch our for overflow
if ( pop <= 0.0 ) pop = 1.0/MAXPOP;
// get lowest of scores
if ( scores && scores[j] > mm ) mm = scores[j];
}
// keep track of words
count++;
if ( iqt[j] || icw[j] ) {
nqc++; // increment number of query/commoners
}
else if (qtrs[j] > 0) {
score += qtrs[j];
nhw++; // increment "hot word" counter
};
// keep phrasing until next punct word is delimeter
// or the end
if ( delimeter ) {
// if we end on a punct word, then hash
// our phrase, otherwise, only hash it if
// the next word has the delimeter
if ( j+2 < jend && ! w->hasChar(j+1,delimeter))
continue;
}
// otherwise, ensure phrase is not ALL query terms
else {
// if phrase is all commoners & query skip it
if ( nqc == count ) {
#ifdef DEBUG_MSG24
char saveChar = ww[wwlen];
ww[wwlen] = '\0';
log("topics: phrase is all QT or CW; skipping"
" phrase %s", ww);
ww[wwlen] = saveChar;
#endif
continue;
};
}
// . skip if we're common, pair across common words
// . BUT it is common to end a meta from tag in ".com"
// so we should not count that one as common
if ( icw[j] ) {
// allow for more words only for purposes
// of fixing the ABCD and BCDE overlap bug
// without having to raise jend for all cases
if ( jend < maxjend ) jend++;
continue;
}
// do not stop if - . or @ follows us right b4 alnum
if ( j+1 < nw && is_alnum(wp[j+1][oneChar]) ) {
if ( wp[j+1][0]=='-' ) continue;
if ( wp[j+1][0]=='.' ) continue;
if ( wp[j+1][0]=='\'') continue;
if ( wp[j+1][0]=='@' ) continue;
// . do not split phrases between capitalized words
// . this should fix the Costa Rica, Costa Blah bug
// . it may decrease score of Belkin for query
// 'Belkin Omni Cube' but that's ok because if
// Belkin is important it will be used independently.
if ( is_upper(wp[j][0]) &&
j + 2 < nw &&
wp[j+1][0]==' ' &&
is_upper(wp[j+2][0]) &&
wlen[j+1] == oneChar &&
t->m_maxWordsPerTopic > 1 )
continue;
}
// do not mix caps
if ( is_upper(wp[i][0]) != is_upper(wp[j][0]) )
continue;
// . do not stop on a single capital letter
// . so we don't stop on "George W->" (george w. bush)
// . i added the " && j > i" so METALINCS can have
// single digit gigabits
if ( wlen[j] == oneChar && j > i ) continue;
// . do not split after Mr. or St. or Ms. or Mt. ...
// . fixes 'st. valentines day'
if ( wlen[j] == twoChars && is_upper(wp[j][0]) &&
wp[j][twoChars]=='.' ) continue;
// sgt. or col.
if ( wlen[j] == threeChars && wp[j][threeChars]=='.' ){
if ( to_lower(wp[j][0 ])=='s' &&
to_lower(wp[j][oneChar ])=='g' &&
to_lower(wp[j][twoChars])=='t' ) continue;
if ( to_lower(wp[j][0 ])=='c' &&
to_lower(wp[j][oneChar ])=='o' &&
to_lower(wp[j][twoChars])=='l' ) continue;
if ( to_lower(wp[j][0 ])=='m' &&
to_lower(wp[j][oneChar ])=='r' &&
to_lower(wp[j][twoChars])=='s' ) continue;
}
// . do not split commas in numbers
// . like 1,000,000,000
if ( j >= 2 &&
wp[j][-oneChar ]==',' &&
is_digit(wp[j][-twoChars]) &&
wp[j][wlen[j]]==',' &&
is_digit(wp[j][wlen[j]+oneChar]))
continue;
/*
if ( pop < 1 ) ;
else if ( pop < 2 ) ss = (score * 90) / 100;
else if ( pop < 5 ) ss = (score * 85) / 100;
else if ( pop < 10 ) ss = (score * 80) / 100;
else if ( pop < 20 ) ss = (score * 75) / 100;
else if ( pop < 30 ) ss = (score * 70) / 100;
else if ( pop < 40 ) ss = (score * 65) / 100;
else if ( pop < 50 ) ss = (score * 60) / 100;
else ss = (score * 40) / 100;
*/
//if ( tt->getScoreFromTermId((int64_t)h) > 0 )
// continue;
// debug msg
//char c = ww[wwlen];
//ww[wwlen]='\0';
//fprintf(stderr,"tid=%"UINT32" score=%"INT32" pop=%"INT32" len=%"INT32" "
// "repeat=%"INT32" term=%s\n",h,ss,pop,wwlen,
// repeatScores[i],ww);
//ww[wwlen]=c;
// include any ending or starting ( or )
if ( i > 0 && ww[-oneChar] == '(' ) {
// ensure we got a ')' somwhere before adding (
for ( int32_t r = 0 ; r <= wwlen ; r++ )
if ( ww[r]==')' ) {
ww--; wwlen++; break; }
}
if ( i < nw && ww[wwlen] == ')' ) {
// we need a '(' somewhere before adding the )
for ( int32_t r = 0 ; r <= wwlen ; r++ )
if ( ww[r]=='(' ) {
wwlen++; break; }
}
// now remove ('s if begin AND end in them
if ( ww[0] == '(' && ww[wwlen-oneChar] == ')' ) {
ww++; wwlen -= twoChars; }
// now double score if capitalized, we need more
// proper nouns for topic clustering to work better,
// but it doesn't count if start of a sentence, so
// there must be some alnum word right before it.
//if (is_upper(ww[0]) && !isUnicode && wwlen>=2 &&
if ( is_upper(ww[0]) && wwlen>=twoChars &&
is_alnum(ww[-twoChars]))
ss <<= 1; // 1;
// adjust the gigabit score using the new scores array
//if ( scores && mm != NORM_WORD_SCORE )
// ss = (ss * mm) / NORM_WORD_SCORE;
// adjust the gigabit score using the new scores array
//if ( scores && mm != NORM_WORD_SCORE )
// ss = (ss * mm) / NORM_WORD_SCORE;
// only count the highest scoring guy once per page
//int32_t tn = tt->getTermNum((int64_t)h);
//maxScore = ss;
//if ( tn >= 0 ) {
// int32_t sc = tt->getScoreFromTermNum(tn);
// if ( sc > maxScore ) maxScore = sc;
//}
// . add it
// . now store the popularity, too, so we can display
// it for the winning gigabits
//if ( ! tt->addTerm ((int64_t)h,ss,maxScore,false,
// ww,wwlen,tn,NULL,pop) )
// . weight score by pop
// . lets try weighting more popular phrases more!
ss = score;
if (nhw > 0) ss /= nhw;
ss += bonus;
float boost;
if ( ((float)nhw) / count < SPARSE_MARK)
ss -= SPARSE_PENALTY;
if (pop < POP_ZONE_0) boost = POP_BOOST_0;
else if (pop < POP_ZONE_1) boost = POP_BOOST_1;
else if (pop < POP_ZONE_2) boost = POP_BOOST_2;
else if (pop < POP_ZONE_3) boost = POP_BOOST_3;
else boost = POP_BOOST_4;
ss = (int32_t)(boost *ss);
if ( ss <= 0 ) ss = 1;
// store it
int32_t ipop = (int32_t)(pop * MAXPOP);
if ( ! tt->addTerm ((int64_t)h,ss,maxScore,false,
TITLEREC_CURRENT_VERSION ,
ww,wwlen,-1,NULL,ipop) ) {
log("topics: No memory to grow table.");
return;
}
// stop after indexing a word after a int32_t string of
// punct, this is the overlap bug fix without taking
// a performance hit. hasPunct above will remove it.
if ( j > i && wlen[j-1] > twoChars ) break;
}
}
// clear any error
if ( g_errno ) {
log("topics: Had error getting topic candidates from "
"document: %s.",mstrerror(g_errno));
g_errno = 0;
}
mfree(lrgBuf, lrgBufSize, "hashExcerpt (Msg24)");
}
// taken from Weights.cpp's set3() function
void setRepeatScores ( char *repeatScores ,
int64_t *wids ,
int32_t nw ,
char *repeatTable ,
int32_t repeatTableNumSlots ,
Words *words ) {
// if no words, nothing to do
if ( nw == 0 ) return;
char *ptr = repeatTable;
int32_t numSlots = repeatTableNumSlots;
int64_t *hashes = (int64_t *)ptr; ptr += numSlots * 8;
int32_t *vals = (int32_t *)ptr; ptr += numSlots * 4;
int64_t ringWids [ 5 ];
int32_t ringPos [ 5 ];
int32_t ringi = 0;
int32_t count = 0;
int64_t h = 0;
// make the mask
uint32_t mask = numSlots - 1;
// clear ring of hashes
memset ( ringWids , 0 , 5 * sizeof(int64_t) );
// for sanity check
//int32_t lastStart = -1;
// count how many 5-word sequences we match in a row
int32_t matched = 0;
int32_t matchStart = -1;
// reset
memset ( repeatScores , 100 , nw );
// return until we fix the infinite loop bug
//return;
// . hash EVERY 5-word sequence in the document
// . if we get a match look and see what sequences it matches
// . we allow multiple instances of the same hash to be stored in
// the hash table, so keep checking for a matching hash until you
// chain to a 0 hash, indicating the chain ends
// . check each matching hash to see if more than 5 words match
// . get the max words that matched from all of the candidates
// . demote the word and phrase weights based on the total/max
// number of words matching
for ( int32_t i = 0 ; i < nw ; i++ ) {
// skip if not alnum word
if ( ! wids[i] ) continue;
// reset
//repeatScores[i] = 100;
// add new to the 5 word hash
h ^= wids[i];
// . remove old from 5 word hash before adding new...
// . initial ring wids are 0, so should be benign at startup
h ^= ringWids[ringi];
// add to ring
ringWids[ringi] = wids[i];
// save our position
ringPos[ringi] = i;
// wrap the ring ptr if we need to, that is why we are a ring
if ( ++ringi >= 5 ) ringi = 0;
// this 5-word sequence starts with word # "start"
int32_t start = ringPos[ringi];
// need at least 5 words in the ring buffer to do analysis
if ( ++count < 5 ) continue;
// sanity check
//if ( start <= lastStart ) { char *xx = NULL; *xx = 0; }
// look up in the hash table
int32_t n = h & mask;
// stop at new york times - debug
/*
if ( words->m_words[i][0] == 'A' &&
words->m_words[i][1] == 's' &&
words->m_words[i][2] == 'k' &&
words->m_words[i][3] == 'e' &&
words->m_words[i][4] == 'd' &&
words->m_words[i][5] == ' ' &&
words->m_words[i][6] == 'Q' &&
words->m_words[i][7] == 'u' )
log("hey");
*/
loop:
// all done if empty
if ( ! hashes[n] ) {
// add ourselves to the hash table now
hashes[n] = h;
// this is where the 5-word sequence starts
vals [n] = matchStart+1;
// do not demote any words if less than 8 matched
if ( matched < 3 ) { matched = 0; continue; }
// reset
matched = 0;
// . how much we should we demote
// . 10 matching words pretty much means 0 weights
//float demote = 1.0 - ((matched-5)*.10);
//if ( demote >= 1.0 ) continue;
//if ( demote < 0.0 ) demote = 0.0;
// demote the words involved
for ( int32_t j = matchStart ; j < i ; j++ )
repeatScores[j] = 0;
// get next word
continue;
}
// get next in chain if hash does not match
if ( hashes[n] != h ) {
// wrap around the hash table if we hit the end
if ( ++n >= numSlots ) n = 0;
// check out bucket #n now
goto loop;
}
// save start of matching sequence for demote loop
if ( matched == 0 ) matchStart = start;
// inc the match count
matched++;
}
// if we ended without nulling out some matches
if ( matched < 3 ) return;
for ( int32_t j = matchStart ; j < nw ; j++ ) repeatScores[j] = 0;
}
/*
// is it a stop word?
char isCommonPhrase ( int32_t h ) {
static TermTable s_table;
static bool s_isInitialized = false;
// . these have the stop words above plus some foreign stop words
// . these aren't
// . i shrunk this list a lot
// . see backups for the hold list
// . i shrunk this list a lot
// . see backups for the hold list
static char *s_stopPhrases[] = {
"all rights reserved" ,
"in addition" ,
"for example" ,
"for more information"
};
// include a bunch of foreign prepositions so they don't get required
// by the bitScores in IndexTable.cpp
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_table.set ( sizeof(s_stopPhrases) * 2 ) )
return log("Msg24::isCommonPhrase: error set table");
// now add in all the stop words
int32_t n = (int32_t)sizeof(s_stopPhrases)/ sizeof(char *);
for ( int32_t i = 0 ; i < n ; i++ ) {
// set the phrases
char *sw = s_stopPhrases[i];
int32_t swlen = strlen ( sw );
Words w;
w->set ( false , sw , swlen );
int32_t h = hash64d ( w->getWord (0),
w->getWordLen(0));
for ( int32_t j = 1 ; j < w->getNumWords() ; j++ )
int32_t h2 =
int32_t swh = hash64d ( sw , swlen );
s_table.addTerm ((int32_t)swh,i+1,0x7fffffff,true);
}
s_isInitialized = true;
}
// . all 1 char letter words are stop words
// . good for initials and some contractions
//if ( len == 1 && is_alpha(*s) ) return true;
// get from table
return (char)s_table.getScoreFromTermId ( h );
}
*/
int32_t Msg24::getStoredSize ( ) {
// store number of topics into 4 bytes
int32_t size = 4;
// store number of topics we have
// all related topics that have scores >= m_minTopicScore
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
// get group info
//TopicGroup *t = &m_topicGroups[m_topicGids[i]];
// break if buf is too small
//if ( size + m_topicLens[i] + 2 + 8 > MAX_REPLY_LEN ) break;
// include \0 and 4 byte score and 4 byte topic length
size +=
4 + // topic ptr
4 + // topicScore
4 + // topicLen
4 + // numDocIds
4 + // ptr to docids
4 + // topic pop
1 + // topic gid
m_topicLens[i] + 1 + // topic string with \0
m_topicNumDocIds[i]*8; // actual docids
}
return size;
}
// . serialize ourselves for the cache
// . returns bytes written
// . returns -1 and sets g_errno on error
// . just like serializing the reply
int32_t Msg24::serialize ( char *buf , int32_t bufLen ) {
char *p = buf;
// store number of topics
*(int32_t *)p = m_numTopics; p += 4;
// if no topics, bail
if ( m_numTopics <= 0 ) return 4;
// then the ptrs, with offset relative to m_topicPtrs[0] so
// deserialize works
char *base = m_topicPtrs[0];
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
*(int32_t *)p = m_topicPtrs[i] - base; p += 4; }
// then the scores
memcpy ( p , m_topicScores , m_numTopics * 4 ); p += m_numTopics * 4;
memcpy ( p , m_topicLens , m_numTopics * 4 ); p += m_numTopics * 4;
memcpy ( p , m_topicNumDocIds, m_numTopics * 4 ); p += m_numTopics * 4;
// these m_topicDocIds, are just essentially placeholders for ptrs
// to the docids, just like the topic ptrs above, but these call all
// be NULL if we didn't get back the list of docids for each gigabit
p += m_numTopics * 4;
// then the popularity rating of each topic
memcpy ( p , m_topicPops , m_numTopics * 4 ); p += m_numTopics * 4;
memcpy ( p , m_topicGids , m_numTopics ); p += m_numTopics;
// then the text
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
memcpy ( p , m_topicPtrs[i] , m_topicLens[i] ) ;
p += m_topicLens[i];
*p++ = '\0';
}
// and one array of docids per topic
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
memcpy ( p , m_topicDocIds[i] , m_topicNumDocIds[i] * 8 );
p += m_topicNumDocIds[i] * 8;
// sanity check
//for ( int32_t k = 0 ; k < m_topicNumDocIds[i] ; k++ )
// if ( m_topicDocIds[i][k] & ~((int64_t)DOCID_MASK) ) {
// log("query: Msg24 bad docid in serialize.");
// char *xx = NULL; *xx = 0;
// }
}
// debug msg
//log("in nt=%"INT32"",*nt);
if ( p - buf > bufLen ) {
log("query: Msg24 serialize overflow.");
char *xx = NULL; *xx = 0;
}
return p - buf;
}
// . deserialize ourselves for the cache
// . returns bytes written
// . returns -1 and sets g_errno on error
// . Msg40 owns the buffer, so we can reference it without having to copy
int32_t Msg24::deserialize ( char *buf , int32_t bufLen ) {
// sanity check, i've seen this happen before when the handle of
// the Msg24 runs out of memory at a certain plance and ends up
// sending back a 0 length reply
if ( bufLen < 4 ) {
g_errno = EBADREPLY;
log("query: Msg24::deserialize: bad reply.");
return -1;
}
char *p = buf;
m_numTopics = *(int32_t *)p; p += 4;
// another sanity check, just in case
if ( bufLen < m_numTopics * (6*4+1) ) {
g_errno = EBADREPLY;
log("query: Msg24::deserialize: bad reply 2.");
return -1;
}
m_topicPtrs = (char **)p; p += m_numTopics * 4;
m_topicScores = (int32_t *)p; p += m_numTopics * 4;
m_topicLens = (int32_t *)p; p += m_numTopics * 4;
m_topicNumDocIds = (int32_t *)p; p += m_numTopics * 4; //voters
m_topicDocIds = (int64_t **)p; p += m_numTopics * 4; //placehldrs
m_topicPops = (int32_t *)p; p += m_numTopics * 4;
m_topicGids = p; p += m_numTopics;
// . make ptrs to topic text
// . we were just provided with offsets to make it portable
char *off = p;
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
m_topicPtrs[i] = (int32_t)m_topicPtrs[i] + off;
p += m_topicLens[i] + 1;
}
// now for the array of docids per topic
for ( int32_t i = 0 ; i < m_numTopics ; i++ ) {
m_topicDocIds[i] = (int64_t *)p;
p += m_topicNumDocIds[i] * 8;
// sanity check
//for ( int32_t k = 0 ; k < m_topicNumDocIds[i] ; k++ )
// if ( m_topicDocIds[i][k] & ~((int64_t)DOCID_MASK) ) {
// log("query: Msg24 bad docid in deserialize.");
// char *xx = NULL; *xx = 0;
// }
}
if ( p - buf > bufLen ) {
log("query: Msg24 deserialize overflow.");
char *xx = NULL; *xx = 0;
}
return p - buf;
}
//if we already have the msg20s, just generate the gigabits from those.
bool Msg24::generateTopicsLocal ( char *coll ,
int32_t collLen ,
char *query ,
int32_t queryLen ,
Msg20** msg20Ptrs ,
int32_t numMsg20s ,
char *clusterLevels ,
TopicGroup *topicGroups ,
int32_t numTopicGroups ,
unsigned char lang ) { // (aac)
// force it to be true, since hi bit is set in pops if topic is unicode
m_returnPops = true;
// warning
if ( ! coll ) log(LOG_LOGIC,"net: NULL collection. msg24.");
// force it
m_returnDocIdCount = true;
// if we don't get docids, then deserialize doesn't work because it
// expects the docids to be valid.
m_returnDocIds = true;
// reset
m_numTopics = 0;
//m_docsToScanForTopics = docsToScanForTopics;
//m_minTopicScore = minTopicScore;
//m_maxTopics = maxTopics;
m_numDocIds = 0;
m_coll = coll;
m_collLen = collLen;
// bail if no operations to do
int32_t numTopicsToGen = topicGroups->m_numTopics;
// get the min we have to scan
int32_t docsToScanForTopics = topicGroups[0].m_docsToScanForTopics;
for ( int32_t i = 1 ; i < numTopicGroups ; i++ ) {
int32_t x = topicGroups[i].m_docsToScanForTopics ;
if ( x > docsToScanForTopics ) docsToScanForTopics = x;
if ( topicGroups[i].m_numTopics > numTopicsToGen )
numTopicsToGen = topicGroups[i].m_numTopics;
}
// bail if none
if ( docsToScanForTopics <= 0 ) return true;
if ( numTopicsToGen == 0 ) return true;
m_startTime = gettimeofdayInMilliseconds();
// save, caller should not delete this!
m_topicGroups = topicGroups;
m_numTopicGroups = numTopicGroups;
// truncate
//if ( maxTopics > MAX_TOPICS ) maxTopics = MAX_TOPICS;
// truncate
//if ( numDocIds > MAX_DOCIDS_TO_SCAN )
// numDocIds = MAX_DOCIDS_TO_SCAN ;
// if ( numDocIds > docsToScanForTopics )
// numDocIds = docsToScanForTopics ;
State24 st;
st.m_slot = NULL;
st.m_niceness = 0;
st.m_numRequests = numMsg20s;
st.m_numReplies = numMsg20s;
memcpy ( st.m_query , query , queryLen );
st.m_query [ queryLen ] = '\0';
st.m_queryLen = queryLen;
st.m_qq.set ( st.m_query , st.m_queryLen , NULL , 0, 2 , true );
st.m_numTopicGroups = m_numTopicGroups;
memcpy(st.m_topicGroups, m_topicGroups,
sizeof(TopicGroup) * m_numTopicGroups);
st.m_maxCacheAge = 0;
st.m_addToCache = false;
st.m_returnDocIdCount = m_returnDocIdCount;
st.m_returnDocIds = m_returnDocIds;
st.m_returnPops = true; // ??? use this in dedup vector?
st.m_docIds = NULL;
st.m_numDocIds = 0;
st.m_clusterLevels = clusterLevels;
st.m_n = 0;
st.m_i = 0;
st.m_coll = coll;
st.m_msg20Ptrs = msg20Ptrs;
st.m_msg20 = NULL;
TermTable master;
if ( ! master.set ( 20000 , true , true ,
st.m_returnDocIdCount | st.m_returnDocIds ,
st.m_returnPops , true, false, NULL ) ) {
log("topics: Could not allocate memory for topic generation.");
return true;
}
char *buf = NULL;
int32_t bufSize = 0;
for ( int32_t i = 0 ; i < st.m_numTopicGroups ; i++ ) {
// get ith topic group descriptor
TopicGroup *t = &st.m_topicGroups[i];
// . generate topics for this topic group
// . serialize them into "p"
// . getTopics will realloc() this "buf" to exactly the size
// it needs
getTopics ( &st , t , &master , &st.m_qq , i ,
// getTopics will realloc this buffer
&buf , &bufSize , NULL , NULL , NULL, lang ); // (aac)
// clear master table each time
if ( i + 1 < st.m_numTopicGroups ) master.clear();
}
//}
// free mem now to avoid fragmentation
master.reset();
deserialize ( buf , bufSize );
//we are pointing into buf, but we want to make sure it gets freed when we
//are done with it, so we make it our m_reply
m_reply = buf;
m_replySize = bufSize;
g_stats.addStat_r ( 0 ,
m_startTime ,
gettimeofdayInMilliseconds(),
"get_gigabits",
0x00d1e1ff ,
STAT_QUERY );
return true;
}