fixed summary initialization error

of the flags buffer.
fixed term freq algo. use exact term freq
for qatest123. made Summary.o -O3 again.
fix gbsystem() to disable both timers.
This commit is contained in:
Matt 2014-12-06 10:14:48 -07:00
parent 01d61d5427
commit 41c8817bdb
7 changed files with 172 additions and 65 deletions

View File

@ -2547,22 +2547,24 @@ void Loop::disableTimer() {
}
int gbsystem(char *cmd ) {
if ( ! g_conf.m_runAsDaemon )
setitimer(ITIMER_REAL, &g_loop.m_noInterrupt, NULL);
// if ( ! g_conf.m_runAsDaemon )
// setitimer(ITIMER_REAL, &g_loop.m_noInterrupt, NULL);
g_loop.disableTimer();
log("gb: running system(\"%s\")",cmd);
int ret = system(cmd);
if ( ! g_conf.m_runAsDaemon )
setitimer(ITIMER_REAL, &g_loop.m_realInterrupt, NULL);
g_loop.enableTimer();
// if ( ! g_conf.m_runAsDaemon )
// setitimer(ITIMER_REAL, &g_loop.m_realInterrupt, NULL);
return ret;
}
// void Loop::enableTimer() {
// m_canQuickPoll = true;
// // logf(LOG_WARN, "xxx enabling");
// setitimer(ITIMER_VIRTUAL, &m_quickInterrupt, NULL);
// //setitimer(ITIMER_REAL, &m_quickInterrupt, NULL);
// }
void Loop::enableTimer() {
m_canQuickPoll = true;
// logf(LOG_WARN, "xxx enabling");
setitimer(ITIMER_VIRTUAL, &m_quickInterrupt, NULL);
setitimer(ITIMER_REAL, &m_realInterrupt, NULL);
}

View File

@ -438,8 +438,9 @@ neighborhood.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
TermTable.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
#Summary.o:
# $(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
# why was this commented out?
Summary.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
Title.o:
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp

View File

@ -103,6 +103,9 @@ void Matches::setQuery ( Query *q ) {
//memset ( m_foundNegTermVector, 0, m_q->getNumTerms() );
// this is word based. these are each 1 byte
memset ( m_qwordFlags , 0 , m_q->m_numWords * sizeof(mf_t));
// # of WORDS in the query
int32_t nqt = m_q->m_numTerms;
@ -110,7 +113,7 @@ void Matches::setQuery ( Query *q ) {
int32_t numToMatch = 0;
for ( int32_t i = 0 ; i < nqt ; i++ ) {
// rest this
m_qwordFlags[i] = 0;
//m_qwordFlags[i] = 0;
// get query word #i
//QueryWord *qw = &m_q->m_qwords[i];
QueryTerm *qt = &m_q->m_qterms[i];

137
Posdb.cpp
View File

@ -461,8 +461,80 @@ static bool s_cacheInit = false;
// . accesses RdbMap to estimate size of the indexList for this termId
// . returns an UPPER BOUND
// . because this is over POSDB now and not indexdb, a document is counted
// once for every occurence of term "termId" it has... :{
int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
// establish the list boundary keys
key144_t startKey ;
key144_t endKey ;
makeStartKey ( &startKey, termId );
makeEndKey ( &endKey , termId );
// doint qa test?
bool qaTest = false;
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( cr && strcmp(cr->m_coll,"qatest123") == 0 )
qaTest = true;
// if so, use the exact size
if ( qaTest ) {
Msg5 msg5;
RdbList list;
g_threads.disableThreads();
msg5.getList ( RDB_POSDB ,
collnum ,
&list ,
&startKey ,
&endKey ,
64000000 , // minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
NULL , // msg5b ptr
true );
// re-enable threads
g_threads.enableThreads();
//int64_t numBytes = list.getListSize();
// see how many diff docids we have... easier to debug this
// loop over entries in list
int64_t docId = 0;
int64_t count = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key144_t k; list.getCurrentKey(&k);
// is it a delete?
if ( (k.n0 & 0x01) == 0x00 ) continue;
int64_t d = g_posdb.getDocId(&k);
if ( d == docId ) continue;
docId = d;
count++;
}
// convert to # keys, approx. just an estimate since
// some keys are compressed...
// none except first key are full size. they are all just
// 12 bytes etc.
int64_t numKeys = count;
if ( numKeys < 0 ) numKeys = 0;
// and assume each shard has about the same #
numKeys *= g_hostdb.m_numShards;
return numKeys;
}
//collnum_t collnum = g_collectiondb.getCollnum ( coll );
if ( ! s_cacheInit ) {
@ -494,11 +566,6 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
500 , // maxage secs
true );// promote?
// doint qa test?
bool qaTest = false;
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( cr && strcmp(cr->m_coll,"qatest123") == 0 )
qaTest = true;
// -1 means not found in cache. if found, return it though.
@ -508,51 +575,44 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
return val;
}
// establish the list boundary keys
key144_t startKey ;
key144_t endKey ;
//makeStartKey ( &startKey, termId );
//makeEndKey ( &endKey , termId );
// . ask rdb for an upper bound on this list size
// . but actually, it will be somewhat of an estimate 'cuz of RdbTree
//key144_t maxKey;
key144_t maxKey;
//int64_t maxRecs;
// . don't count more than these many in the map
// . that's our old truncation limit, the new stuff isn't as dense
//int32_t oldTrunc = 100000;
// turn this off for this
//int64_t oldTrunc = -1;
int64_t oldTrunc = -1;
// get maxKey for only the top "oldTruncLimit" docids because when
// we increase the trunc limit we screw up our extrapolation! BIG TIME!
// maxRecs = m_rdb.getListSize(collnum,
// (char *)&startKey,
// (char *)&endKey,
// (char *)&maxKey,
// oldTrunc );
int64_t maxRecs = m_rdb.getListSize(collnum,
(char *)&startKey,
(char *)&endKey,
(char *)&maxKey,
oldTrunc );
makeStartKey ( &startKey, termId );
makeEndKey ( &endKey , termId );
int64_t numBytes = 0;
// get the # more slowly but exact for qa tests so it agrees
// with the results of the last time we ran it
if ( qaTest )
// TODO: just get the actual list and count unique docids
// with a blocking msg5...
numBytes += m_rdb.m_buckets.getListSizeExact(collnum,
(char *)&startKey,
(char *)&endKey);
else
numBytes += m_rdb.m_buckets.getListSize(collnum,
(char *)&startKey,
(char *)&endKey,
NULL,NULL);
// if ( qaTest )
// // TODO: just get the actual list and count unique docids
// // with a blocking msg5...
// numBytes += m_rdb.m_buckets.getListSizeExact(collnum,
// (char *)&startKey,
// (char *)&endKey);
// else
numBytes += m_rdb.m_buckets.getListSize(collnum,
(char *)&startKey,
(char *)&endKey,
NULL,NULL);
// convert from size in bytes to # of recs
numBytes /= sizeof(POSDBKEY);
maxRecs += numBytes / sizeof(POSDBKEY);
// RdbList list;
// makeStartKey ( &startKey, termId );
@ -570,9 +630,10 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
// if ( numPos*18 != numBytes ) {
// char *xx=NULL;*xx=0; }
// and assume each shard has about the same #
numBytes *= g_hostdb.m_numShards;
maxRecs *= g_hostdb.m_numShards;
// over all splits!
//maxRecs *= g_hostdb.m_numShards;
@ -585,9 +646,9 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
//log("posdb: approx=%"INT64" exact=%"INT64"",maxRecs,numBytes);
// now cache it. it sets g_errno to zero.
g_termFreqCache.addLongLong2 ( collnum, termId, numBytes );
g_termFreqCache.addLongLong2 ( collnum, termId, maxRecs );
// return it
return numBytes;//maxRecs;
return maxRecs;
}
//////////////////
@ -820,6 +881,7 @@ bool PosdbTable::allocTopTree ( ) {
if ( m_r->m_doSiteClustering ) nn *= 2;
// limit to this regardless!
//CollectionRec *cr = g_collectiondb.getRec ( m_coll );
//if ( ! cr ) return false;
@ -832,7 +894,8 @@ bool PosdbTable::allocTopTree ( ) {
if ( nn < m_r->m_docsToGet )
log("query: warning only getting up to %"INT64" docids "
"even though %"INT32" requested!!"
"even though %"INT32" requested because termlist "
"sizes are so small!!"
, nn
, m_r->m_docsToGet );
@ -844,10 +907,10 @@ bool PosdbTable::allocTopTree ( ) {
}
// let's use nn*4 to try to get as many score as possible, although
// it may still not work!
int32_t xx = m_r->m_docsToGet ;
int32_t xx = nn;//m_r->m_docsToGet ;
// try to fix a core of growing this table in a thread when xx == 1
if ( xx < 32 ) xx = 32;
if ( m_r->m_doSiteClustering ) xx *= 4;
//if ( m_r->m_doSiteClustering ) xx *= 4;
m_maxScores = xx;
// for seeing if a docid is in toptree. niceness=0.
//if ( ! m_docIdTable.set(8,0,xx*4,NULL,0,false,0,"dotb") )

View File

@ -158,6 +158,7 @@ bool Summary::set2 ( Xml *xml ,
// query terms
int32_t numTerms = q->getNumTerms();
// . compute our word weights wrt each query. words which are more rare
// have a higher weight. We use this to weight the terms importance
// when generating the summary.
@ -195,6 +196,15 @@ bool Summary::set2 ( Xml *xml ,
m_wordWeights[i] = 1.0;
}
if ( g_conf.m_logDebugSummary ) {
for ( int32_t i = 0 ; i < q->m_numWords; i++ ) {
int64_t tf = -1;
if ( termFreqs ) tf = termFreqs[i];
log("sum: u=%s wordWeights[%"INT32"]=%f tf=%"INT64"",
f->m_url,i,m_wordWeights[i],tf);
}
}
// convenience
m_maxNumCharsPerLine = maxNumCharsPerLine;
//m_qscores = qscores;
@ -802,8 +812,9 @@ int64_t Summary::getBestWindow ( Matches *matches ,
wordCount = 0;
// for debug
char buf[5000];
char *xp = buf;
//char buf[5000];
//char *xp = buf;
SafeBuf xp;
// wtf?
if ( b > nw ) b = nw;
@ -819,8 +830,8 @@ int64_t Summary::getBestWindow ( Matches *matches ,
char *c = words->m_words[i]+k;
cs = getUtf8CharSize(c);
if ( is_binary_utf8 ( c ) ) continue;
memcpy ( xp , c , cs );
xp += cs;
xp.safeMemcpy ( c , cs );
xp.nullTerm();
}
}
@ -830,7 +841,7 @@ int64_t Summary::getBestWindow ( Matches *matches ,
// don't count just numeric words
if ( words->isNum(i) ) continue;
// check if there is a url. best way to check for '://'
if ( !wids[i] ){
if ( wids && !wids[i] ){
char *wrd = words->m_words[i];
int32_t wrdLen = words->m_wordLens[i];
if ( wrdLen == 3 &&
@ -859,8 +870,7 @@ int64_t Summary::getBestWindow ( Matches *matches ,
// print the score, "t"
if ( g_conf.m_logDebugSummary ) {
sprintf ( xp ,"(%"INT32")",t);
xp += gbstrlen(xp);
xp.safePrintf("(%"INT32")",t);
}
// skip if not wid
@ -900,8 +910,8 @@ int64_t Summary::getBestWindow ( Matches *matches ,
score += t;
if ( g_conf.m_logDebugSummary ) {
sprintf ( xp ,"[%"INT32"]",t);
xp += gbstrlen(xp);
xp.safePrintf ("[%"INT32"]{qwn=%"INT32",ww=%f}",t,qwn,
m_wordWeights[qwn]);
}
// inc the query word count for this window
@ -940,7 +950,8 @@ int64_t Summary::getBestWindow ( Matches *matches ,
// show it
if ( g_conf.m_logDebugSummary )
logf(LOG_DEBUG,"score=%08"INT32" prescore=%08"INT32" a=%05"INT32" b=%05"INT32" %s",
(int32_t)score,oldScore,(int32_t)a,(int32_t)b,buf);
(int32_t)score,oldScore,(int32_t)a,(int32_t)b,
xp.getBufStart());
// set lasta, besta, bestb
*lasta = a;

View File

@ -30169,7 +30169,7 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
// . no! think about it -- this can be huge for pages like
// google.com!!!
LinkInfo *info1 = ptr_linkInfo1;
if ( info1 && m_req->m_getLinkInfo ) {
if ( info1 ) { // && m_req->m_getLinkInfo ) {
reply->m_pageNumInlinks = info1->m_totalInlinkingDocIds;
reply->m_pageNumGoodInlinks = info1->m_numGoodInlinks;
reply->m_pageNumUniqueIps = info1->m_numUniqueIps;

35
qa.cpp
View File

@ -65,8 +65,8 @@ void markOut ( char *content , char *needle ) {
// a consistent LENGTH if we had 10 hits vs 9... making the hash
// different
// space out digits
for ( ; *s && is_digit(*s); s++ ) *s = ' ';
// space out digits. including decimal point.
for ( ; *s && (is_digit(*s)||*s=='.'); s++ ) *s = ' ';
// loop for more for the "rand64=" thing
content = s;
@ -173,6 +173,9 @@ void processReply ( char *reply , int32_t replyLen ) {
markOut ( content , "<currentTimeUTC>");
markOut ( content , "<responseTimeMS>");
// ...from an index of about 429 pages in 0.91 seconds in collection...
markOut ( content , " pages in ");
// until i figure this one out, take it out
markOut ( content , "<docsInCollection>");
@ -533,7 +536,11 @@ bool qainject1 ( ) {
// turn off images thumbnails
if ( ! s_flags[17] ) {
s_flags[17] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
// turn off use robots to avoid that
// xyz.com/robots.txt redir to seekseek.com
"&obeyRobots=0"
,
// checksum of reply expected
238170006 ) )
return false;
@ -708,6 +715,8 @@ bool qainject1 ( ) {
return true;
}
//static int32_t s_savedAutoSaveFreq = 0;
bool qainject2 ( ) {
//if ( ! s_callback ) s_callback = qainject2;
@ -718,6 +727,8 @@ bool qainject2 ( ) {
//static bool s_x1 = false;
if ( ! s_flags[0] ) {
s_flags[0] = true;
//s_savedAutoSaveFreq = g_conf.m_autoSaveFrequency;
//g_conf.m_autoSaveFrequency = 0;
if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
return false;
}
@ -738,7 +749,12 @@ bool qainject2 ( ) {
// turn off images thumbnails
if ( ! s_flags[17] ) {
s_flags[17] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
// can't turn off spiders because we need for query reindex
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
// turn off use robots to avoid that
// xyz.com/robots.txt redir to seekseek.com
"&obeyRobots=0"
,
// checksum of reply expected
238170006 ) )
return false;
@ -818,7 +834,17 @@ bool qainject2 ( ) {
// mdw: query DELETE test
//
if ( ! s_flags[30] ) {
s_flags[30] = true;
// log("qa: SUCCESSFULLY COMPLETED "
// "QA INJECT TEST 2 *** FAKE");
// //if ( s_callback == qainject ) exit(0);
// g_conf.m_autoSaveFrequency = s_savedAutoSaveFreq;
// return true;
if ( ! getUrl ( "/admin/reindex"
"?c=qatest123"
"&format=xml"
@ -874,6 +900,7 @@ bool qainject2 ( ) {
log("qa: SUCCESSFULLY COMPLETED "
"QA INJECT TEST 2");
//if ( s_callback == qainject ) exit(0);
//g_conf.m_autoSaveFrequency = s_savedAutoSaveFreq;
return true;
}