mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
fixed summary initialization error
of the flags buffer. fixed term freq algo. use exact term freq for qatest123. made Summary.o -O3 again. fix gbsystem() to disable both timers.
This commit is contained in:
parent
01d61d5427
commit
41c8817bdb
22
Loop.cpp
22
Loop.cpp
@ -2547,22 +2547,24 @@ void Loop::disableTimer() {
|
||||
}
|
||||
|
||||
int gbsystem(char *cmd ) {
|
||||
if ( ! g_conf.m_runAsDaemon )
|
||||
setitimer(ITIMER_REAL, &g_loop.m_noInterrupt, NULL);
|
||||
// if ( ! g_conf.m_runAsDaemon )
|
||||
// setitimer(ITIMER_REAL, &g_loop.m_noInterrupt, NULL);
|
||||
g_loop.disableTimer();
|
||||
log("gb: running system(\"%s\")",cmd);
|
||||
int ret = system(cmd);
|
||||
if ( ! g_conf.m_runAsDaemon )
|
||||
setitimer(ITIMER_REAL, &g_loop.m_realInterrupt, NULL);
|
||||
g_loop.enableTimer();
|
||||
// if ( ! g_conf.m_runAsDaemon )
|
||||
// setitimer(ITIMER_REAL, &g_loop.m_realInterrupt, NULL);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
// void Loop::enableTimer() {
|
||||
// m_canQuickPoll = true;
|
||||
// // logf(LOG_WARN, "xxx enabling");
|
||||
// setitimer(ITIMER_VIRTUAL, &m_quickInterrupt, NULL);
|
||||
// //setitimer(ITIMER_REAL, &m_quickInterrupt, NULL);
|
||||
// }
|
||||
void Loop::enableTimer() {
|
||||
m_canQuickPoll = true;
|
||||
// logf(LOG_WARN, "xxx enabling");
|
||||
setitimer(ITIMER_VIRTUAL, &m_quickInterrupt, NULL);
|
||||
setitimer(ITIMER_REAL, &m_realInterrupt, NULL);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
5
Makefile
5
Makefile
@ -438,8 +438,9 @@ neighborhood.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
TermTable.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
#Summary.o:
|
||||
# $(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
# why was this commented out?
|
||||
Summary.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
Title.o:
|
||||
$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp
|
||||
|
||||
|
@ -103,6 +103,9 @@ void Matches::setQuery ( Query *q ) {
|
||||
|
||||
//memset ( m_foundNegTermVector, 0, m_q->getNumTerms() );
|
||||
|
||||
// this is word based. these are each 1 byte
|
||||
memset ( m_qwordFlags , 0 , m_q->m_numWords * sizeof(mf_t));
|
||||
|
||||
// # of WORDS in the query
|
||||
int32_t nqt = m_q->m_numTerms;
|
||||
|
||||
@ -110,7 +113,7 @@ void Matches::setQuery ( Query *q ) {
|
||||
int32_t numToMatch = 0;
|
||||
for ( int32_t i = 0 ; i < nqt ; i++ ) {
|
||||
// rest this
|
||||
m_qwordFlags[i] = 0;
|
||||
//m_qwordFlags[i] = 0;
|
||||
// get query word #i
|
||||
//QueryWord *qw = &m_q->m_qwords[i];
|
||||
QueryTerm *qt = &m_q->m_qterms[i];
|
||||
|
137
Posdb.cpp
137
Posdb.cpp
@ -461,8 +461,80 @@ static bool s_cacheInit = false;
|
||||
|
||||
// . accesses RdbMap to estimate size of the indexList for this termId
|
||||
// . returns an UPPER BOUND
|
||||
// . because this is over POSDB now and not indexdb, a document is counted
|
||||
// once for every occurence of term "termId" it has... :{
|
||||
int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
|
||||
|
||||
// establish the list boundary keys
|
||||
key144_t startKey ;
|
||||
key144_t endKey ;
|
||||
makeStartKey ( &startKey, termId );
|
||||
makeEndKey ( &endKey , termId );
|
||||
|
||||
|
||||
// doint qa test?
|
||||
bool qaTest = false;
|
||||
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
if ( cr && strcmp(cr->m_coll,"qatest123") == 0 )
|
||||
qaTest = true;
|
||||
|
||||
// if so, use the exact size
|
||||
if ( qaTest ) {
|
||||
Msg5 msg5;
|
||||
RdbList list;
|
||||
g_threads.disableThreads();
|
||||
msg5.getList ( RDB_POSDB ,
|
||||
collnum ,
|
||||
&list ,
|
||||
&startKey ,
|
||||
&endKey ,
|
||||
64000000 , // minRecSizes ,
|
||||
true , // includeTree ,
|
||||
false , // add to cache?
|
||||
0 , // max cache age
|
||||
0 , // startFileNum ,
|
||||
-1 , // numFiles ,
|
||||
NULL , // state
|
||||
NULL , // callback
|
||||
0 , // niceness
|
||||
false , // err correction?
|
||||
NULL ,
|
||||
0 ,
|
||||
-1 ,
|
||||
true ,
|
||||
-1LL ,
|
||||
NULL , // msg5b ptr
|
||||
true );
|
||||
// re-enable threads
|
||||
g_threads.enableThreads();
|
||||
//int64_t numBytes = list.getListSize();
|
||||
// see how many diff docids we have... easier to debug this
|
||||
// loop over entries in list
|
||||
int64_t docId = 0;
|
||||
int64_t count = 0;
|
||||
for ( list.resetListPtr() ; ! list.isExhausted() ;
|
||||
list.skipCurrentRecord() ) {
|
||||
key144_t k; list.getCurrentKey(&k);
|
||||
// is it a delete?
|
||||
if ( (k.n0 & 0x01) == 0x00 ) continue;
|
||||
int64_t d = g_posdb.getDocId(&k);
|
||||
if ( d == docId ) continue;
|
||||
docId = d;
|
||||
count++;
|
||||
}
|
||||
// convert to # keys, approx. just an estimate since
|
||||
// some keys are compressed...
|
||||
// none except first key are full size. they are all just
|
||||
// 12 bytes etc.
|
||||
int64_t numKeys = count;
|
||||
if ( numKeys < 0 ) numKeys = 0;
|
||||
// and assume each shard has about the same #
|
||||
numKeys *= g_hostdb.m_numShards;
|
||||
return numKeys;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//collnum_t collnum = g_collectiondb.getCollnum ( coll );
|
||||
|
||||
if ( ! s_cacheInit ) {
|
||||
@ -494,11 +566,6 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
|
||||
500 , // maxage secs
|
||||
true );// promote?
|
||||
|
||||
// doint qa test?
|
||||
bool qaTest = false;
|
||||
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
if ( cr && strcmp(cr->m_coll,"qatest123") == 0 )
|
||||
qaTest = true;
|
||||
|
||||
|
||||
// -1 means not found in cache. if found, return it though.
|
||||
@ -508,51 +575,44 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
|
||||
return val;
|
||||
}
|
||||
|
||||
// establish the list boundary keys
|
||||
key144_t startKey ;
|
||||
key144_t endKey ;
|
||||
//makeStartKey ( &startKey, termId );
|
||||
//makeEndKey ( &endKey , termId );
|
||||
// . ask rdb for an upper bound on this list size
|
||||
// . but actually, it will be somewhat of an estimate 'cuz of RdbTree
|
||||
//key144_t maxKey;
|
||||
key144_t maxKey;
|
||||
//int64_t maxRecs;
|
||||
// . don't count more than these many in the map
|
||||
// . that's our old truncation limit, the new stuff isn't as dense
|
||||
//int32_t oldTrunc = 100000;
|
||||
// turn this off for this
|
||||
//int64_t oldTrunc = -1;
|
||||
int64_t oldTrunc = -1;
|
||||
// get maxKey for only the top "oldTruncLimit" docids because when
|
||||
// we increase the trunc limit we screw up our extrapolation! BIG TIME!
|
||||
// maxRecs = m_rdb.getListSize(collnum,
|
||||
// (char *)&startKey,
|
||||
// (char *)&endKey,
|
||||
// (char *)&maxKey,
|
||||
// oldTrunc );
|
||||
int64_t maxRecs = m_rdb.getListSize(collnum,
|
||||
(char *)&startKey,
|
||||
(char *)&endKey,
|
||||
(char *)&maxKey,
|
||||
oldTrunc );
|
||||
|
||||
makeStartKey ( &startKey, termId );
|
||||
makeEndKey ( &endKey , termId );
|
||||
|
||||
int64_t numBytes = 0;
|
||||
|
||||
// get the # more slowly but exact for qa tests so it agrees
|
||||
// with the results of the last time we ran it
|
||||
if ( qaTest )
|
||||
// TODO: just get the actual list and count unique docids
|
||||
// with a blocking msg5...
|
||||
numBytes += m_rdb.m_buckets.getListSizeExact(collnum,
|
||||
(char *)&startKey,
|
||||
(char *)&endKey);
|
||||
else
|
||||
numBytes += m_rdb.m_buckets.getListSize(collnum,
|
||||
(char *)&startKey,
|
||||
(char *)&endKey,
|
||||
NULL,NULL);
|
||||
// if ( qaTest )
|
||||
// // TODO: just get the actual list and count unique docids
|
||||
// // with a blocking msg5...
|
||||
// numBytes += m_rdb.m_buckets.getListSizeExact(collnum,
|
||||
// (char *)&startKey,
|
||||
// (char *)&endKey);
|
||||
// else
|
||||
numBytes += m_rdb.m_buckets.getListSize(collnum,
|
||||
(char *)&startKey,
|
||||
(char *)&endKey,
|
||||
NULL,NULL);
|
||||
|
||||
|
||||
|
||||
// convert from size in bytes to # of recs
|
||||
numBytes /= sizeof(POSDBKEY);
|
||||
maxRecs += numBytes / sizeof(POSDBKEY);
|
||||
|
||||
// RdbList list;
|
||||
// makeStartKey ( &startKey, termId );
|
||||
@ -570,9 +630,10 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
|
||||
// if ( numPos*18 != numBytes ) {
|
||||
// char *xx=NULL;*xx=0; }
|
||||
|
||||
|
||||
|
||||
// and assume each shard has about the same #
|
||||
numBytes *= g_hostdb.m_numShards;
|
||||
maxRecs *= g_hostdb.m_numShards;
|
||||
|
||||
// over all splits!
|
||||
//maxRecs *= g_hostdb.m_numShards;
|
||||
@ -585,9 +646,9 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
|
||||
//log("posdb: approx=%"INT64" exact=%"INT64"",maxRecs,numBytes);
|
||||
|
||||
// now cache it. it sets g_errno to zero.
|
||||
g_termFreqCache.addLongLong2 ( collnum, termId, numBytes );
|
||||
g_termFreqCache.addLongLong2 ( collnum, termId, maxRecs );
|
||||
// return it
|
||||
return numBytes;//maxRecs;
|
||||
return maxRecs;
|
||||
}
|
||||
|
||||
//////////////////
|
||||
@ -820,6 +881,7 @@ bool PosdbTable::allocTopTree ( ) {
|
||||
|
||||
|
||||
if ( m_r->m_doSiteClustering ) nn *= 2;
|
||||
|
||||
// limit to this regardless!
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( m_coll );
|
||||
//if ( ! cr ) return false;
|
||||
@ -832,7 +894,8 @@ bool PosdbTable::allocTopTree ( ) {
|
||||
|
||||
if ( nn < m_r->m_docsToGet )
|
||||
log("query: warning only getting up to %"INT64" docids "
|
||||
"even though %"INT32" requested!!"
|
||||
"even though %"INT32" requested because termlist "
|
||||
"sizes are so small!!"
|
||||
, nn
|
||||
, m_r->m_docsToGet );
|
||||
|
||||
@ -844,10 +907,10 @@ bool PosdbTable::allocTopTree ( ) {
|
||||
}
|
||||
// let's use nn*4 to try to get as many score as possible, although
|
||||
// it may still not work!
|
||||
int32_t xx = m_r->m_docsToGet ;
|
||||
int32_t xx = nn;//m_r->m_docsToGet ;
|
||||
// try to fix a core of growing this table in a thread when xx == 1
|
||||
if ( xx < 32 ) xx = 32;
|
||||
if ( m_r->m_doSiteClustering ) xx *= 4;
|
||||
//if ( m_r->m_doSiteClustering ) xx *= 4;
|
||||
m_maxScores = xx;
|
||||
// for seeing if a docid is in toptree. niceness=0.
|
||||
//if ( ! m_docIdTable.set(8,0,xx*4,NULL,0,false,0,"dotb") )
|
||||
|
31
Summary.cpp
31
Summary.cpp
@ -158,6 +158,7 @@ bool Summary::set2 ( Xml *xml ,
|
||||
|
||||
// query terms
|
||||
int32_t numTerms = q->getNumTerms();
|
||||
|
||||
// . compute our word weights wrt each query. words which are more rare
|
||||
// have a higher weight. We use this to weight the terms importance
|
||||
// when generating the summary.
|
||||
@ -195,6 +196,15 @@ bool Summary::set2 ( Xml *xml ,
|
||||
m_wordWeights[i] = 1.0;
|
||||
}
|
||||
|
||||
if ( g_conf.m_logDebugSummary ) {
|
||||
for ( int32_t i = 0 ; i < q->m_numWords; i++ ) {
|
||||
int64_t tf = -1;
|
||||
if ( termFreqs ) tf = termFreqs[i];
|
||||
log("sum: u=%s wordWeights[%"INT32"]=%f tf=%"INT64"",
|
||||
f->m_url,i,m_wordWeights[i],tf);
|
||||
}
|
||||
}
|
||||
|
||||
// convenience
|
||||
m_maxNumCharsPerLine = maxNumCharsPerLine;
|
||||
//m_qscores = qscores;
|
||||
@ -802,8 +812,9 @@ int64_t Summary::getBestWindow ( Matches *matches ,
|
||||
wordCount = 0;
|
||||
|
||||
// for debug
|
||||
char buf[5000];
|
||||
char *xp = buf;
|
||||
//char buf[5000];
|
||||
//char *xp = buf;
|
||||
SafeBuf xp;
|
||||
|
||||
// wtf?
|
||||
if ( b > nw ) b = nw;
|
||||
@ -819,8 +830,8 @@ int64_t Summary::getBestWindow ( Matches *matches ,
|
||||
char *c = words->m_words[i]+k;
|
||||
cs = getUtf8CharSize(c);
|
||||
if ( is_binary_utf8 ( c ) ) continue;
|
||||
memcpy ( xp , c , cs );
|
||||
xp += cs;
|
||||
xp.safeMemcpy ( c , cs );
|
||||
xp.nullTerm();
|
||||
}
|
||||
}
|
||||
|
||||
@ -830,7 +841,7 @@ int64_t Summary::getBestWindow ( Matches *matches ,
|
||||
// don't count just numeric words
|
||||
if ( words->isNum(i) ) continue;
|
||||
// check if there is a url. best way to check for '://'
|
||||
if ( !wids[i] ){
|
||||
if ( wids && !wids[i] ){
|
||||
char *wrd = words->m_words[i];
|
||||
int32_t wrdLen = words->m_wordLens[i];
|
||||
if ( wrdLen == 3 &&
|
||||
@ -859,8 +870,7 @@ int64_t Summary::getBestWindow ( Matches *matches ,
|
||||
|
||||
// print the score, "t"
|
||||
if ( g_conf.m_logDebugSummary ) {
|
||||
sprintf ( xp ,"(%"INT32")",t);
|
||||
xp += gbstrlen(xp);
|
||||
xp.safePrintf("(%"INT32")",t);
|
||||
}
|
||||
|
||||
// skip if not wid
|
||||
@ -900,8 +910,8 @@ int64_t Summary::getBestWindow ( Matches *matches ,
|
||||
score += t;
|
||||
|
||||
if ( g_conf.m_logDebugSummary ) {
|
||||
sprintf ( xp ,"[%"INT32"]",t);
|
||||
xp += gbstrlen(xp);
|
||||
xp.safePrintf ("[%"INT32"]{qwn=%"INT32",ww=%f}",t,qwn,
|
||||
m_wordWeights[qwn]);
|
||||
}
|
||||
|
||||
// inc the query word count for this window
|
||||
@ -940,7 +950,8 @@ int64_t Summary::getBestWindow ( Matches *matches ,
|
||||
// show it
|
||||
if ( g_conf.m_logDebugSummary )
|
||||
logf(LOG_DEBUG,"score=%08"INT32" prescore=%08"INT32" a=%05"INT32" b=%05"INT32" %s",
|
||||
(int32_t)score,oldScore,(int32_t)a,(int32_t)b,buf);
|
||||
(int32_t)score,oldScore,(int32_t)a,(int32_t)b,
|
||||
xp.getBufStart());
|
||||
|
||||
// set lasta, besta, bestb
|
||||
*lasta = a;
|
||||
|
@ -30169,7 +30169,7 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
// . no! think about it -- this can be huge for pages like
|
||||
// google.com!!!
|
||||
LinkInfo *info1 = ptr_linkInfo1;
|
||||
if ( info1 && m_req->m_getLinkInfo ) {
|
||||
if ( info1 ) { // && m_req->m_getLinkInfo ) {
|
||||
reply->m_pageNumInlinks = info1->m_totalInlinkingDocIds;
|
||||
reply->m_pageNumGoodInlinks = info1->m_numGoodInlinks;
|
||||
reply->m_pageNumUniqueIps = info1->m_numUniqueIps;
|
||||
|
35
qa.cpp
35
qa.cpp
@ -65,8 +65,8 @@ void markOut ( char *content , char *needle ) {
|
||||
// a consistent LENGTH if we had 10 hits vs 9... making the hash
|
||||
// different
|
||||
|
||||
// space out digits
|
||||
for ( ; *s && is_digit(*s); s++ ) *s = ' ';
|
||||
// space out digits. including decimal point.
|
||||
for ( ; *s && (is_digit(*s)||*s=='.'); s++ ) *s = ' ';
|
||||
|
||||
// loop for more for the "rand64=" thing
|
||||
content = s;
|
||||
@ -173,6 +173,9 @@ void processReply ( char *reply , int32_t replyLen ) {
|
||||
markOut ( content , "<currentTimeUTC>");
|
||||
markOut ( content , "<responseTimeMS>");
|
||||
|
||||
// ...from an index of about 429 pages in 0.91 seconds in collection...
|
||||
markOut ( content , " pages in ");
|
||||
|
||||
// until i figure this one out, take it out
|
||||
markOut ( content , "<docsInCollection>");
|
||||
|
||||
@ -533,7 +536,11 @@ bool qainject1 ( ) {
|
||||
// turn off images thumbnails
|
||||
if ( ! s_flags[17] ) {
|
||||
s_flags[17] = true;
|
||||
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
|
||||
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
|
||||
// turn off use robots to avoid that
|
||||
// xyz.com/robots.txt redir to seekseek.com
|
||||
"&obeyRobots=0"
|
||||
,
|
||||
// checksum of reply expected
|
||||
238170006 ) )
|
||||
return false;
|
||||
@ -708,6 +715,8 @@ bool qainject1 ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
//static int32_t s_savedAutoSaveFreq = 0;
|
||||
|
||||
bool qainject2 ( ) {
|
||||
|
||||
//if ( ! s_callback ) s_callback = qainject2;
|
||||
@ -718,6 +727,8 @@ bool qainject2 ( ) {
|
||||
//static bool s_x1 = false;
|
||||
if ( ! s_flags[0] ) {
|
||||
s_flags[0] = true;
|
||||
//s_savedAutoSaveFreq = g_conf.m_autoSaveFrequency;
|
||||
//g_conf.m_autoSaveFrequency = 0;
|
||||
if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
|
||||
return false;
|
||||
}
|
||||
@ -738,7 +749,12 @@ bool qainject2 ( ) {
|
||||
// turn off images thumbnails
|
||||
if ( ! s_flags[17] ) {
|
||||
s_flags[17] = true;
|
||||
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
|
||||
// can't turn off spiders because we need for query reindex
|
||||
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
|
||||
// turn off use robots to avoid that
|
||||
// xyz.com/robots.txt redir to seekseek.com
|
||||
"&obeyRobots=0"
|
||||
,
|
||||
// checksum of reply expected
|
||||
238170006 ) )
|
||||
return false;
|
||||
@ -818,7 +834,17 @@ bool qainject2 ( ) {
|
||||
// mdw: query DELETE test
|
||||
//
|
||||
if ( ! s_flags[30] ) {
|
||||
|
||||
|
||||
s_flags[30] = true;
|
||||
|
||||
// log("qa: SUCCESSFULLY COMPLETED "
|
||||
// "QA INJECT TEST 2 *** FAKE");
|
||||
// //if ( s_callback == qainject ) exit(0);
|
||||
// g_conf.m_autoSaveFrequency = s_savedAutoSaveFreq;
|
||||
// return true;
|
||||
|
||||
|
||||
if ( ! getUrl ( "/admin/reindex"
|
||||
"?c=qatest123"
|
||||
"&format=xml"
|
||||
@ -874,6 +900,7 @@ bool qainject2 ( ) {
|
||||
log("qa: SUCCESSFULLY COMPLETED "
|
||||
"QA INJECT TEST 2");
|
||||
//if ( s_callback == qainject ) exit(0);
|
||||
//g_conf.m_autoSaveFrequency = s_savedAutoSaveFreq;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user