fixed summary initialization error

of the flags buffer. fixed term freq algo. use exact term freq for qatest123. made Summary.o -O3 again. fix gbsystem() to disable both timers.
2024-10-04 04:07:13 +03:00 · 2014-12-06 10:14:48 -07:00 · 2014-12-06 10:14:48 -07:00 · 41c8817bdb
commit 41c8817bdb
parent 01d61d5427
7 changed files with 172 additions and 65 deletions
--- a/Loop.cpp
+++ b/Loop.cpp
@ -2547,22 +2547,24 @@ void Loop::disableTimer() {
 }

 int gbsystem(char *cmd ) {
-	if ( ! g_conf.m_runAsDaemon )
-		setitimer(ITIMER_REAL, &g_loop.m_noInterrupt, NULL);
+	// if ( ! g_conf.m_runAsDaemon )
+	// 	setitimer(ITIMER_REAL, &g_loop.m_noInterrupt, NULL);
+	g_loop.disableTimer();
 	log("gb: running system(\"%s\")",cmd);
 	int ret = system(cmd);
-	if ( ! g_conf.m_runAsDaemon )
-		setitimer(ITIMER_REAL, &g_loop.m_realInterrupt, NULL);
+	g_loop.enableTimer();
+	// if ( ! g_conf.m_runAsDaemon )
+	// 	setitimer(ITIMER_REAL, &g_loop.m_realInterrupt, NULL);
 	return ret;
 }
 	

-// void Loop::enableTimer() {
-// 	m_canQuickPoll = true;
-// 	//	logf(LOG_WARN, "xxx enabling");
-// 	setitimer(ITIMER_VIRTUAL, &m_quickInterrupt, NULL);
-// 	//setitimer(ITIMER_REAL, &m_quickInterrupt, NULL);
-// }
+void Loop::enableTimer() {
+	m_canQuickPoll = true;
+	//	logf(LOG_WARN, "xxx enabling");
+	setitimer(ITIMER_VIRTUAL, &m_quickInterrupt, NULL);
+	setitimer(ITIMER_REAL, &m_realInterrupt, NULL);
+}



--- a/5
+++ b/5
@ -438,8 +438,9 @@ neighborhood.o:
 	$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp 
 TermTable.o:
 	$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp 
-#Summary.o:
-#	$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp 
+# why was this commented out?
+Summary.o:
+	$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp 
 Title.o:
 	$(CC) $(DEFS) $(CPPFLAGS) -O2 -c $*.cpp 

--- a/Matches.cpp
+++ b/Matches.cpp
@ -103,6 +103,9 @@ void Matches::setQuery ( Query *q ) {

 	//memset ( m_foundNegTermVector, 0, m_q->getNumTerms() );

+	// this is word based. these are each 1 byte
+	memset ( m_qwordFlags  , 0 , m_q->m_numWords * sizeof(mf_t));
+
 	// # of WORDS in the query
 	int32_t nqt = m_q->m_numTerms;

@ -110,7 +113,7 @@ void Matches::setQuery ( Query *q ) {
 	int32_t numToMatch = 0;
 	for ( int32_t i = 0 ; i < nqt ; i++ ) {
 		// rest this
-		m_qwordFlags[i] = 0;
+		//m_qwordFlags[i] = 0;
 		// get query word #i
 		//QueryWord *qw = &m_q->m_qwords[i];
 		QueryTerm *qt = &m_q->m_qterms[i];
--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -461,8 +461,80 @@ static bool s_cacheInit = false;

 // . accesses RdbMap to estimate size of the indexList for this termId
 // . returns an UPPER BOUND
+// . because this is over POSDB now and not indexdb, a document is counted
+//   once for every occurence of term "termId" it has... :{
 int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {

+	// establish the list boundary keys
+	key144_t startKey ;
+	key144_t endKey   ;
+	makeStartKey ( &startKey, termId );
+	makeEndKey   ( &endKey  , termId );
+
+
+	// doint qa test?
+	bool qaTest = false;
+	CollectionRec *cr = g_collectiondb.getRec ( collnum );
+	if ( cr && strcmp(cr->m_coll,"qatest123") == 0 )
+		qaTest = true;
+
+	// if so, use the exact size
+	if ( qaTest ) {
+		Msg5 msg5;
+		RdbList list;
+		g_threads.disableThreads();
+		msg5.getList ( RDB_POSDB   ,
+			       collnum      ,
+			      &list         ,
+			       &startKey      ,
+			       &endKey        ,
+			      64000000      , // minRecSizes   ,
+			      true          , // includeTree   ,
+			      false         , // add to cache?
+			      0             , // max cache age
+			      0             , // startFileNum  ,
+			      -1            , // numFiles      ,
+			      NULL          , // state
+			      NULL          , // callback
+			      0             , // niceness
+			      false         , // err correction?
+			      NULL          ,
+			      0             ,
+			      -1            ,
+			      true          ,
+			      -1LL          ,
+			       NULL        , // msg5b ptr
+			       true          );
+		// re-enable threads
+		g_threads.enableThreads();
+		//int64_t numBytes = list.getListSize();
+		// see how many diff docids we have... easier to debug this
+		// loop over entries in list
+		int64_t docId = 0;
+		int64_t count = 0;
+		for ( list.resetListPtr() ; ! list.isExhausted() ;
+		      list.skipCurrentRecord() ) {
+			key144_t k; list.getCurrentKey(&k);
+			// is it a delete?
+			if ( (k.n0 & 0x01) == 0x00 ) continue;
+			int64_t d = g_posdb.getDocId(&k);
+			if ( d == docId ) continue;
+			docId = d;
+			count++;
+		}		
+		// convert to # keys, approx. just an estimate since
+		// some keys are compressed...
+		// none except first key are full size. they are all just
+		// 12 bytes etc.
+		int64_t numKeys = count;
+		if ( numKeys < 0 ) numKeys = 0;
+		// and assume each shard has about the same #
+		numKeys *= g_hostdb.m_numShards;
+		return numKeys;
+	}
+ 
+	
+
 	//collnum_t collnum = g_collectiondb.getCollnum ( coll );

 	if ( ! s_cacheInit ) {
@ -494,11 +566,6 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
 						       500   , // maxage secs
 						       true    );// promote?

-	// doint qa test?
-	bool qaTest = false;
-	CollectionRec *cr = g_collectiondb.getRec ( collnum );
-	if ( cr && strcmp(cr->m_coll,"qatest123") == 0 )
-		qaTest = true;


 	// -1 means not found in cache. if found, return it though.
@ -508,51 +575,44 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
 		return val;
 	}

-	// establish the list boundary keys
-	key144_t startKey ;
-	key144_t endKey   ;
-	//makeStartKey ( &startKey, termId );
-	//makeEndKey   ( &endKey  , termId );
 	// . ask rdb for an upper bound on this list size
 	// . but actually, it will be somewhat of an estimate 'cuz of RdbTree
-	//key144_t maxKey;
+	key144_t maxKey;
 	//int64_t maxRecs;
 	// . don't count more than these many in the map
 	// . that's our old truncation limit, the new stuff isn't as dense
 	//int32_t oldTrunc = 100000;
 	// turn this off for this
-	//int64_t oldTrunc = -1;
+	int64_t oldTrunc = -1;
 	// get maxKey for only the top "oldTruncLimit" docids because when
 	// we increase the trunc limit we screw up our extrapolation! BIG TIME!
-	// maxRecs = m_rdb.getListSize(collnum,
-	// 			    (char *)&startKey,
-	// 			    (char *)&endKey,
-	// 			    (char *)&maxKey,
-	// 			    oldTrunc );
+	int64_t maxRecs = m_rdb.getListSize(collnum,
+					    (char *)&startKey,
+					    (char *)&endKey,
+					    (char *)&maxKey,
+					    oldTrunc );

-	makeStartKey ( &startKey, termId );
-	makeEndKey   ( &endKey  , termId );

 	int64_t numBytes = 0;

 	// get the # more slowly but exact for qa tests so it agrees
 	// with the results of the last time we ran it
-	if ( qaTest )
-		// TODO: just get the actual list and count unique docids
-		// with a blocking msg5...
-		numBytes += m_rdb.m_buckets.getListSizeExact(collnum,
-							(char *)&startKey,
-							(char *)&endKey);
-	else
-		numBytes += m_rdb.m_buckets.getListSize(collnum,
-							(char *)&startKey,
-							(char *)&endKey,
-							NULL,NULL);
+	// if ( qaTest )
+	// 	// TODO: just get the actual list and count unique docids
+	// 	// with a blocking msg5...
+	// 	numBytes += m_rdb.m_buckets.getListSizeExact(collnum,
+	// 						(char *)&startKey,
+	// 						(char *)&endKey);
+	// else
+	numBytes += m_rdb.m_buckets.getListSize(collnum,
+						(char *)&startKey,
+						(char *)&endKey,
+						NULL,NULL);



 	// convert from size in bytes to # of recs
-	numBytes /= sizeof(POSDBKEY);
+	maxRecs += numBytes / sizeof(POSDBKEY);

 	// RdbList list;
 	// makeStartKey ( &startKey, termId );
@ -570,9 +630,10 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
 	// if ( numPos*18 != numBytes ) {
 	// 	char *xx=NULL;*xx=0; }

+	

 	// and assume each shard has about the same #
-	numBytes *= g_hostdb.m_numShards;
+	maxRecs *= g_hostdb.m_numShards;

 	// over all splits!
 	//maxRecs *= g_hostdb.m_numShards;
@ -585,9 +646,9 @@ int64_t Posdb::getTermFreq ( collnum_t collnum, int64_t termId ) {
 	//log("posdb: approx=%"INT64" exact=%"INT64"",maxRecs,numBytes);

 	// now cache it. it sets g_errno to zero.
-	g_termFreqCache.addLongLong2 ( collnum, termId, numBytes );
+	g_termFreqCache.addLongLong2 ( collnum, termId, maxRecs );
 	// return it
-	return numBytes;//maxRecs;
+	return maxRecs;
 }

 //////////////////
@ -820,6 +881,7 @@ bool PosdbTable::allocTopTree ( ) {


 	if ( m_r->m_doSiteClustering ) nn *= 2;
+
        // limit to this regardless!
        //CollectionRec *cr = g_collectiondb.getRec ( m_coll );
        //if ( ! cr ) return false;
@ -832,7 +894,8 @@ bool PosdbTable::allocTopTree ( ) {

 	if ( nn < m_r->m_docsToGet )
 		log("query: warning only getting up to %"INT64" docids "
-		    "even though %"INT32" requested!!"
+		    "even though %"INT32" requested because termlist "
+		    "sizes are so small!!"
 		    , nn
 		    , m_r->m_docsToGet );

@ -844,10 +907,10 @@ bool PosdbTable::allocTopTree ( ) {
 	}
 	// let's use nn*4 to try to get as many score as possible, although
 	// it may still not work!
-	int32_t xx = m_r->m_docsToGet ;
+	int32_t xx = nn;//m_r->m_docsToGet ;
 	// try to fix a core of growing this table in a thread when xx == 1
 	if ( xx < 32 ) xx = 32;
-	if ( m_r->m_doSiteClustering ) xx *= 4;
+	//if ( m_r->m_doSiteClustering ) xx *= 4;
 	m_maxScores = xx;
 	// for seeing if a docid is in toptree. niceness=0.
 	//if ( ! m_docIdTable.set(8,0,xx*4,NULL,0,false,0,"dotb") )
--- a/Summary.cpp
+++ b/Summary.cpp
@ -158,6 +158,7 @@ bool Summary::set2 ( Xml      *xml                ,

 	// query terms
 	int32_t numTerms = q->getNumTerms();
+
 	// . compute our word weights wrt each query. words which are more rare
 	//   have a higher weight. We use this to weight the terms importance 
 	//   when generating the summary.
@ -195,6 +196,15 @@ bool Summary::set2 ( Xml      *xml                ,
 			m_wordWeights[i] = 1.0;
 	}

+	if ( g_conf.m_logDebugSummary ) {
+		for ( int32_t i = 0 ; i < q->m_numWords; i++ ) {
+			int64_t tf = -1;
+			if ( termFreqs ) tf = termFreqs[i];
+			log("sum: u=%s wordWeights[%"INT32"]=%f tf=%"INT64"",
+			    f->m_url,i,m_wordWeights[i],tf);
+		}
+	}
+
 	// convenience
 	m_maxNumCharsPerLine = maxNumCharsPerLine;
 	//m_qscores            = qscores;
@ -802,8 +812,9 @@ int64_t Summary::getBestWindow ( Matches *matches       ,
 	wordCount = 0;

 	// for debug
-	char buf[5000];
-	char *xp = buf;
+	//char buf[5000];
+	//char *xp = buf;
+	SafeBuf xp;

 	// wtf?
 	if ( b > nw ) b = nw;
@ -819,8 +830,8 @@ int64_t Summary::getBestWindow ( Matches *matches       ,
 				char *c = words->m_words[i]+k;
 				cs = getUtf8CharSize(c);
 				if ( is_binary_utf8 ( c ) ) continue;
-				memcpy ( xp , c , cs );
-				xp += cs;
+				xp.safeMemcpy ( c , cs );
+				xp.nullTerm();
 			}
 		}

@ -830,7 +841,7 @@ int64_t Summary::getBestWindow ( Matches *matches       ,
 		// don't count just numeric words
 		if ( words->isNum(i) ) continue;
 		// check if there is a url. best way to check for '://'
-		if ( !wids[i] ){
+		if ( wids && !wids[i] ){
 			char *wrd = words->m_words[i];
 			int32_t  wrdLen = words->m_wordLens[i];
 			if ( wrdLen == 3 &&
@ -859,8 +870,7 @@ int64_t Summary::getBestWindow ( Matches *matches       ,

 		// print the score, "t"
 		if ( g_conf.m_logDebugSummary ) {
-			sprintf ( xp ,"(%"INT32")",t);
-			xp += gbstrlen(xp);
+			xp.safePrintf("(%"INT32")",t);
 		}

 		// skip if not wid
@ -900,8 +910,8 @@ int64_t Summary::getBestWindow ( Matches *matches       ,
 		score += t;

 		if ( g_conf.m_logDebugSummary ) {
-			sprintf ( xp ,"[%"INT32"]",t);
-			xp += gbstrlen(xp);
+			xp.safePrintf ("[%"INT32"]{qwn=%"INT32",ww=%f}",t,qwn,
+				       m_wordWeights[qwn]);
 		}

 		// inc the query word count for this window
@ -940,7 +950,8 @@ int64_t Summary::getBestWindow ( Matches *matches       ,
 	// show it
 	if ( g_conf.m_logDebugSummary )
 		logf(LOG_DEBUG,"score=%08"INT32" prescore=%08"INT32" a=%05"INT32" b=%05"INT32" %s",
-		     (int32_t)score,oldScore,(int32_t)a,(int32_t)b,buf);
+		     (int32_t)score,oldScore,(int32_t)a,(int32_t)b,
+		     xp.getBufStart());

 	// set lasta, besta, bestb
 	*lasta = a;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -30169,7 +30169,7 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
 	// . no! think about it -- this can be huge for pages like
 	//   google.com!!!
 	LinkInfo *info1 = ptr_linkInfo1;
-	if ( info1 && m_req->m_getLinkInfo ) {
+	if ( info1 ) { // && m_req->m_getLinkInfo ) {
 		reply->m_pageNumInlinks        = info1->m_totalInlinkingDocIds;
 		reply->m_pageNumGoodInlinks     = info1->m_numGoodInlinks;
 		reply->m_pageNumUniqueIps       = info1->m_numUniqueIps;
--- a/qa.cpp
+++ b/qa.cpp
@ -65,8 +65,8 @@ void markOut ( char *content , char *needle ) {
 	// a consistent LENGTH if we had 10 hits vs 9... making the hash 
 	// different

-	// space out digits
-	for ( ; *s && is_digit(*s); s++ ) *s = ' ';
+	// space out digits. including decimal point.
+	for ( ; *s && (is_digit(*s)||*s=='.'); s++ ) *s = ' ';

 	// loop for more for the "rand64=" thing
 	content = s;
@ -173,6 +173,9 @@ void processReply ( char *reply , int32_t replyLen ) {
 	markOut ( content , "<currentTimeUTC>");
 	markOut ( content , "<responseTimeMS>");

+	// ...from an index of about 429 pages in 0.91 seconds in collection...
+	markOut ( content , " pages in ");
+
 	// until i figure this one out, take it out
 	markOut ( content , "<docsInCollection>");

@ -533,7 +536,11 @@ bool qainject1 ( ) {
 	// turn off images thumbnails
 	if ( ! s_flags[17] ) {
 		s_flags[17] = true;
-		if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
+		if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
+				// turn off use robots to avoid that
+				// xyz.com/robots.txt redir to seekseek.com
+				"&obeyRobots=0"
+				,
 				// checksum of reply expected
 				238170006 ) )
 			return false;
@ -708,6 +715,8 @@ bool qainject1 ( ) {
 	return true;
 }

+//static int32_t s_savedAutoSaveFreq = 0;
+
 bool qainject2 ( ) {

 	//if ( ! s_callback ) s_callback = qainject2;
@ -718,6 +727,8 @@ bool qainject2 ( ) {
 	//static bool s_x1 = false;
 	if ( ! s_flags[0] ) {
 		s_flags[0] = true;
+		//s_savedAutoSaveFreq = g_conf.m_autoSaveFrequency;
+		//g_conf.m_autoSaveFrequency = 0;
 		if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
 			return false;
 	}
@ -738,7 +749,12 @@ bool qainject2 ( ) {
 	// turn off images thumbnails
 	if ( ! s_flags[17] ) {
 		s_flags[17] = true;
-		if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
+		// can't turn off spiders because we need for query reindex
+		if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1"
+				// turn off use robots to avoid that
+				// xyz.com/robots.txt redir to seekseek.com
+				"&obeyRobots=0"
+				,
 				// checksum of reply expected
 				238170006 ) )
 			return false;
@ -818,7 +834,17 @@ bool qainject2 ( ) {
 	// mdw: query DELETE test
 	//
 	 if ( ! s_flags[30] ) {
+
+
 	 	s_flags[30] = true;
+
+		// log("qa: SUCCESSFULLY COMPLETED "
+		// 	"QA INJECT TEST 2 *** FAKE");
+		// //if ( s_callback == qainject ) exit(0);
+		// g_conf.m_autoSaveFrequency = s_savedAutoSaveFreq;
+		// return true;
+
+
 	 	if ( ! getUrl ( "/admin/reindex"
 				"?c=qatest123"
 				"&format=xml"
@ -874,6 +900,7 @@ bool qainject2 ( ) {
 		log("qa: SUCCESSFULLY COMPLETED "
 			"QA INJECT TEST 2");
 		//if ( s_callback == qainject ) exit(0);
+		//g_conf.m_autoSaveFrequency = s_savedAutoSaveFreq;
 		return true;
 	}