more faster spider fixes. tried to fix

corrupt rdbcache.
2024-10-04 04:07:13 +03:00 · 2014-02-06 09:25:27 -08:00 · 2014-02-06 09:25:27 -08:00 · 4029b0b937
commit 4029b0b937
parent 9145d89e3f
6 changed files with 39 additions and 13 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -1105,10 +1105,11 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
 	// and clear the robots.txt cache in case we recently spidered a
 	// robots.txt, we don't want to use it, we want to use the one we
 	// have in the test-parser subdir so we are consistent
-	RdbCache *robots = Msg13::getHttpCacheRobots();
-	RdbCache *others = Msg13::getHttpCacheOthers();
-	robots->clear ( oldCollnum );
-	others->clear ( oldCollnum );
+	//RdbCache *robots = Msg13::getHttpCacheRobots();
+	//RdbCache *others = Msg13::getHttpCacheOthers();
+	// clear() was removed do to possible corruption
+	//robots->clear ( oldCollnum );
+	//others->clear ( oldCollnum );

 	//g_templateTable.reset();
 	//g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );
--- a/Parms.cpp
+++ b/Parms.cpp
@ -18364,8 +18364,8 @@ void handleRequest3fLoop ( void *weArg ) {
 		if ( cx->m_spiderColl ) {
 			log("parms: forcing waiting tree rebuild");
 			cx->m_spiderColl->m_waitingTreeNeedsRebuild = true;
-			// and the dup cache because rebuilding the waiting tree
-			cx->m_spiderColl->m_dupCache.clear(0);
+			// reset dup cache because rebuilding the waiting tree
+			cx->m_spiderColl->m_dupCache.reset();//clear(0);
 		}
 		// . assume we have urls ready to spider too
 		// . no, because if they change the filters and there are
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -2370,7 +2370,10 @@ bool Rdb::addRecord ( collnum_t collnum,
 		}
 	}

-	// cancel any spider request that is a dup in the dupcache to save disk space
+	// . cancel any spider request that is a dup in the dupcache to save disk space
+	// . MDW: can't do it this way, it will end up in twins spiderdb then and they
+	//   will be out of sync. just rely on dedupSpiderList() in the merge.
+	/*
 	if ( m_rdbId == RDB_SPIDERDB && ! KEYNEG(key) ) {
 		// . this will create it if spiders are on and its NULL
 		// . even if spiders are off we need to create it so 
@ -2385,6 +2388,7 @@ bool Rdb::addRecord ( collnum_t collnum,
 		// in Spider.cpp will do that when called from addSpiderRequest() below
 		if ( isReq && sc->isInDupCache ( sreq , false ) ) return true;
 	}
+	*/

 	if ( m_useTree && (tn=m_tree.addNode ( collnum, key , data , dataSize ))>=0) {
 		// if adding to spiderdb, add to cache, too
--- a/RdbCache.cpp
+++ b/RdbCache.cpp
@ -325,7 +325,7 @@ long RdbCache::getLong ( collnum_t collnum ,
 			   (char *)&k,
 			   &rec     ,
 			   &recSize ,
-			   false    ,
+			   false    , // do copy?
 			   maxAge   , // in seconds, -1 means none
 			   true     , // incCounts?
 			   NULL     , // cacheTime ptr
@ -745,6 +745,9 @@ bool RdbCache::addRecord ( collnum_t collnum ,
 	//long long startTime = gettimeofdayInMillisecondsLocal();
 	if ( collnum < (collnum_t)0) {char *xx=NULL;*xx=0; }
 	if ( collnum >= m_maxColls ) {char *xx=NULL;*xx=0; }
+	// full key not allowed because we use that in markDeletedRecord()
+	if ( KEYCMP(cacheKey,KEYMAX(),m_cks) == 0 ) { char  *xx=NULL;*xx=0; }
+
 	// bail if cache empty
 	if ( m_totalBufSize <= 0 ) return true;
 	// debug msg
@ -953,7 +956,7 @@ bool RdbCache::addRecord ( collnum_t collnum ,
 // delete the rec at m_tail from the hashtable
 bool RdbCache::deleteRec ( ) {
 	// sanity. 
-	if ( m_tail < 0 || m_tail > m_totalBufSize ) {
+	if ( m_tail < 0 || m_tail >= m_totalBufSize ) {
 		char *xx = NULL; *xx = 0;}

 	// don't do anything if we're empty
@ -996,6 +999,7 @@ bool RdbCache::deleteRec ( ) {
 		     "maxCollNum=%li dbname=%s", (long)start,
 		     (long)collnum, g_collectiondb.m_numRecsUsed,  
 		     m_dbname);
+		char *xx=NULL;*xx=0;
 		// exception for gourav's bug (dbname=Users)
 		// i am tired of it craping out every 2-3 wks
 		if ( m_dbname[0]=='U' ) return true;
@ -1064,11 +1068,12 @@ bool RdbCache::deleteRec ( ) {
 	m_tail += (p - start);
 	
 	// sanity. this must be failing due to a corrupt dataSize...
-	if ( m_tail < 0 || m_tail > m_totalBufSize ) {
+	if ( m_tail < 0 || m_tail +sizeof(collnum_t)+m_cks+4>m_totalBufSize){
 		char *xx = NULL; *xx = 0;}
 	
 	// delete key from hash table, iff is for THIS record
-	// but if it has not already been voided
+	// but if it has not already been voided.
+	// we set key to KEYMAX() in markDeletedRecord()
 	if ( KEYCMP(k,KEYMAX(),m_cks) != 0 ){
 		removeKey ( collnum , k , start );
 		markDeletedRecord(start);
@ -1291,9 +1296,14 @@ void RdbCache::clearAll ( ) {
 }
 */

+//
+// . MDW: took out clear() for corruption suspicision... i think ninad's
+//   corruption detection would panic on collnum_t's of -1 anyway...
+//
 // . this just clears the contents of the cache
 // . used when deleting a collection in Rdb::delColl() and used in
 //   Rdb::updateToRebuild() when updating/setting the rdb to a rebuilt rdb
+/*
 void RdbCache::clear ( collnum_t collnum ) {
 	// bail if no writing ops allowed now
 	if ( ! g_cacheWritesEnabled ) { char *xx=NULL;*xx=0; }
@ -1309,6 +1319,7 @@ void RdbCache::clear ( collnum_t collnum ) {
 		*(collnum_t *)m_ptrs[i] = -1;
 	}
 }
+*/

 bool RdbCache::load ( ) {
 	return load ( m_dbname );
--- a/Spider.cpp
+++ b/Spider.cpp
@ -29,7 +29,8 @@
 // . i'd like to set back to 10 for speed... maybe even 5 or less
 #define SPIDER_DONE_TIMER 20

-#define MAX_WINNER_NODES 40
+// seems like timecity.com as gigabytes of spiderdb data so up from 40 to 400
+#define MAX_WINNER_NODES 400

 Doledb g_doledb;

@ -615,7 +616,9 @@ bool Spiderdb::init ( ) {
 			    -1      , // fixedDataSize
 			    // now that we have MAX_WINNER_NODES allowed in doledb
 			    // we don't have to keep spiderdb so tightly merged i guess..
-			    3,//g_conf.m_spiderdbMinFilesToMerge , mintomerge
+			    // MDW: it seems to slow performance when not tightly merged
+			    // so put this back to "2"...
+			    2,//g_conf.m_spiderdbMinFilesToMerge , mintomerge
 			    maxMem,//g_conf.m_spiderdbMaxTreeMem ,
 			    maxTreeNodes                ,
 			    true                        , // balance tree?
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -22656,6 +22656,8 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 	bool ignore = false;
 	if ( mbuf[0] == '1' ) ignore = true;

+	SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum );
+
 	//
 	// serialize each link into the metalist now
 	//
@ -22934,6 +22936,10 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 		// set the key, ksr.m_key. isDel = false
 		ksr.setKey ( firstIp, *d , false );

+		// if we've recently added this url to spiderdb in Spider.cpp, skip it
+		if ( sc && sc->isInDupCache ( &ksr , false ) )
+			continue;
+
 		// . technically speaking we do not have any reply so we
 		//   should not be calling this! cuz we don't have all the info
 		// . see if banned or filtered, etc.
@ -22987,6 +22993,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 		//if ( priority == SPIDER_PRIORITY_BANNED   ) {
 		//	linksBanned++; continue; }

+
 		// serialize into the buffer
 		long need = ksr.getRecSize();
 		// is that what we thought it would be?