fix the source of lots of corruption in spiderdb and titledb.

rdbmem.cpp was storing in secondary mem which got reset when dump completed. also do not add keys that are in collnum and key range of list currently being dumped, return ETRYAGAIN. added verify writes parm. clean out tree of titledb and spiderdb corruption on startup.
2024-10-03 19:57:18 +03:00 · 2016-03-15 15:54:12 -07:00 · 2016-03-15 15:54:12 -07:00 · 8a65d21371
commit 8a65d21371
parent 0fdbaa4196
11 changed files with 177 additions and 15 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -3922,6 +3922,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("matchesucp");
 		m_spiderPriorities   [i] = 53;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		i++;
 		// crawl everything else, but don't harvest links,
 		// we have to see if the page content matches the "ppp"
@ -3929,6 +3932,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("default");
 		m_spiderPriorities   [i] = 52;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		m_harvestLinks       [i] = false;
 		i++;
 		goto done;
@ -3939,19 +3945,27 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("matchesucp && matchesupp");
 		m_spiderPriorities   [i] = 55;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
-
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		//m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
 		// if just matches ucp, just crawl it, do not process
 		m_regExs[i].set("matchesucp");
 		m_spiderPriorities   [i] = 53;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		i++;
 		// just process, do not spider links if does not match ucp
 		m_regExs[i].set("matchesupp");
 		m_spiderPriorities   [i] = 54;
 		m_harvestLinks       [i] = false;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		//m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
 		// do not crawl anything else
@ -3973,6 +3987,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("matchesucp");
 		m_spiderPriorities   [i] = 53;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away.
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		// process everything since upp is empty
 		//m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
@ -3995,6 +4012,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set("matchesupp");
 		m_spiderPriorities   [i] = 54;
 		if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
+		// let's always make this without delay because if we
+		// restart the round we want these to process right away
+		if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
 		//m_harvestLinks       [i] = false;
 		//m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
--- a/Conf.h
+++ b/Conf.h
@ -503,6 +503,10 @@ class Conf {
 	// lookup requests to a host to maxmize tfndb page cache hits?
 	//bool   m_useBiasedTfndb;

+	// just ensure lists being written are valid rdb records (titlerecs)
+	// trying to isolate titlerec corruption
+	bool m_verifyDumpedLists;
+
 	// calls fsync(fd) if true after each write
 	bool   m_flushWrites ; 
 	bool   m_verifyWrites;
--- a/Parms.cpp
+++ b/Parms.cpp
@ -12426,6 +12426,22 @@ void Parms::init ( ) {
 	m++;
 	*/

+	m->m_title = "verify written lists";
+	m->m_desc  = "Ensure lists being written to disk are not corrupt. "
+		"That title recs appear valid, etc. Helps isolate sources "
+		"of corruption. Used for debugging.";
+	m->m_cgi   = "vwl";
+	m->m_off   = (char *)&g_conf.m_verifyDumpedLists - g;
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "0";
+	m->m_group = 0;
+	m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
+	m->m_page  = PAGE_MASTER;
+	m->m_obj   = OBJ_CONF;
+	m->m_group = 0;
+	m++;
+
+
 	m->m_title = "verify disk writes";
 	m->m_desc  = "Read what was written in a verification step. Decreases "
 		"performance, but may help fight disk corruption mostly on "
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -2004,6 +2004,10 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
 		g_errno = ETRYAGAIN; 
 		return false;
 	}
+	// if ( m_inDumpLoop ) {
+	// 	g_errno = ETRYAGAIN;
+	// 	return false;
+	// }
 	// if we are well into repair mode, level 2, do not add anything
 	// to spiderdb or titledb... that can mess up our titledb scan.
 	// we always rebuild tfndb, clusterdb, checksumdb and spiderdb
@ -2419,6 +2423,30 @@ bool Rdb::addRecord ( collnum_t collnum,
 		return false;
 	}

+
+	// do not add if range being dumped at all because when the
+	// dump completes it calls deleteList() and removes the nodes from
+	// the tree, so if you were overriding a node currently being dumped
+	// we would lose it.
+	if ( m_dump.isDumping() &&
+	     //oppKey >= m_dump.getFirstKeyInQueue() &&
+	     // ensure the dump is dumping the collnum of this key
+	     m_dump.m_collnum == collnum &&
+	     m_dump.m_lastKeyInQueue &&
+	     // the dump should not split positive/negative keys so
+	     // if our positive/negative twin should be in the dump with us
+	     // or not in the dump with us, so any positive/negative 
+	     // annihilation below should be ok and we should be save
+	     // to call deleteNode() below
+	     KEYCMP(key,m_dump.getFirstKeyInQueue(),m_ks)>=0 &&
+	     //oppKey <= m_dump.getLastKeyInQueue ()   ) goto addIt;
+	     KEYCMP(key,m_dump.getLastKeyInQueue (),m_ks)<=0   )  {
+		// tell caller to wait and try again later
+		g_errno = ETRYAGAIN;
+		return false;
+	}
+
+
 	// save orig
 	char *orig = NULL;

@ -2618,13 +2646,17 @@ bool Rdb::addRecord ( collnum_t collnum,
 		// CAUTION: we should not annihilate with oppKey if oppKey may
 		// be in the process of being dumped to disk! This would 
 		// render our annihilation useless and make undeletable data
+		/*
 		if ( m_dump.isDumping() &&
 		     //oppKey >= m_dump.getFirstKeyInQueue() &&
+		     // ensure the dump is dumping the collnum of this key
+		     m_dump.m_collnum == collnum &&
 		     m_dump.m_lastKeyInQueue &&
 		     KEYCMP(oppKey,m_dump.getFirstKeyInQueue(),m_ks)>=0 &&
 		     //oppKey <= m_dump.getLastKeyInQueue ()   ) goto addIt;
 		     KEYCMP(oppKey,m_dump.getLastKeyInQueue (),m_ks)<=0   ) 
 			goto addIt;
+		*/
 		// BEFORE we delete it, save it. this is a special hack
 		// so we can UNDO this deleteNode() should the titledb rec
 		// add fail.
@ -2698,7 +2730,7 @@ bool Rdb::addRecord ( collnum_t collnum,
 	// if we did not find an oppKey and are tfndb, flag this
 	//if ( n<0 && m_rdbId == RDB_TFNDB ) s_tfndbHadOppKey = false;

- addIt:
+	// addIt:
 	// mark as changed
 	//if ( ! m_needsSave ) {
 	//	m_needsSave = true;
--- a/RdbDump.cpp
+++ b/RdbDump.cpp
@ -406,10 +406,13 @@ bool RdbDump::dumpTree ( bool recall ) {
 		// . check the list we got from the tree for problems
 		// . ensures keys are ordered from lowest to highest as well
 		//#ifdef GBSANITYCHECK
-		if ( g_conf.m_verifyWrites ) {
+		if ( 1==1 ||
+		     g_conf.m_verifyWrites ||
+		     g_conf.m_verifyDumpedLists ) {
 			char *s = "none";
 			if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
-			log("dump: verifying list before dumping (rdb=%s)",s);
+			log("dump: verifying list before dumping (rdb=%s "
+			    "collnum=%i)",s,(int)m_collnum);
 			m_list->checkList_r ( false , // removeNegRecs?
 					      false , // sleep on problem?
 					      m_rdb->m_rdbId );
--- a/RdbList.cpp
+++ b/RdbList.cpp
@ -776,7 +776,7 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
 		if ( rdbId == RDB_TITLEDB && ! KEYNEG(k) ) {
 			char *rec = getCurrentRec();
 			int32_t usize = *(int32_t *)(rec+12+4);
-			if ( usize <= 0 ) {
+			if ( usize <= 0 || usize>100000000) {
 				log("db: bad titlerec uncompress size");
 				char *xx=NULL;*xx=0; 
 			}
--- a/RdbMem.cpp
+++ b/RdbMem.cpp
@ -90,15 +90,21 @@ void *RdbMem::dupData ( char *key , char *data , int32_t dataSize ,
 void *RdbMem::allocData ( char *key , int32_t dataSize , collnum_t collnum ) {
 	// if we're dumping and key has been dumped, use the secondary mem
 	//if ( m_dump->isDumping() && key < m_dump->getLastKeyInQueue() ) {
-	if ( m_rdb->m_inDumpLoop && // m_dump->isDumping() && 
-	     ( collnum < m_rdb->m_dumpCollnum ||
-	       (collnum == m_rdb->m_dumpCollnum &&
-		// if dump fails to alloc mem in RdbDump::dumpTree it does
-		// a sleep wrapper and keeps retrying, and 
-		// RdbDump::m_lastKeyInQueue can remain NULL because we've
-		// never dumped out a list from the tree yet
-		m_rdb->m_dump.m_lastKeyInQueue &&
-		KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0)) ){
+	if ( m_rdb->m_inDumpLoop ) {
+		/////
+		// MDW: 3/15/2016
+		// if we're dumping then ALWAYS use secondary mem, wtf...
+		// primary is being dumped out and when the dump completes
+		// the ptr gets reset so we'll end up point to garbage.
+		///////
+	     // ( collnum < m_rdb->m_dumpCollnum ||
+	     //   (collnum == m_rdb->m_dumpCollnum &&
+	     //	// if dump fails to alloc mem in RdbDump::dumpTree it does
+	     //	// a sleep wrapper and keeps retrying, and 
+	     //	// RdbDump::m_lastKeyInQueue can remain NULL because we've
+	     // 	// never dumped out a list from the tree yet
+	     // 	m_rdb->m_dump.m_lastKeyInQueue &&
+	     //	KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0))){
 		// if secondary mem is growing down...
 		if ( m_ptr2 > m_ptr1 ) {
 			// return NULL if it would breech,
--- a/RdbTree.cpp
+++ b/RdbTree.cpp
@ -1145,6 +1145,8 @@ void RdbTree::deleteOrderedList ( collnum_t collnum ,
 	if ( m_useProtection ) protect ( );
 }

+#include "Spider.h"
+
 // . this fixes the tree
 // returns false if could not fix tree and sets g_errno, otherwise true

@ -1170,6 +1172,12 @@ bool RdbTree::fixTree ( ) {
 	//CollectionRec *recs = g_collectiondb.m_recs;
 	int32_t           max  = g_collectiondb.m_numRecs;
 	log("db: Valid collection numbers range from 0 to %"INT32".",max);
+
+	bool isTitledb = false;
+	if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
+	bool isSpiderdb = false;
+	if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
+
 	// now re-add the old nods to the tree, they should not be overwritten
 	// by addNode()
 	for ( int32_t i = 0 ; i < n ; i++ ) {
@ -1178,6 +1186,34 @@ bool RdbTree::fixTree ( ) {
 			log("db: Fixing node #%"INT32" of %"INT32".",i,n);
 		// skip if empty
 		if ( m_parents[i] <= -2 ) continue;
+
+
+		if ( isTitledb && m_data[i] ) {
+			char *data = m_data[i];
+			int32_t ucompSize = *(int32_t *)data;
+			if ( ucompSize < 0 || ucompSize > 100000000 ) {
+				log("db: removing titlerec with uncompressed "
+				     "size of %i from tree",(int)ucompSize);
+				continue;
+			}
+		}
+
+		char *key = &m_keys[i*m_ks];
+		if ( isSpiderdb && m_data[i] &&
+		     g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
+			char *data = m_data[i];
+			data -= sizeof(SPIDERDBKEY);
+			data -= 4;
+			SpiderRequest *sreq ;
+			sreq =(SpiderRequest *)data;
+			if ( strncmp(sreq->m_url,"http",4) ) {
+				log("db: removing spiderrequest bad url "
+				    "%s from tree",sreq->m_url);
+				//return false;
+				continue;
+			}
+		}
+
 		collnum_t cn = m_collnums[i];
 		// verify collnum
 		if ( cn <  0   ) continue;
@ -1185,6 +1221,7 @@ bool RdbTree::fixTree ( ) {
 		// collnum of non-existent coll
 		if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
 			continue;
+
 		// now add just to set m_right/m_left/m_parent
 		if ( m_fixedDataSize == 0 )
 			addNode(cn,&m_keys[i*m_ks], NULL, 0 );
@ -1233,6 +1270,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
 	if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
 	if ( !strcmp(m_dbname,"tfndb"  ) ) useHalfKeys = true;
 	if ( !strcmp(m_dbname,"linkdb" ) ) useHalfKeys = true;
+
+	bool isTitledb = false;
+	if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
+	bool isSpiderdb = false;
+	if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
+
 	// now check parent kid correlations
 	for ( int32_t i = 0 ; i < m_minUnusedNode ; i++ ) {
 		// this thing blocks for 1.5 secs for indexdb
@ -1250,6 +1293,31 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
 		if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
 			char *xx=NULL;*xx=0; }

+		if ( isTitledb && m_data[i] ) {
+			char *data = m_data[i];
+			int32_t ucompSize = *(int32_t *)data;
+			if ( ucompSize < 0 || ucompSize > 100000000 ) {
+				log("db: found titlerec with uncompressed "
+				    "size of %i from tree",(int)ucompSize);
+				return false;
+			}
+		}
+
+		char *key = &m_keys[i*m_ks];
+		if ( isSpiderdb && m_data[i] &&
+		     g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
+			char *data = m_data[i];
+			data -= sizeof(SPIDERDBKEY);
+			data -= 4;
+			SpiderRequest *sreq ;
+			sreq =(SpiderRequest *)data;
+			if ( strncmp(sreq->m_url,"http",4) ) {
+				log("db: spiderrequest bad url "
+				    "%s",sreq->m_url);
+				return false;
+			}
+		}
+
 		// bad collnum?
 		if ( doCollRecCheck ) {
 			collnum_t cn = m_collnums[i];
--- a/Spider.cpp
+++ b/Spider.cpp
@ -6044,7 +6044,8 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
 	int64_t waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0);
 	// do not spider more than once per 15 seconds ever!
 	// no! might be a query reindex!!
-	if ( waitInSecs < 15 && ! sreq->m_isPageReindex ) { //urlIsDocId ) { 
+	/*
+	if ( waitInSecs < 1 && ! sreq->m_isPageReindex ) { //urlIsDocId ) { 
 		static bool s_printed = false;
 		if ( ! s_printed ) {
 			s_printed = true;
@ -6053,6 +6054,7 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
 		}
 		waitInSecs = 15;//900; this was 15 minutes
 	}
+	*/
 	// in fact, force docid based guys to be zero!
 	//if ( sreq->m_urlIsDocId ) waitInSecs = 0;
 	if ( sreq->m_isPageReindex ) waitInSecs = 0;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -213,6 +213,8 @@ class XmlDoc *g_xd;

 void XmlDoc::reset ( ) {

+	m_oldDocExistedButHadError = false;
+
 	m_addedStatusDocId = 0;

 	if ( m_diffbotProxyReplyValid && m_diffbotProxyReply ) {
@ -12087,6 +12089,7 @@ XmlDoc **XmlDoc::getOldXmlDoc ( ) {
 		// ok, fix the memleak here
 		mdelete ( m_oldDoc , sizeof(XmlDoc), "odnuke" );
 		delete ( m_oldDoc );
+		m_oldDocExistedButHadError = true;
 		//log("xmldoc: nuke xmldoc1=%"PTRFMT"",(PTRTYPE)m_oldDoc);
 		m_oldDoc = NULL;
 		g_errno = saved;
@ -16156,6 +16159,12 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
 	     od && od->m_gotDiffbotSuccessfulReply )
 		m_recycleDiffbotReply = true;

+	// to fight off corrupted title recs just assume that even though
+	// we could not uncompress the title rec that it had a successful reply
+	// if ( cr->m_diffbotOnlyProcessIfNewUrl &&
+	//      m_oldDocExistedButHadError )
+	// 	m_recycleDiffbotReply = true;
+
 	// don't recycle if specfically asked to reindex though
 	if ( m_sreqValid && m_sreq.m_isPageReindex )
 		m_recycleDiffbotReply = false;
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -1643,6 +1643,8 @@ class XmlDoc {
 	char m_isInIndex;
 	char m_wasInIndex;

+	bool m_oldDocExistedButHadError;
+
 	Msg8a   m_msg8a;
 	char   *m_tagdbColl;
 	int32_t    m_tagdbCollLen;