fix the source of lots of corruption in spiderdb and titledb.

rdbmem.cpp was storing in secondary mem which got reset when
dump completed. also do not add keys that are in collnum and
key range of list currently being dumped, return ETRYAGAIN.
added verify writes parm. clean out tree of titledb and spiderdb
corruption on startup.
This commit is contained in:
Matt Wells 2016-03-15 15:54:12 -07:00
parent 0fdbaa4196
commit 8a65d21371
11 changed files with 177 additions and 15 deletions

View File

@ -3922,6 +3922,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
i++;
// crawl everything else, but don't harvest links,
// we have to see if the page content matches the "ppp"
@ -3929,6 +3932,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("default");
m_spiderPriorities [i] = 52;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
m_harvestLinks [i] = false;
i++;
goto done;
@ -3939,19 +3945,27 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("matchesucp && matchesupp");
m_spiderPriorities [i] = 55;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// if just matches ucp, just crawl it, do not process
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
i++;
// just process, do not spider links if does not match ucp
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 54;
m_harvestLinks [i] = false;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
// do not crawl anything else
@ -3973,6 +3987,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("matchesucp");
m_spiderPriorities [i] = 53;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away.
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
// process everything since upp is empty
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
@ -3995,6 +4012,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set("matchesupp");
m_spiderPriorities [i] = 54;
if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0;
// let's always make this without delay because if we
// restart the round we want these to process right away
if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0;
//m_harvestLinks [i] = false;
//m_spiderDiffbotApiUrl[i].set ( api );
i++;

4
Conf.h
View File

@ -503,6 +503,10 @@ class Conf {
// lookup requests to a host to maxmize tfndb page cache hits?
//bool m_useBiasedTfndb;
// just ensure lists being written are valid rdb records (titlerecs)
// trying to isolate titlerec corruption
bool m_verifyDumpedLists;
// calls fsync(fd) if true after each write
bool m_flushWrites ;
bool m_verifyWrites;

View File

@ -12426,6 +12426,22 @@ void Parms::init ( ) {
m++;
*/
m->m_title = "verify written lists";
m->m_desc = "Ensure lists being written to disk are not corrupt. "
"That title recs appear valid, etc. Helps isolate sources "
"of corruption. Used for debugging.";
m->m_cgi = "vwl";
m->m_off = (char *)&g_conf.m_verifyDumpedLists - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m->m_group = 0;
m++;
m->m_title = "verify disk writes";
m->m_desc = "Read what was written in a verification step. Decreases "
"performance, but may help fight disk corruption mostly on "

34
Rdb.cpp
View File

@ -2004,6 +2004,10 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list,
g_errno = ETRYAGAIN;
return false;
}
// if ( m_inDumpLoop ) {
// g_errno = ETRYAGAIN;
// return false;
// }
// if we are well into repair mode, level 2, do not add anything
// to spiderdb or titledb... that can mess up our titledb scan.
// we always rebuild tfndb, clusterdb, checksumdb and spiderdb
@ -2419,6 +2423,30 @@ bool Rdb::addRecord ( collnum_t collnum,
return false;
}
// do not add if range being dumped at all because when the
// dump completes it calls deleteList() and removes the nodes from
// the tree, so if you were overriding a node currently being dumped
// we would lose it.
if ( m_dump.isDumping() &&
//oppKey >= m_dump.getFirstKeyInQueue() &&
// ensure the dump is dumping the collnum of this key
m_dump.m_collnum == collnum &&
m_dump.m_lastKeyInQueue &&
// the dump should not split positive/negative keys so
// if our positive/negative twin should be in the dump with us
// or not in the dump with us, so any positive/negative
// annihilation below should be ok and we should be save
// to call deleteNode() below
KEYCMP(key,m_dump.getFirstKeyInQueue(),m_ks)>=0 &&
//oppKey <= m_dump.getLastKeyInQueue () ) goto addIt;
KEYCMP(key,m_dump.getLastKeyInQueue (),m_ks)<=0 ) {
// tell caller to wait and try again later
g_errno = ETRYAGAIN;
return false;
}
// save orig
char *orig = NULL;
@ -2618,13 +2646,17 @@ bool Rdb::addRecord ( collnum_t collnum,
// CAUTION: we should not annihilate with oppKey if oppKey may
// be in the process of being dumped to disk! This would
// render our annihilation useless and make undeletable data
/*
if ( m_dump.isDumping() &&
//oppKey >= m_dump.getFirstKeyInQueue() &&
// ensure the dump is dumping the collnum of this key
m_dump.m_collnum == collnum &&
m_dump.m_lastKeyInQueue &&
KEYCMP(oppKey,m_dump.getFirstKeyInQueue(),m_ks)>=0 &&
//oppKey <= m_dump.getLastKeyInQueue () ) goto addIt;
KEYCMP(oppKey,m_dump.getLastKeyInQueue (),m_ks)<=0 )
goto addIt;
*/
// BEFORE we delete it, save it. this is a special hack
// so we can UNDO this deleteNode() should the titledb rec
// add fail.
@ -2698,7 +2730,7 @@ bool Rdb::addRecord ( collnum_t collnum,
// if we did not find an oppKey and are tfndb, flag this
//if ( n<0 && m_rdbId == RDB_TFNDB ) s_tfndbHadOppKey = false;
addIt:
// addIt:
// mark as changed
//if ( ! m_needsSave ) {
// m_needsSave = true;

View File

@ -406,10 +406,13 @@ bool RdbDump::dumpTree ( bool recall ) {
// . check the list we got from the tree for problems
// . ensures keys are ordered from lowest to highest as well
//#ifdef GBSANITYCHECK
if ( g_conf.m_verifyWrites ) {
if ( 1==1 ||
g_conf.m_verifyWrites ||
g_conf.m_verifyDumpedLists ) {
char *s = "none";
if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId);
log("dump: verifying list before dumping (rdb=%s)",s);
log("dump: verifying list before dumping (rdb=%s "
"collnum=%i)",s,(int)m_collnum);
m_list->checkList_r ( false , // removeNegRecs?
false , // sleep on problem?
m_rdb->m_rdbId );

View File

@ -776,7 +776,7 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem ,
if ( rdbId == RDB_TITLEDB && ! KEYNEG(k) ) {
char *rec = getCurrentRec();
int32_t usize = *(int32_t *)(rec+12+4);
if ( usize <= 0 ) {
if ( usize <= 0 || usize>100000000) {
log("db: bad titlerec uncompress size");
char *xx=NULL;*xx=0;
}

View File

@ -90,15 +90,21 @@ void *RdbMem::dupData ( char *key , char *data , int32_t dataSize ,
void *RdbMem::allocData ( char *key , int32_t dataSize , collnum_t collnum ) {
// if we're dumping and key has been dumped, use the secondary mem
//if ( m_dump->isDumping() && key < m_dump->getLastKeyInQueue() ) {
if ( m_rdb->m_inDumpLoop && // m_dump->isDumping() &&
( collnum < m_rdb->m_dumpCollnum ||
(collnum == m_rdb->m_dumpCollnum &&
// if dump fails to alloc mem in RdbDump::dumpTree it does
// a sleep wrapper and keeps retrying, and
// RdbDump::m_lastKeyInQueue can remain NULL because we've
// never dumped out a list from the tree yet
m_rdb->m_dump.m_lastKeyInQueue &&
KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0)) ){
if ( m_rdb->m_inDumpLoop ) {
/////
// MDW: 3/15/2016
// if we're dumping then ALWAYS use secondary mem, wtf...
// primary is being dumped out and when the dump completes
// the ptr gets reset so we'll end up point to garbage.
///////
// ( collnum < m_rdb->m_dumpCollnum ||
// (collnum == m_rdb->m_dumpCollnum &&
// // if dump fails to alloc mem in RdbDump::dumpTree it does
// // a sleep wrapper and keeps retrying, and
// // RdbDump::m_lastKeyInQueue can remain NULL because we've
// // never dumped out a list from the tree yet
// m_rdb->m_dump.m_lastKeyInQueue &&
// KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0))){
// if secondary mem is growing down...
if ( m_ptr2 > m_ptr1 ) {
// return NULL if it would breech,

View File

@ -1145,6 +1145,8 @@ void RdbTree::deleteOrderedList ( collnum_t collnum ,
if ( m_useProtection ) protect ( );
}
#include "Spider.h"
// . this fixes the tree
// returns false if could not fix tree and sets g_errno, otherwise true
@ -1170,6 +1172,12 @@ bool RdbTree::fixTree ( ) {
//CollectionRec *recs = g_collectiondb.m_recs;
int32_t max = g_collectiondb.m_numRecs;
log("db: Valid collection numbers range from 0 to %"INT32".",max);
bool isTitledb = false;
if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
bool isSpiderdb = false;
if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
// now re-add the old nods to the tree, they should not be overwritten
// by addNode()
for ( int32_t i = 0 ; i < n ; i++ ) {
@ -1178,6 +1186,34 @@ bool RdbTree::fixTree ( ) {
log("db: Fixing node #%"INT32" of %"INT32".",i,n);
// skip if empty
if ( m_parents[i] <= -2 ) continue;
if ( isTitledb && m_data[i] ) {
char *data = m_data[i];
int32_t ucompSize = *(int32_t *)data;
if ( ucompSize < 0 || ucompSize > 100000000 ) {
log("db: removing titlerec with uncompressed "
"size of %i from tree",(int)ucompSize);
continue;
}
}
char *key = &m_keys[i*m_ks];
if ( isSpiderdb && m_data[i] &&
g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
char *data = m_data[i];
data -= sizeof(SPIDERDBKEY);
data -= 4;
SpiderRequest *sreq ;
sreq =(SpiderRequest *)data;
if ( strncmp(sreq->m_url,"http",4) ) {
log("db: removing spiderrequest bad url "
"%s from tree",sreq->m_url);
//return false;
continue;
}
}
collnum_t cn = m_collnums[i];
// verify collnum
if ( cn < 0 ) continue;
@ -1185,6 +1221,7 @@ bool RdbTree::fixTree ( ) {
// collnum of non-existent coll
if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
continue;
// now add just to set m_right/m_left/m_parent
if ( m_fixedDataSize == 0 )
addNode(cn,&m_keys[i*m_ks], NULL, 0 );
@ -1233,6 +1270,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
if ( !strcmp(m_dbname,"tfndb" ) ) useHalfKeys = true;
if ( !strcmp(m_dbname,"linkdb" ) ) useHalfKeys = true;
bool isTitledb = false;
if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true;
bool isSpiderdb = false;
if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true;
// now check parent kid correlations
for ( int32_t i = 0 ; i < m_minUnusedNode ; i++ ) {
// this thing blocks for 1.5 secs for indexdb
@ -1250,6 +1293,31 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
char *xx=NULL;*xx=0; }
if ( isTitledb && m_data[i] ) {
char *data = m_data[i];
int32_t ucompSize = *(int32_t *)data;
if ( ucompSize < 0 || ucompSize > 100000000 ) {
log("db: found titlerec with uncompressed "
"size of %i from tree",(int)ucompSize);
return false;
}
}
char *key = &m_keys[i*m_ks];
if ( isSpiderdb && m_data[i] &&
g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) {
char *data = m_data[i];
data -= sizeof(SPIDERDBKEY);
data -= 4;
SpiderRequest *sreq ;
sreq =(SpiderRequest *)data;
if ( strncmp(sreq->m_url,"http",4) ) {
log("db: spiderrequest bad url "
"%s",sreq->m_url);
return false;
}
}
// bad collnum?
if ( doCollRecCheck ) {
collnum_t cn = m_collnums[i];

View File

@ -6044,7 +6044,8 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
int64_t waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0);
// do not spider more than once per 15 seconds ever!
// no! might be a query reindex!!
if ( waitInSecs < 15 && ! sreq->m_isPageReindex ) { //urlIsDocId ) {
/*
if ( waitInSecs < 1 && ! sreq->m_isPageReindex ) { //urlIsDocId ) {
static bool s_printed = false;
if ( ! s_printed ) {
s_printed = true;
@ -6053,6 +6054,7 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq,
}
waitInSecs = 15;//900; this was 15 minutes
}
*/
// in fact, force docid based guys to be zero!
//if ( sreq->m_urlIsDocId ) waitInSecs = 0;
if ( sreq->m_isPageReindex ) waitInSecs = 0;

View File

@ -213,6 +213,8 @@ class XmlDoc *g_xd;
void XmlDoc::reset ( ) {
m_oldDocExistedButHadError = false;
m_addedStatusDocId = 0;
if ( m_diffbotProxyReplyValid && m_diffbotProxyReply ) {
@ -12087,6 +12089,7 @@ XmlDoc **XmlDoc::getOldXmlDoc ( ) {
// ok, fix the memleak here
mdelete ( m_oldDoc , sizeof(XmlDoc), "odnuke" );
delete ( m_oldDoc );
m_oldDocExistedButHadError = true;
//log("xmldoc: nuke xmldoc1=%"PTRFMT"",(PTRTYPE)m_oldDoc);
m_oldDoc = NULL;
g_errno = saved;
@ -16156,6 +16159,12 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
od && od->m_gotDiffbotSuccessfulReply )
m_recycleDiffbotReply = true;
// to fight off corrupted title recs just assume that even though
// we could not uncompress the title rec that it had a successful reply
// if ( cr->m_diffbotOnlyProcessIfNewUrl &&
// m_oldDocExistedButHadError )
// m_recycleDiffbotReply = true;
// don't recycle if specfically asked to reindex though
if ( m_sreqValid && m_sreq.m_isPageReindex )
m_recycleDiffbotReply = false;

View File

@ -1643,6 +1643,8 @@ class XmlDoc {
char m_isInIndex;
char m_wasInIndex;
bool m_oldDocExistedButHadError;
Msg8a m_msg8a;
char *m_tagdbColl;
int32_t m_tagdbCollLen;