diff --git a/Collectiondb.cpp b/Collectiondb.cpp index d5557df0..45ffbdc8 100644 --- a/Collectiondb.cpp +++ b/Collectiondb.cpp @@ -3922,6 +3922,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { m_regExs[i].set("matchesucp"); m_spiderPriorities [i] = 53; if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0; + // let's always make this without delay because if we + // restart the round we want these to process right away + if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0; i++; // crawl everything else, but don't harvest links, // we have to see if the page content matches the "ppp" @@ -3929,6 +3932,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { m_regExs[i].set("default"); m_spiderPriorities [i] = 52; if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0; + // let's always make this without delay because if we + // restart the round we want these to process right away + if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0; m_harvestLinks [i] = false; i++; goto done; @@ -3939,19 +3945,27 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { m_regExs[i].set("matchesucp && matchesupp"); m_spiderPriorities [i] = 55; if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0; - + // let's always make this without delay because if we + // restart the round we want these to process right away + if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0; //m_spiderDiffbotApiUrl[i].set ( api ); i++; // if just matches ucp, just crawl it, do not process m_regExs[i].set("matchesucp"); m_spiderPriorities [i] = 53; if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0; + // let's always make this without delay because if we + // restart the round we want these to process right away + if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0; i++; // just process, do not spider links if does not match ucp m_regExs[i].set("matchesupp"); m_spiderPriorities [i] = 54; m_harvestLinks [i] = false; if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0; + // let's always make this without delay because if we + // restart the round we want these to process right away + if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0; //m_spiderDiffbotApiUrl[i].set ( api ); i++; // do not crawl anything else @@ -3973,6 +3987,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { m_regExs[i].set("matchesucp"); m_spiderPriorities [i] = 53; if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0; + // let's always make this without delay because if we + // restart the round we want these to process right away. + if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0; // process everything since upp is empty //m_spiderDiffbotApiUrl[i].set ( api ); i++; @@ -3995,6 +4012,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { m_regExs[i].set("matchesupp"); m_spiderPriorities [i] = 54; if ( m_collectiveRespiderFrequency<=0.0) m_spiderFreqs [i] = 0; + // let's always make this without delay because if we + // restart the round we want these to process right away + if ( respiderFreq > 0.0 ) m_spiderFreqs[i] = 0.0; //m_harvestLinks [i] = false; //m_spiderDiffbotApiUrl[i].set ( api ); i++; diff --git a/Conf.h b/Conf.h index 63cf6707..83a05837 100644 --- a/Conf.h +++ b/Conf.h @@ -503,6 +503,10 @@ class Conf { // lookup requests to a host to maxmize tfndb page cache hits? //bool m_useBiasedTfndb; + // just ensure lists being written are valid rdb records (titlerecs) + // trying to isolate titlerec corruption + bool m_verifyDumpedLists; + // calls fsync(fd) if true after each write bool m_flushWrites ; bool m_verifyWrites; diff --git a/Parms.cpp b/Parms.cpp index 0e95fe53..4e014bd0 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -12426,6 +12426,22 @@ void Parms::init ( ) { m++; */ + m->m_title = "verify written lists"; + m->m_desc = "Ensure lists being written to disk are not corrupt. " + "That title recs appear valid, etc. Helps isolate sources " + "of corruption. Used for debugging."; + m->m_cgi = "vwl"; + m->m_off = (char *)&g_conf.m_verifyDumpedLists - g; + m->m_type = TYPE_BOOL; + m->m_def = "0"; + m->m_group = 0; + m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE; + m->m_page = PAGE_MASTER; + m->m_obj = OBJ_CONF; + m->m_group = 0; + m++; + + m->m_title = "verify disk writes"; m->m_desc = "Read what was written in a verification step. Decreases " "performance, but may help fight disk corruption mostly on " diff --git a/Rdb.cpp b/Rdb.cpp index f7d4ac00..8c26c176 100644 --- a/Rdb.cpp +++ b/Rdb.cpp @@ -2004,6 +2004,10 @@ bool Rdb::addList ( collnum_t collnum , RdbList *list, g_errno = ETRYAGAIN; return false; } + // if ( m_inDumpLoop ) { + // g_errno = ETRYAGAIN; + // return false; + // } // if we are well into repair mode, level 2, do not add anything // to spiderdb or titledb... that can mess up our titledb scan. // we always rebuild tfndb, clusterdb, checksumdb and spiderdb @@ -2419,6 +2423,30 @@ bool Rdb::addRecord ( collnum_t collnum, return false; } + + // do not add if range being dumped at all because when the + // dump completes it calls deleteList() and removes the nodes from + // the tree, so if you were overriding a node currently being dumped + // we would lose it. + if ( m_dump.isDumping() && + //oppKey >= m_dump.getFirstKeyInQueue() && + // ensure the dump is dumping the collnum of this key + m_dump.m_collnum == collnum && + m_dump.m_lastKeyInQueue && + // the dump should not split positive/negative keys so + // if our positive/negative twin should be in the dump with us + // or not in the dump with us, so any positive/negative + // annihilation below should be ok and we should be save + // to call deleteNode() below + KEYCMP(key,m_dump.getFirstKeyInQueue(),m_ks)>=0 && + //oppKey <= m_dump.getLastKeyInQueue () ) goto addIt; + KEYCMP(key,m_dump.getLastKeyInQueue (),m_ks)<=0 ) { + // tell caller to wait and try again later + g_errno = ETRYAGAIN; + return false; + } + + // save orig char *orig = NULL; @@ -2618,13 +2646,17 @@ bool Rdb::addRecord ( collnum_t collnum, // CAUTION: we should not annihilate with oppKey if oppKey may // be in the process of being dumped to disk! This would // render our annihilation useless and make undeletable data + /* if ( m_dump.isDumping() && //oppKey >= m_dump.getFirstKeyInQueue() && + // ensure the dump is dumping the collnum of this key + m_dump.m_collnum == collnum && m_dump.m_lastKeyInQueue && KEYCMP(oppKey,m_dump.getFirstKeyInQueue(),m_ks)>=0 && //oppKey <= m_dump.getLastKeyInQueue () ) goto addIt; KEYCMP(oppKey,m_dump.getLastKeyInQueue (),m_ks)<=0 ) goto addIt; + */ // BEFORE we delete it, save it. this is a special hack // so we can UNDO this deleteNode() should the titledb rec // add fail. @@ -2698,7 +2730,7 @@ bool Rdb::addRecord ( collnum_t collnum, // if we did not find an oppKey and are tfndb, flag this //if ( n<0 && m_rdbId == RDB_TFNDB ) s_tfndbHadOppKey = false; - addIt: + // addIt: // mark as changed //if ( ! m_needsSave ) { // m_needsSave = true; diff --git a/RdbDump.cpp b/RdbDump.cpp index 8301bfc8..42186d85 100644 --- a/RdbDump.cpp +++ b/RdbDump.cpp @@ -406,10 +406,13 @@ bool RdbDump::dumpTree ( bool recall ) { // . check the list we got from the tree for problems // . ensures keys are ordered from lowest to highest as well //#ifdef GBSANITYCHECK - if ( g_conf.m_verifyWrites ) { + if ( 1==1 || + g_conf.m_verifyWrites || + g_conf.m_verifyDumpedLists ) { char *s = "none"; if ( m_rdb ) s = getDbnameFromId(m_rdb->m_rdbId); - log("dump: verifying list before dumping (rdb=%s)",s); + log("dump: verifying list before dumping (rdb=%s " + "collnum=%i)",s,(int)m_collnum); m_list->checkList_r ( false , // removeNegRecs? false , // sleep on problem? m_rdb->m_rdbId ); diff --git a/RdbList.cpp b/RdbList.cpp index dea9d5b9..09950f30 100644 --- a/RdbList.cpp +++ b/RdbList.cpp @@ -776,7 +776,7 @@ bool RdbList::checkList_r ( bool removeNegRecs , bool sleepOnProblem , if ( rdbId == RDB_TITLEDB && ! KEYNEG(k) ) { char *rec = getCurrentRec(); int32_t usize = *(int32_t *)(rec+12+4); - if ( usize <= 0 ) { + if ( usize <= 0 || usize>100000000) { log("db: bad titlerec uncompress size"); char *xx=NULL;*xx=0; } diff --git a/RdbMem.cpp b/RdbMem.cpp index 022671d9..bdc47145 100644 --- a/RdbMem.cpp +++ b/RdbMem.cpp @@ -90,15 +90,21 @@ void *RdbMem::dupData ( char *key , char *data , int32_t dataSize , void *RdbMem::allocData ( char *key , int32_t dataSize , collnum_t collnum ) { // if we're dumping and key has been dumped, use the secondary mem //if ( m_dump->isDumping() && key < m_dump->getLastKeyInQueue() ) { - if ( m_rdb->m_inDumpLoop && // m_dump->isDumping() && - ( collnum < m_rdb->m_dumpCollnum || - (collnum == m_rdb->m_dumpCollnum && - // if dump fails to alloc mem in RdbDump::dumpTree it does - // a sleep wrapper and keeps retrying, and - // RdbDump::m_lastKeyInQueue can remain NULL because we've - // never dumped out a list from the tree yet - m_rdb->m_dump.m_lastKeyInQueue && - KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0)) ){ + if ( m_rdb->m_inDumpLoop ) { + ///// + // MDW: 3/15/2016 + // if we're dumping then ALWAYS use secondary mem, wtf... + // primary is being dumped out and when the dump completes + // the ptr gets reset so we'll end up point to garbage. + /////// + // ( collnum < m_rdb->m_dumpCollnum || + // (collnum == m_rdb->m_dumpCollnum && + // // if dump fails to alloc mem in RdbDump::dumpTree it does + // // a sleep wrapper and keeps retrying, and + // // RdbDump::m_lastKeyInQueue can remain NULL because we've + // // never dumped out a list from the tree yet + // m_rdb->m_dump.m_lastKeyInQueue && + // KEYCMP(key,m_rdb->m_dump.getLastKeyInQueue(),m_ks)<0))){ // if secondary mem is growing down... if ( m_ptr2 > m_ptr1 ) { // return NULL if it would breech, diff --git a/RdbTree.cpp b/RdbTree.cpp index 8cf271b8..eda40331 100644 --- a/RdbTree.cpp +++ b/RdbTree.cpp @@ -1145,6 +1145,8 @@ void RdbTree::deleteOrderedList ( collnum_t collnum , if ( m_useProtection ) protect ( ); } +#include "Spider.h" + // . this fixes the tree // returns false if could not fix tree and sets g_errno, otherwise true @@ -1170,6 +1172,12 @@ bool RdbTree::fixTree ( ) { //CollectionRec *recs = g_collectiondb.m_recs; int32_t max = g_collectiondb.m_numRecs; log("db: Valid collection numbers range from 0 to %"INT32".",max); + + bool isTitledb = false; + if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true; + bool isSpiderdb = false; + if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true; + // now re-add the old nods to the tree, they should not be overwritten // by addNode() for ( int32_t i = 0 ; i < n ; i++ ) { @@ -1178,6 +1186,34 @@ bool RdbTree::fixTree ( ) { log("db: Fixing node #%"INT32" of %"INT32".",i,n); // skip if empty if ( m_parents[i] <= -2 ) continue; + + + if ( isTitledb && m_data[i] ) { + char *data = m_data[i]; + int32_t ucompSize = *(int32_t *)data; + if ( ucompSize < 0 || ucompSize > 100000000 ) { + log("db: removing titlerec with uncompressed " + "size of %i from tree",(int)ucompSize); + continue; + } + } + + char *key = &m_keys[i*m_ks]; + if ( isSpiderdb && m_data[i] && + g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) { + char *data = m_data[i]; + data -= sizeof(SPIDERDBKEY); + data -= 4; + SpiderRequest *sreq ; + sreq =(SpiderRequest *)data; + if ( strncmp(sreq->m_url,"http",4) ) { + log("db: removing spiderrequest bad url " + "%s from tree",sreq->m_url); + //return false; + continue; + } + } + collnum_t cn = m_collnums[i]; // verify collnum if ( cn < 0 ) continue; @@ -1185,6 +1221,7 @@ bool RdbTree::fixTree ( ) { // collnum of non-existent coll if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] ) continue; + // now add just to set m_right/m_left/m_parent if ( m_fixedDataSize == 0 ) addNode(cn,&m_keys[i*m_ks], NULL, 0 ); @@ -1233,6 +1270,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) { if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true; if ( !strcmp(m_dbname,"tfndb" ) ) useHalfKeys = true; if ( !strcmp(m_dbname,"linkdb" ) ) useHalfKeys = true; + + bool isTitledb = false; + if ( !strcmp(m_dbname,"titledb" ) ) isTitledb = true; + bool isSpiderdb = false; + if ( !strcmp(m_dbname,"spiderdb" ) ) isSpiderdb = true; + // now check parent kid correlations for ( int32_t i = 0 ; i < m_minUnusedNode ; i++ ) { // this thing blocks for 1.5 secs for indexdb @@ -1250,6 +1293,31 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) { if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) { char *xx=NULL;*xx=0; } + if ( isTitledb && m_data[i] ) { + char *data = m_data[i]; + int32_t ucompSize = *(int32_t *)data; + if ( ucompSize < 0 || ucompSize > 100000000 ) { + log("db: found titlerec with uncompressed " + "size of %i from tree",(int)ucompSize); + return false; + } + } + + char *key = &m_keys[i*m_ks]; + if ( isSpiderdb && m_data[i] && + g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)key ) ) { + char *data = m_data[i]; + data -= sizeof(SPIDERDBKEY); + data -= 4; + SpiderRequest *sreq ; + sreq =(SpiderRequest *)data; + if ( strncmp(sreq->m_url,"http",4) ) { + log("db: spiderrequest bad url " + "%s",sreq->m_url); + return false; + } + } + // bad collnum? if ( doCollRecCheck ) { collnum_t cn = m_collnums[i]; diff --git a/Spider.cpp b/Spider.cpp index dc1b0139..13aa5797 100644 --- a/Spider.cpp +++ b/Spider.cpp @@ -6044,7 +6044,8 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq, int64_t waitInSecs = (uint64_t)(m_cr->m_spiderFreqs[ufn]*3600*24.0); // do not spider more than once per 15 seconds ever! // no! might be a query reindex!! - if ( waitInSecs < 15 && ! sreq->m_isPageReindex ) { //urlIsDocId ) { + /* + if ( waitInSecs < 1 && ! sreq->m_isPageReindex ) { //urlIsDocId ) { static bool s_printed = false; if ( ! s_printed ) { s_printed = true; @@ -6053,6 +6054,7 @@ uint64_t SpiderColl::getSpiderTimeMS ( SpiderRequest *sreq, } waitInSecs = 15;//900; this was 15 minutes } + */ // in fact, force docid based guys to be zero! //if ( sreq->m_urlIsDocId ) waitInSecs = 0; if ( sreq->m_isPageReindex ) waitInSecs = 0; diff --git a/XmlDoc.cpp b/XmlDoc.cpp index 108e79eb..a0d66794 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -213,6 +213,8 @@ class XmlDoc *g_xd; void XmlDoc::reset ( ) { + m_oldDocExistedButHadError = false; + m_addedStatusDocId = 0; if ( m_diffbotProxyReplyValid && m_diffbotProxyReply ) { @@ -12087,6 +12089,7 @@ XmlDoc **XmlDoc::getOldXmlDoc ( ) { // ok, fix the memleak here mdelete ( m_oldDoc , sizeof(XmlDoc), "odnuke" ); delete ( m_oldDoc ); + m_oldDocExistedButHadError = true; //log("xmldoc: nuke xmldoc1=%"PTRFMT"",(PTRTYPE)m_oldDoc); m_oldDoc = NULL; g_errno = saved; @@ -16156,6 +16159,12 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) { od && od->m_gotDiffbotSuccessfulReply ) m_recycleDiffbotReply = true; + // to fight off corrupted title recs just assume that even though + // we could not uncompress the title rec that it had a successful reply + // if ( cr->m_diffbotOnlyProcessIfNewUrl && + // m_oldDocExistedButHadError ) + // m_recycleDiffbotReply = true; + // don't recycle if specfically asked to reindex though if ( m_sreqValid && m_sreq.m_isPageReindex ) m_recycleDiffbotReply = false; diff --git a/XmlDoc.h b/XmlDoc.h index 6696d5d7..58657d65 100644 --- a/XmlDoc.h +++ b/XmlDoc.h @@ -1643,6 +1643,8 @@ class XmlDoc { char m_isInIndex; char m_wasInIndex; + bool m_oldDocExistedButHadError; + Msg8a m_msg8a; char *m_tagdbColl; int32_t m_tagdbCollLen;