diff --git a/Process.cpp b/Process.cpp index 49418750..29509485 100644 --- a/Process.cpp +++ b/Process.cpp @@ -1340,6 +1340,7 @@ bool Process::save ( ) { logf(LOG_INFO,"db: Entering lock mode for saving."); m_mode = LOCK_MODE; // SAVE_MODE; m_urgent = false; + m_calledSave = false; return save2(); } @@ -1360,6 +1361,8 @@ bool Process::shutdown ( bool urgent , m_mode = EXIT_MODE; m_urgent = urgent; + m_calledSave = false; + // check memory buffers for overruns/underrunds to see if that // caused this core if ( urgent ) g_mem.printBreeches(false); @@ -1738,16 +1741,30 @@ bool Process::saveRdbTrees ( bool useThread , bool shuttingDown ) { if ( g_conf.m_readOnlyMode ) return true; // no thread if shutting down if ( shuttingDown ) useThread = false; + // debug note + log("gb: shuttingdown=%i",(int)shuttingDown); // turn off statsdb until everyone is done //g_statsdb.m_disabled = true; // loop over all Rdbs and save them - for ( int32_t i = 0 ; ! m_calledSave && i < m_numRdbs ; i++ ) { + for ( int32_t i = 0 ; i < m_numRdbs ; i++ ) { + if ( m_calledSave ) { + log("gb: already saved trees, skipping."); + break; + } Rdb *rdb = m_rdbs[i]; // if we save doledb while spidering it screws us up // because Spider.cpp can not directly write into the // rdb tree and it expects that to always be available! if ( ! shuttingDown && rdb->m_rdbId == RDB_DOLEDB ) continue; + // note it + if ( ! rdb->m_dbname || ! rdb->m_dbname[0] ) + log("gb: calling save tree for rdbid %i", + (int)rdb->m_rdbId); + else + log("gb: calling save tree for %s", + rdb->m_dbname); + rdb->saveTree ( useThread ); } diff --git a/RdbTree.cpp b/RdbTree.cpp index 42591690..4a829c95 100644 --- a/RdbTree.cpp +++ b/RdbTree.cpp @@ -162,7 +162,9 @@ void RdbTree::reset ( ) { strcmp(m_dbname,"accessdb") && strcmp(m_dbname,"statsdb") ) { //strcmp(m_dbname,"doledb") ) { - log("rdb: Resetting unsaved tree %s.",m_dbname); + log("rdb: RESETTING UNSAVED TREE %s.",m_dbname); + log("rdb: RESETTING UNSAVED TREE %s.",m_dbname); + log("rdb: RESETTING UNSAVED TREE %s.",m_dbname); // when DELETING a collection from pagecrawlbot.cpp // it calls Collectiondb::deleteRec() which calls // SpiderColl::reset() which calls m_waitingTree.reset() diff --git a/Spider.cpp b/Spider.cpp index a2a2bff2..c7a8abc6 100644 --- a/Spider.cpp +++ b/Spider.cpp @@ -1878,6 +1878,14 @@ bool SpiderColl::updateSiteNumInlinksTable ( int32_t siteHash32, // the count in m_doleIpTable here bool SpiderColl::addSpiderReply ( SpiderReply *srep ) { + //// + // + // skip if not assigned to us for doling + // + //// + if ( ! isAssignedToUs ( srep->m_firstIp ) ) + return true; + ///////// // // remove the lock here @@ -1891,8 +1899,10 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) { time_t nowGlobal = getTimeGlobal(); if ( g_conf.m_logDebugSpider ) - logf(LOG_DEBUG,"spider: scheduled lock removal in 5 secs for " - "lockKey=%"UINT64"", lockKey ); + logf(LOG_DEBUG,"spider: removing lock uh48=%"INT64" " + "lockKey=%"UINT64"", + srep->getUrlHash48(), + lockKey ); // test it //if ( m_nowGlobal == 0 && lock ) @@ -1927,14 +1937,6 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) { logf(LOG_DEBUG,"spider: rdb: lockKey=%"UINT64" " "was not in lock table",lockKey); - //// - // - // skip if not assigned to us for doling - // - //// - if ( ! isAssignedToUs ( srep->m_firstIp ) ) - return true; - // now just remove it since we only spider our own urls // and doledb is in memory g_spiderLoop.m_lockTable.removeKey ( &lockKey ); @@ -3486,7 +3488,7 @@ bool SpiderColl::evalIpLoop ( ) { &cachedTimestamp , // rec timestamp true ); // promote rec? // doleBuf could be NULL i guess... - if ( inCache && doleBufSize > 0 ) { + if ( inCache ) { // && doleBufSize > 0 ) { if ( g_conf.m_logDebugSpider ) log("spider: GOT %"INT32" bytes of SpiderRequests " "from winnerlistcache for ip %s",doleBufSize, @@ -4162,6 +4164,11 @@ bool SpiderColl::scanListForWinners ( ) { if ( g_conf.m_logDebugSpider ) log("spider: got ufn=%"INT32" for %s",ufn,sreq->m_url); + + if ( g_conf.m_logDebugSpider && srep ) + log("spider: lastspidered=%"UINT32"", + srep->m_spideredTime); + // spiders disabled for this row in url filteres? //if ( ! m_cr->m_spidersEnabled[ufn] ) continue; @@ -4212,6 +4219,11 @@ bool SpiderColl::scanListForWinners ( ) { continue; } + // debug point + // if ( ((long long)srep->m_spideredTime)*1000LL > + // nowGlobalMS - 86400LL*1000LL*30LL ) + // log("spider: should not be spidering this!"); + ////// // // MDW: no, take this out now that we allow multiple urls @@ -4312,6 +4324,14 @@ bool SpiderColl::scanListForWinners ( ) { // "firstIp". this way we can spider multiple urls from the // same ip at the same time. int64_t key = makeLockTableKey ( sreq ); + + if ( g_conf.m_logDebugSpider ) + log("spider: checking uh48=%"INT64" lockkey=%"INT64" " + "used=%"INT32"", + uh48,key, + g_spiderLoop.m_lockTable.getNumUsedSlots()); + + // MDW if ( g_spiderLoop.m_lockTable.isInTable ( &key ) ) { // get it //CrawlInfo *ci = &m_cr->m_localCrawlInfo; @@ -5179,17 +5199,41 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache , // allow this to add a 0 length record otherwise we keep the same // old url in here and keep spidering it over and over again! bool addToCache = false; - if ( skipSize && m_doleBuf.length() - skipSize >= 0 ) addToCache =true; + if ( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true; // if winnertree was empty, then we might have scanned like 10M // twitter.com urls and not wanted any of them, so we don't want to // have to keep redoing that! if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true; + RdbCache *wc = &g_spiderLoop.m_winnerListCache; + + // remove from cache? if we added the last spider request in the + // cached dolebuf to doledb then remove it from cache so it's not + // a cached empty dolebuf and we recompute it not using the cache. + if ( isFromCache && skipSize && m_doleBuf.length() - skipSize == 0 ) { + if ( addToCache ) { char *xx=NULL;*xx=0; } + // let's get this working right... + //wc->removeKey ( collnum , k , start ); + //wc->markDeletedRecord(start); + // i don't think we can remove keys from cache so add + // a rec with a byte size of 1 to indicate for us to ignore. + // set the timestamp to 12345 so the getRecord above will + // not get it and promote it in the linked list. + char byte = 0; + key_t cacheKey; + cacheKey.n0 = firstIp; + cacheKey.n1 = 0; + wc->addRecord ( m_collnum, + (char *)&cacheKey, + &byte , + 1 , + 12345 );//cachedTimestamp ); + } + if ( addToCache ) { key_t cacheKey; cacheKey.n0 = firstIp; cacheKey.n1 = 0; - RdbCache *wc = &g_spiderLoop.m_winnerListCache; if ( g_conf.m_logDebugSpider ) log("spider: adding %"INT32" bytes of SpiderRequests " "to winnerlistcache for ip %s", @@ -5243,6 +5287,11 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache , // now that doledb is tree-only and never dumps to disk, just // add it directly g_doledb.m_rdb.addList ( m_collnum , &tmpList , MAX_NICENESS ); + + if ( g_conf.m_logDebugSpider ) + log("spider: adding doledb tree node size=%"INT32"",skipSize); + + // and it happens right away. just add it locally. bool status = true; @@ -7287,6 +7336,9 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq , true); if ( node == -1 ) { char *xx=NULL;*xx=0; } + if ( g_conf.m_logDebugSpider ) + log("spider: deleting doledb tree node %"INT32,node); + // now remove from doleiptable since we removed from doledb m_sc->removeFromDoledbTable ( sreq->m_firstIp ); @@ -7326,6 +7378,9 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq , tmp.m_spiderOutstanding = 0; tmp.m_confirmed = 1; tmp.m_collnum = m_collnum; + if ( g_conf.m_logDebugSpider ) + log("spider: adding lock uh48=%"INT64" lockkey=%"INT64"", + m_sreq->getUrlHash48(),lockKeyUh48); if ( ! ht->addKey ( &lockKeyUh48 , &tmp ) ) return true; diff --git a/XmlDoc.cpp b/XmlDoc.cpp index 1374795f..cf04a162 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -20135,7 +20135,7 @@ bool XmlDoc::logIt ( SafeBuf *bb ) { time_t spideredTime = m_oldDoc->getSpideredTime(); struct tm *timeStruct = gmtime ( &spideredTime ); char tmp[64]; - strftime(tmp,64,"lastspidered=%b-%d-%Y(%H:%M:%S)",timeStruct); + strftime(tmp,64,"lastindexed=%b-%d-%Y(%H:%M:%S)",timeStruct); sb->safePrintf("%s(%"UINT32") ", tmp,(uint32_t)spideredTime); }