diff --git a/CollectionRec.cpp b/CollectionRec.cpp index be9894e7..a346b198 100644 --- a/CollectionRec.cpp +++ b/CollectionRec.cpp @@ -101,6 +101,8 @@ void CollectionRec::reset() { m_replies = 0; } +CollectionRec *g_cr = NULL; + // . load this data from a conf file // . values we do not explicitly have will be taken from "default", // collection config file. if it does not have them then we use diff --git a/CollectionRec.h b/CollectionRec.h index 5a47385e..0bf34484 100644 --- a/CollectionRec.h +++ b/CollectionRec.h @@ -97,6 +97,11 @@ class CrawlInfo { // this is non-zero if urls are available to be spidered right now. long m_hasUrlsReadyToSpider; + // last time we launched a spider. 0 on startup. + time_t m_lastSpiderAttempt; + // time we had or might have had a url available for spidering + time_t m_lastSpiderCouldLaunch; + // have we sent out email/webhook notifications crawl has no urls // currently in the ready queue (doledb) to spider? char m_sentCrawlDoneAlert; diff --git a/Collectiondb.cpp b/Collectiondb.cpp index 5842bafe..19853d6a 100644 --- a/Collectiondb.cpp +++ b/Collectiondb.cpp @@ -774,6 +774,9 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) { cr->m_spiderRoundNum = 0; cr->m_spiderRoundStartTime = 0; + cr->m_spiderStatus = 0; + cr->m_spiderStatusMsg = NULL; + // reset seed buf cr->m_diffbotSeeds.purge(); diff --git a/Mem.cpp b/Mem.cpp index 325ab225..59f70f92 100644 --- a/Mem.cpp +++ b/Mem.cpp @@ -12,7 +12,7 @@ //#include "Stats.h" // put me back -#define _EFENCE_ +//#define _EFENCE_ // uncomment this for _EFENCE_ to do underflow checks instead of the // default overflow checks diff --git a/PageCrawlBot.cpp b/PageCrawlBot.cpp index c538063c..e05ddf62 100644 --- a/PageCrawlBot.cpp +++ b/PageCrawlBot.cpp @@ -2652,12 +2652,35 @@ bool printCrawlBotPage2 ( TcpSocket *socket , char *ss = "Normal"; if ( cx->m_spiderStatusMsg ) ss = cx->m_spiderStatusMsg; + // 0 means not to RE-crawl + char tmp[256]; + // indicate if we are WAITING for next round... + if ( cx->m_collectiveRespiderFrequency > 0.0 && + getTimeGlobal() < cr->m_spiderRoundStartTime ) { + long now = getTimeGlobal(); + sprintf(tmp,"Spidering next round in %li " + "seconds.", + cr->m_spiderRoundStartTime - now + ); + ss = tmp; + } + // if we sent an email simply because no urls + // were left and we are not recrawling! + if ( cx->m_collectiveRespiderFrequency == 0.0 && + ! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) { + ss = "Crawl has exhausted all urls and " + "repeatCrawl is set to 0.0"; + } + CrawlInfo *ci = &cx->m_localCrawlInfo; + long sentAlert = (long)ci->m_sentCrawlDoneAlert; + if ( sentAlert ) sentAlert = 1; //if ( cx->m_spideringEnabled ) paused = 0; sb.safePrintf("\n\n{" "\"name\":\"%s\",\n" //"\"alias\":\"%s\",\n" "\"crawlingEnabled\":%li,\n" - "\"crawlingStatus\":\"%s\",\n" + "\"crawlStatus\":\"%s\",\n" + "\"sentCrawlDoneNotification\":%li,\n" //"\"crawlingPaused\":%li,\n" "\"objectsFound\":%lli,\n" "\"urlsHarvested\":%lli,\n" @@ -2678,6 +2701,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , //, alias , (long)cx->m_spideringEnabled , ss + , sentAlert //, (long)paused , cx->m_globalCrawlInfo.m_objectsAdded - cx->m_globalCrawlInfo.m_objectsDeleted diff --git a/Spider.cpp b/Spider.cpp index 9bcc6214..82d5b81e 100644 --- a/Spider.cpp +++ b/Spider.cpp @@ -1000,8 +1000,8 @@ SpiderColl::SpiderColl () { m_numAdded = 0; m_numBytesScanned = 0; m_lastPrintCount = 0; - m_lastSpiderAttempt = 0; - m_lastSpiderCouldLaunch = 0; + //m_lastSpiderAttempt = 0; + //m_lastSpiderCouldLaunch = 0; //m_numRoundsDone = 0; //m_lastDoledbReadEmpty = false; // over all priorities in this coll // re-set this to min and set m_needsWaitingTreeRebuild to true @@ -3954,14 +3954,23 @@ void doneSendingNotification ( void *state ) { // as false again! use LOCAL crawlInfo, since global is reset often. cr->m_localCrawlInfo.m_sentCrawlDoneAlert = cr->m_spiderStatus;//1; + // be sure to save state so we do not re-send emails + cr->m_needsSave = 1; + + // sanity + if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; } + // sanity check if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; } - // if not round done we are done - if ( cr->m_spiderStatus != SP_ROUNDDONE ) return; + // advance round if that round has completed, or there are no + // more urls to spider. if we hit maxToProcess/maxToCrawl then + // do not increment the round #. otherwise we should increment it. + if ( cr->m_spiderStatus == SP_MAXTOCRAWL ) return; + if ( cr->m_spiderStatus == SP_MAXTOPROCESS ) return; // this should have been set below - if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; } + //if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; } // how is this possible //if ( getTimeGlobal() @@ -3980,7 +3989,10 @@ void doneSendingNotification ( void *state ) { break; } - if ( respiderFreq == -1.0 ) return; + // if not REcrawling, set this to 0 so we at least update our + // round # and round start time... + if ( respiderFreq == -1.0 ) + respiderFreq = 0.0; if ( respiderFreq < 0.0 ) { log("spider: bad respiderFreq of %f. making 0.", @@ -3989,6 +4001,9 @@ void doneSendingNotification ( void *state ) { } long seconds = respiderFreq * 24*3600; + // add 1 for lastspidertime round off errors so we can be assured + // all spiders have a lastspidertime LESS than the new + // m_spiderRoundStartTime we set below. if ( seconds <= 0 ) seconds = 1; // now update this round start time. all the other hosts should @@ -4139,7 +4154,7 @@ void SpiderLoop::spiderDoledUrls ( ) { if ( ! cr->m_spideringEnabled ) continue; // hit crawl round max? - if ( //cr->m_maxCrawlRounds > 0 && + if ( cr->m_maxCrawlRounds > 0 && cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) { cr->m_spiderStatus = SP_MAXROUNDS; cr->m_spiderStatusMsg = "Hit maxCrawlRounds limit."; @@ -4175,6 +4190,24 @@ void SpiderLoop::spiderDoledUrls ( ) { // set current time, synced with host #0 nowGlobal = getTimeGlobal(); + // shortcut + CrawlInfo *ci = &cr->m_localCrawlInfo; + + // the last time we attempted to spider a url for this coll + //m_sc->m_lastSpiderAttempt = nowGlobal; + // now we save this so when we restart these two times + // are from where we left off so we do not end up setting + // hasUrlsReadyToSpider to true which in turn sets + // the sentEmailAlert flag to false, which makes us + // send ANOTHER email alert!! + ci->m_lastSpiderAttempt = nowGlobal; + + // update this for the first time in case it is never updated. + // then after 60 seconds we assume the crawl is done and + // we send out notifications. see below. + if ( ci->m_lastSpiderCouldLaunch == 0 ) + ci->m_lastSpiderCouldLaunch = nowGlobal; + // // . if doing respider with roundstarttime.... // . roundstarttime is > 0 if m_collectiveRespiderFrequency @@ -4184,19 +4217,13 @@ void SpiderLoop::spiderDoledUrls ( ) { // if ( nowGlobal < cr->m_spiderRoundStartTime ) continue; - // the last time we attempted to spider a url for this coll - m_sc->m_lastSpiderAttempt = nowGlobal; - // update this for the first time in case it is never updated. - // then after 60 seconds we assume the crawl is done and - // we send out notifications. see below. - if ( m_sc->m_lastSpiderCouldLaunch == 0 ) - m_sc->m_lastSpiderCouldLaunch = nowGlobal; // if populating this collection's waitingtree assume // we would have found something to launch as well. it might // mean the waitingtree-saved.dat file was deleted from disk // so we need to rebuild it at startup. if ( m_sc->m_waitingTreeNeedsRebuild ) - m_sc->m_lastSpiderCouldLaunch = nowGlobal; + ci->m_lastSpiderCouldLaunch = nowGlobal; + // get max spiders long maxSpiders = cr->m_maxNumSpiders; if ( m_sc->m_isTestColl ) { @@ -4215,7 +4242,7 @@ void SpiderLoop::spiderDoledUrls ( ) { // obey max spiders per collection too if ( m_sc->m_spidersOut >= maxSpiders ) { // assume we would have launched a spider - m_sc->m_lastSpiderCouldLaunch = nowGlobal; + ci->m_lastSpiderCouldLaunch = nowGlobal; // try next collection continue; } @@ -4279,10 +4306,13 @@ void SpiderLoop::spiderDoledUrls ( ) { loop: + // shortcut + CrawlInfo *ci = &cr->m_localCrawlInfo; + // bail if waiting for lock reply, no point in reading more if ( m_msg12.m_gettingLocks ) { // assume we would have launched a spider for this coll - m_sc->m_lastSpiderCouldLaunch = nowGlobal; + ci->m_lastSpiderCouldLaunch = nowGlobal; // wait for sleep callback to re-call us in 10ms return; } @@ -4344,7 +4374,7 @@ void SpiderLoop::spiderDoledUrls ( ) { // skip? if ( out >= max ) { // assume we could have launched a spider - if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal; + if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal; // count as non-empty then! //m_sc->m_encounteredDoledbRecs = true; // try the priority below us @@ -4464,6 +4494,10 @@ bool SpiderLoop::gotDoledbList2 ( ) { // unlock m_gettingDoledbList = false; + // shortcuts + CollectionRec *cr = m_sc->m_cr; + CrawlInfo *ci = &cr->m_localCrawlInfo; + // update m_msg5StartKey for next read if ( m_list.getListSize() > 0 ) { m_list.getLastKey((char *)&m_sc->m_msg5StartKey); @@ -4495,7 +4529,7 @@ bool SpiderLoop::gotDoledbList2 ( ) { if ( bail ) { // assume we could have launched a spider - m_sc->m_lastSpiderCouldLaunch = getTimeGlobal(); + ci->m_lastSpiderCouldLaunch = getTimeGlobal(); // return false to indicate to try another return false; } @@ -4623,7 +4657,6 @@ bool SpiderLoop::gotDoledbList2 ( ) { if ( pri < 0 || pri >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; } // skip the priority if we already have enough spiders on it long out = m_sc->m_outstandingSpiders[pri]; - CollectionRec *cr = m_sc->m_cr; // get the first ufn that uses this priority //long max = getMaxAllowableSpidersOut ( pri ); // how many spiders can we have out? @@ -4661,7 +4694,7 @@ bool SpiderLoop::gotDoledbList2 ( ) { // skip? and re-get another doledb list from next priority... if ( out >= max ) { // assume we could have launched a spider - if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal; + if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal; // this priority is maxed out, try next m_sc->devancePriority(); // assume not an empty read @@ -4850,12 +4883,18 @@ bool SpiderLoop::gotDoledbList2 ( ) { // assume we launch the spider below. really this timestamp indicates // the last time we COULD HAVE LAUNCHED *OR* did actually launch // a spider - m_sc->m_lastSpiderCouldLaunch = nowGlobal; + ci->m_lastSpiderCouldLaunch = nowGlobal; // set crawl done email sent flag so another email can be sent again // in case the user upped the maxToCrawl limit, for instance, // so that the crawl could continue. - m_sc->m_cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0; + //ci->m_sentCrawlDoneAlert = 0; + + // there are urls ready to spider + ci->m_hasUrlsReadyToSpider = true; + + // be sure to save state so we do not re-send emails + cr->m_needsSave = 1; // assume not an empty read //m_sc->m_encounteredDoledbRecs = true; @@ -9863,9 +9902,13 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) { // but only if it was a crawl round done alert, // not a maxToCrawl or maxToProcess or maxRounds // alert. - if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == - SP_ROUNDDONE ) - cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0; + // we can't do this because on startup we end up + // setting hasUrlsReadyToSpider to true and we + // may have already sent an email, and it gets RESET + // here when it shouldn't be + //if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == + // SP_ROUNDDONE ) + // cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0; } } // return if still waiting on more to come in @@ -9874,6 +9917,15 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) { // sanity check if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; } + + //if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE ) + + // if we have urls ready to be spidered then prepare to send another + // email/webhook notification + if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) + cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0; + + // update cache time cr->m_globalCrawlInfo.m_lastUpdateTime = getTime(); @@ -9932,9 +9984,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) { // if urls were considered and roundstarttime is still 0 then // set it to the current time... - if ( cr->m_spiderRoundStartTime == 0 ) - // all hosts in the network should sync with host #0 on this - cr->m_spiderRoundStartTime = getTimeGlobal(); + //if ( cr->m_spiderRoundStartTime == 0 ) + // // all hosts in the network should sync with host #0 on this + // cr->m_spiderRoundStartTime = getTimeGlobal(); // but of course if it has urls ready to spider, do not send alert... // or if this is -1, indicating "unknown". @@ -9987,20 +10039,23 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) { //long now = getTimeGlobal(); - SpiderColl *sc = g_spiderCache.getSpiderColl(collnum); + //SpiderColl *sc = g_spiderCache.getSpiderColl(collnum); + + // shortcut + CrawlInfo *ci = &cr->m_localCrawlInfo; // assume it does - cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1; + //ci->m_hasUrlsReadyToSpider = 1; // if we haven't spidered anything in 1 min assume the // queue is basically empty... - if ( sc->m_lastSpiderAttempt && - sc->m_lastSpiderCouldLaunch && + if ( ci->m_lastSpiderAttempt && + ci->m_lastSpiderCouldLaunch && //cr->m_spideringEnabled && //g_conf.m_spideringEnabled && - sc->m_lastSpiderAttempt - sc->m_lastSpiderCouldLaunch > 60 ) + ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 60 ) // assume our crawl on this host is completed i guess - cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 0; + ci->m_hasUrlsReadyToSpider = 0; diff --git a/Spider.h b/Spider.h index bc1d1360..59376ed8 100644 --- a/Spider.h +++ b/Spider.h @@ -980,11 +980,6 @@ class SpiderColl { bool m_useTree; - // last time we launched a spider. 0 on startup. - time_t m_lastSpiderAttempt; - // time we had or might have had a url available for spidering - time_t m_lastSpiderCouldLaunch; - //bool m_lastDoledbReadEmpty; //bool m_encounteredDoledbRecs; //long long m_numRoundsDone;