fix crawl round end detection etc.

inc round counter even if not repeating crawl
This commit is contained in:
Matt Wells 2013-10-23 15:53:59 -07:00
parent 469be5f216
commit c39b45ff88
7 changed files with 126 additions and 42 deletions

View File

@ -101,6 +101,8 @@ void CollectionRec::reset() {
m_replies = 0;
}
CollectionRec *g_cr = NULL;
// . load this data from a conf file
// . values we do not explicitly have will be taken from "default",
// collection config file. if it does not have them then we use

View File

@ -97,6 +97,11 @@ class CrawlInfo {
// this is non-zero if urls are available to be spidered right now.
long m_hasUrlsReadyToSpider;
// last time we launched a spider. 0 on startup.
time_t m_lastSpiderAttempt;
// time we had or might have had a url available for spidering
time_t m_lastSpiderCouldLaunch;
// have we sent out email/webhook notifications crawl has no urls
// currently in the ready queue (doledb) to spider?
char m_sentCrawlDoneAlert;

View File

@ -774,6 +774,9 @@ bool Collectiondb::resetColl ( char *coll , bool resetTurkdb ) {
cr->m_spiderRoundNum = 0;
cr->m_spiderRoundStartTime = 0;
cr->m_spiderStatus = 0;
cr->m_spiderStatusMsg = NULL;
// reset seed buf
cr->m_diffbotSeeds.purge();

View File

@ -12,7 +12,7 @@
//#include "Stats.h"
// put me back
#define _EFENCE_
//#define _EFENCE_
// uncomment this for _EFENCE_ to do underflow checks instead of the
// default overflow checks

View File

@ -2652,12 +2652,35 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
char *ss = "Normal";
if ( cx->m_spiderStatusMsg )
ss = cx->m_spiderStatusMsg;
// 0 means not to RE-crawl
char tmp[256];
// indicate if we are WAITING for next round...
if ( cx->m_collectiveRespiderFrequency > 0.0 &&
getTimeGlobal() < cr->m_spiderRoundStartTime ) {
long now = getTimeGlobal();
sprintf(tmp,"Spidering next round in %li "
"seconds.",
cr->m_spiderRoundStartTime - now
);
ss = tmp;
}
// if we sent an email simply because no urls
// were left and we are not recrawling!
if ( cx->m_collectiveRespiderFrequency == 0.0 &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
ss = "Crawl has exhausted all urls and "
"repeatCrawl is set to 0.0";
}
CrawlInfo *ci = &cx->m_localCrawlInfo;
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
if ( sentAlert ) sentAlert = 1;
//if ( cx->m_spideringEnabled ) paused = 0;
sb.safePrintf("\n\n{"
"\"name\":\"%s\",\n"
//"\"alias\":\"%s\",\n"
"\"crawlingEnabled\":%li,\n"
"\"crawlingStatus\":\"%s\",\n"
"\"crawlStatus\":\"%s\",\n"
"\"sentCrawlDoneNotification\":%li,\n"
//"\"crawlingPaused\":%li,\n"
"\"objectsFound\":%lli,\n"
"\"urlsHarvested\":%lli,\n"
@ -2678,6 +2701,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//, alias
, (long)cx->m_spideringEnabled
, ss
, sentAlert
//, (long)paused
, cx->m_globalCrawlInfo.m_objectsAdded -
cx->m_globalCrawlInfo.m_objectsDeleted

View File

@ -1000,8 +1000,8 @@ SpiderColl::SpiderColl () {
m_numAdded = 0;
m_numBytesScanned = 0;
m_lastPrintCount = 0;
m_lastSpiderAttempt = 0;
m_lastSpiderCouldLaunch = 0;
//m_lastSpiderAttempt = 0;
//m_lastSpiderCouldLaunch = 0;
//m_numRoundsDone = 0;
//m_lastDoledbReadEmpty = false; // over all priorities in this coll
// re-set this to min and set m_needsWaitingTreeRebuild to true
@ -3954,14 +3954,23 @@ void doneSendingNotification ( void *state ) {
// as false again! use LOCAL crawlInfo, since global is reset often.
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = cr->m_spiderStatus;//1;
// be sure to save state so we do not re-send emails
cr->m_needsSave = 1;
// sanity
if ( cr->m_spiderStatus == 0 ) { char *xx=NULL;*xx=0; }
// sanity check
if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; }
// if not round done we are done
if ( cr->m_spiderStatus != SP_ROUNDDONE ) return;
// advance round if that round has completed, or there are no
// more urls to spider. if we hit maxToProcess/maxToCrawl then
// do not increment the round #. otherwise we should increment it.
if ( cr->m_spiderStatus == SP_MAXTOCRAWL ) return;
if ( cr->m_spiderStatus == SP_MAXTOPROCESS ) return;
// this should have been set below
if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }
//if ( cr->m_spiderRoundStartTime == 0 ) { char *xx=NULL;*xx=0; }
// how is this possible
//if ( getTimeGlobal()
@ -3980,7 +3989,10 @@ void doneSendingNotification ( void *state ) {
break;
}
if ( respiderFreq == -1.0 ) return;
// if not REcrawling, set this to 0 so we at least update our
// round # and round start time...
if ( respiderFreq == -1.0 )
respiderFreq = 0.0;
if ( respiderFreq < 0.0 ) {
log("spider: bad respiderFreq of %f. making 0.",
@ -3989,6 +4001,9 @@ void doneSendingNotification ( void *state ) {
}
long seconds = respiderFreq * 24*3600;
// add 1 for lastspidertime round off errors so we can be assured
// all spiders have a lastspidertime LESS than the new
// m_spiderRoundStartTime we set below.
if ( seconds <= 0 ) seconds = 1;
// now update this round start time. all the other hosts should
@ -4139,7 +4154,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
if ( ! cr->m_spideringEnabled ) continue;
// hit crawl round max?
if ( //cr->m_maxCrawlRounds > 0 &&
if ( cr->m_maxCrawlRounds > 0 &&
cr->m_spiderRoundNum >= cr->m_maxCrawlRounds ) {
cr->m_spiderStatus = SP_MAXROUNDS;
cr->m_spiderStatusMsg = "Hit maxCrawlRounds limit.";
@ -4175,6 +4190,24 @@ void SpiderLoop::spiderDoledUrls ( ) {
// set current time, synced with host #0
nowGlobal = getTimeGlobal();
// shortcut
CrawlInfo *ci = &cr->m_localCrawlInfo;
// the last time we attempted to spider a url for this coll
//m_sc->m_lastSpiderAttempt = nowGlobal;
// now we save this so when we restart these two times
// are from where we left off so we do not end up setting
// hasUrlsReadyToSpider to true which in turn sets
// the sentEmailAlert flag to false, which makes us
// send ANOTHER email alert!!
ci->m_lastSpiderAttempt = nowGlobal;
// update this for the first time in case it is never updated.
// then after 60 seconds we assume the crawl is done and
// we send out notifications. see below.
if ( ci->m_lastSpiderCouldLaunch == 0 )
ci->m_lastSpiderCouldLaunch = nowGlobal;
//
// . if doing respider with roundstarttime....
// . roundstarttime is > 0 if m_collectiveRespiderFrequency
@ -4184,19 +4217,13 @@ void SpiderLoop::spiderDoledUrls ( ) {
//
if ( nowGlobal < cr->m_spiderRoundStartTime ) continue;
// the last time we attempted to spider a url for this coll
m_sc->m_lastSpiderAttempt = nowGlobal;
// update this for the first time in case it is never updated.
// then after 60 seconds we assume the crawl is done and
// we send out notifications. see below.
if ( m_sc->m_lastSpiderCouldLaunch == 0 )
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
// if populating this collection's waitingtree assume
// we would have found something to launch as well. it might
// mean the waitingtree-saved.dat file was deleted from disk
// so we need to rebuild it at startup.
if ( m_sc->m_waitingTreeNeedsRebuild )
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
ci->m_lastSpiderCouldLaunch = nowGlobal;
// get max spiders
long maxSpiders = cr->m_maxNumSpiders;
if ( m_sc->m_isTestColl ) {
@ -4215,7 +4242,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
// obey max spiders per collection too
if ( m_sc->m_spidersOut >= maxSpiders ) {
// assume we would have launched a spider
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
ci->m_lastSpiderCouldLaunch = nowGlobal;
// try next collection
continue;
}
@ -4279,10 +4306,13 @@ void SpiderLoop::spiderDoledUrls ( ) {
loop:
// shortcut
CrawlInfo *ci = &cr->m_localCrawlInfo;
// bail if waiting for lock reply, no point in reading more
if ( m_msg12.m_gettingLocks ) {
// assume we would have launched a spider for this coll
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
ci->m_lastSpiderCouldLaunch = nowGlobal;
// wait for sleep callback to re-call us in 10ms
return;
}
@ -4344,7 +4374,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
// skip?
if ( out >= max ) {
// assume we could have launched a spider
if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal;
if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
// count as non-empty then!
//m_sc->m_encounteredDoledbRecs = true;
// try the priority below us
@ -4464,6 +4494,10 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// unlock
m_gettingDoledbList = false;
// shortcuts
CollectionRec *cr = m_sc->m_cr;
CrawlInfo *ci = &cr->m_localCrawlInfo;
// update m_msg5StartKey for next read
if ( m_list.getListSize() > 0 ) {
m_list.getLastKey((char *)&m_sc->m_msg5StartKey);
@ -4495,7 +4529,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
if ( bail ) {
// assume we could have launched a spider
m_sc->m_lastSpiderCouldLaunch = getTimeGlobal();
ci->m_lastSpiderCouldLaunch = getTimeGlobal();
// return false to indicate to try another
return false;
}
@ -4623,7 +4657,6 @@ bool SpiderLoop::gotDoledbList2 ( ) {
if ( pri < 0 || pri >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
// skip the priority if we already have enough spiders on it
long out = m_sc->m_outstandingSpiders[pri];
CollectionRec *cr = m_sc->m_cr;
// get the first ufn that uses this priority
//long max = getMaxAllowableSpidersOut ( pri );
// how many spiders can we have out?
@ -4661,7 +4694,7 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// skip? and re-get another doledb list from next priority...
if ( out >= max ) {
// assume we could have launched a spider
if ( max > 0 ) m_sc->m_lastSpiderCouldLaunch = nowGlobal;
if ( max > 0 ) ci->m_lastSpiderCouldLaunch = nowGlobal;
// this priority is maxed out, try next
m_sc->devancePriority();
// assume not an empty read
@ -4850,12 +4883,18 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// assume we launch the spider below. really this timestamp indicates
// the last time we COULD HAVE LAUNCHED *OR* did actually launch
// a spider
m_sc->m_lastSpiderCouldLaunch = nowGlobal;
ci->m_lastSpiderCouldLaunch = nowGlobal;
// set crawl done email sent flag so another email can be sent again
// in case the user upped the maxToCrawl limit, for instance,
// so that the crawl could continue.
m_sc->m_cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
//ci->m_sentCrawlDoneAlert = 0;
// there are urls ready to spider
ci->m_hasUrlsReadyToSpider = true;
// be sure to save state so we do not re-send emails
cr->m_needsSave = 1;
// assume not an empty read
//m_sc->m_encounteredDoledbRecs = true;
@ -9863,9 +9902,13 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// but only if it was a crawl round done alert,
// not a maxToCrawl or maxToProcess or maxRounds
// alert.
if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ==
SP_ROUNDDONE )
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
// we can't do this because on startup we end up
// setting hasUrlsReadyToSpider to true and we
// may have already sent an email, and it gets RESET
// here when it shouldn't be
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert ==
// SP_ROUNDDONE )
// cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
}
}
// return if still waiting on more to come in
@ -9874,6 +9917,15 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// sanity check
if ( cr->m_replies > cr->m_requests ) { char *xx=NULL;*xx=0; }
//if ( cr->m_localCrawlInfo.m_sentCrawlDoneAlert == SP_ROUNDDONE )
// if we have urls ready to be spidered then prepare to send another
// email/webhook notification
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
cr->m_localCrawlInfo.m_sentCrawlDoneAlert = 0;
// update cache time
cr->m_globalCrawlInfo.m_lastUpdateTime = getTime();
@ -9932,9 +9984,9 @@ void gotCrawlInfoReply ( void *state , UdpSlot *slot ) {
// if urls were considered and roundstarttime is still 0 then
// set it to the current time...
if ( cr->m_spiderRoundStartTime == 0 )
// all hosts in the network should sync with host #0 on this
cr->m_spiderRoundStartTime = getTimeGlobal();
//if ( cr->m_spiderRoundStartTime == 0 )
// // all hosts in the network should sync with host #0 on this
// cr->m_spiderRoundStartTime = getTimeGlobal();
// but of course if it has urls ready to spider, do not send alert...
// or if this is -1, indicating "unknown".
@ -9987,20 +10039,23 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) {
//long now = getTimeGlobal();
SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum);
// shortcut
CrawlInfo *ci = &cr->m_localCrawlInfo;
// assume it does
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 1;
//ci->m_hasUrlsReadyToSpider = 1;
// if we haven't spidered anything in 1 min assume the
// queue is basically empty...
if ( sc->m_lastSpiderAttempt &&
sc->m_lastSpiderCouldLaunch &&
if ( ci->m_lastSpiderAttempt &&
ci->m_lastSpiderCouldLaunch &&
//cr->m_spideringEnabled &&
//g_conf.m_spideringEnabled &&
sc->m_lastSpiderAttempt - sc->m_lastSpiderCouldLaunch > 60 )
ci->m_lastSpiderAttempt - ci->m_lastSpiderCouldLaunch > 60 )
// assume our crawl on this host is completed i guess
cr->m_localCrawlInfo.m_hasUrlsReadyToSpider = 0;
ci->m_hasUrlsReadyToSpider = 0;

View File

@ -980,11 +980,6 @@ class SpiderColl {
bool m_useTree;
// last time we launched a spider. 0 on startup.
time_t m_lastSpiderAttempt;
// time we had or might have had a url available for spidering
time_t m_lastSpiderCouldLaunch;
//bool m_lastDoledbReadEmpty;
//bool m_encounteredDoledbRecs;
//long long m_numRoundsDone;