fix neverending crawl rounds by only trying each url

once per round. updated url filters.
This commit is contained in:
Matt Wells 2016-02-22 09:28:46 -08:00
parent da9949f462
commit 412b04bbd4
2 changed files with 38 additions and 15 deletions

View File

@ -3841,6 +3841,33 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
i++;
}
// 3rd rule for respidering
// put this above the errocount>= rules below otherwise the crawl
// may never advance its round because it keeps retrying a ton of
// errored urls.
if ( respiderFreq > 0.0 ) {
m_regExs[i].set("lastspidertime>={roundstart}");
// do not "remove" from index
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// if doing a one-shot crawl limit error retries to 3 times or
// if no urls currently available to spider, whichever comes first.
else {
m_regExs[i].set("errorcount>=3");
m_spiderPriorities [i] = 11;
m_spiderFreqs [i] = 0.0416;
m_maxSpidersPerRule [i] = 0; // turn off spiders
i++;
}
// diffbot needs to retry even on 500 or 404 errors since sometimes
// a seed url gets a 500 error mistakenly and it haults the crawl.
// so take out "!hastmperror".
@ -3871,23 +3898,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
i++;
// 3rd rule for respidering
if ( respiderFreq > 0.0 ) {
m_regExs[i].set("lastspidertime>={roundstart}");
// do not "remove" from index
m_spiderPriorities [i] = 10;
// just turn off spidering. if we were to set priority to
// filtered it would be removed from index!
//m_spidersEnabled [i] = 0;
m_maxSpidersPerRule[i] = 0;
// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
// which has been obsoleted, but we are running old code now!
//m_spiderDiffbotApiUrl[i].set ( api );
i++;
}
// if collectiverespiderfreq is 0 or less then do not RE-spider
// documents already indexed.
else {
if ( respiderFreq <= 0.0 ) { // else {
// this does NOT work! error docs continuosly respider
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
//m_regExs[i].set("isindexed");

View File

@ -4214,6 +4214,16 @@ bool SpiderColl::scanListForWinners ( ) {
srepUh48 = srep->getUrlHash48();
continue;
}
// MDW: this is handled in url filters now just fine.
// regardless of the spider request, if it has a spider
// reply for THIS ROUND, and we are doing crawl rounds,
// then skip it
// if ( m_cr->m_isCustomCrawl &&
// srep &&
// srep->m_spideredTime >= m_cr->m_spiderRoundStartTime )
// continue;
// cast it
SpiderRequest *sreq = (SpiderRequest *)rec;