fix neverending crawl rounds by only trying each url

once per round. updated url filters.
2024-10-04 04:07:13 +03:00 · 2016-02-22 09:28:46 -08:00 · 2016-02-22 09:28:46 -08:00 · 412b04bbd4
commit 412b04bbd4
parent da9949f462
2 changed files with 38 additions and 15 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -3841,6 +3841,33 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		i++;
 	}

+	// 3rd rule for respidering
+	// put this above the errocount>= rules below otherwise the crawl
+	// may never advance its round because it keeps retrying a ton of
+	// errored urls.
+	if ( respiderFreq > 0.0 ) {
+		m_regExs[i].set("lastspidertime>={roundstart}");
+		// do not "remove" from index
+		m_spiderPriorities   [i] = 10;
+		// just turn off spidering. if we were to set priority to
+		// filtered it would be removed from index!
+		//m_spidersEnabled     [i] = 0;
+		m_maxSpidersPerRule[i] = 0;
+		// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
+		// which has been obsoleted, but we are running old code now!
+		//m_spiderDiffbotApiUrl[i].set ( api );
+		i++;
+	}
+	// if doing a one-shot crawl limit error retries to 3 times or
+	// if no urls currently available to spider, whichever comes first.
+	else {
+		m_regExs[i].set("errorcount>=3");
+		m_spiderPriorities   [i] = 11;
+		m_spiderFreqs        [i] = 0.0416;
+		m_maxSpidersPerRule  [i] = 0; // turn off spiders
+		i++;
+	}
+
 	// diffbot needs to retry even on 500 or 404 errors since sometimes
 	// a seed url gets a 500 error mistakenly and it haults the crawl.
 	// so take out "!hastmperror".
@ -3871,23 +3898,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
 	i++;

-	// 3rd rule for respidering
-	if ( respiderFreq > 0.0 ) {
-		m_regExs[i].set("lastspidertime>={roundstart}");
-		// do not "remove" from index
-		m_spiderPriorities   [i] = 10;
-		// just turn off spidering. if we were to set priority to
-		// filtered it would be removed from index!
-		//m_spidersEnabled     [i] = 0;
-		m_maxSpidersPerRule[i] = 0;
-		// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
-		// which has been obsoleted, but we are running old code now!
-		//m_spiderDiffbotApiUrl[i].set ( api );
-		i++;
-	}
 	// if collectiverespiderfreq is 0 or less then do not RE-spider
 	// documents already indexed.
-	else {
+	if ( respiderFreq <= 0.0 ) { // else {
 		// this does NOT work! error docs continuosly respider
 		// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
 		//m_regExs[i].set("isindexed");
--- a/Spider.cpp
+++ b/Spider.cpp
@ -4214,6 +4214,16 @@ bool SpiderColl::scanListForWinners ( ) {
 			srepUh48 = srep->getUrlHash48();
 			continue;
 		}
+
+		// MDW: this is handled in url filters now just fine.
+		// regardless of the spider request, if it has a spider
+		// reply for THIS ROUND, and we are doing crawl rounds,
+		// then skip it
+		// if ( m_cr->m_isCustomCrawl &&
+		//      srep &&
+		//      srep->m_spideredTime >= m_cr->m_spiderRoundStartTime )
+		// 	continue;
+
 		// cast it
 		SpiderRequest *sreq = (SpiderRequest *)rec;