From dfc069aaa195e5b7b02620bef4d3245006be04e0 Mon Sep 17 00:00:00 2001
From: mwells <mattdwells@hotmail.com>
Date: Tue, 17 Mar 2015 20:27:23 -0600
Subject: [PATCH] do away with filtered/banned spider priorities. add checkbox
 to signify force deletes to remove urls from index if in the index, or not
 allow them in.

---
 Collectiondb.cpp | 38 ++++++++++++++++++++++++++++----------
 Collectiondb.h   |  3 +++
 Makefile         |  5 +++++
 PageCrawlBot.cpp | 12 ++++++++----
 Parms.cpp        | 25 ++++++++++++++++++++++---
 Parms.h          |  8 ++++----
 Spider.cpp       | 24 ++++++++++++++++--------
 XmlDoc.cpp       | 44 ++++++++++++++++++++++++++------------------
 8 files changed, 112 insertions(+), 47 deletions(-)

diff --git a/Collectiondb.cpp b/Collectiondb.cpp
index 99fc784f..b61a2bff 100644
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@@ -2198,7 +2198,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	m_maxSpidersPerRule  [n] = 1; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = 3;
+	m_spiderPriorities   [n] = 100;
+	m_forceDelete        [n] = 1;
 	n++;
 
 	m_regExs[n].set("errorcount>=1 && hastmperror");
@@ -2456,6 +2457,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	m_numRegExs5  = n;
 	m_numRegExs6  = n;
 	m_numRegExs8  = n;
+	m_numRegExs7  = n;
 
 	// more rules
 
@@ -2510,7 +2512,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
 	m_maxSpidersPerRule  [n] = 1; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = 3;
+	m_spiderPriorities   [n] = 100;
+	m_forceDelete        [n] = 1;
 	n++;
 
 	m_regExs[n].set("errorcount>=1 && hastmperror");
@@ -2871,6 +2874,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
 	m_numRegExs5  = n;
 	m_numRegExs6  = n;
 	m_numRegExs8  = n;
+	m_numRegExs7  = n;
 
 	// done rebuilding CHINESE rules
 	return true;
@@ -2914,7 +2918,8 @@ bool CollectionRec::rebuildShallowRules ( ) {
 	m_maxSpidersPerRule  [n] = 1; // max spiders
 	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
 	m_spiderIpWaits      [n] = 1000; // same ip wait
-	m_spiderPriorities   [n] = 3;
+	m_spiderPriorities   [n] = 100;
+	m_forceDelete        [n] = 1;
 	n++;
 
 	m_regExs[n].set("errorcount>=1 && hastmperror");
@@ -3089,6 +3094,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
 	m_numRegExs5  = n;
 	m_numRegExs6  = n;
 	m_numRegExs8  = n;
+	m_numRegExs7  = n;
 
 	// done rebuilding SHALLOW rules
 	return true;
@@ -3465,6 +3471,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_spiderFreqs       [i] = respiderFreq;
 		//m_spiderDiffbotApiUrl[i].purge();
 		m_harvestLinks[i] = true;
+		m_forceDelete [i] = false;
 	}
 
 	int32_t i = 0;
@@ -3477,7 +3484,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 
 	// 2nd default url 
 	m_regExs[i].set("ismedia && !ismanualadd");
-	m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+	m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;
+	m_maxSpidersPerRule  [i] = 0;
 	i++;
 
 	// hopcount filter if asked for
@@ -3495,7 +3503,10 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		m_regExs[i].set(hopcountStr);
 
 		// means DELETE :
-		m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED; 
+		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED; 
+
+		//  just don't spider
+		m_maxSpidersPerRule[i] = 0;
 
 		// compatibility with m_spiderRoundStartTime:
 		m_spiderFreqs[i] = 0.0; 
@@ -3516,7 +3527,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	// MDW: even if they supplied a crawl pattern let's restrict to seed
 	// domains 12/15/14
 	m_regExs[i].set("!isonsamedomain && !ismanualadd");
-	m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+	m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;
+	m_maxSpidersPerRule  [i] = 0;
 	i++;
 	//}
 
@@ -3529,7 +3541,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	// only negative patterns then restrict to domains of seeds
 	if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
 		m_regExs[i].set("!isonsamedomain && !ismanualadd");
-		m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;
+		m_maxSpidersPerRule  [i] = 0;
 		i++;
 	}
 
@@ -3555,7 +3568,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 
 	// excessive errors? (tcp/dns timed out, etc.) retry once per month?
 	m_regExs[i].set("errorcount>=3 && hastmperror");
-	m_spiderPriorities   [i] = 30;
+	m_spiderPriorities   [i] = 3;
 	m_spiderFreqs        [i] = 30; // 30 days
 	// if bulk job, do not download a url more than 3 times
 	if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
@@ -3633,7 +3646,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		i++;
 		// do not crawl anything else
 		m_regExs[i].set("default");
-		m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;
+		// don't spider
+		m_maxSpidersPerRule[i] = 0;
 		// this needs to be zero so &spiderRoundStart=0
 		// functionality which sets m_spiderRoundStartTime
 		// to the current time works
@@ -3653,7 +3668,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 		i++;
 		// do not crawl anything else
 		m_regExs[i].set("default");
-		m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+		m_spiderPriorities   [i] = 0;//SPIDER_PRIORITY_FILTERED;
+		// don't delete, just don't spider
+		m_maxSpidersPerRule[i] = 0;
 		// this needs to be zero so &spiderRoundStart=0
 		// functionality which sets m_spiderRoundStartTime
 		// to the current time works
@@ -3707,6 +3724,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 	m_numRegExs6  = i;
 	//m_numRegExs7  = i;
 	m_numRegExs8  = i;
+	m_numRegExs7  = i;
 	//m_numRegExs11 = i;
 
 
diff --git a/Collectiondb.h b/Collectiondb.h
index 43faceed..f6a2e567 100644
--- a/Collectiondb.h
+++ b/Collectiondb.h
@@ -814,6 +814,9 @@ class CollectionRec {
 	int32_t      m_numRegExs8;
 	char      m_harvestLinks     [ MAX_FILTERS ];
 
+	int32_t      m_numRegExs7;
+	char      m_forceDelete  [ MAX_FILTERS ];
+
 	// dummy?
 	int32_t      m_numRegExs9;
 
diff --git a/Makefile b/Makefile
index 6e79c11f..4e6eefac 100644
--- a/Makefile
+++ b/Makefile
@@ -175,6 +175,11 @@ vclean:
 	@echo ""
 	@echo "sudo yum install gcc-c++"
 	@echo ""
+	@echo ""
+	@echo "If make fails on CentOS then first run:"
+	@echo ""
+	@echo "sudo yum install gcc-c++ openssl-devel"
+	@echo ""
 	@echo "*****"
 	@echo ""
 
diff --git a/PageCrawlBot.cpp b/PageCrawlBot.cpp
index c513da35..8213b3f3 100644
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@@ -848,8 +848,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 		// lastspidertime>={roundstart} --> spiders disabled rule
 		// so that we do not spider a url twice in the same round
 		if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) {
+		     cr->m_regExs[ufn].length() &&
 		     // we set this to 0 instead of using the checkbox
-		     cr->m_maxSpidersPerRule[ufn] <= 0 ) {
+		     strstr(cr->m_regExs[ufn].getBufStart(),"round") ) {
+			//cr->m_maxSpidersPerRule[ufn] <= 0 ) {
 			priority = -5;
 		}
 
@@ -935,10 +937,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 				       //, iptoa(sreq->m_firstIp)
 				       );
 			// print priority
-			if ( priority == SPIDER_PRIORITY_FILTERED )
+			//if ( priority == SPIDER_PRIORITY_FILTERED )
+			// we just turn off the spiders now
+			if ( ufn >= 0 && cr->m_maxSpidersPerRule[ufn] <= 0 )
 				sb->safePrintf("url ignored");
-			else if ( priority == SPIDER_PRIORITY_BANNED )
-				sb->safePrintf("url banned");
+			//else if ( priority == SPIDER_PRIORITY_BANNED )
+			//	sb->safePrintf("url banned");
 			else if ( priority == -4 )
 				sb->safePrintf("error");
 			else if ( priority == -5 )
diff --git a/Parms.cpp b/Parms.cpp
index f055ff7f..6ba9da17 100644
--- a/Parms.cpp
+++ b/Parms.cpp
@@ -1625,6 +1625,11 @@ bool printDropDown ( int32_t n , SafeBuf* sb, char *name, int32_t select,
 	// . by default, minus 2 includes minus 3, the new "FILTERED" priority
 	// . it is link "BANNED" but does not mean the url is low quality necessarily
 	if ( includeMinusTwo ) i = -3;
+
+	// no more DELETE, etc.
+	i = 0;
+	if ( select < 0 ) select = 0;
+
 	for ( ; i < n ; i++ ) {
 		if ( i == select ) s = " selected";
 		else               s = "";
@@ -12987,9 +12992,10 @@ void Parms::init ( ) {
 		"together in the same expression text box. "
 		"A <i>spider priority</i> of "
 		//"<i>FILTERED</i> or <i>BANNED</i> "
-		"<i>DELETE</i> "
-		"will cause the URL to not be spidered, or if it has already "
-		"been indexed, it will be deleted when it is respidered."
+		// "<i>DELETE</i> "
+		// "will cause the URL to not be spidered, "
+		// "or if it has already "
+		// "been indexed, it will be deleted when it is respidered."
 		"<br><br>";
 		
 		/*
@@ -13173,6 +13179,19 @@ void Parms::init ( ) {
 	m->m_addin = 1; // "insert" follows?
 	m++;
 
+	m->m_title = "delete";
+	m->m_cgi   = "fdu";
+	m->m_xml   = "forceDeleteUrls";
+	m->m_max   = MAX_FILTERS;
+	m->m_off   = (char *)cr.m_forceDelete - x;
+	m->m_type  = TYPE_CHECKBOX;
+	m->m_def   = "0";
+	m->m_page  = PAGE_FILTERS;
+	m->m_rowid = 1;
+	m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
+	m->m_obj   = OBJ_COLL;
+	m++;
+
 	/*
 	m->m_title = "diffbot api";
 	m->m_cgi   = "dapi";
diff --git a/Parms.h b/Parms.h
index 7bb9038f..2483f246 100644
--- a/Parms.h
+++ b/Parms.h
@@ -29,10 +29,10 @@ void handleRequest3f ( UdpSlot *slot , int32_t niceness ) ;
 
 // special priorities for the priority drop down 
 // in the url filters table
-enum {
-	SPIDER_PRIORITY_FILTERED  = -3 ,
-	SPIDER_PRIORITY_BANNED    = -2 ,
-	SPIDER_PRIORITY_UNDEFINED = -1 };
+//enum {
+//	SPIDER_PRIORITY_FILTERED  = -3 ,
+//	SPIDER_PRIORITY_BANNED    = -2 ,
+//	SPIDER_PRIORITY_UNDEFINED = -1 };
 
 enum {
 	OBJ_CONF    = 1 ,
diff --git a/Spider.cpp b/Spider.cpp
index eb6ae6ff..c966ccaf 100644
--- a/Spider.cpp
+++ b/Spider.cpp
@@ -2313,14 +2313,16 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
 	if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
 
 	// do not add to doledb if bad
-	if ( priority == SPIDER_PRIORITY_FILTERED ) {
+	//if ( priority == SPIDER_PRIORITY_FILTERED ) {
+	if ( m_cr->m_forceDelete[ufn] ) {
 		if ( g_conf.m_logDebugSpider )
 			log("spider: request %s is filtered ufn=%"INT32"",
 			    sreq->m_url,ufn);
 		return true;
 	}
 
-	if ( priority == SPIDER_PRIORITY_BANNED   ) {
+	//if ( priority == SPIDER_PRIORITY_BANNED   ) {
+	if ( m_cr->m_forceDelete[ufn] ) {
 		if ( g_conf.m_logDebugSpider )
 			log("spider: request %s is banned ufn=%"INT32"",
 			    sreq->m_url,ufn);
@@ -4267,8 +4269,11 @@ bool SpiderColl::scanListForWinners ( ) {
 		}
 		// set the priority (might be the same as old)
 		int32_t priority = m_cr->m_spiderPriorities[ufn];
+		// now get rid of negative priorities since we added a
+		// separate force delete checkbox in the url filters
+		if ( priority < 0 ) priority = 0;
 		// sanity checks
-		if ( priority == -1 ) { char *xx=NULL;*xx=0; }
+		//if ( priority == -1 ) { char *xx=NULL;*xx=0; }
 		if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
 
 		if ( g_conf.m_logDebugSpider )
@@ -4285,10 +4290,11 @@ bool SpiderColl::scanListForWinners ( ) {
 
 		// skip if banned (unless need to delete from index)
 		bool skip = false;
-		if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true;
-		if ( priority == SPIDER_PRIORITY_BANNED   ) skip = true;
+		// if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true;
+		// if ( priority == SPIDER_PRIORITY_BANNED   ) skip = true;
+		if ( m_cr->m_forceDelete[ufn] ) skip = true;
 		// but if it is currently indexed we have to delete it
-		if ( srep && srep->m_isIndexed ) skip = false;
+		if ( skip && srep && srep->m_isIndexed ) skip = false;
 		if ( skip ) continue;
 
 		// temp debug
@@ -4298,8 +4304,10 @@ bool SpiderColl::scanListForWinners ( ) {
 		// because we need to delete the url from the index.
 		// seems like we need priority to be in [0-127] so make it 127.
 		// just make 127 a reserved priority;
-		if ( priority < 0 )
-			priority = 127;
+		if ( skip ) {
+			// force it to a delete
+			sreq->m_forceDelete = true;
+		}
 
 		int64_t spiderTimeMS;
 		spiderTimeMS = getSpiderTimeMS ( sreq,ufn,srep,nowGlobalMS );
diff --git a/XmlDoc.cpp b/XmlDoc.cpp
index 5b250f36..df90aff9 100644
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@@ -3551,16 +3551,16 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
 		// and return to be called again i hope
 		return (int32_t *)priority;
 	}
-	if ( *priority  == SPIDER_PRIORITY_FILTERED ) {
+	if ( *priority  == -3 ) { // SPIDER_PRIORITY_FILTERED ) {
 		m_indexCode      = EDOCFILTERED;
 		m_indexCodeValid = true;
 		return &m_indexCode;
 	}
-	if ( *priority  == SPIDER_PRIORITY_BANNED ) {
-		m_indexCode      = EDOCBANNED;
-		m_indexCodeValid = true;
-		return &m_indexCode;
-	}
+	// if ( *priority  == SPIDER_PRIORITY_BANNED ) {
+	// 	m_indexCode      = EDOCBANNED;
+	// 	m_indexCodeValid = true;
+	// 	return &m_indexCode;
+	// }
 
 	// . if using diffbot and the diffbot reply had a time out error
 	//   or otherwise... diffbot failure demands a re-try always i guess.
@@ -19907,8 +19907,9 @@ char *XmlDoc::getIsFiltered ( ) {
 	int32_t *priority = getSpiderPriority();
 	if ( ! priority || priority == (void *)-1 ) return (char *)priority;
 	m_isFiltered = false;
-	if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
-	if ( *priority == SPIDER_PRIORITY_BANNED   ) m_isFiltered = true;
+	// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
+	// if ( *priority == SPIDER_PRIORITY_BANNED   ) m_isFiltered = true;
+	if ( *priority == -3 ) m_isFiltered = true;
 	m_isFilteredValid = true;
 	return &m_isFiltered;
 }
@@ -19921,7 +19922,7 @@ int32_t *XmlDoc::getSpiderPriority ( ) {
 	if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
 	// this is an automatic ban!
 	if ( gr->getLong("manualban",0) ) {
-		m_priority      = SPIDER_PRIORITY_BANNED;
+		m_priority      = -3;//SPIDER_PRIORITY_BANNED;
 		m_priorityValid = true;
 		return &m_priority;
 	}
@@ -19931,7 +19932,12 @@ int32_t *XmlDoc::getSpiderPriority ( ) {
 	if ( *ufn < 0 ) { char *xx=NULL;*xx=0; }
 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;
+
 	m_priority = cr->m_spiderPriorities[*ufn];
+
+	// continue to use -3 to indicate SPIDER_PRIORITY_FILTERED for now
+	if ( cr->m_forceDelete[*ufn] ) m_priority = -3;
+
 	m_priorityValid = true;
 	return &m_priority;
 }
@@ -30244,13 +30250,15 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
 		// save it
 		reply->m_urlFilterNum = ufn;
 		// get spider priority if ufn is valid
-		int32_t pr = 0; if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
+		int32_t pr = 0; 
+		//if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
+		if ( cr->m_forceDelete[ufn] ) pr = -3;
 
 		// this is an automatic ban!
-		if ( gr->getLong("manualban",0) ) pr = SPIDER_PRIORITY_BANNED;
+		if ( gr->getLong("manualban",0))pr=-3;//SPIDER_PRIORITY_BANNED;
 
 		// is it banned
-		if ( pr == SPIDER_PRIORITY_BANNED ) { // -2
+		if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
 			// set m_errno
 			reply->m_errno = EDOCBANNED;
 			// and this
@@ -30266,12 +30274,12 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
 		pr = 0;
 
 
-		if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
-			// set m_errno
-			reply->m_errno = EDOCFILTERED;
-			// and this
-			reply->m_isFiltered = true;
-		}
+		// if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
+		// 	// set m_errno
+		// 	reply->m_errno = EDOCFILTERED;
+		// 	// and this
+		// 	reply->m_isFiltered = true;
+		// }
 		// done if we are
 		if ( reply->m_errno && ! m_req->m_showBanned ) {
 			// give back the url at least