From dfc069aaa195e5b7b02620bef4d3245006be04e0 Mon Sep 17 00:00:00 2001 From: mwells Date: Tue, 17 Mar 2015 20:27:23 -0600 Subject: [PATCH] do away with filtered/banned spider priorities. add checkbox to signify force deletes to remove urls from index if in the index, or not allow them in. --- Collectiondb.cpp | 38 ++++++++++++++++++++++++++++---------- Collectiondb.h | 3 +++ Makefile | 5 +++++ PageCrawlBot.cpp | 12 ++++++++---- Parms.cpp | 25 ++++++++++++++++++++++--- Parms.h | 8 ++++---- Spider.cpp | 24 ++++++++++++++++-------- XmlDoc.cpp | 44 ++++++++++++++++++++++++++------------------ 8 files changed, 112 insertions(+), 47 deletions(-) diff --git a/Collectiondb.cpp b/Collectiondb.cpp index 99fc784f..b61a2bff 100644 --- a/Collectiondb.cpp +++ b/Collectiondb.cpp @@ -2198,7 +2198,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) { m_maxSpidersPerRule [n] = 1; // max spiders m_spiderIpMaxSpiders [n] = 1; // max spiders per ip m_spiderIpWaits [n] = 1000; // same ip wait - m_spiderPriorities [n] = 3; + m_spiderPriorities [n] = 100; + m_forceDelete [n] = 1; n++; m_regExs[n].set("errorcount>=1 && hastmperror"); @@ -2456,6 +2457,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) { m_numRegExs5 = n; m_numRegExs6 = n; m_numRegExs8 = n; + m_numRegExs7 = n; // more rules @@ -2510,7 +2512,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) { m_maxSpidersPerRule [n] = 1; // max spiders m_spiderIpMaxSpiders [n] = 1; // max spiders per ip m_spiderIpWaits [n] = 1000; // same ip wait - m_spiderPriorities [n] = 3; + m_spiderPriorities [n] = 100; + m_forceDelete [n] = 1; n++; m_regExs[n].set("errorcount>=1 && hastmperror"); @@ -2871,6 +2874,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) { m_numRegExs5 = n; m_numRegExs6 = n; m_numRegExs8 = n; + m_numRegExs7 = n; // done rebuilding CHINESE rules return true; @@ -2914,7 +2918,8 @@ bool CollectionRec::rebuildShallowRules ( ) { m_maxSpidersPerRule [n] = 1; // max spiders m_spiderIpMaxSpiders [n] = 1; // max spiders per ip m_spiderIpWaits [n] = 1000; // same ip wait - m_spiderPriorities [n] = 3; + m_spiderPriorities [n] = 100; + m_forceDelete [n] = 1; n++; m_regExs[n].set("errorcount>=1 && hastmperror"); @@ -3089,6 +3094,7 @@ bool CollectionRec::rebuildShallowRules ( ) { m_numRegExs5 = n; m_numRegExs6 = n; m_numRegExs8 = n; + m_numRegExs7 = n; // done rebuilding SHALLOW rules return true; @@ -3465,6 +3471,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { m_spiderFreqs [i] = respiderFreq; //m_spiderDiffbotApiUrl[i].purge(); m_harvestLinks[i] = true; + m_forceDelete [i] = false; } int32_t i = 0; @@ -3477,7 +3484,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { // 2nd default url m_regExs[i].set("ismedia && !ismanualadd"); - m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED; + m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED; + m_maxSpidersPerRule [i] = 0; i++; // hopcount filter if asked for @@ -3495,7 +3503,10 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { m_regExs[i].set(hopcountStr); // means DELETE : - m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED; + m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED; + + // just don't spider + m_maxSpidersPerRule[i] = 0; // compatibility with m_spiderRoundStartTime: m_spiderFreqs[i] = 0.0; @@ -3516,7 +3527,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { // MDW: even if they supplied a crawl pattern let's restrict to seed // domains 12/15/14 m_regExs[i].set("!isonsamedomain && !ismanualadd"); - m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED; + m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED; + m_maxSpidersPerRule [i] = 0; i++; //} @@ -3529,7 +3541,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { // only negative patterns then restrict to domains of seeds if ( ucp && ! ucpHasPositive && ! m_hasucr ) { m_regExs[i].set("!isonsamedomain && !ismanualadd"); - m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED; + m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED; + m_maxSpidersPerRule [i] = 0; i++; } @@ -3555,7 +3568,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { // excessive errors? (tcp/dns timed out, etc.) retry once per month? m_regExs[i].set("errorcount>=3 && hastmperror"); - m_spiderPriorities [i] = 30; + m_spiderPriorities [i] = 3; m_spiderFreqs [i] = 30; // 30 days // if bulk job, do not download a url more than 3 times if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0; @@ -3633,7 +3646,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { i++; // do not crawl anything else m_regExs[i].set("default"); - m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED; + m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED; + // don't spider + m_maxSpidersPerRule[i] = 0; // this needs to be zero so &spiderRoundStart=0 // functionality which sets m_spiderRoundStartTime // to the current time works @@ -3653,7 +3668,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { i++; // do not crawl anything else m_regExs[i].set("default"); - m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED; + m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED; + // don't delete, just don't spider + m_maxSpidersPerRule[i] = 0; // this needs to be zero so &spiderRoundStart=0 // functionality which sets m_spiderRoundStartTime // to the current time works @@ -3707,6 +3724,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() { m_numRegExs6 = i; //m_numRegExs7 = i; m_numRegExs8 = i; + m_numRegExs7 = i; //m_numRegExs11 = i; diff --git a/Collectiondb.h b/Collectiondb.h index 43faceed..f6a2e567 100644 --- a/Collectiondb.h +++ b/Collectiondb.h @@ -814,6 +814,9 @@ class CollectionRec { int32_t m_numRegExs8; char m_harvestLinks [ MAX_FILTERS ]; + int32_t m_numRegExs7; + char m_forceDelete [ MAX_FILTERS ]; + // dummy? int32_t m_numRegExs9; diff --git a/Makefile b/Makefile index 6e79c11f..4e6eefac 100644 --- a/Makefile +++ b/Makefile @@ -175,6 +175,11 @@ vclean: @echo "" @echo "sudo yum install gcc-c++" @echo "" + @echo "" + @echo "If make fails on CentOS then first run:" + @echo "" + @echo "sudo yum install gcc-c++ openssl-devel" + @echo "" @echo "*****" @echo "" diff --git a/PageCrawlBot.cpp b/PageCrawlBot.cpp index c513da35..8213b3f3 100644 --- a/PageCrawlBot.cpp +++ b/PageCrawlBot.cpp @@ -848,8 +848,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){ // lastspidertime>={roundstart} --> spiders disabled rule // so that we do not spider a url twice in the same round if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) { + cr->m_regExs[ufn].length() && // we set this to 0 instead of using the checkbox - cr->m_maxSpidersPerRule[ufn] <= 0 ) { + strstr(cr->m_regExs[ufn].getBufStart(),"round") ) { + //cr->m_maxSpidersPerRule[ufn] <= 0 ) { priority = -5; } @@ -935,10 +937,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){ //, iptoa(sreq->m_firstIp) ); // print priority - if ( priority == SPIDER_PRIORITY_FILTERED ) + //if ( priority == SPIDER_PRIORITY_FILTERED ) + // we just turn off the spiders now + if ( ufn >= 0 && cr->m_maxSpidersPerRule[ufn] <= 0 ) sb->safePrintf("url ignored"); - else if ( priority == SPIDER_PRIORITY_BANNED ) - sb->safePrintf("url banned"); + //else if ( priority == SPIDER_PRIORITY_BANNED ) + // sb->safePrintf("url banned"); else if ( priority == -4 ) sb->safePrintf("error"); else if ( priority == -5 ) diff --git a/Parms.cpp b/Parms.cpp index f055ff7f..6ba9da17 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -1625,6 +1625,11 @@ bool printDropDown ( int32_t n , SafeBuf* sb, char *name, int32_t select, // . by default, minus 2 includes minus 3, the new "FILTERED" priority // . it is link "BANNED" but does not mean the url is low quality necessarily if ( includeMinusTwo ) i = -3; + + // no more DELETE, etc. + i = 0; + if ( select < 0 ) select = 0; + for ( ; i < n ; i++ ) { if ( i == select ) s = " selected"; else s = ""; @@ -12987,9 +12992,10 @@ void Parms::init ( ) { "together in the same expression text box. " "A spider priority of " //"FILTERED or BANNED " - "DELETE " - "will cause the URL to not be spidered, or if it has already " - "been indexed, it will be deleted when it is respidered." + // "DELETE " + // "will cause the URL to not be spidered, " + // "or if it has already " + // "been indexed, it will be deleted when it is respidered." "

"; /* @@ -13173,6 +13179,19 @@ void Parms::init ( ) { m->m_addin = 1; // "insert" follows? m++; + m->m_title = "delete"; + m->m_cgi = "fdu"; + m->m_xml = "forceDeleteUrls"; + m->m_max = MAX_FILTERS; + m->m_off = (char *)cr.m_forceDelete - x; + m->m_type = TYPE_CHECKBOX; + m->m_def = "0"; + m->m_page = PAGE_FILTERS; + m->m_rowid = 1; + m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE; + m->m_obj = OBJ_COLL; + m++; + /* m->m_title = "diffbot api"; m->m_cgi = "dapi"; diff --git a/Parms.h b/Parms.h index 7bb9038f..2483f246 100644 --- a/Parms.h +++ b/Parms.h @@ -29,10 +29,10 @@ void handleRequest3f ( UdpSlot *slot , int32_t niceness ) ; // special priorities for the priority drop down // in the url filters table -enum { - SPIDER_PRIORITY_FILTERED = -3 , - SPIDER_PRIORITY_BANNED = -2 , - SPIDER_PRIORITY_UNDEFINED = -1 }; +//enum { +// SPIDER_PRIORITY_FILTERED = -3 , +// SPIDER_PRIORITY_BANNED = -2 , +// SPIDER_PRIORITY_UNDEFINED = -1 }; enum { OBJ_CONF = 1 , diff --git a/Spider.cpp b/Spider.cpp index eb6ae6ff..c966ccaf 100644 --- a/Spider.cpp +++ b/Spider.cpp @@ -2313,14 +2313,16 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq , if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;} // do not add to doledb if bad - if ( priority == SPIDER_PRIORITY_FILTERED ) { + //if ( priority == SPIDER_PRIORITY_FILTERED ) { + if ( m_cr->m_forceDelete[ufn] ) { if ( g_conf.m_logDebugSpider ) log("spider: request %s is filtered ufn=%"INT32"", sreq->m_url,ufn); return true; } - if ( priority == SPIDER_PRIORITY_BANNED ) { + //if ( priority == SPIDER_PRIORITY_BANNED ) { + if ( m_cr->m_forceDelete[ufn] ) { if ( g_conf.m_logDebugSpider ) log("spider: request %s is banned ufn=%"INT32"", sreq->m_url,ufn); @@ -4267,8 +4269,11 @@ bool SpiderColl::scanListForWinners ( ) { } // set the priority (might be the same as old) int32_t priority = m_cr->m_spiderPriorities[ufn]; + // now get rid of negative priorities since we added a + // separate force delete checkbox in the url filters + if ( priority < 0 ) priority = 0; // sanity checks - if ( priority == -1 ) { char *xx=NULL;*xx=0; } + //if ( priority == -1 ) { char *xx=NULL;*xx=0; } if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;} if ( g_conf.m_logDebugSpider ) @@ -4285,10 +4290,11 @@ bool SpiderColl::scanListForWinners ( ) { // skip if banned (unless need to delete from index) bool skip = false; - if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true; - if ( priority == SPIDER_PRIORITY_BANNED ) skip = true; + // if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true; + // if ( priority == SPIDER_PRIORITY_BANNED ) skip = true; + if ( m_cr->m_forceDelete[ufn] ) skip = true; // but if it is currently indexed we have to delete it - if ( srep && srep->m_isIndexed ) skip = false; + if ( skip && srep && srep->m_isIndexed ) skip = false; if ( skip ) continue; // temp debug @@ -4298,8 +4304,10 @@ bool SpiderColl::scanListForWinners ( ) { // because we need to delete the url from the index. // seems like we need priority to be in [0-127] so make it 127. // just make 127 a reserved priority; - if ( priority < 0 ) - priority = 127; + if ( skip ) { + // force it to a delete + sreq->m_forceDelete = true; + } int64_t spiderTimeMS; spiderTimeMS = getSpiderTimeMS ( sreq,ufn,srep,nowGlobalMS ); diff --git a/XmlDoc.cpp b/XmlDoc.cpp index 5b250f36..df90aff9 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -3551,16 +3551,16 @@ int32_t *XmlDoc::getIndexCode2 ( ) { // and return to be called again i hope return (int32_t *)priority; } - if ( *priority == SPIDER_PRIORITY_FILTERED ) { + if ( *priority == -3 ) { // SPIDER_PRIORITY_FILTERED ) { m_indexCode = EDOCFILTERED; m_indexCodeValid = true; return &m_indexCode; } - if ( *priority == SPIDER_PRIORITY_BANNED ) { - m_indexCode = EDOCBANNED; - m_indexCodeValid = true; - return &m_indexCode; - } + // if ( *priority == SPIDER_PRIORITY_BANNED ) { + // m_indexCode = EDOCBANNED; + // m_indexCodeValid = true; + // return &m_indexCode; + // } // . if using diffbot and the diffbot reply had a time out error // or otherwise... diffbot failure demands a re-try always i guess. @@ -19907,8 +19907,9 @@ char *XmlDoc::getIsFiltered ( ) { int32_t *priority = getSpiderPriority(); if ( ! priority || priority == (void *)-1 ) return (char *)priority; m_isFiltered = false; - if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true; - if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true; + // if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true; + // if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true; + if ( *priority == -3 ) m_isFiltered = true; m_isFilteredValid = true; return &m_isFiltered; } @@ -19921,7 +19922,7 @@ int32_t *XmlDoc::getSpiderPriority ( ) { if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr; // this is an automatic ban! if ( gr->getLong("manualban",0) ) { - m_priority = SPIDER_PRIORITY_BANNED; + m_priority = -3;//SPIDER_PRIORITY_BANNED; m_priorityValid = true; return &m_priority; } @@ -19931,7 +19932,12 @@ int32_t *XmlDoc::getSpiderPriority ( ) { if ( *ufn < 0 ) { char *xx=NULL;*xx=0; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; + m_priority = cr->m_spiderPriorities[*ufn]; + + // continue to use -3 to indicate SPIDER_PRIORITY_FILTERED for now + if ( cr->m_forceDelete[*ufn] ) m_priority = -3; + m_priorityValid = true; return &m_priority; } @@ -30244,13 +30250,15 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) { // save it reply->m_urlFilterNum = ufn; // get spider priority if ufn is valid - int32_t pr = 0; if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn]; + int32_t pr = 0; + //if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn]; + if ( cr->m_forceDelete[ufn] ) pr = -3; // this is an automatic ban! - if ( gr->getLong("manualban",0) ) pr = SPIDER_PRIORITY_BANNED; + if ( gr->getLong("manualban",0))pr=-3;//SPIDER_PRIORITY_BANNED; // is it banned - if ( pr == SPIDER_PRIORITY_BANNED ) { // -2 + if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2 // set m_errno reply->m_errno = EDOCBANNED; // and this @@ -30266,12 +30274,12 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) { pr = 0; - if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3 - // set m_errno - reply->m_errno = EDOCFILTERED; - // and this - reply->m_isFiltered = true; - } + // if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3 + // // set m_errno + // reply->m_errno = EDOCFILTERED; + // // and this + // reply->m_isFiltered = true; + // } // done if we are if ( reply->m_errno && ! m_req->m_showBanned ) { // give back the url at least