do away with filtered/banned spider priorities.

add checkbox to signify force deletes to remove urls from index
if in the index, or not allow them in.
This commit is contained in:
mwells 2015-03-17 20:27:23 -06:00
parent dea534827e
commit dfc069aaa1
8 changed files with 112 additions and 47 deletions

View File

@ -2198,7 +2198,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 3;
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
@ -2456,6 +2457,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
m_numRegExs7 = n;
// more rules
@ -2510,7 +2512,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 3;
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
@ -2871,6 +2874,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
m_numRegExs7 = n;
// done rebuilding CHINESE rules
return true;
@ -2914,7 +2918,8 @@ bool CollectionRec::rebuildShallowRules ( ) {
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 3;
m_spiderPriorities [n] = 100;
m_forceDelete [n] = 1;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
@ -3089,6 +3094,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
m_numRegExs7 = n;
// done rebuilding SHALLOW rules
return true;
@ -3465,6 +3471,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_spiderFreqs [i] = respiderFreq;
//m_spiderDiffbotApiUrl[i].purge();
m_harvestLinks[i] = true;
m_forceDelete [i] = false;
}
int32_t i = 0;
@ -3477,7 +3484,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
// 2nd default url
m_regExs[i].set("ismedia && !ismanualadd");
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
m_maxSpidersPerRule [i] = 0;
i++;
// hopcount filter if asked for
@ -3495,7 +3503,10 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_regExs[i].set(hopcountStr);
// means DELETE :
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
// just don't spider
m_maxSpidersPerRule[i] = 0;
// compatibility with m_spiderRoundStartTime:
m_spiderFreqs[i] = 0.0;
@ -3516,7 +3527,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
// MDW: even if they supplied a crawl pattern let's restrict to seed
// domains 12/15/14
m_regExs[i].set("!isonsamedomain && !ismanualadd");
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
m_maxSpidersPerRule [i] = 0;
i++;
//}
@ -3529,7 +3541,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
// only negative patterns then restrict to domains of seeds
if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
m_regExs[i].set("!isonsamedomain && !ismanualadd");
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
m_maxSpidersPerRule [i] = 0;
i++;
}
@ -3555,7 +3568,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
m_regExs[i].set("errorcount>=3 && hastmperror");
m_spiderPriorities [i] = 30;
m_spiderPriorities [i] = 3;
m_spiderFreqs [i] = 30; // 30 days
// if bulk job, do not download a url more than 3 times
if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
@ -3633,7 +3646,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
i++;
// do not crawl anything else
m_regExs[i].set("default");
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
// don't spider
m_maxSpidersPerRule[i] = 0;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
@ -3653,7 +3668,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
i++;
// do not crawl anything else
m_regExs[i].set("default");
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
// don't delete, just don't spider
m_maxSpidersPerRule[i] = 0;
// this needs to be zero so &spiderRoundStart=0
// functionality which sets m_spiderRoundStartTime
// to the current time works
@ -3707,6 +3724,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
m_numRegExs6 = i;
//m_numRegExs7 = i;
m_numRegExs8 = i;
m_numRegExs7 = i;
//m_numRegExs11 = i;

View File

@ -814,6 +814,9 @@ class CollectionRec {
int32_t m_numRegExs8;
char m_harvestLinks [ MAX_FILTERS ];
int32_t m_numRegExs7;
char m_forceDelete [ MAX_FILTERS ];
// dummy?
int32_t m_numRegExs9;

View File

@ -175,6 +175,11 @@ vclean:
@echo ""
@echo "sudo yum install gcc-c++"
@echo ""
@echo ""
@echo "If make fails on CentOS then first run:"
@echo ""
@echo "sudo yum install gcc-c++ openssl-devel"
@echo ""
@echo "*****"
@echo ""

View File

@ -848,8 +848,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// lastspidertime>={roundstart} --> spiders disabled rule
// so that we do not spider a url twice in the same round
if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) {
cr->m_regExs[ufn].length() &&
// we set this to 0 instead of using the checkbox
cr->m_maxSpidersPerRule[ufn] <= 0 ) {
strstr(cr->m_regExs[ufn].getBufStart(),"round") ) {
//cr->m_maxSpidersPerRule[ufn] <= 0 ) {
priority = -5;
}
@ -935,10 +937,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
//, iptoa(sreq->m_firstIp)
);
// print priority
if ( priority == SPIDER_PRIORITY_FILTERED )
//if ( priority == SPIDER_PRIORITY_FILTERED )
// we just turn off the spiders now
if ( ufn >= 0 && cr->m_maxSpidersPerRule[ufn] <= 0 )
sb->safePrintf("url ignored");
else if ( priority == SPIDER_PRIORITY_BANNED )
sb->safePrintf("url banned");
//else if ( priority == SPIDER_PRIORITY_BANNED )
// sb->safePrintf("url banned");
else if ( priority == -4 )
sb->safePrintf("error");
else if ( priority == -5 )

View File

@ -1625,6 +1625,11 @@ bool printDropDown ( int32_t n , SafeBuf* sb, char *name, int32_t select,
// . by default, minus 2 includes minus 3, the new "FILTERED" priority
// . it is link "BANNED" but does not mean the url is low quality necessarily
if ( includeMinusTwo ) i = -3;
// no more DELETE, etc.
i = 0;
if ( select < 0 ) select = 0;
for ( ; i < n ; i++ ) {
if ( i == select ) s = " selected";
else s = "";
@ -12987,9 +12992,10 @@ void Parms::init ( ) {
"together in the same expression text box. "
"A <i>spider priority</i> of "
//"<i>FILTERED</i> or <i>BANNED</i> "
"<i>DELETE</i> "
"will cause the URL to not be spidered, or if it has already "
"been indexed, it will be deleted when it is respidered."
// "<i>DELETE</i> "
// "will cause the URL to not be spidered, "
// "or if it has already "
// "been indexed, it will be deleted when it is respidered."
"<br><br>";
/*
@ -13173,6 +13179,19 @@ void Parms::init ( ) {
m->m_addin = 1; // "insert" follows?
m++;
m->m_title = "delete";
m->m_cgi = "fdu";
m->m_xml = "forceDeleteUrls";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_forceDelete - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_page = PAGE_FILTERS;
m->m_rowid = 1;
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
m->m_obj = OBJ_COLL;
m++;
/*
m->m_title = "diffbot api";
m->m_cgi = "dapi";

View File

@ -29,10 +29,10 @@ void handleRequest3f ( UdpSlot *slot , int32_t niceness ) ;
// special priorities for the priority drop down
// in the url filters table
enum {
SPIDER_PRIORITY_FILTERED = -3 ,
SPIDER_PRIORITY_BANNED = -2 ,
SPIDER_PRIORITY_UNDEFINED = -1 };
//enum {
// SPIDER_PRIORITY_FILTERED = -3 ,
// SPIDER_PRIORITY_BANNED = -2 ,
// SPIDER_PRIORITY_UNDEFINED = -1 };
enum {
OBJ_CONF = 1 ,

View File

@ -2313,14 +2313,16 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
// do not add to doledb if bad
if ( priority == SPIDER_PRIORITY_FILTERED ) {
//if ( priority == SPIDER_PRIORITY_FILTERED ) {
if ( m_cr->m_forceDelete[ufn] ) {
if ( g_conf.m_logDebugSpider )
log("spider: request %s is filtered ufn=%"INT32"",
sreq->m_url,ufn);
return true;
}
if ( priority == SPIDER_PRIORITY_BANNED ) {
//if ( priority == SPIDER_PRIORITY_BANNED ) {
if ( m_cr->m_forceDelete[ufn] ) {
if ( g_conf.m_logDebugSpider )
log("spider: request %s is banned ufn=%"INT32"",
sreq->m_url,ufn);
@ -4267,8 +4269,11 @@ bool SpiderColl::scanListForWinners ( ) {
}
// set the priority (might be the same as old)
int32_t priority = m_cr->m_spiderPriorities[ufn];
// now get rid of negative priorities since we added a
// separate force delete checkbox in the url filters
if ( priority < 0 ) priority = 0;
// sanity checks
if ( priority == -1 ) { char *xx=NULL;*xx=0; }
//if ( priority == -1 ) { char *xx=NULL;*xx=0; }
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
if ( g_conf.m_logDebugSpider )
@ -4285,10 +4290,11 @@ bool SpiderColl::scanListForWinners ( ) {
// skip if banned (unless need to delete from index)
bool skip = false;
if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true;
if ( priority == SPIDER_PRIORITY_BANNED ) skip = true;
// if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true;
// if ( priority == SPIDER_PRIORITY_BANNED ) skip = true;
if ( m_cr->m_forceDelete[ufn] ) skip = true;
// but if it is currently indexed we have to delete it
if ( srep && srep->m_isIndexed ) skip = false;
if ( skip && srep && srep->m_isIndexed ) skip = false;
if ( skip ) continue;
// temp debug
@ -4298,8 +4304,10 @@ bool SpiderColl::scanListForWinners ( ) {
// because we need to delete the url from the index.
// seems like we need priority to be in [0-127] so make it 127.
// just make 127 a reserved priority;
if ( priority < 0 )
priority = 127;
if ( skip ) {
// force it to a delete
sreq->m_forceDelete = true;
}
int64_t spiderTimeMS;
spiderTimeMS = getSpiderTimeMS ( sreq,ufn,srep,nowGlobalMS );

View File

@ -3551,16 +3551,16 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
// and return to be called again i hope
return (int32_t *)priority;
}
if ( *priority == SPIDER_PRIORITY_FILTERED ) {
if ( *priority == -3 ) { // SPIDER_PRIORITY_FILTERED ) {
m_indexCode = EDOCFILTERED;
m_indexCodeValid = true;
return &m_indexCode;
}
if ( *priority == SPIDER_PRIORITY_BANNED ) {
m_indexCode = EDOCBANNED;
m_indexCodeValid = true;
return &m_indexCode;
}
// if ( *priority == SPIDER_PRIORITY_BANNED ) {
// m_indexCode = EDOCBANNED;
// m_indexCodeValid = true;
// return &m_indexCode;
// }
// . if using diffbot and the diffbot reply had a time out error
// or otherwise... diffbot failure demands a re-try always i guess.
@ -19907,8 +19907,9 @@ char *XmlDoc::getIsFiltered ( ) {
int32_t *priority = getSpiderPriority();
if ( ! priority || priority == (void *)-1 ) return (char *)priority;
m_isFiltered = false;
if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true;
// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
// if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true;
if ( *priority == -3 ) m_isFiltered = true;
m_isFilteredValid = true;
return &m_isFiltered;
}
@ -19921,7 +19922,7 @@ int32_t *XmlDoc::getSpiderPriority ( ) {
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
// this is an automatic ban!
if ( gr->getLong("manualban",0) ) {
m_priority = SPIDER_PRIORITY_BANNED;
m_priority = -3;//SPIDER_PRIORITY_BANNED;
m_priorityValid = true;
return &m_priority;
}
@ -19931,7 +19932,12 @@ int32_t *XmlDoc::getSpiderPriority ( ) {
if ( *ufn < 0 ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
m_priority = cr->m_spiderPriorities[*ufn];
// continue to use -3 to indicate SPIDER_PRIORITY_FILTERED for now
if ( cr->m_forceDelete[*ufn] ) m_priority = -3;
m_priorityValid = true;
return &m_priority;
}
@ -30244,13 +30250,15 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
// save it
reply->m_urlFilterNum = ufn;
// get spider priority if ufn is valid
int32_t pr = 0; if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
int32_t pr = 0;
//if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
if ( cr->m_forceDelete[ufn] ) pr = -3;
// this is an automatic ban!
if ( gr->getLong("manualban",0) ) pr = SPIDER_PRIORITY_BANNED;
if ( gr->getLong("manualban",0))pr=-3;//SPIDER_PRIORITY_BANNED;
// is it banned
if ( pr == SPIDER_PRIORITY_BANNED ) { // -2
if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
// set m_errno
reply->m_errno = EDOCBANNED;
// and this
@ -30266,12 +30274,12 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
pr = 0;
if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
// set m_errno
reply->m_errno = EDOCFILTERED;
// and this
reply->m_isFiltered = true;
}
// if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
// // set m_errno
// reply->m_errno = EDOCFILTERED;
// // and this
// reply->m_isFiltered = true;
// }
// done if we are
if ( reply->m_errno && ! m_req->m_showBanned ) {
// give back the url at least