mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
do away with filtered/banned spider priorities.
add checkbox to signify force deletes to remove urls from index if in the index, or not allow them in.
This commit is contained in:
parent
dea534827e
commit
dfc069aaa1
@ -2198,7 +2198,8 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
m_spiderPriorities [n] = 100;
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
@ -2456,6 +2457,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
m_numRegExs7 = n;
|
||||
|
||||
// more rules
|
||||
|
||||
@ -2510,7 +2512,8 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
m_spiderPriorities [n] = 100;
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
@ -2871,6 +2874,7 @@ bool CollectionRec::rebuildLangRules ( char *langStr , char *tldStr ) {
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
m_numRegExs7 = n;
|
||||
|
||||
// done rebuilding CHINESE rules
|
||||
return true;
|
||||
@ -2914,7 +2918,8 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
m_spiderPriorities [n] = 100;
|
||||
m_forceDelete [n] = 1;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
@ -3089,6 +3094,7 @@ bool CollectionRec::rebuildShallowRules ( ) {
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
m_numRegExs7 = n;
|
||||
|
||||
// done rebuilding SHALLOW rules
|
||||
return true;
|
||||
@ -3465,6 +3471,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_spiderFreqs [i] = respiderFreq;
|
||||
//m_spiderDiffbotApiUrl[i].purge();
|
||||
m_harvestLinks[i] = true;
|
||||
m_forceDelete [i] = false;
|
||||
}
|
||||
|
||||
int32_t i = 0;
|
||||
@ -3477,7 +3484,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
// 2nd default url
|
||||
m_regExs[i].set("ismedia && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
|
||||
m_maxSpidersPerRule [i] = 0;
|
||||
i++;
|
||||
|
||||
// hopcount filter if asked for
|
||||
@ -3495,7 +3503,10 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_regExs[i].set(hopcountStr);
|
||||
|
||||
// means DELETE :
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
|
||||
|
||||
// just don't spider
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
|
||||
// compatibility with m_spiderRoundStartTime:
|
||||
m_spiderFreqs[i] = 0.0;
|
||||
@ -3516,7 +3527,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
// MDW: even if they supplied a crawl pattern let's restrict to seed
|
||||
// domains 12/15/14
|
||||
m_regExs[i].set("!isonsamedomain && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
|
||||
m_maxSpidersPerRule [i] = 0;
|
||||
i++;
|
||||
//}
|
||||
|
||||
@ -3529,7 +3541,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
// only negative patterns then restrict to domains of seeds
|
||||
if ( ucp && ! ucpHasPositive && ! m_hasucr ) {
|
||||
m_regExs[i].set("!isonsamedomain && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
|
||||
m_maxSpidersPerRule [i] = 0;
|
||||
i++;
|
||||
}
|
||||
|
||||
@ -3555,7 +3568,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
|
||||
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
|
||||
m_regExs[i].set("errorcount>=3 && hastmperror");
|
||||
m_spiderPriorities [i] = 30;
|
||||
m_spiderPriorities [i] = 3;
|
||||
m_spiderFreqs [i] = 30; // 30 days
|
||||
// if bulk job, do not download a url more than 3 times
|
||||
if ( m_isCustomCrawl == 2 ) m_maxSpidersPerRule [i] = 0;
|
||||
@ -3633,7 +3646,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
i++;
|
||||
// do not crawl anything else
|
||||
m_regExs[i].set("default");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
|
||||
// don't spider
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
// this needs to be zero so &spiderRoundStart=0
|
||||
// functionality which sets m_spiderRoundStartTime
|
||||
// to the current time works
|
||||
@ -3653,7 +3668,9 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
i++;
|
||||
// do not crawl anything else
|
||||
m_regExs[i].set("default");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
m_spiderPriorities [i] = 0;//SPIDER_PRIORITY_FILTERED;
|
||||
// don't delete, just don't spider
|
||||
m_maxSpidersPerRule[i] = 0;
|
||||
// this needs to be zero so &spiderRoundStart=0
|
||||
// functionality which sets m_spiderRoundStartTime
|
||||
// to the current time works
|
||||
@ -3707,6 +3724,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
m_numRegExs6 = i;
|
||||
//m_numRegExs7 = i;
|
||||
m_numRegExs8 = i;
|
||||
m_numRegExs7 = i;
|
||||
//m_numRegExs11 = i;
|
||||
|
||||
|
||||
|
@ -814,6 +814,9 @@ class CollectionRec {
|
||||
int32_t m_numRegExs8;
|
||||
char m_harvestLinks [ MAX_FILTERS ];
|
||||
|
||||
int32_t m_numRegExs7;
|
||||
char m_forceDelete [ MAX_FILTERS ];
|
||||
|
||||
// dummy?
|
||||
int32_t m_numRegExs9;
|
||||
|
||||
|
5
Makefile
5
Makefile
@ -175,6 +175,11 @@ vclean:
|
||||
@echo ""
|
||||
@echo "sudo yum install gcc-c++"
|
||||
@echo ""
|
||||
@echo ""
|
||||
@echo "If make fails on CentOS then first run:"
|
||||
@echo ""
|
||||
@echo "sudo yum install gcc-c++ openssl-devel"
|
||||
@echo ""
|
||||
@echo "*****"
|
||||
@echo ""
|
||||
|
||||
|
@ -848,8 +848,10 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
// lastspidertime>={roundstart} --> spiders disabled rule
|
||||
// so that we do not spider a url twice in the same round
|
||||
if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) {
|
||||
cr->m_regExs[ufn].length() &&
|
||||
// we set this to 0 instead of using the checkbox
|
||||
cr->m_maxSpidersPerRule[ufn] <= 0 ) {
|
||||
strstr(cr->m_regExs[ufn].getBufStart(),"round") ) {
|
||||
//cr->m_maxSpidersPerRule[ufn] <= 0 ) {
|
||||
priority = -5;
|
||||
}
|
||||
|
||||
@ -935,10 +937,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
//, iptoa(sreq->m_firstIp)
|
||||
);
|
||||
// print priority
|
||||
if ( priority == SPIDER_PRIORITY_FILTERED )
|
||||
//if ( priority == SPIDER_PRIORITY_FILTERED )
|
||||
// we just turn off the spiders now
|
||||
if ( ufn >= 0 && cr->m_maxSpidersPerRule[ufn] <= 0 )
|
||||
sb->safePrintf("url ignored");
|
||||
else if ( priority == SPIDER_PRIORITY_BANNED )
|
||||
sb->safePrintf("url banned");
|
||||
//else if ( priority == SPIDER_PRIORITY_BANNED )
|
||||
// sb->safePrintf("url banned");
|
||||
else if ( priority == -4 )
|
||||
sb->safePrintf("error");
|
||||
else if ( priority == -5 )
|
||||
|
25
Parms.cpp
25
Parms.cpp
@ -1625,6 +1625,11 @@ bool printDropDown ( int32_t n , SafeBuf* sb, char *name, int32_t select,
|
||||
// . by default, minus 2 includes minus 3, the new "FILTERED" priority
|
||||
// . it is link "BANNED" but does not mean the url is low quality necessarily
|
||||
if ( includeMinusTwo ) i = -3;
|
||||
|
||||
// no more DELETE, etc.
|
||||
i = 0;
|
||||
if ( select < 0 ) select = 0;
|
||||
|
||||
for ( ; i < n ; i++ ) {
|
||||
if ( i == select ) s = " selected";
|
||||
else s = "";
|
||||
@ -12987,9 +12992,10 @@ void Parms::init ( ) {
|
||||
"together in the same expression text box. "
|
||||
"A <i>spider priority</i> of "
|
||||
//"<i>FILTERED</i> or <i>BANNED</i> "
|
||||
"<i>DELETE</i> "
|
||||
"will cause the URL to not be spidered, or if it has already "
|
||||
"been indexed, it will be deleted when it is respidered."
|
||||
// "<i>DELETE</i> "
|
||||
// "will cause the URL to not be spidered, "
|
||||
// "or if it has already "
|
||||
// "been indexed, it will be deleted when it is respidered."
|
||||
"<br><br>";
|
||||
|
||||
/*
|
||||
@ -13173,6 +13179,19 @@ void Parms::init ( ) {
|
||||
m->m_addin = 1; // "insert" follows?
|
||||
m++;
|
||||
|
||||
m->m_title = "delete";
|
||||
m->m_cgi = "fdu";
|
||||
m->m_xml = "forceDeleteUrls";
|
||||
m->m_max = MAX_FILTERS;
|
||||
m->m_off = (char *)cr.m_forceDelete - x;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_page = PAGE_FILTERS;
|
||||
m->m_rowid = 1;
|
||||
m->m_flags = PF_REBUILDURLFILTERS | PF_CLONE;
|
||||
m->m_obj = OBJ_COLL;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "diffbot api";
|
||||
m->m_cgi = "dapi";
|
||||
|
8
Parms.h
8
Parms.h
@ -29,10 +29,10 @@ void handleRequest3f ( UdpSlot *slot , int32_t niceness ) ;
|
||||
|
||||
// special priorities for the priority drop down
|
||||
// in the url filters table
|
||||
enum {
|
||||
SPIDER_PRIORITY_FILTERED = -3 ,
|
||||
SPIDER_PRIORITY_BANNED = -2 ,
|
||||
SPIDER_PRIORITY_UNDEFINED = -1 };
|
||||
//enum {
|
||||
// SPIDER_PRIORITY_FILTERED = -3 ,
|
||||
// SPIDER_PRIORITY_BANNED = -2 ,
|
||||
// SPIDER_PRIORITY_UNDEFINED = -1 };
|
||||
|
||||
enum {
|
||||
OBJ_CONF = 1 ,
|
||||
|
24
Spider.cpp
24
Spider.cpp
@ -2313,14 +2313,16 @@ bool SpiderColl::addSpiderRequest ( SpiderRequest *sreq ,
|
||||
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
|
||||
|
||||
// do not add to doledb if bad
|
||||
if ( priority == SPIDER_PRIORITY_FILTERED ) {
|
||||
//if ( priority == SPIDER_PRIORITY_FILTERED ) {
|
||||
if ( m_cr->m_forceDelete[ufn] ) {
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: request %s is filtered ufn=%"INT32"",
|
||||
sreq->m_url,ufn);
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( priority == SPIDER_PRIORITY_BANNED ) {
|
||||
//if ( priority == SPIDER_PRIORITY_BANNED ) {
|
||||
if ( m_cr->m_forceDelete[ufn] ) {
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: request %s is banned ufn=%"INT32"",
|
||||
sreq->m_url,ufn);
|
||||
@ -4267,8 +4269,11 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
}
|
||||
// set the priority (might be the same as old)
|
||||
int32_t priority = m_cr->m_spiderPriorities[ufn];
|
||||
// now get rid of negative priorities since we added a
|
||||
// separate force delete checkbox in the url filters
|
||||
if ( priority < 0 ) priority = 0;
|
||||
// sanity checks
|
||||
if ( priority == -1 ) { char *xx=NULL;*xx=0; }
|
||||
//if ( priority == -1 ) { char *xx=NULL;*xx=0; }
|
||||
if ( priority >= MAX_SPIDER_PRIORITIES) {char *xx=NULL;*xx=0;}
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
@ -4285,10 +4290,11 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
|
||||
// skip if banned (unless need to delete from index)
|
||||
bool skip = false;
|
||||
if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true;
|
||||
if ( priority == SPIDER_PRIORITY_BANNED ) skip = true;
|
||||
// if ( priority == SPIDER_PRIORITY_FILTERED ) skip = true;
|
||||
// if ( priority == SPIDER_PRIORITY_BANNED ) skip = true;
|
||||
if ( m_cr->m_forceDelete[ufn] ) skip = true;
|
||||
// but if it is currently indexed we have to delete it
|
||||
if ( srep && srep->m_isIndexed ) skip = false;
|
||||
if ( skip && srep && srep->m_isIndexed ) skip = false;
|
||||
if ( skip ) continue;
|
||||
|
||||
// temp debug
|
||||
@ -4298,8 +4304,10 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// because we need to delete the url from the index.
|
||||
// seems like we need priority to be in [0-127] so make it 127.
|
||||
// just make 127 a reserved priority;
|
||||
if ( priority < 0 )
|
||||
priority = 127;
|
||||
if ( skip ) {
|
||||
// force it to a delete
|
||||
sreq->m_forceDelete = true;
|
||||
}
|
||||
|
||||
int64_t spiderTimeMS;
|
||||
spiderTimeMS = getSpiderTimeMS ( sreq,ufn,srep,nowGlobalMS );
|
||||
|
44
XmlDoc.cpp
44
XmlDoc.cpp
@ -3551,16 +3551,16 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
|
||||
// and return to be called again i hope
|
||||
return (int32_t *)priority;
|
||||
}
|
||||
if ( *priority == SPIDER_PRIORITY_FILTERED ) {
|
||||
if ( *priority == -3 ) { // SPIDER_PRIORITY_FILTERED ) {
|
||||
m_indexCode = EDOCFILTERED;
|
||||
m_indexCodeValid = true;
|
||||
return &m_indexCode;
|
||||
}
|
||||
if ( *priority == SPIDER_PRIORITY_BANNED ) {
|
||||
m_indexCode = EDOCBANNED;
|
||||
m_indexCodeValid = true;
|
||||
return &m_indexCode;
|
||||
}
|
||||
// if ( *priority == SPIDER_PRIORITY_BANNED ) {
|
||||
// m_indexCode = EDOCBANNED;
|
||||
// m_indexCodeValid = true;
|
||||
// return &m_indexCode;
|
||||
// }
|
||||
|
||||
// . if using diffbot and the diffbot reply had a time out error
|
||||
// or otherwise... diffbot failure demands a re-try always i guess.
|
||||
@ -19907,8 +19907,9 @@ char *XmlDoc::getIsFiltered ( ) {
|
||||
int32_t *priority = getSpiderPriority();
|
||||
if ( ! priority || priority == (void *)-1 ) return (char *)priority;
|
||||
m_isFiltered = false;
|
||||
if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
|
||||
if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true;
|
||||
// if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true;
|
||||
// if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true;
|
||||
if ( *priority == -3 ) m_isFiltered = true;
|
||||
m_isFilteredValid = true;
|
||||
return &m_isFiltered;
|
||||
}
|
||||
@ -19921,7 +19922,7 @@ int32_t *XmlDoc::getSpiderPriority ( ) {
|
||||
if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr;
|
||||
// this is an automatic ban!
|
||||
if ( gr->getLong("manualban",0) ) {
|
||||
m_priority = SPIDER_PRIORITY_BANNED;
|
||||
m_priority = -3;//SPIDER_PRIORITY_BANNED;
|
||||
m_priorityValid = true;
|
||||
return &m_priority;
|
||||
}
|
||||
@ -19931,7 +19932,12 @@ int32_t *XmlDoc::getSpiderPriority ( ) {
|
||||
if ( *ufn < 0 ) { char *xx=NULL;*xx=0; }
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
m_priority = cr->m_spiderPriorities[*ufn];
|
||||
|
||||
// continue to use -3 to indicate SPIDER_PRIORITY_FILTERED for now
|
||||
if ( cr->m_forceDelete[*ufn] ) m_priority = -3;
|
||||
|
||||
m_priorityValid = true;
|
||||
return &m_priority;
|
||||
}
|
||||
@ -30244,13 +30250,15 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
// save it
|
||||
reply->m_urlFilterNum = ufn;
|
||||
// get spider priority if ufn is valid
|
||||
int32_t pr = 0; if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
|
||||
int32_t pr = 0;
|
||||
//if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn];
|
||||
if ( cr->m_forceDelete[ufn] ) pr = -3;
|
||||
|
||||
// this is an automatic ban!
|
||||
if ( gr->getLong("manualban",0) ) pr = SPIDER_PRIORITY_BANNED;
|
||||
if ( gr->getLong("manualban",0))pr=-3;//SPIDER_PRIORITY_BANNED;
|
||||
|
||||
// is it banned
|
||||
if ( pr == SPIDER_PRIORITY_BANNED ) { // -2
|
||||
if ( pr == -3 ) { // SPIDER_PRIORITY_BANNED ) { // -2
|
||||
// set m_errno
|
||||
reply->m_errno = EDOCBANNED;
|
||||
// and this
|
||||
@ -30266,12 +30274,12 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
pr = 0;
|
||||
|
||||
|
||||
if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
|
||||
// set m_errno
|
||||
reply->m_errno = EDOCFILTERED;
|
||||
// and this
|
||||
reply->m_isFiltered = true;
|
||||
}
|
||||
// if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3
|
||||
// // set m_errno
|
||||
// reply->m_errno = EDOCFILTERED;
|
||||
// // and this
|
||||
// reply->m_isFiltered = true;
|
||||
// }
|
||||
// done if we are
|
||||
if ( reply->m_errno && ! m_req->m_showBanned ) {
|
||||
// give back the url at least
|
||||
|
Loading…
Reference in New Issue
Block a user