mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
checkpoint for auto proxy logic
This commit is contained in:
parent
f3d9b016ce
commit
6d8bb19962
@ -1936,6 +1936,14 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
|
||||
m_coll,
|
||||
(int32_t)m_collnum,
|
||||
(int32_t)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
|
||||
|
||||
// the list of ip addresses that we have detected as being throttled
|
||||
// and therefore backoff and use proxies for
|
||||
sb.reset();
|
||||
sb.safePrintf("%scoll.%s.%"INT32"/",
|
||||
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
|
||||
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
|
||||
|
||||
|
||||
|
||||
////////////
|
||||
@ -3207,6 +3215,13 @@ bool CollectionRec::save ( ) {
|
||||
g_errno = 0;
|
||||
}
|
||||
|
||||
// the list of ip addresses that we have detected as being throttled
|
||||
// and therefore backoff and use proxies for
|
||||
sb.reset();
|
||||
sb.safePrintf("%scoll.%s.%"INT32"/",
|
||||
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
|
||||
m_twitchyTable.save ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
|
||||
|
||||
// do not need a save now
|
||||
m_needsSave = false;
|
||||
|
||||
|
@ -426,6 +426,10 @@ class CollectionRec {
|
||||
|
||||
int64_t m_spiderCorruptCount;
|
||||
|
||||
// holds ips that have been detected as being throttled and we need
|
||||
// to backoff and use proxies on
|
||||
HashTableX m_twitchyTable;
|
||||
|
||||
//
|
||||
// CLOUD SEARCH ENGINE SUPPORT
|
||||
//
|
||||
@ -511,7 +515,7 @@ class CollectionRec {
|
||||
char m_doIpLookups ; // considered iff using proxy
|
||||
char m_useRobotsTxt ;
|
||||
char m_forceUseFloaters ;
|
||||
char m_autoUseProxyIps ;
|
||||
char m_automaticallyUseProxies ;
|
||||
//char m_restrictDomain ; // say on same domain as seeds?
|
||||
char m_doTuringTest ; // for addurl
|
||||
char m_applyFilterToText ; // speeds us up
|
||||
|
46
Msg13.cpp
46
Msg13.cpp
@ -729,6 +729,23 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
downloadTheDocForReals2 ( r );
|
||||
}
|
||||
|
||||
bool isIpInTwitchyTable ( CollectionRec *cr , int32_t ip ) {
|
||||
if ( ! cr ) return false;
|
||||
HashTableX *ht = &cr->m_twitchyTable;
|
||||
if ( ht->m_numSlots == 0 ) return false;
|
||||
return ( ht->getSlot ( &ip ) >= 0 );
|
||||
}
|
||||
|
||||
bool addIpToTwitchyTable ( CollectionRec *cr , int32_t ip ) {
|
||||
if ( ! cr ) return true;
|
||||
HashTableX *ht = &cr->m_twitchyTable;
|
||||
if ( ht->m_numSlots == 0 )
|
||||
ht->set ( 4,0,16,NULL,0,false,MAX_NICENESS,"twitchtbl",true);
|
||||
return ht->addKey ( &ip );
|
||||
}
|
||||
|
||||
|
||||
|
||||
// insertion point when we try to get another proxy to use because the one
|
||||
// we tried seemed to be ip-banned
|
||||
void downloadTheDocForReals2 ( Msg13Request *r ) {
|
||||
@ -741,10 +758,15 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
|
||||
// for diffbot turn ON if use robots is off
|
||||
if ( r->m_forceUseFloaters ) useProxies = true;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// if you turned on automatically use proxies in spider controls...
|
||||
if ( ! useProxies &&
|
||||
cr &&
|
||||
r->m_urlIp != 0 &&
|
||||
r->m_urlIp != -1 &&
|
||||
cr->m_automaticallyUseProxies &&
|
||||
isIpInTwitchyList( cr, r->m_ip ) )
|
||||
isIpInTwitchyTable( cr, r->m_urlIp ) )
|
||||
useProxies = true;
|
||||
|
||||
// we gotta have some proxy ips that we can use
|
||||
@ -1401,15 +1423,20 @@ void gotHttpReply2 ( void *state ,
|
||||
Msg13Request *r = (Msg13Request *) state;
|
||||
UdpSlot *slot = r->m_udpSlot;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// error?
|
||||
if ( g_errno && g_conf.m_logDebugSpider )
|
||||
log("spider: http reply (msg13) had error = %s "
|
||||
"for %s at ip %s",
|
||||
mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));
|
||||
|
||||
bool banned = false;
|
||||
char *banMsg = NULL;
|
||||
const char *banMsg = NULL;
|
||||
if ( ! g_errno &&
|
||||
// must have a collrec to hold the ips
|
||||
cr &&
|
||||
r->m_urlIp != 0 &&
|
||||
r->m_urlIp != -1 &&
|
||||
// if we should use them automatically
|
||||
// now even if we don't do auto proxies, at least back off if
|
||||
// an ip is in the list. do a crawl delay.
|
||||
@ -1421,14 +1448,14 @@ void gotHttpReply2 ( void *state ,
|
||||
// should we turn proxies on for this IP address only?
|
||||
log("msg13: url %s detected as banned (%s), "
|
||||
"automatically using proxies for ip %s"
|
||||
, r->m_url
|
||||
, r->ptr_url
|
||||
, banMsg
|
||||
, iptoa(ts->m_ip)
|
||||
, iptoa(r->m_urlIp)
|
||||
);
|
||||
// . store in our table of ips we should use proxies for
|
||||
// . also start off with a crawldelay of like 1 sec for this
|
||||
// which is not normal for using proxies.
|
||||
addIpToTwitchyList ( cr , ts->m_ip );
|
||||
addIpToTwitchyTable ( cr , r->m_urlIp );
|
||||
/// and retry. it should use the proxy
|
||||
downloadTheDocForReals2 ( r );
|
||||
// that's it. if it had an error it will send back a reply.
|
||||
@ -2905,9 +2932,14 @@ bool addToHammerQueue ( Msg13Request *r ) {
|
||||
|
||||
int32_t crawlDelayMS = r->m_crawlDelayMS;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
|
||||
|
||||
// if not using proxies, but the ip is banning us, then at least
|
||||
// backoff a bit
|
||||
if ( isIpInTwitchyList ( cr , r->m_ip ) )
|
||||
if ( cr &&
|
||||
r->m_urlIp != 0 &&
|
||||
r->m_urlIp != -1 &&
|
||||
isIpInTwitchyTable ( cr , r->m_urlIp ) )
|
||||
// 1 second = 1000 milliseconds
|
||||
if ( crawlDelayMS < 1000 ) crawlDelayMS = 1000;
|
||||
|
||||
|
3
Msg13.h
3
Msg13.h
@ -36,6 +36,8 @@ public:
|
||||
char m_opCode;
|
||||
char m_lastHack;
|
||||
|
||||
collnum_t m_collnum;
|
||||
|
||||
// not part of the proxy request, but set from ProxyReply:
|
||||
int32_t m_numBannedProxies;
|
||||
// . if using proxies, how many proxies have we tried to download
|
||||
@ -153,6 +155,7 @@ public:
|
||||
m_maxTextDocLen = -1; // no limit
|
||||
m_maxOtherDocLen = -1; // no limit
|
||||
m_crawlDelayMS = -1; // unknown or none
|
||||
m_collnum = (collnum_t)-1;
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -16462,8 +16462,8 @@ void Parms::init ( ) {
|
||||
"learn the webserver's spidering policy so that our spiders "
|
||||
"can be more polite. If not proxies are listed on the "
|
||||
"proxies page then this parameter will have no affect.";
|
||||
m->m_cgi = "autouseproxyips";
|
||||
m->m_off = (char *)&cr.m_autoUseProxyIps - g;
|
||||
m->m_cgi = "automaticallyuseproxies";
|
||||
m->m_off = (char *)&cr.m_automaticallyUseProxies - g;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_page = PAGE_SPIDER;
|
||||
|
@ -15923,6 +15923,9 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
// turn this off too
|
||||
r->m_attemptedIframeExpansion = false;
|
||||
|
||||
r->m_collnum = (collnum_t)-1;
|
||||
if ( m_collnumValid )r->m_collnum = m_collnum;
|
||||
|
||||
// turn off
|
||||
r->m_useCompressionProxy = false;
|
||||
r->m_compressReply = false;
|
||||
|
Loading…
Reference in New Issue
Block a user