checkpoint for auto proxy logic

This commit is contained in:
Matt 2015-04-30 13:28:57 -07:00
parent f3d9b016ce
commit 6d8bb19962
6 changed files with 67 additions and 10 deletions

View File

@ -1936,6 +1936,14 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
m_coll,
(int32_t)m_collnum,
(int32_t)m_globalCrawlInfo.m_hasUrlsReadyToSpider);
// the list of ip addresses that we have detected as being throttled
// and therefore backoff and use proxies for
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.load ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
////////////
@ -3207,6 +3215,13 @@ bool CollectionRec::save ( ) {
g_errno = 0;
}
// the list of ip addresses that we have detected as being throttled
// and therefore backoff and use proxies for
sb.reset();
sb.safePrintf("%scoll.%s.%"INT32"/",
g_hostdb.m_dir , m_coll , (int32_t)m_collnum );
m_twitchyTable.save ( sb.getBufStart() , "ipstouseproxiesfor.dat" );
// do not need a save now
m_needsSave = false;

View File

@ -426,6 +426,10 @@ class CollectionRec {
int64_t m_spiderCorruptCount;
// holds ips that have been detected as being throttled and we need
// to backoff and use proxies on
HashTableX m_twitchyTable;
//
// CLOUD SEARCH ENGINE SUPPORT
//
@ -511,7 +515,7 @@ class CollectionRec {
char m_doIpLookups ; // considered iff using proxy
char m_useRobotsTxt ;
char m_forceUseFloaters ;
char m_autoUseProxyIps ;
char m_automaticallyUseProxies ;
//char m_restrictDomain ; // say on same domain as seeds?
char m_doTuringTest ; // for addurl
char m_applyFilterToText ; // speeds us up

View File

@ -729,6 +729,23 @@ void downloadTheDocForReals ( Msg13Request *r ) {
downloadTheDocForReals2 ( r );
}
bool isIpInTwitchyTable ( CollectionRec *cr , int32_t ip ) {
if ( ! cr ) return false;
HashTableX *ht = &cr->m_twitchyTable;
if ( ht->m_numSlots == 0 ) return false;
return ( ht->getSlot ( &ip ) >= 0 );
}
bool addIpToTwitchyTable ( CollectionRec *cr , int32_t ip ) {
if ( ! cr ) return true;
HashTableX *ht = &cr->m_twitchyTable;
if ( ht->m_numSlots == 0 )
ht->set ( 4,0,16,NULL,0,false,MAX_NICENESS,"twitchtbl",true);
return ht->addKey ( &ip );
}
// insertion point when we try to get another proxy to use because the one
// we tried seemed to be ip-banned
void downloadTheDocForReals2 ( Msg13Request *r ) {
@ -741,10 +758,15 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
// for diffbot turn ON if use robots is off
if ( r->m_forceUseFloaters ) useProxies = true;
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
// if you turned on automatically use proxies in spider controls...
if ( ! useProxies &&
cr &&
r->m_urlIp != 0 &&
r->m_urlIp != -1 &&
cr->m_automaticallyUseProxies &&
isIpInTwitchyList( cr, r->m_ip ) )
isIpInTwitchyTable( cr, r->m_urlIp ) )
useProxies = true;
// we gotta have some proxy ips that we can use
@ -1401,15 +1423,20 @@ void gotHttpReply2 ( void *state ,
Msg13Request *r = (Msg13Request *) state;
UdpSlot *slot = r->m_udpSlot;
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
// error?
if ( g_errno && g_conf.m_logDebugSpider )
log("spider: http reply (msg13) had error = %s "
"for %s at ip %s",
mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));
bool banned = false;
char *banMsg = NULL;
const char *banMsg = NULL;
if ( ! g_errno &&
// must have a collrec to hold the ips
cr &&
r->m_urlIp != 0 &&
r->m_urlIp != -1 &&
// if we should use them automatically
// now even if we don't do auto proxies, at least back off if
// an ip is in the list. do a crawl delay.
@ -1421,14 +1448,14 @@ void gotHttpReply2 ( void *state ,
// should we turn proxies on for this IP address only?
log("msg13: url %s detected as banned (%s), "
"automatically using proxies for ip %s"
, r->m_url
, r->ptr_url
, banMsg
, iptoa(ts->m_ip)
, iptoa(r->m_urlIp)
);
// . store in our table of ips we should use proxies for
// . also start off with a crawldelay of like 1 sec for this
// which is not normal for using proxies.
addIpToTwitchyList ( cr , ts->m_ip );
addIpToTwitchyTable ( cr , r->m_urlIp );
/// and retry. it should use the proxy
downloadTheDocForReals2 ( r );
// that's it. if it had an error it will send back a reply.
@ -2905,9 +2932,14 @@ bool addToHammerQueue ( Msg13Request *r ) {
int32_t crawlDelayMS = r->m_crawlDelayMS;
CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
// if not using proxies, but the ip is banning us, then at least
// backoff a bit
if ( isIpInTwitchyList ( cr , r->m_ip ) )
if ( cr &&
r->m_urlIp != 0 &&
r->m_urlIp != -1 &&
isIpInTwitchyTable ( cr , r->m_urlIp ) )
// 1 second = 1000 milliseconds
if ( crawlDelayMS < 1000 ) crawlDelayMS = 1000;

View File

@ -36,6 +36,8 @@ public:
char m_opCode;
char m_lastHack;
collnum_t m_collnum;
// not part of the proxy request, but set from ProxyReply:
int32_t m_numBannedProxies;
// . if using proxies, how many proxies have we tried to download
@ -153,6 +155,7 @@ public:
m_maxTextDocLen = -1; // no limit
m_maxOtherDocLen = -1; // no limit
m_crawlDelayMS = -1; // unknown or none
m_collnum = (collnum_t)-1;
};
};

View File

@ -16462,8 +16462,8 @@ void Parms::init ( ) {
"learn the webserver's spidering policy so that our spiders "
"can be more polite. If not proxies are listed on the "
"proxies page then this parameter will have no affect.";
m->m_cgi = "autouseproxyips";
m->m_off = (char *)&cr.m_autoUseProxyIps - g;
m->m_cgi = "automaticallyuseproxies";
m->m_off = (char *)&cr.m_automaticallyUseProxies - g;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_page = PAGE_SPIDER;

View File

@ -15923,6 +15923,9 @@ char **XmlDoc::getHttpReply2 ( ) {
// turn this off too
r->m_attemptedIframeExpansion = false;
r->m_collnum = (collnum_t)-1;
if ( m_collnumValid )r->m_collnum = m_collnum;
// turn off
r->m_useCompressionProxy = false;
r->m_compressReply = false;