fix floater bug from reading hashtable off disk.

force use floaters if ! useRobots and is diffbot crawl.
This commit is contained in:
mwells 2014-09-26 15:30:42 -07:00
parent a7bb1c59a3
commit c2f98a81b6
4 changed files with 29 additions and 3 deletions

View File

@ -432,7 +432,6 @@ bool HashTableX::load ( char *dir , char *filename , SafeBuf *fillBuf ) {
// both return false and set g_errno on error, true otherwise
bool HashTableX::load ( char *dir, char *filename, char **tbuf, long *tsize ) {
reset();
File f;
f.set ( dir , filename );
if ( ! f.doesExist() ) return false;
@ -447,10 +446,27 @@ bool HashTableX::load ( char *dir, char *filename, char **tbuf, long *tsize ) {
off += 4;
if ( ! f.read ( &numSlotsUsed , 4 , off ) ) return false;
off += 4;
if ( ! f.read ( &m_ks , 4 , off ) ) return false;
long ks;
if ( ! f.read ( &ks , 4 , off ) ) return false;
off += 4;
if ( ! f.read ( &m_ds , 4 , off ) ) return false;
long ds;
if ( ! f.read ( &ds , 4 , off ) ) return false;
off += 4;
// bogus key size?
if ( ks <= 0 ) {
log("htable: reading hashtable from %s%s: "
"bogus keysize of %li",
dir,filename,ks );
return false;
}
// just in case m_ks was already set, call reset() down here
reset();
m_ks = ks;
m_ds = ds;
if ( ! setTableSize ( numSlots , NULL , 0 ) ) return false;
if ( ! f.read ( m_keys , numSlots * m_ks , off ) ) return false;
off += numSlots * m_ks;

View File

@ -732,9 +732,13 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
// user can turn off proxy use with this switch
if ( ! g_conf.m_useProxyIps ) useProxies = false;
// for diffbot turn ON if use robots is off
if ( r->m_forceUseFloaters ) useProxies = true;
// we gotta have some proxy ips that we can use
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
// we did not need a spider proxy ip so send this reuest to a host
// to download the url
if ( ! useProxies ) {

View File

@ -97,6 +97,7 @@ public:
long m_isSquidProxiedUrl:1;
long m_foundInCache:1;
long m_forceUseFloaters:1;
//long m_testParserEnabled:1;
//long m_testSpiderEnabled:1;

View File

@ -15261,6 +15261,11 @@ char **XmlDoc::getHttpReply2 ( ) {
if ( od )
r->m_contentHash32 = od->m_contentHash32;
// force floater usage on even if "use spider proxies" parms is off
// if we're a diffbot crawl and use robots is off.
if ( cr && ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
r->m_forceUseFloaters = true;
// eventgurubot is the max
//char *userAgent = g_conf.m_spiderUserAgent;
// hardcode it