mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
fix floater bug from reading hashtable off disk.
force use floaters if ! useRobots and is diffbot crawl.
This commit is contained in:
parent
a7bb1c59a3
commit
c2f98a81b6
@ -432,7 +432,6 @@ bool HashTableX::load ( char *dir , char *filename , SafeBuf *fillBuf ) {
|
||||
|
||||
// both return false and set g_errno on error, true otherwise
|
||||
bool HashTableX::load ( char *dir, char *filename, char **tbuf, long *tsize ) {
|
||||
reset();
|
||||
File f;
|
||||
f.set ( dir , filename );
|
||||
if ( ! f.doesExist() ) return false;
|
||||
@ -447,10 +446,27 @@ bool HashTableX::load ( char *dir, char *filename, char **tbuf, long *tsize ) {
|
||||
off += 4;
|
||||
if ( ! f.read ( &numSlotsUsed , 4 , off ) ) return false;
|
||||
off += 4;
|
||||
if ( ! f.read ( &m_ks , 4 , off ) ) return false;
|
||||
long ks;
|
||||
if ( ! f.read ( &ks , 4 , off ) ) return false;
|
||||
off += 4;
|
||||
if ( ! f.read ( &m_ds , 4 , off ) ) return false;
|
||||
long ds;
|
||||
if ( ! f.read ( &ds , 4 , off ) ) return false;
|
||||
off += 4;
|
||||
|
||||
// bogus key size?
|
||||
if ( ks <= 0 ) {
|
||||
log("htable: reading hashtable from %s%s: "
|
||||
"bogus keysize of %li",
|
||||
dir,filename,ks );
|
||||
return false;
|
||||
}
|
||||
|
||||
// just in case m_ks was already set, call reset() down here
|
||||
reset();
|
||||
|
||||
m_ks = ks;
|
||||
m_ds = ds;
|
||||
|
||||
if ( ! setTableSize ( numSlots , NULL , 0 ) ) return false;
|
||||
if ( ! f.read ( m_keys , numSlots * m_ks , off ) ) return false;
|
||||
off += numSlots * m_ks;
|
||||
|
@ -732,9 +732,13 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
|
||||
// user can turn off proxy use with this switch
|
||||
if ( ! g_conf.m_useProxyIps ) useProxies = false;
|
||||
|
||||
// for diffbot turn ON if use robots is off
|
||||
if ( r->m_forceUseFloaters ) useProxies = true;
|
||||
|
||||
// we gotta have some proxy ips that we can use
|
||||
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
|
||||
|
||||
|
||||
// we did not need a spider proxy ip so send this reuest to a host
|
||||
// to download the url
|
||||
if ( ! useProxies ) {
|
||||
|
1
Msg13.h
1
Msg13.h
@ -97,6 +97,7 @@ public:
|
||||
long m_isSquidProxiedUrl:1;
|
||||
|
||||
long m_foundInCache:1;
|
||||
long m_forceUseFloaters:1;
|
||||
|
||||
//long m_testParserEnabled:1;
|
||||
//long m_testSpiderEnabled:1;
|
||||
|
@ -15261,6 +15261,11 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
if ( od )
|
||||
r->m_contentHash32 = od->m_contentHash32;
|
||||
|
||||
// force floater usage on even if "use spider proxies" parms is off
|
||||
// if we're a diffbot crawl and use robots is off.
|
||||
if ( cr && ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
|
||||
r->m_forceUseFloaters = true;
|
||||
|
||||
// eventgurubot is the max
|
||||
//char *userAgent = g_conf.m_spiderUserAgent;
|
||||
// hardcode it
|
||||
|
Loading…
Reference in New Issue
Block a user