diff --git a/Parms.cpp b/Parms.cpp index c7c54489..18207734 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -3577,6 +3577,8 @@ bool Parms::saveToXml ( char *THIS , char *f ) { if ( THIS != (char *)&g_conf && m->m_obj == OBJ_CONF) continue; if ( m->m_type == TYPE_MONOD2 ) continue; if ( m->m_type == TYPE_MONOM2 ) continue; + if ( m->m_type == TYPE_CMD ) continue; + if ( m->m_type == TYPE_BOOL2 ) continue; // skip if we should not save to xml if ( ! m->m_save ) continue; // allow comments though diff --git a/Spider.cpp b/Spider.cpp index 98cc5529..38fe6538 100644 --- a/Spider.cpp +++ b/Spider.cpp @@ -1513,10 +1513,15 @@ void SpiderColl::clear ( ) { //m_lastDownloadCache.clear ( m_collnum ); // copied from reset() below - for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) + for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) { m_nextKeys[i] = g_doledb.makeFirstKey2 ( i ); + m_isDoledbEmpty[i] = 0; + } + // assume the whole thing is not empty + m_allDoledbPrioritiesEmpty = false; } + void SpiderColl::reset ( ) { // reset these for SpiderLoop; @@ -1547,8 +1552,14 @@ void SpiderColl::reset ( ) { // key annihilations related to starting at the top of the priority // queue every time we scan it, which causes us to do upwards of // 300 re-reads! - for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) + for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) { m_nextKeys[i] = g_doledb.makeFirstKey2 ( i ); + m_isDoledbEmpty[i] = 0; + } + + // assume the whole thing is not empty + m_allDoledbPrioritiesEmpty = false; + } bool SpiderColl::updateSiteNumInlinksTable ( long siteHash32, @@ -3774,6 +3785,17 @@ bool SpiderColl::scanSpiderdb ( bool needList ) { // a single ip address. maybe use msg1 here not msg4? if ( ! addToDoleTable ( m_bestRequest ) ) return true; + // . if it was empty it is no longer + // . we have this flag here to avoid scanning empty doledb priorities + // because it saves us a msg5 call to doledb in the scanning loop + long bp = m_bestRequest->m_priority; + if ( bp < 0 ) { char *xx=NULL;*xx=0; } + if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; } + m_isDoledbEmpty [ bp ] = 0; + + // and the whole thing is no longer empty + m_allDoledbPrioritiesEmpty = false; + // // delete the winner from ufntree as well // @@ -3947,15 +3969,15 @@ bool SpiderColl::addToDoleTable ( SpiderRequest *sreq ) { // update how many per ip we got doled long *score = (long *)m_doleIpTable.getValue32 ( sreq->m_firstIp ); // debug point - if ( g_conf.m_logDebugSpider ) { + if ( g_conf.m_logDebugSpider && 1 == 2 ) { // disable for now, spammy long long uh48 = sreq->getUrlHash48(); long long pdocid = sreq->getParentDocId(); long ss = 1; if ( score ) ss = *score + 1; - //log("spider: added to doletbl uh48=%llu parentdocid=%llu " - // "ipdolecount=%li ufn=%li priority=%li firstip=%s", - // uh48,pdocid,ss,(long)sreq->m_ufn,(long)sreq->m_priority, - // iptoa(sreq->m_firstIp)); + log("spider: added to doletbl uh48=%llu parentdocid=%llu " + "ipdolecount=%li ufn=%li priority=%li firstip=%s", + uh48,pdocid,ss,(long)sreq->m_ufn,(long)sreq->m_priority, + iptoa(sreq->m_firstIp)); } // we had a score there already, so inc it if ( score ) { @@ -4138,7 +4160,7 @@ void SpiderLoop::startLoop ( ) { // spider some urls that were doled to us //g_spiderLoop.spiderDoledUrls( ); // sleep for .1 seconds = 100ms - if (!g_loop.registerSleepCallback(10,this,doneSleepingWrapperSL)) + if (!g_loop.registerSleepCallback(50,this,doneSleepingWrapperSL)) log("build: Failed to register timer callback. Spidering " "is permanently disabled. Restart to fix."); @@ -4583,6 +4605,13 @@ void SpiderLoop::spiderDoledUrls ( ) { // try next collection continue; } + + // . if all doledb priorities are empty, skip it quickly + // . do this only after we update lastSpiderAttempt above + if ( cr->m_spiderColl && + cr->m_spiderColl->m_allDoledbPrioritiesEmpty ) + continue; + // ok, we are good to launch a spider for coll m_cri break; } @@ -4681,6 +4710,15 @@ void SpiderLoop::spiderDoledUrls ( ) { goto collLoop; } + // . skip priority if we knows its empty in doledb + // . this will save us a call to msg5 below + if ( m_sc->m_isDoledbEmpty [ m_sc->m_pri2 ] ) { + // decrease the priority + m_sc->devancePriority(); + // and try the one below + goto loop; + } + // shortcut //CollectionRec *cr = m_sc->m_cr; // sanity @@ -4879,6 +4917,18 @@ bool SpiderLoop::gotDoledbList2 ( ) { // bail if list is empty if ( m_list.getListSize() <= 0 ) { + // don't bother with this priority again until a key is + // added to it! + m_sc->m_isDoledbEmpty [ m_sc->m_pri2 ] = 1; + + // if all priorities now empty set another flag + m_sc->m_allDoledbPrioritiesEmpty = true; + for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) { + if ( m_sc->m_isDoledbEmpty[m_sc->m_pri2] ) continue; + m_sc->m_allDoledbPrioritiesEmpty = false; + break; + } + // if no spiders... //if ( g_conf.m_logDebugSpider ) { // log("spider: empty doledblist collnum=%li " diff --git a/Spider.h b/Spider.h index a07cc3c8..d4299460 100644 --- a/Spider.h +++ b/Spider.h @@ -1045,6 +1045,12 @@ class SpiderColl { // doledb cursor keys for each priority to speed up performance key_t m_nextKeys[MAX_SPIDER_PRIORITIES]; + // save us scanning empty priorities + char m_isDoledbEmpty [MAX_SPIDER_PRIORITIES]; + + // are all priority slots empt? + bool m_allDoledbPrioritiesEmpty; + // maps priority to first ufn that uses that // priority. map to -1 if no ufn uses it. that way when we scan // priorities for spiderrequests to dole out we can start with diff --git a/UdpServer.h b/UdpServer.h index a51db545..7fda0e33 100644 --- a/UdpServer.h +++ b/UdpServer.h @@ -119,6 +119,8 @@ class UdpServer { // host. for niceness=0 requests the backoff is usually constant // and set to about 30 ms. so if you set maxResends to 10 that is // probably at least 300 ms of resending tries. + // . use an ip of 0 and port of 0 if you provide a hostId. use a hostid + // of -1 to indicate no hostid. bool sendRequest ( char *msg , long msgSize , unsigned char msgType , diff --git a/gb.conf b/gb.conf index 2a5cbc37..0bb6da46 100644 --- a/gb.conf +++ b/gb.conf @@ -57,7 +57,7 @@ 0 # Overrides all spidering for all collections on just this host. -0 +1 # Overrides all add urls for all collections on just this host. 1 @@ -72,12 +72,6 @@ # compares to the last run and outputs the diffs for inspection and validation. 1 -# Enable spidering on all hosts -0 - -# Disable spidering on all hosts -0 - # Serves ads unless pure=1 is in cgi parms. 0