save resources by not doing reads

on an empty doledb priority.
stop saving allSpidersOn and Off parms.
This commit is contained in:
Matt Wells 2013-12-08 14:07:31 -07:00
parent a2e52a5dc3
commit 144e2c898e
5 changed files with 69 additions and 15 deletions

View File

@ -3577,6 +3577,8 @@ bool Parms::saveToXml ( char *THIS , char *f ) {
if ( THIS != (char *)&g_conf && m->m_obj == OBJ_CONF) continue;
if ( m->m_type == TYPE_MONOD2 ) continue;
if ( m->m_type == TYPE_MONOM2 ) continue;
if ( m->m_type == TYPE_CMD ) continue;
if ( m->m_type == TYPE_BOOL2 ) continue;
// skip if we should not save to xml
if ( ! m->m_save ) continue;
// allow comments though

View File

@ -1513,10 +1513,15 @@ void SpiderColl::clear ( ) {
//m_lastDownloadCache.clear ( m_collnum );
// copied from reset() below
for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ )
for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) {
m_nextKeys[i] = g_doledb.makeFirstKey2 ( i );
m_isDoledbEmpty[i] = 0;
}
// assume the whole thing is not empty
m_allDoledbPrioritiesEmpty = false;
}
void SpiderColl::reset ( ) {
// reset these for SpiderLoop;
@ -1547,8 +1552,14 @@ void SpiderColl::reset ( ) {
// key annihilations related to starting at the top of the priority
// queue every time we scan it, which causes us to do upwards of
// 300 re-reads!
for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ )
for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) {
m_nextKeys[i] = g_doledb.makeFirstKey2 ( i );
m_isDoledbEmpty[i] = 0;
}
// assume the whole thing is not empty
m_allDoledbPrioritiesEmpty = false;
}
bool SpiderColl::updateSiteNumInlinksTable ( long siteHash32,
@ -3774,6 +3785,17 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
// a single ip address. maybe use msg1 here not msg4?
if ( ! addToDoleTable ( m_bestRequest ) ) return true;
// . if it was empty it is no longer
// . we have this flag here to avoid scanning empty doledb priorities
// because it saves us a msg5 call to doledb in the scanning loop
long bp = m_bestRequest->m_priority;
if ( bp < 0 ) { char *xx=NULL;*xx=0; }
if ( bp >= MAX_SPIDER_PRIORITIES ) { char *xx=NULL;*xx=0; }
m_isDoledbEmpty [ bp ] = 0;
// and the whole thing is no longer empty
m_allDoledbPrioritiesEmpty = false;
//
// delete the winner from ufntree as well
//
@ -3947,15 +3969,15 @@ bool SpiderColl::addToDoleTable ( SpiderRequest *sreq ) {
// update how many per ip we got doled
long *score = (long *)m_doleIpTable.getValue32 ( sreq->m_firstIp );
// debug point
if ( g_conf.m_logDebugSpider ) {
if ( g_conf.m_logDebugSpider && 1 == 2 ) { // disable for now, spammy
long long uh48 = sreq->getUrlHash48();
long long pdocid = sreq->getParentDocId();
long ss = 1;
if ( score ) ss = *score + 1;
//log("spider: added to doletbl uh48=%llu parentdocid=%llu "
// "ipdolecount=%li ufn=%li priority=%li firstip=%s",
// uh48,pdocid,ss,(long)sreq->m_ufn,(long)sreq->m_priority,
// iptoa(sreq->m_firstIp));
log("spider: added to doletbl uh48=%llu parentdocid=%llu "
"ipdolecount=%li ufn=%li priority=%li firstip=%s",
uh48,pdocid,ss,(long)sreq->m_ufn,(long)sreq->m_priority,
iptoa(sreq->m_firstIp));
}
// we had a score there already, so inc it
if ( score ) {
@ -4138,7 +4160,7 @@ void SpiderLoop::startLoop ( ) {
// spider some urls that were doled to us
//g_spiderLoop.spiderDoledUrls( );
// sleep for .1 seconds = 100ms
if (!g_loop.registerSleepCallback(10,this,doneSleepingWrapperSL))
if (!g_loop.registerSleepCallback(50,this,doneSleepingWrapperSL))
log("build: Failed to register timer callback. Spidering "
"is permanently disabled. Restart to fix.");
@ -4583,6 +4605,13 @@ void SpiderLoop::spiderDoledUrls ( ) {
// try next collection
continue;
}
// . if all doledb priorities are empty, skip it quickly
// . do this only after we update lastSpiderAttempt above
if ( cr->m_spiderColl &&
cr->m_spiderColl->m_allDoledbPrioritiesEmpty )
continue;
// ok, we are good to launch a spider for coll m_cri
break;
}
@ -4681,6 +4710,15 @@ void SpiderLoop::spiderDoledUrls ( ) {
goto collLoop;
}
// . skip priority if we knows its empty in doledb
// . this will save us a call to msg5 below
if ( m_sc->m_isDoledbEmpty [ m_sc->m_pri2 ] ) {
// decrease the priority
m_sc->devancePriority();
// and try the one below
goto loop;
}
// shortcut
//CollectionRec *cr = m_sc->m_cr;
// sanity
@ -4879,6 +4917,18 @@ bool SpiderLoop::gotDoledbList2 ( ) {
// bail if list is empty
if ( m_list.getListSize() <= 0 ) {
// don't bother with this priority again until a key is
// added to it!
m_sc->m_isDoledbEmpty [ m_sc->m_pri2 ] = 1;
// if all priorities now empty set another flag
m_sc->m_allDoledbPrioritiesEmpty = true;
for ( long i = 0 ; i < MAX_SPIDER_PRIORITIES ; i++ ) {
if ( m_sc->m_isDoledbEmpty[m_sc->m_pri2] ) continue;
m_sc->m_allDoledbPrioritiesEmpty = false;
break;
}
// if no spiders...
//if ( g_conf.m_logDebugSpider ) {
// log("spider: empty doledblist collnum=%li "

View File

@ -1045,6 +1045,12 @@ class SpiderColl {
// doledb cursor keys for each priority to speed up performance
key_t m_nextKeys[MAX_SPIDER_PRIORITIES];
// save us scanning empty priorities
char m_isDoledbEmpty [MAX_SPIDER_PRIORITIES];
// are all priority slots empt?
bool m_allDoledbPrioritiesEmpty;
// maps priority to first ufn that uses that
// priority. map to -1 if no ufn uses it. that way when we scan
// priorities for spiderrequests to dole out we can start with

View File

@ -119,6 +119,8 @@ class UdpServer {
// host. for niceness=0 requests the backoff is usually constant
// and set to about 30 ms. so if you set maxResends to 10 that is
// probably at least 300 ms of resending tries.
// . use an ip of 0 and port of 0 if you provide a hostId. use a hostid
// of -1 to indicate no hostid.
bool sendRequest ( char *msg ,
long msgSize ,
unsigned char msgType ,

View File

@ -57,7 +57,7 @@
<doNarrowSearch>0</>
# Overrides all spidering for all collections on just this host.
<localSpideringEnabled>0</>
<localSpideringEnabled>1</>
# Overrides all add urls for all collections on just this host.
<localAddUrlEnabled>1</>
@ -72,12 +72,6 @@
# compares to the last run and outputs the diffs for inspection and validation.
<qaSearchTestEnabled>1</>
# Enable spidering on all hosts
<allSpidersOn>0</>
# Disable spidering on all hosts
<allSpidersOff>0</>
# Serves ads unless pure=1 is in cgi parms.
<adFeedEnabled>0</>