more faster spider fixes. tried to fix

corrupt rdbcache.
This commit is contained in:
Matt Wells 2014-02-06 09:25:27 -08:00
parent 9145d89e3f
commit 4029b0b937
6 changed files with 39 additions and 13 deletions

View File

@ -1105,10 +1105,11 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
// and clear the robots.txt cache in case we recently spidered a
// robots.txt, we don't want to use it, we want to use the one we
// have in the test-parser subdir so we are consistent
RdbCache *robots = Msg13::getHttpCacheRobots();
RdbCache *others = Msg13::getHttpCacheOthers();
robots->clear ( oldCollnum );
others->clear ( oldCollnum );
//RdbCache *robots = Msg13::getHttpCacheRobots();
//RdbCache *others = Msg13::getHttpCacheOthers();
// clear() was removed do to possible corruption
//robots->clear ( oldCollnum );
//others->clear ( oldCollnum );
//g_templateTable.reset();
//g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );

View File

@ -18364,8 +18364,8 @@ void handleRequest3fLoop ( void *weArg ) {
if ( cx->m_spiderColl ) {
log("parms: forcing waiting tree rebuild");
cx->m_spiderColl->m_waitingTreeNeedsRebuild = true;
// and the dup cache because rebuilding the waiting tree
cx->m_spiderColl->m_dupCache.clear(0);
// reset dup cache because rebuilding the waiting tree
cx->m_spiderColl->m_dupCache.reset();//clear(0);
}
// . assume we have urls ready to spider too
// . no, because if they change the filters and there are

View File

@ -2370,7 +2370,10 @@ bool Rdb::addRecord ( collnum_t collnum,
}
}
// cancel any spider request that is a dup in the dupcache to save disk space
// . cancel any spider request that is a dup in the dupcache to save disk space
// . MDW: can't do it this way, it will end up in twins spiderdb then and they
// will be out of sync. just rely on dedupSpiderList() in the merge.
/*
if ( m_rdbId == RDB_SPIDERDB && ! KEYNEG(key) ) {
// . this will create it if spiders are on and its NULL
// . even if spiders are off we need to create it so
@ -2385,6 +2388,7 @@ bool Rdb::addRecord ( collnum_t collnum,
// in Spider.cpp will do that when called from addSpiderRequest() below
if ( isReq && sc->isInDupCache ( sreq , false ) ) return true;
}
*/
if ( m_useTree && (tn=m_tree.addNode ( collnum, key , data , dataSize ))>=0) {
// if adding to spiderdb, add to cache, too

View File

@ -325,7 +325,7 @@ long RdbCache::getLong ( collnum_t collnum ,
(char *)&k,
&rec ,
&recSize ,
false ,
false , // do copy?
maxAge , // in seconds, -1 means none
true , // incCounts?
NULL , // cacheTime ptr
@ -745,6 +745,9 @@ bool RdbCache::addRecord ( collnum_t collnum ,
//long long startTime = gettimeofdayInMillisecondsLocal();
if ( collnum < (collnum_t)0) {char *xx=NULL;*xx=0; }
if ( collnum >= m_maxColls ) {char *xx=NULL;*xx=0; }
// full key not allowed because we use that in markDeletedRecord()
if ( KEYCMP(cacheKey,KEYMAX(),m_cks) == 0 ) { char *xx=NULL;*xx=0; }
// bail if cache empty
if ( m_totalBufSize <= 0 ) return true;
// debug msg
@ -953,7 +956,7 @@ bool RdbCache::addRecord ( collnum_t collnum ,
// delete the rec at m_tail from the hashtable
bool RdbCache::deleteRec ( ) {
// sanity.
if ( m_tail < 0 || m_tail > m_totalBufSize ) {
if ( m_tail < 0 || m_tail >= m_totalBufSize ) {
char *xx = NULL; *xx = 0;}
// don't do anything if we're empty
@ -996,6 +999,7 @@ bool RdbCache::deleteRec ( ) {
"maxCollNum=%li dbname=%s", (long)start,
(long)collnum, g_collectiondb.m_numRecsUsed,
m_dbname);
char *xx=NULL;*xx=0;
// exception for gourav's bug (dbname=Users)
// i am tired of it craping out every 2-3 wks
if ( m_dbname[0]=='U' ) return true;
@ -1064,11 +1068,12 @@ bool RdbCache::deleteRec ( ) {
m_tail += (p - start);
// sanity. this must be failing due to a corrupt dataSize...
if ( m_tail < 0 || m_tail > m_totalBufSize ) {
if ( m_tail < 0 || m_tail +sizeof(collnum_t)+m_cks+4>m_totalBufSize){
char *xx = NULL; *xx = 0;}
// delete key from hash table, iff is for THIS record
// but if it has not already been voided
// but if it has not already been voided.
// we set key to KEYMAX() in markDeletedRecord()
if ( KEYCMP(k,KEYMAX(),m_cks) != 0 ){
removeKey ( collnum , k , start );
markDeletedRecord(start);
@ -1291,9 +1296,14 @@ void RdbCache::clearAll ( ) {
}
*/
//
// . MDW: took out clear() for corruption suspicision... i think ninad's
// corruption detection would panic on collnum_t's of -1 anyway...
//
// . this just clears the contents of the cache
// . used when deleting a collection in Rdb::delColl() and used in
// Rdb::updateToRebuild() when updating/setting the rdb to a rebuilt rdb
/*
void RdbCache::clear ( collnum_t collnum ) {
// bail if no writing ops allowed now
if ( ! g_cacheWritesEnabled ) { char *xx=NULL;*xx=0; }
@ -1309,6 +1319,7 @@ void RdbCache::clear ( collnum_t collnum ) {
*(collnum_t *)m_ptrs[i] = -1;
}
}
*/
bool RdbCache::load ( ) {
return load ( m_dbname );

View File

@ -29,7 +29,8 @@
// . i'd like to set back to 10 for speed... maybe even 5 or less
#define SPIDER_DONE_TIMER 20
#define MAX_WINNER_NODES 40
// seems like timecity.com as gigabytes of spiderdb data so up from 40 to 400
#define MAX_WINNER_NODES 400
Doledb g_doledb;
@ -615,7 +616,9 @@ bool Spiderdb::init ( ) {
-1 , // fixedDataSize
// now that we have MAX_WINNER_NODES allowed in doledb
// we don't have to keep spiderdb so tightly merged i guess..
3,//g_conf.m_spiderdbMinFilesToMerge , mintomerge
// MDW: it seems to slow performance when not tightly merged
// so put this back to "2"...
2,//g_conf.m_spiderdbMinFilesToMerge , mintomerge
maxMem,//g_conf.m_spiderdbMaxTreeMem ,
maxTreeNodes ,
true , // balance tree?

View File

@ -22656,6 +22656,8 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
bool ignore = false;
if ( mbuf[0] == '1' ) ignore = true;
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum );
//
// serialize each link into the metalist now
//
@ -22934,6 +22936,10 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
// set the key, ksr.m_key. isDel = false
ksr.setKey ( firstIp, *d , false );
// if we've recently added this url to spiderdb in Spider.cpp, skip it
if ( sc && sc->isInDupCache ( &ksr , false ) )
continue;
// . technically speaking we do not have any reply so we
// should not be calling this! cuz we don't have all the info
// . see if banned or filtered, etc.
@ -22987,6 +22993,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
//if ( priority == SPIDER_PRIORITY_BANNED ) {
// linksBanned++; continue; }
// serialize into the buffer
long need = ksr.getRecSize();
// is that what we thought it would be?