mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
more faster spider fixes. tried to fix
corrupt rdbcache.
This commit is contained in:
parent
9145d89e3f
commit
4029b0b937
@ -1105,10 +1105,11 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
// and clear the robots.txt cache in case we recently spidered a
|
||||
// robots.txt, we don't want to use it, we want to use the one we
|
||||
// have in the test-parser subdir so we are consistent
|
||||
RdbCache *robots = Msg13::getHttpCacheRobots();
|
||||
RdbCache *others = Msg13::getHttpCacheOthers();
|
||||
robots->clear ( oldCollnum );
|
||||
others->clear ( oldCollnum );
|
||||
//RdbCache *robots = Msg13::getHttpCacheRobots();
|
||||
//RdbCache *others = Msg13::getHttpCacheOthers();
|
||||
// clear() was removed do to possible corruption
|
||||
//robots->clear ( oldCollnum );
|
||||
//others->clear ( oldCollnum );
|
||||
|
||||
//g_templateTable.reset();
|
||||
//g_templateTable.save( g_hostdb.m_dir , "turkedtemplates.dat" );
|
||||
|
@ -18364,8 +18364,8 @@ void handleRequest3fLoop ( void *weArg ) {
|
||||
if ( cx->m_spiderColl ) {
|
||||
log("parms: forcing waiting tree rebuild");
|
||||
cx->m_spiderColl->m_waitingTreeNeedsRebuild = true;
|
||||
// and the dup cache because rebuilding the waiting tree
|
||||
cx->m_spiderColl->m_dupCache.clear(0);
|
||||
// reset dup cache because rebuilding the waiting tree
|
||||
cx->m_spiderColl->m_dupCache.reset();//clear(0);
|
||||
}
|
||||
// . assume we have urls ready to spider too
|
||||
// . no, because if they change the filters and there are
|
||||
|
6
Rdb.cpp
6
Rdb.cpp
@ -2370,7 +2370,10 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
}
|
||||
}
|
||||
|
||||
// cancel any spider request that is a dup in the dupcache to save disk space
|
||||
// . cancel any spider request that is a dup in the dupcache to save disk space
|
||||
// . MDW: can't do it this way, it will end up in twins spiderdb then and they
|
||||
// will be out of sync. just rely on dedupSpiderList() in the merge.
|
||||
/*
|
||||
if ( m_rdbId == RDB_SPIDERDB && ! KEYNEG(key) ) {
|
||||
// . this will create it if spiders are on and its NULL
|
||||
// . even if spiders are off we need to create it so
|
||||
@ -2385,6 +2388,7 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
// in Spider.cpp will do that when called from addSpiderRequest() below
|
||||
if ( isReq && sc->isInDupCache ( sreq , false ) ) return true;
|
||||
}
|
||||
*/
|
||||
|
||||
if ( m_useTree && (tn=m_tree.addNode ( collnum, key , data , dataSize ))>=0) {
|
||||
// if adding to spiderdb, add to cache, too
|
||||
|
19
RdbCache.cpp
19
RdbCache.cpp
@ -325,7 +325,7 @@ long RdbCache::getLong ( collnum_t collnum ,
|
||||
(char *)&k,
|
||||
&rec ,
|
||||
&recSize ,
|
||||
false ,
|
||||
false , // do copy?
|
||||
maxAge , // in seconds, -1 means none
|
||||
true , // incCounts?
|
||||
NULL , // cacheTime ptr
|
||||
@ -745,6 +745,9 @@ bool RdbCache::addRecord ( collnum_t collnum ,
|
||||
//long long startTime = gettimeofdayInMillisecondsLocal();
|
||||
if ( collnum < (collnum_t)0) {char *xx=NULL;*xx=0; }
|
||||
if ( collnum >= m_maxColls ) {char *xx=NULL;*xx=0; }
|
||||
// full key not allowed because we use that in markDeletedRecord()
|
||||
if ( KEYCMP(cacheKey,KEYMAX(),m_cks) == 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// bail if cache empty
|
||||
if ( m_totalBufSize <= 0 ) return true;
|
||||
// debug msg
|
||||
@ -953,7 +956,7 @@ bool RdbCache::addRecord ( collnum_t collnum ,
|
||||
// delete the rec at m_tail from the hashtable
|
||||
bool RdbCache::deleteRec ( ) {
|
||||
// sanity.
|
||||
if ( m_tail < 0 || m_tail > m_totalBufSize ) {
|
||||
if ( m_tail < 0 || m_tail >= m_totalBufSize ) {
|
||||
char *xx = NULL; *xx = 0;}
|
||||
|
||||
// don't do anything if we're empty
|
||||
@ -996,6 +999,7 @@ bool RdbCache::deleteRec ( ) {
|
||||
"maxCollNum=%li dbname=%s", (long)start,
|
||||
(long)collnum, g_collectiondb.m_numRecsUsed,
|
||||
m_dbname);
|
||||
char *xx=NULL;*xx=0;
|
||||
// exception for gourav's bug (dbname=Users)
|
||||
// i am tired of it craping out every 2-3 wks
|
||||
if ( m_dbname[0]=='U' ) return true;
|
||||
@ -1064,11 +1068,12 @@ bool RdbCache::deleteRec ( ) {
|
||||
m_tail += (p - start);
|
||||
|
||||
// sanity. this must be failing due to a corrupt dataSize...
|
||||
if ( m_tail < 0 || m_tail > m_totalBufSize ) {
|
||||
if ( m_tail < 0 || m_tail +sizeof(collnum_t)+m_cks+4>m_totalBufSize){
|
||||
char *xx = NULL; *xx = 0;}
|
||||
|
||||
// delete key from hash table, iff is for THIS record
|
||||
// but if it has not already been voided
|
||||
// but if it has not already been voided.
|
||||
// we set key to KEYMAX() in markDeletedRecord()
|
||||
if ( KEYCMP(k,KEYMAX(),m_cks) != 0 ){
|
||||
removeKey ( collnum , k , start );
|
||||
markDeletedRecord(start);
|
||||
@ -1291,9 +1296,14 @@ void RdbCache::clearAll ( ) {
|
||||
}
|
||||
*/
|
||||
|
||||
//
|
||||
// . MDW: took out clear() for corruption suspicision... i think ninad's
|
||||
// corruption detection would panic on collnum_t's of -1 anyway...
|
||||
//
|
||||
// . this just clears the contents of the cache
|
||||
// . used when deleting a collection in Rdb::delColl() and used in
|
||||
// Rdb::updateToRebuild() when updating/setting the rdb to a rebuilt rdb
|
||||
/*
|
||||
void RdbCache::clear ( collnum_t collnum ) {
|
||||
// bail if no writing ops allowed now
|
||||
if ( ! g_cacheWritesEnabled ) { char *xx=NULL;*xx=0; }
|
||||
@ -1309,6 +1319,7 @@ void RdbCache::clear ( collnum_t collnum ) {
|
||||
*(collnum_t *)m_ptrs[i] = -1;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
bool RdbCache::load ( ) {
|
||||
return load ( m_dbname );
|
||||
|
@ -29,7 +29,8 @@
|
||||
// . i'd like to set back to 10 for speed... maybe even 5 or less
|
||||
#define SPIDER_DONE_TIMER 20
|
||||
|
||||
#define MAX_WINNER_NODES 40
|
||||
// seems like timecity.com as gigabytes of spiderdb data so up from 40 to 400
|
||||
#define MAX_WINNER_NODES 400
|
||||
|
||||
Doledb g_doledb;
|
||||
|
||||
@ -615,7 +616,9 @@ bool Spiderdb::init ( ) {
|
||||
-1 , // fixedDataSize
|
||||
// now that we have MAX_WINNER_NODES allowed in doledb
|
||||
// we don't have to keep spiderdb so tightly merged i guess..
|
||||
3,//g_conf.m_spiderdbMinFilesToMerge , mintomerge
|
||||
// MDW: it seems to slow performance when not tightly merged
|
||||
// so put this back to "2"...
|
||||
2,//g_conf.m_spiderdbMinFilesToMerge , mintomerge
|
||||
maxMem,//g_conf.m_spiderdbMaxTreeMem ,
|
||||
maxTreeNodes ,
|
||||
true , // balance tree?
|
||||
|
@ -22656,6 +22656,8 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
bool ignore = false;
|
||||
if ( mbuf[0] == '1' ) ignore = true;
|
||||
|
||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum );
|
||||
|
||||
//
|
||||
// serialize each link into the metalist now
|
||||
//
|
||||
@ -22934,6 +22936,10 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
// set the key, ksr.m_key. isDel = false
|
||||
ksr.setKey ( firstIp, *d , false );
|
||||
|
||||
// if we've recently added this url to spiderdb in Spider.cpp, skip it
|
||||
if ( sc && sc->isInDupCache ( &ksr , false ) )
|
||||
continue;
|
||||
|
||||
// . technically speaking we do not have any reply so we
|
||||
// should not be calling this! cuz we don't have all the info
|
||||
// . see if banned or filtered, etc.
|
||||
@ -22987,6 +22993,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
//if ( priority == SPIDER_PRIORITY_BANNED ) {
|
||||
// linksBanned++; continue; }
|
||||
|
||||
|
||||
// serialize into the buffer
|
||||
long need = ksr.getRecSize();
|
||||
// is that what we thought it would be?
|
||||
|
Loading…
Reference in New Issue
Block a user