fixed bug of gb not saving

This commit is contained in:
mwells 2015-02-22 13:11:20 -07:00
parent 4e485b6649
commit 692c2932e8
4 changed files with 90 additions and 16 deletions

View File

@ -1340,6 +1340,7 @@ bool Process::save ( ) {
logf(LOG_INFO,"db: Entering lock mode for saving.");
m_mode = LOCK_MODE; // SAVE_MODE;
m_urgent = false;
m_calledSave = false;
return save2();
}
@ -1360,6 +1361,8 @@ bool Process::shutdown ( bool urgent ,
m_mode = EXIT_MODE;
m_urgent = urgent;
m_calledSave = false;
// check memory buffers for overruns/underrunds to see if that
// caused this core
if ( urgent ) g_mem.printBreeches(false);
@ -1738,16 +1741,30 @@ bool Process::saveRdbTrees ( bool useThread , bool shuttingDown ) {
if ( g_conf.m_readOnlyMode ) return true;
// no thread if shutting down
if ( shuttingDown ) useThread = false;
// debug note
log("gb: shuttingdown=%i",(int)shuttingDown);
// turn off statsdb until everyone is done
//g_statsdb.m_disabled = true;
// loop over all Rdbs and save them
for ( int32_t i = 0 ; ! m_calledSave && i < m_numRdbs ; i++ ) {
for ( int32_t i = 0 ; i < m_numRdbs ; i++ ) {
if ( m_calledSave ) {
log("gb: already saved trees, skipping.");
break;
}
Rdb *rdb = m_rdbs[i];
// if we save doledb while spidering it screws us up
// because Spider.cpp can not directly write into the
// rdb tree and it expects that to always be available!
if ( ! shuttingDown && rdb->m_rdbId == RDB_DOLEDB )
continue;
// note it
if ( ! rdb->m_dbname || ! rdb->m_dbname[0] )
log("gb: calling save tree for rdbid %i",
(int)rdb->m_rdbId);
else
log("gb: calling save tree for %s",
rdb->m_dbname);
rdb->saveTree ( useThread );
}

View File

@ -162,7 +162,9 @@ void RdbTree::reset ( ) {
strcmp(m_dbname,"accessdb") &&
strcmp(m_dbname,"statsdb") ) {
//strcmp(m_dbname,"doledb") ) {
log("rdb: Resetting unsaved tree %s.",m_dbname);
log("rdb: RESETTING UNSAVED TREE %s.",m_dbname);
log("rdb: RESETTING UNSAVED TREE %s.",m_dbname);
log("rdb: RESETTING UNSAVED TREE %s.",m_dbname);
// when DELETING a collection from pagecrawlbot.cpp
// it calls Collectiondb::deleteRec() which calls
// SpiderColl::reset() which calls m_waitingTree.reset()

View File

@ -1878,6 +1878,14 @@ bool SpiderColl::updateSiteNumInlinksTable ( int32_t siteHash32,
// the count in m_doleIpTable here
bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
////
//
// skip if not assigned to us for doling
//
////
if ( ! isAssignedToUs ( srep->m_firstIp ) )
return true;
/////////
//
// remove the lock here
@ -1891,8 +1899,10 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
time_t nowGlobal = getTimeGlobal();
if ( g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: scheduled lock removal in 5 secs for "
"lockKey=%"UINT64"", lockKey );
logf(LOG_DEBUG,"spider: removing lock uh48=%"INT64" "
"lockKey=%"UINT64"",
srep->getUrlHash48(),
lockKey );
// test it
//if ( m_nowGlobal == 0 && lock )
@ -1927,14 +1937,6 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
logf(LOG_DEBUG,"spider: rdb: lockKey=%"UINT64" "
"was not in lock table",lockKey);
////
//
// skip if not assigned to us for doling
//
////
if ( ! isAssignedToUs ( srep->m_firstIp ) )
return true;
// now just remove it since we only spider our own urls
// and doledb is in memory
g_spiderLoop.m_lockTable.removeKey ( &lockKey );
@ -3486,7 +3488,7 @@ bool SpiderColl::evalIpLoop ( ) {
&cachedTimestamp , // rec timestamp
true ); // promote rec?
// doleBuf could be NULL i guess...
if ( inCache && doleBufSize > 0 ) {
if ( inCache ) { // && doleBufSize > 0 ) {
if ( g_conf.m_logDebugSpider )
log("spider: GOT %"INT32" bytes of SpiderRequests "
"from winnerlistcache for ip %s",doleBufSize,
@ -4162,6 +4164,11 @@ bool SpiderColl::scanListForWinners ( ) {
if ( g_conf.m_logDebugSpider )
log("spider: got ufn=%"INT32" for %s",ufn,sreq->m_url);
if ( g_conf.m_logDebugSpider && srep )
log("spider: lastspidered=%"UINT32"",
srep->m_spideredTime);
// spiders disabled for this row in url filteres?
//if ( ! m_cr->m_spidersEnabled[ufn] ) continue;
@ -4212,6 +4219,11 @@ bool SpiderColl::scanListForWinners ( ) {
continue;
}
// debug point
// if ( ((long long)srep->m_spideredTime)*1000LL >
// nowGlobalMS - 86400LL*1000LL*30LL )
// log("spider: should not be spidering this!");
//////
//
// MDW: no, take this out now that we allow multiple urls
@ -4312,6 +4324,14 @@ bool SpiderColl::scanListForWinners ( ) {
// "firstIp". this way we can spider multiple urls from the
// same ip at the same time.
int64_t key = makeLockTableKey ( sreq );
if ( g_conf.m_logDebugSpider )
log("spider: checking uh48=%"INT64" lockkey=%"INT64" "
"used=%"INT32"",
uh48,key,
g_spiderLoop.m_lockTable.getNumUsedSlots());
// MDW
if ( g_spiderLoop.m_lockTable.isInTable ( &key ) ) {
// get it
//CrawlInfo *ci = &m_cr->m_localCrawlInfo;
@ -5179,17 +5199,41 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// allow this to add a 0 length record otherwise we keep the same
// old url in here and keep spidering it over and over again!
bool addToCache = false;
if ( skipSize && m_doleBuf.length() - skipSize >= 0 ) addToCache =true;
if ( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true;
// if winnertree was empty, then we might have scanned like 10M
// twitter.com urls and not wanted any of them, so we don't want to
// have to keep redoing that!
if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true;
RdbCache *wc = &g_spiderLoop.m_winnerListCache;
// remove from cache? if we added the last spider request in the
// cached dolebuf to doledb then remove it from cache so it's not
// a cached empty dolebuf and we recompute it not using the cache.
if ( isFromCache && skipSize && m_doleBuf.length() - skipSize == 0 ) {
if ( addToCache ) { char *xx=NULL;*xx=0; }
// let's get this working right...
//wc->removeKey ( collnum , k , start );
//wc->markDeletedRecord(start);
// i don't think we can remove keys from cache so add
// a rec with a byte size of 1 to indicate for us to ignore.
// set the timestamp to 12345 so the getRecord above will
// not get it and promote it in the linked list.
char byte = 0;
key_t cacheKey;
cacheKey.n0 = firstIp;
cacheKey.n1 = 0;
wc->addRecord ( m_collnum,
(char *)&cacheKey,
&byte ,
1 ,
12345 );//cachedTimestamp );
}
if ( addToCache ) {
key_t cacheKey;
cacheKey.n0 = firstIp;
cacheKey.n1 = 0;
RdbCache *wc = &g_spiderLoop.m_winnerListCache;
if ( g_conf.m_logDebugSpider )
log("spider: adding %"INT32" bytes of SpiderRequests "
"to winnerlistcache for ip %s",
@ -5243,6 +5287,11 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
// now that doledb is tree-only and never dumps to disk, just
// add it directly
g_doledb.m_rdb.addList ( m_collnum , &tmpList , MAX_NICENESS );
if ( g_conf.m_logDebugSpider )
log("spider: adding doledb tree node size=%"INT32"",skipSize);
// and it happens right away. just add it locally.
bool status = true;
@ -7287,6 +7336,9 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
true);
if ( node == -1 ) { char *xx=NULL;*xx=0; }
if ( g_conf.m_logDebugSpider )
log("spider: deleting doledb tree node %"INT32,node);
// now remove from doleiptable since we removed from doledb
m_sc->removeFromDoledbTable ( sreq->m_firstIp );
@ -7326,6 +7378,9 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
tmp.m_spiderOutstanding = 0;
tmp.m_confirmed = 1;
tmp.m_collnum = m_collnum;
if ( g_conf.m_logDebugSpider )
log("spider: adding lock uh48=%"INT64" lockkey=%"INT64"",
m_sreq->getUrlHash48(),lockKeyUh48);
if ( ! ht->addKey ( &lockKeyUh48 , &tmp ) )
return true;

View File

@ -20135,7 +20135,7 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
time_t spideredTime = m_oldDoc->getSpideredTime();
struct tm *timeStruct = gmtime ( &spideredTime );
char tmp[64];
strftime(tmp,64,"lastspidered=%b-%d-%Y(%H:%M:%S)",timeStruct);
strftime(tmp,64,"lastindexed=%b-%d-%Y(%H:%M:%S)",timeStruct);
sb->safePrintf("%s(%"UINT32") ", tmp,(uint32_t)spideredTime);
}