mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
fixed bug of gb not saving
This commit is contained in:
parent
4e485b6649
commit
692c2932e8
19
Process.cpp
19
Process.cpp
@ -1340,6 +1340,7 @@ bool Process::save ( ) {
|
||||
logf(LOG_INFO,"db: Entering lock mode for saving.");
|
||||
m_mode = LOCK_MODE; // SAVE_MODE;
|
||||
m_urgent = false;
|
||||
m_calledSave = false;
|
||||
return save2();
|
||||
}
|
||||
|
||||
@ -1360,6 +1361,8 @@ bool Process::shutdown ( bool urgent ,
|
||||
m_mode = EXIT_MODE;
|
||||
m_urgent = urgent;
|
||||
|
||||
m_calledSave = false;
|
||||
|
||||
// check memory buffers for overruns/underrunds to see if that
|
||||
// caused this core
|
||||
if ( urgent ) g_mem.printBreeches(false);
|
||||
@ -1738,16 +1741,30 @@ bool Process::saveRdbTrees ( bool useThread , bool shuttingDown ) {
|
||||
if ( g_conf.m_readOnlyMode ) return true;
|
||||
// no thread if shutting down
|
||||
if ( shuttingDown ) useThread = false;
|
||||
// debug note
|
||||
log("gb: shuttingdown=%i",(int)shuttingDown);
|
||||
// turn off statsdb until everyone is done
|
||||
//g_statsdb.m_disabled = true;
|
||||
// loop over all Rdbs and save them
|
||||
for ( int32_t i = 0 ; ! m_calledSave && i < m_numRdbs ; i++ ) {
|
||||
for ( int32_t i = 0 ; i < m_numRdbs ; i++ ) {
|
||||
if ( m_calledSave ) {
|
||||
log("gb: already saved trees, skipping.");
|
||||
break;
|
||||
}
|
||||
Rdb *rdb = m_rdbs[i];
|
||||
// if we save doledb while spidering it screws us up
|
||||
// because Spider.cpp can not directly write into the
|
||||
// rdb tree and it expects that to always be available!
|
||||
if ( ! shuttingDown && rdb->m_rdbId == RDB_DOLEDB )
|
||||
continue;
|
||||
// note it
|
||||
if ( ! rdb->m_dbname || ! rdb->m_dbname[0] )
|
||||
log("gb: calling save tree for rdbid %i",
|
||||
(int)rdb->m_rdbId);
|
||||
else
|
||||
log("gb: calling save tree for %s",
|
||||
rdb->m_dbname);
|
||||
|
||||
rdb->saveTree ( useThread );
|
||||
}
|
||||
|
||||
|
@ -162,7 +162,9 @@ void RdbTree::reset ( ) {
|
||||
strcmp(m_dbname,"accessdb") &&
|
||||
strcmp(m_dbname,"statsdb") ) {
|
||||
//strcmp(m_dbname,"doledb") ) {
|
||||
log("rdb: Resetting unsaved tree %s.",m_dbname);
|
||||
log("rdb: RESETTING UNSAVED TREE %s.",m_dbname);
|
||||
log("rdb: RESETTING UNSAVED TREE %s.",m_dbname);
|
||||
log("rdb: RESETTING UNSAVED TREE %s.",m_dbname);
|
||||
// when DELETING a collection from pagecrawlbot.cpp
|
||||
// it calls Collectiondb::deleteRec() which calls
|
||||
// SpiderColl::reset() which calls m_waitingTree.reset()
|
||||
|
81
Spider.cpp
81
Spider.cpp
@ -1878,6 +1878,14 @@ bool SpiderColl::updateSiteNumInlinksTable ( int32_t siteHash32,
|
||||
// the count in m_doleIpTable here
|
||||
bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
|
||||
|
||||
////
|
||||
//
|
||||
// skip if not assigned to us for doling
|
||||
//
|
||||
////
|
||||
if ( ! isAssignedToUs ( srep->m_firstIp ) )
|
||||
return true;
|
||||
|
||||
/////////
|
||||
//
|
||||
// remove the lock here
|
||||
@ -1891,8 +1899,10 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
|
||||
time_t nowGlobal = getTimeGlobal();
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
logf(LOG_DEBUG,"spider: scheduled lock removal in 5 secs for "
|
||||
"lockKey=%"UINT64"", lockKey );
|
||||
logf(LOG_DEBUG,"spider: removing lock uh48=%"INT64" "
|
||||
"lockKey=%"UINT64"",
|
||||
srep->getUrlHash48(),
|
||||
lockKey );
|
||||
|
||||
// test it
|
||||
//if ( m_nowGlobal == 0 && lock )
|
||||
@ -1927,14 +1937,6 @@ bool SpiderColl::addSpiderReply ( SpiderReply *srep ) {
|
||||
logf(LOG_DEBUG,"spider: rdb: lockKey=%"UINT64" "
|
||||
"was not in lock table",lockKey);
|
||||
|
||||
////
|
||||
//
|
||||
// skip if not assigned to us for doling
|
||||
//
|
||||
////
|
||||
if ( ! isAssignedToUs ( srep->m_firstIp ) )
|
||||
return true;
|
||||
|
||||
// now just remove it since we only spider our own urls
|
||||
// and doledb is in memory
|
||||
g_spiderLoop.m_lockTable.removeKey ( &lockKey );
|
||||
@ -3486,7 +3488,7 @@ bool SpiderColl::evalIpLoop ( ) {
|
||||
&cachedTimestamp , // rec timestamp
|
||||
true ); // promote rec?
|
||||
// doleBuf could be NULL i guess...
|
||||
if ( inCache && doleBufSize > 0 ) {
|
||||
if ( inCache ) { // && doleBufSize > 0 ) {
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: GOT %"INT32" bytes of SpiderRequests "
|
||||
"from winnerlistcache for ip %s",doleBufSize,
|
||||
@ -4162,6 +4164,11 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: got ufn=%"INT32" for %s",ufn,sreq->m_url);
|
||||
|
||||
if ( g_conf.m_logDebugSpider && srep )
|
||||
log("spider: lastspidered=%"UINT32"",
|
||||
srep->m_spideredTime);
|
||||
|
||||
|
||||
// spiders disabled for this row in url filteres?
|
||||
//if ( ! m_cr->m_spidersEnabled[ufn] ) continue;
|
||||
@ -4212,6 +4219,11 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// debug point
|
||||
// if ( ((long long)srep->m_spideredTime)*1000LL >
|
||||
// nowGlobalMS - 86400LL*1000LL*30LL )
|
||||
// log("spider: should not be spidering this!");
|
||||
|
||||
//////
|
||||
//
|
||||
// MDW: no, take this out now that we allow multiple urls
|
||||
@ -4312,6 +4324,14 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// "firstIp". this way we can spider multiple urls from the
|
||||
// same ip at the same time.
|
||||
int64_t key = makeLockTableKey ( sreq );
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: checking uh48=%"INT64" lockkey=%"INT64" "
|
||||
"used=%"INT32"",
|
||||
uh48,key,
|
||||
g_spiderLoop.m_lockTable.getNumUsedSlots());
|
||||
|
||||
// MDW
|
||||
if ( g_spiderLoop.m_lockTable.isInTable ( &key ) ) {
|
||||
// get it
|
||||
//CrawlInfo *ci = &m_cr->m_localCrawlInfo;
|
||||
@ -5179,17 +5199,41 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
|
||||
// allow this to add a 0 length record otherwise we keep the same
|
||||
// old url in here and keep spidering it over and over again!
|
||||
bool addToCache = false;
|
||||
if ( skipSize && m_doleBuf.length() - skipSize >= 0 ) addToCache =true;
|
||||
if ( skipSize && m_doleBuf.length() - skipSize > 0 ) addToCache =true;
|
||||
// if winnertree was empty, then we might have scanned like 10M
|
||||
// twitter.com urls and not wanted any of them, so we don't want to
|
||||
// have to keep redoing that!
|
||||
if ( m_doleBuf.length() == 0 && ! isFromCache ) addToCache = true;
|
||||
|
||||
RdbCache *wc = &g_spiderLoop.m_winnerListCache;
|
||||
|
||||
// remove from cache? if we added the last spider request in the
|
||||
// cached dolebuf to doledb then remove it from cache so it's not
|
||||
// a cached empty dolebuf and we recompute it not using the cache.
|
||||
if ( isFromCache && skipSize && m_doleBuf.length() - skipSize == 0 ) {
|
||||
if ( addToCache ) { char *xx=NULL;*xx=0; }
|
||||
// let's get this working right...
|
||||
//wc->removeKey ( collnum , k , start );
|
||||
//wc->markDeletedRecord(start);
|
||||
// i don't think we can remove keys from cache so add
|
||||
// a rec with a byte size of 1 to indicate for us to ignore.
|
||||
// set the timestamp to 12345 so the getRecord above will
|
||||
// not get it and promote it in the linked list.
|
||||
char byte = 0;
|
||||
key_t cacheKey;
|
||||
cacheKey.n0 = firstIp;
|
||||
cacheKey.n1 = 0;
|
||||
wc->addRecord ( m_collnum,
|
||||
(char *)&cacheKey,
|
||||
&byte ,
|
||||
1 ,
|
||||
12345 );//cachedTimestamp );
|
||||
}
|
||||
|
||||
if ( addToCache ) {
|
||||
key_t cacheKey;
|
||||
cacheKey.n0 = firstIp;
|
||||
cacheKey.n1 = 0;
|
||||
RdbCache *wc = &g_spiderLoop.m_winnerListCache;
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: adding %"INT32" bytes of SpiderRequests "
|
||||
"to winnerlistcache for ip %s",
|
||||
@ -5243,6 +5287,11 @@ bool SpiderColl::addDoleBufIntoDoledb ( bool isFromCache ,
|
||||
// now that doledb is tree-only and never dumps to disk, just
|
||||
// add it directly
|
||||
g_doledb.m_rdb.addList ( m_collnum , &tmpList , MAX_NICENESS );
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: adding doledb tree node size=%"INT32"",skipSize);
|
||||
|
||||
|
||||
// and it happens right away. just add it locally.
|
||||
bool status = true;
|
||||
|
||||
@ -7287,6 +7336,9 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
|
||||
true);
|
||||
if ( node == -1 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: deleting doledb tree node %"INT32,node);
|
||||
|
||||
// now remove from doleiptable since we removed from doledb
|
||||
m_sc->removeFromDoledbTable ( sreq->m_firstIp );
|
||||
|
||||
@ -7326,6 +7378,9 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
|
||||
tmp.m_spiderOutstanding = 0;
|
||||
tmp.m_confirmed = 1;
|
||||
tmp.m_collnum = m_collnum;
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: adding lock uh48=%"INT64" lockkey=%"INT64"",
|
||||
m_sreq->getUrlHash48(),lockKeyUh48);
|
||||
if ( ! ht->addKey ( &lockKeyUh48 , &tmp ) )
|
||||
return true;
|
||||
|
||||
|
@ -20135,7 +20135,7 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
time_t spideredTime = m_oldDoc->getSpideredTime();
|
||||
struct tm *timeStruct = gmtime ( &spideredTime );
|
||||
char tmp[64];
|
||||
strftime(tmp,64,"lastspidered=%b-%d-%Y(%H:%M:%S)",timeStruct);
|
||||
strftime(tmp,64,"lastindexed=%b-%d-%Y(%H:%M:%S)",timeStruct);
|
||||
sb->safePrintf("%s(%"UINT32") ", tmp,(uint32_t)spideredTime);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user