mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
spiderloop active list bug fix.
change diffbot ip max from 1 to 7 again.
This commit is contained in:
parent
e1aab778e9
commit
bdebd79f4f
@ -1105,6 +1105,7 @@ bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
|
||||
|
||||
// tell spiders to re-upadted the active list
|
||||
g_spiderLoop.m_activeListValid = false;
|
||||
g_spiderLoop.m_activeListModified = true;
|
||||
|
||||
// a delete?
|
||||
if ( ! cr ) {
|
||||
@ -3574,7 +3575,8 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
|
||||
if ( respiderFreq <= 0.0 ) respiderFreq = 3652.5;
|
||||
|
||||
// lower from 7 to 1 since we have so many collections now
|
||||
int32_t diffbotipms = 1; // 7
|
||||
// ok, now we have much less colls so raise back to 7
|
||||
int32_t diffbotipms = 7;// 1; // 7
|
||||
|
||||
// make the gigablast regex table just "default" so it does not
|
||||
// filtering, but accepts all urls. we will add code to pass the urls
|
||||
|
16
Spider.cpp
16
Spider.cpp
@ -5883,6 +5883,7 @@ void SpiderLoop::startLoop ( ) {
|
||||
//m_cri = 0;
|
||||
m_crx = NULL;
|
||||
m_activeListValid = false;
|
||||
m_activeListModified = false;
|
||||
m_activeList = NULL;
|
||||
m_recalcTime = 0;
|
||||
m_recalcTimeValid = false;
|
||||
@ -5976,16 +5977,20 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
// count these calls
|
||||
s_count++;
|
||||
|
||||
top:
|
||||
|
||||
// reset SpiderColl::m_didRound and m_nextDoledbKey if it is maxed
|
||||
// because we might have had a lock collision
|
||||
//int32_t nc = g_collectiondb.m_numRecs;
|
||||
// start again at head
|
||||
class CollectionRec *crp = g_spiderLoop.getActiveList();//m_activeList;
|
||||
class CollectionRec *crp = g_spiderLoop.getActiveList();
|
||||
|
||||
//for ( int32_t i = 0 ; i < nc ; i++ ) {
|
||||
for ( ; crp ; crp = crp->m_nextActive ) {
|
||||
// breathe
|
||||
QUICKPOLL(MAX_NICENESS);
|
||||
// if list was modified a collection was deleted/added
|
||||
if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
// // get collectionrec
|
||||
// CollectionRec *cr = g_collectiondb.getRec(i);
|
||||
// if ( ! cr ) continue;
|
||||
@ -6018,6 +6023,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
// should be fast in those cases
|
||||
sc->populateWaitingTreeFromSpiderdb ( false );
|
||||
}
|
||||
// if list was modified a collection was deleted/added
|
||||
if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
// re-entry is false because we are entering for the first time
|
||||
sc->populateDoledbFromWaitingTree ( );
|
||||
// . skip if still loading doledb lists from disk this round
|
||||
@ -6035,6 +6042,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
// send notifications when a crawl is basically in hiatus.
|
||||
//sc->m_encounteredDoledbRecs = false;
|
||||
//sc->m_nextDoledbKey.setMin();
|
||||
// if list was modified a collection was deleted/added
|
||||
if ( g_spiderLoop.m_activeListModified ) goto top;
|
||||
}
|
||||
|
||||
// set initial priority to the highest to start spidering there
|
||||
@ -6303,7 +6312,7 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
collLoop:
|
||||
|
||||
// start again at head
|
||||
if ( ! m_crx ) m_crx = getActiveList();//m_activeList;
|
||||
if ( ! m_crx ) m_crx = getActiveList();
|
||||
|
||||
bool firstTime = true;
|
||||
|
||||
@ -14281,12 +14290,13 @@ CollectionRec *SpiderLoop::getActiveList() {
|
||||
// versa. also when deleting a collection in Collectiondb.cpp. this
|
||||
// keeps the below loop fast when we have thousands of collections
|
||||
// and most are inactive or empty/deleted.
|
||||
if ( ! m_activeListValid ) {
|
||||
if ( ! m_activeListValid || m_activeListModified ) {
|
||||
buildActiveList();
|
||||
//m_crx = m_activeList;
|
||||
// recompute every 3 seconds, it seems kinda buggy!!
|
||||
m_recalcTime = nowGlobal + 3;
|
||||
m_recalcTimeValid = true;
|
||||
m_activeListModified = false;
|
||||
}
|
||||
|
||||
return m_activeList;
|
||||
|
Loading…
Reference in New Issue
Block a user