diff --git a/Collectiondb.cpp b/Collectiondb.cpp index b15e1ffe..7aa6859a 100644 --- a/Collectiondb.cpp +++ b/Collectiondb.cpp @@ -636,6 +636,23 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) { } */ +// if there is an outstanding disk read thread or merge thread then +// Spider.cpp will handle the delete in the callback. +void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) { + + sc->m_deleteMyself = true; + + // if not currently being accessed nuke it now + if ( ! sc->m_msg5.m_waitingForList && + ! sc->m_msg5.m_waitingForMerge && + ! sc->m_msg5b.m_waitingForList && + ! sc->m_msg5b.m_waitingForMerge ) { + mdelete ( sc, sizeof(SpiderColl),"nukecr2"); + delete ( sc ); + return; + } +} + bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) { // do not allow this if in repair mode if ( g_repairMode > 0 ) { @@ -723,10 +740,14 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) { SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum); if ( sc ) { // remove locks from lock table: - sc->clear(); + sc->clearLocks(); //sc->m_collnum = newCollnum; - sc->reset(); - mdelete ( sc, sizeof(SpiderColl),"nukecr2"); + //sc->reset(); + // this will put it on "death row" so it will be deleted + // once Msg5::m_waitingForList/Merge is NULL + deleteSpiderColl ( sc ); + //mdelete ( sc, sizeof(SpiderColl),"nukecr2"); + //delete ( sc ); cr->m_spiderColl = NULL; } @@ -925,8 +946,19 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum, // reset spider info SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum); if ( sc ) { - sc->clear(); - sc->m_collnum = newCollnum; + // remove locks from lock table: + sc->clearLocks(); + // don't do this anymore, just nuke it in case + // m_populatingDoledb was true etc. there are too many + // flags to worry about + //sc->m_collnum = newCollnum; + //sc->reset(); + // this will put it on "death row" so it will be deleted + // once Msg5::m_waitingForList/Merge is NULL + deleteSpiderColl ( sc ); + //mdelete ( sc, sizeof(SpiderColl),"nukecr2"); + //delete ( sc ); + cr->m_spiderColl = NULL; } // reset spider round @@ -1903,6 +1935,9 @@ bool CollectionRec::rebuildUrlFilters ( ) { // just turn off spidering. if we were to set priority to // filtered it would be removed from index! m_spidersEnabled [i] = 0; + // temp hack so it processes in xmldoc.cpp::getUrlFilterNum() + // which has been obsoleted, but we are running old code now! + m_spiderDiffbotApiUrl[i].set ( api ); i++; } // if collectiverespiderfreq is 0 or less then do not RE-spider @@ -1916,6 +1951,9 @@ bool CollectionRec::rebuildUrlFilters ( ) { // just turn off spidering. if we were to set priority to // filtered it would be removed from index! m_spidersEnabled [i] = 0; + // temp hack so it processes in xmldoc.cpp::getUrlFilterNum() + // which has been obsoleted, but we are running old code now! + m_spiderDiffbotApiUrl[i].set ( api ); i++; } diff --git a/Collectiondb.h b/Collectiondb.h index 9ae29e2e..043959bf 100644 --- a/Collectiondb.h +++ b/Collectiondb.h @@ -126,6 +126,8 @@ class Collectiondb { //bool updateRec ( CollectionRec *newrec ); bool deleteRecs ( class HttpRequest *r ) ; + void deleteSpiderColl ( class SpiderColl *sc ); + // returns false if blocked, true otherwise. //bool resetColl ( char *coll , WaitEntry *we , bool purgeSeeds ); bool resetColl2 ( collnum_t oldCollnum, diff --git a/DiskPageCache.cpp b/DiskPageCache.cpp index 5f1be93a..1633d316 100644 --- a/DiskPageCache.cpp +++ b/DiskPageCache.cpp @@ -349,7 +349,7 @@ void DiskPageCache::getPages ( long vfd , // dumping more than what was end the tree because stuff was // added to the tree while dumping! log("db: pagecache: Caught get breach. " - "ep=%li max=%li", ep,m_maxPagesInFile[vfd] ); + "ep=%li max=%li vfd=%li", ep,m_maxPagesInFile[vfd] ,vfd); return; //char *xx = NULL; *xx = 0; } diff --git a/DiskPageCache.h b/DiskPageCache.h index ac637e09..0801f318 100644 --- a/DiskPageCache.h +++ b/DiskPageCache.h @@ -40,7 +40,8 @@ #define MAX_PAGE_SETS 128 // how many BigFiles can be using the same DiskPageCache? -#define MAX_NUM_VFDS2 1024 +#include "File.h" +#define MAX_NUM_VFDS2 MAX_NUM_VFDS extern void freeAllSharedMem ( long max ); diff --git a/File.h b/File.h index c44c8f07..6138f3ab 100644 --- a/File.h +++ b/File.h @@ -21,7 +21,8 @@ // . max # of VIRTUAL file descriptors // . man, chris has 958 files, lets crank it up from 2k to 5k -#define MAX_NUM_VFDS (5*1024) +// . boost up to 50,000 since we are hitting this limit with crawlbot +#define MAX_NUM_VFDS (50*1024) #include // for open/lseek #include // for open diff --git a/Json.cpp b/Json.cpp index d4bda183..09355e35 100644 --- a/Json.cpp +++ b/Json.cpp @@ -95,6 +95,14 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , long niceness ) { need += p - json; // plus a \0 for the value and a \0 for the name of each jsonitem need += 2; + // prevent cores for now + need += 10; + // . to prevent safebuf from reallocating do this + // . safeMemcpy() calls reserve(m_length+len) and reserves + // tries to alloc m_length + (m_length+len) so since, + // m_length+len should never be more than "need" we need to + // double up here + need *= 2; // this should be enough if ( ! m_sb.reserve ( need ) ) return NULL; // for testing if we realloc diff --git a/Linkdb.cpp b/Linkdb.cpp index ac18417c..45a9c00a 100644 --- a/Linkdb.cpp +++ b/Linkdb.cpp @@ -109,6 +109,8 @@ bool Linkdb::init ( ) { long maxTreeNodes = maxTreeMem /(sizeof(key224_t)+16); // disk page cache mem, 100MB on gk0 now long pcmem = 0; // g_conf.m_linkdbMaxDiskPageCacheMem; + // give it a little + pcmem = 10000000; // 10MB // keep this low if we are the tmp cluster //if ( g_hostdb.m_useTmpCluster ) pcmem = 0; // TODO: would be nice to just do page caching on the satellite files; diff --git a/Loop.cpp b/Loop.cpp index 6a5e2456..6d271e5d 100644 --- a/Loop.cpp +++ b/Loop.cpp @@ -1791,6 +1791,11 @@ void Loop::quickPoll(long niceness, const char* caller, long lineno) { if(m_inQuickPoll) { log(LOG_WARN, "admin: tried to quickpoll from inside quickpoll"); + // this happens when handleRequest3f is called from + // a quickpoll and it deletes a collection and BigFile::close + // calls ThreadQueue::removeThreads and Msg3::doneScanning() + // has niceness 2 and calls quickpoll again! + return; //if(g_conf.m_quickpollCoreOnError) { char*xx=NULL;*xx=0; // } diff --git a/Msg3.cpp b/Msg3.cpp index 266c3d3c..8fb499aa 100644 --- a/Msg3.cpp +++ b/Msg3.cpp @@ -930,9 +930,12 @@ bool Msg3::doneScanning ( ) { ff->getFilename() , m_niceness ) ) { log("net: Had error while constraining list read from " - "%s: %s. This is likely caused by corrupted " + "%s: %s%s. vfd=%li parts=%li. " + "This is likely caused by corrupted " "data on disk.", - mstrerror(g_errno), ff->getFilename()); + mstrerror(g_errno), ff->m_dir , + ff->getFilename(), ff->m_vfd , + (long)ff->m_numParts ); } } diff --git a/Msg5.cpp b/Msg5.cpp index 831e573d..00c61b67 100644 --- a/Msg5.cpp +++ b/Msg5.cpp @@ -22,6 +22,7 @@ long g_numCorrupt = 0; Msg5::Msg5() { m_waitingForList = false; + m_waitingForMerge = false; m_numListPtrs = 0; m_mergeLists = true; reset(); @@ -33,7 +34,7 @@ Msg5::~Msg5() { // frees m_treeList void Msg5::reset() { - if ( m_waitingForList ) { + if ( m_waitingForList || m_waitingForMerge ) { log("disk: Trying to reset a class waiting for a reply."); // might being doing an urgent exit (mainShutdown(1)) or // g_process.shutdown(), so do not core here @@ -1365,6 +1366,8 @@ bool Msg5::gotList2 ( ) { // skip it for now //goto skipThread; + m_waitingForMerge = true; + // . if size is big, make a thread // . let's always make niceness 0 since it wasn't being very // aggressive before @@ -1374,6 +1377,9 @@ bool Msg5::gotList2 ( ) { threadDoneWrapper , mergeListsWrapper_r ) ) return false; + + m_waitingForMerge = false; + // thread creation failed if ( ! g_threads.areThreadsDisabled() ) log(LOG_INFO, @@ -1704,6 +1710,8 @@ void Msg5::mergeLists_r ( ) { // . we are left with an empty list bool Msg5::doneMerging ( ) { + m_waitingForMerge = false; + // get base, returns NULL and sets g_errno to ENOCOLLREC on error RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true; @@ -1722,8 +1730,8 @@ bool Msg5::doneMerging ( ) { // our first merge if ( m_hadCorruption ) { // log it here, cuz logging in thread doesn't work too well - log("net: Encountered a corrupt list in rdb=%s", - base->m_dbname); + log("net: Encountered a corrupt list in rdb=%s coll=%s", + base->m_dbname,m_coll); // remove error condition, we removed the bad data in thread m_hadCorruption = false; diff --git a/Msg5.h b/Msg5.h index 3427b553..34dfb699 100644 --- a/Msg5.h +++ b/Msg5.h @@ -292,6 +292,7 @@ class Msg5 { bool m_mergeLists; char m_waitingForList; + char m_waitingForMerge; // actually part of a different algo than m_waitingForList! unsigned long long m_waitingKey; diff --git a/PageAddUrl.cpp b/PageAddUrl.cpp index b94d661b..34886256 100644 --- a/PageAddUrl.cpp +++ b/PageAddUrl.cpp @@ -254,6 +254,10 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) { long long probDocId = g_titledb.getProbableDocId ( st1->m_url ); // make one up, like we do in PageReindex.cpp long firstIp = (probDocId & 0xffffffff); + + // avoid ips of 0 or -1 + if ( firstIp == 0 || firstIp == -1 ) firstIp = 1; + // . now fill it up // . TODO: calculate the other values... lazy!!! (m_isRSSExt, // m_siteNumInlinks,...) diff --git a/PageCrawlBot.cpp b/PageCrawlBot.cpp index 09e47b90..b2dceeab 100644 --- a/PageCrawlBot.cpp +++ b/PageCrawlBot.cpp @@ -3841,6 +3841,9 @@ bool getSpiderRequestMetaList ( char *doc , SpiderRequest sreq; sreq.reset(); sreq.m_firstIp = url.getHostHash32(); // fakeip! + // avoid ips of 0 or -1 + if ( sreq.m_firstIp == 0 || sreq.m_firstIp == -1 ) + sreq.m_firstIp = 1; sreq.m_hostHash32 = url.getHostHash32(); sreq.m_domHash32 = url.getDomainHash32(); sreq.m_siteHash32 = url.getHostHash32(); diff --git a/PageStats.cpp b/PageStats.cpp index a3deada1..feb1ccef 100644 --- a/PageStats.cpp +++ b/PageStats.cpp @@ -488,7 +488,7 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) { "Kernel Version%s\n" //"Gigablast Version%s %s\n" "Parsing Inconsistencies%li\n" - "Indexdb Splits%li\n" + "Indexdb Shards%li\n" //"Fully Split%li\n" //"Tfndb Extension Bits%li\n" "\n" diff --git a/Parms.cpp b/Parms.cpp index a0d91670..a5e22477 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -17531,7 +17531,12 @@ bool Parms::doParmSendingLoop ( ) { NULL, // retslot (void *)h->m_hostId , // state gotParmReplyWrapper , - 4 ) ) { // timeout secs + 30 , // timeout secs + -1 , // backoff + -1 , // maxwait + NULL , // replybuf + 0 , // replybufmaxsize + 0 ) ) { // niceness log("parms: faild to send: %s",mstrerror(g_errno)); continue; } diff --git a/Posdb.cpp b/Posdb.cpp index 510b88e0..135743cc 100644 --- a/Posdb.cpp +++ b/Posdb.cpp @@ -122,12 +122,12 @@ bool Posdb::init ( ) { long nodeSize = (sizeof(key144_t)+12+4) + sizeof(collnum_t); long maxTreeNodes = maxTreeMem / nodeSize ; - //long pageSize = GB_INDEXDB_PAGE_SIZE; + long pageSize = GB_INDEXDB_PAGE_SIZE; // we now use a disk page cache as opposed to the // old rec cache. i am trying to do away with the Rdb::m_cache rec // cache in favor of cleverly used disk page caches, because // the rec caches are not real-time and get stale. - long pcmem = 50000000; // 50MB + long pcmem = 30000000; // 30MB // make sure at least 30MB //if ( pcmem < 30000000 ) pcmem = 30000000; // keep this low if we are the tmp cluster, 30MB @@ -136,12 +136,12 @@ bool Posdb::init ( ) { // prevent swapping if ( g_hostdb.m_useTmpCluster ) pcmem = 0; // save more mem!!! allow os to cache it i guess... - pcmem = 0; + // let's go back to using it + //pcmem = 0; // disable for now... for rebuild //pcmem = 0; // . init the page cache // . MDW: "minimize disk seeks" not working otherwise i'd enable it! - /* if ( ! m_pc.init ( "posdb", RDB_POSDB, pcmem , @@ -149,7 +149,6 @@ bool Posdb::init ( ) { true , // use RAM disk? false )) // minimize disk seeks? return log("db: Posdb init failed."); - */ // . set our own internal rdb // . max disk space for bin tree is same as maxTreeMem so that we @@ -174,7 +173,7 @@ bool Posdb::init ( ) { // newer systems have tons of ram to use // for their disk page cache. it is slower than // ours but the new engine has much slower things - NULL,//&m_pc , + &m_pc , false , // istitledb? false , // preloaddiskpagecache? sizeof(key144_t) diff --git a/Process.cpp b/Process.cpp index 19c6ae55..22259f30 100644 --- a/Process.cpp +++ b/Process.cpp @@ -104,7 +104,7 @@ char *g_files[] = { "antiword" , // msword "pdftohtml", // pdf "pstotext" , // postscript - "ppthtml" , // powerpoint + //"ppthtml" , // powerpoint //"dict/unifiedDict", //"dict/thesaurus.txt", diff --git a/RdbBase.cpp b/RdbBase.cpp index d88c7dc6..5f34202a 100644 --- a/RdbBase.cpp +++ b/RdbBase.cpp @@ -360,6 +360,7 @@ bool RdbBase::init ( char *dir , // now fill up the page cache // preload: if ( ! preloadDiskPageCache ) return true; + if ( ! m_pc ) return true; char buf [ 512000 ]; long total = m_pc->getMemMax(); log(LOG_DEBUG,"db: %s: Preloading page cache. Total mem to use =%lu", diff --git a/RdbDump.cpp b/RdbDump.cpp index 686892e1..b76222a4 100644 --- a/RdbDump.cpp +++ b/RdbDump.cpp @@ -204,11 +204,14 @@ void RdbDump::doneDumping ( ) { m_totalPosDumped , m_totalNegDumped , m_totalPosDumped + m_totalNegDumped ); - // map verify - log("db: map # pos=%lli neg=%lli", - m_map->getNumPositiveRecs(), - m_map->getNumNegativeRecs() - ); + // . map verify + // . if continueDumping called us with no collectionrec, it got + // deleted so RdbBase::m_map is nuked too i guess + if ( saved != ENOCOLLREC ) + log("db: map # pos=%lli neg=%lli", + m_map->getNumPositiveRecs(), + m_map->getNumNegativeRecs() + ); // free the list's memory if ( m_list ) m_list->freeList(); @@ -1015,11 +1018,16 @@ void RdbDump::continueDumping() { // if someone reset/deleted the collection we were dumping... CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); - if ( ! cr ) g_errno = ENOCOLLREC; - + if ( ! cr ) { + g_errno = ENOCOLLREC; + // m_file is invalid if collrec got nuked because so did + // the Rdbbase which has the files + log("db: continue dumping lost collection"); + } // bitch about errors - if (g_errno)log("db: Dump to %s had error writing: %s.", - m_file->getFilename(),mstrerror(g_errno)); + else if (g_errno)log("db: Dump to %s had error writing: %s.", + m_file->getFilename(),mstrerror(g_errno)); + // go back now if we were NOT dumping a tree if ( ! (m_tree || m_buckets) ) { m_isDumping = false; diff --git a/RdbMap.cpp b/RdbMap.cpp index e4cb34e5..c4e15053 100644 --- a/RdbMap.cpp +++ b/RdbMap.cpp @@ -6,6 +6,8 @@ RdbMap::RdbMap() { m_numSegments = 0; + m_numSegmentPtrs = 0; + m_numSegmentOffs = 0; reset ( ); } @@ -61,6 +63,14 @@ void RdbMap::reset ( ) { m_keys [i] = NULL; m_offsets[i] = NULL; } + + // the ptrs themselves are now a dynamic array to save mem + // when we have thousands of collections + mfree(m_keys,m_numSegmentPtrs*sizeof(char *),"MapPtrs"); + mfree(m_offsets,m_numSegmentOffs*sizeof(short *),"MapPtrs"); + m_numSegmentPtrs = 0; + m_numSegmentOffs = 0; + m_needToWrite = false; m_fileStartOffset = 0LL; m_numSegments = 0; @@ -1192,6 +1202,40 @@ long long RdbMap::getMemAlloced ( ) { return (long long)m_numSegments * space; } +bool RdbMap::addSegmentPtr ( long n ) { + // realloc + if ( n >= m_numSegmentPtrs ) { + char **k; + long nn = (long)((float)n * 1.20) + 1; + k = (char **) mrealloc (m_keys, + m_numSegmentPtrs * sizeof(char *) , + nn * sizeof(char *) , + "MapPtrs" ); + // failed? + if ( ! k ) return false; + // succeeded + m_numSegmentPtrs = nn; + m_keys = k; + } + + // try offsets + if ( n >= m_numSegmentOffs ) { + short **o; + long nn = (long)((float)n * 1.20) + 1; + o = (short **) mrealloc (m_offsets, + m_numSegmentOffs * sizeof(short *) , + nn * sizeof(short *) , + "MapPtrs" ); + // failed? + if ( ! o ) return false; + // succeeded + m_numSegmentOffs = nn; + m_offsets = o; + } + return true; +} + + // . add "n" segments // . returns false and sets g_errno on error bool RdbMap::addSegment ( ) { @@ -1202,8 +1246,17 @@ bool RdbMap::addSegment ( ) { long n = m_numSegments; long pps = PAGES_PER_SEGMENT; // ensure doesn't exceed the max - if ( n >= MAX_SEGMENTS ) return log("db: Mapped file is " - "too big. Critical error."); + //if ( n >= MAX_SEGMENTS ) return log("db: Mapped file is " + // "too big. Critical error."); + + // the array of up to MAX_SEGMENT pool ptrs is now dynamic too! + // because diffbot uses thousands of collections, this will save + // over 1GB of ram! + if ( ! addSegmentPtr ( n ) ) + return log("db: Failed to allocate memory for adding seg ptr " + "for map file %s.", m_file.getFilename()); + + // alloc spaces for each key segment // allocate new segments now //m_keys[n] = (key_t *) mmalloc ( ks * pps , "RdbMap" ); diff --git a/RdbMap.h b/RdbMap.h index beeffb52..0d703c7e 100644 --- a/RdbMap.h +++ b/RdbMap.h @@ -59,7 +59,7 @@ #define PAGES_PER_SEGMENT (2*1024) #define PAGES_PER_SEG (PAGES_PER_SEGMENT) // MAX_SEGMENTS of 16*1024 allows for 32 million pages = 256gigs of disk data -#define MAX_SEGMENTS (16*1024) +//#define MAX_SEGMENTS (16*1024) class RdbMap { @@ -284,6 +284,8 @@ class RdbMap { // . used to grow the map, too //bool setMapSize ( long maxNumPages ); + bool addSegmentPtr ( long n ) ; + // called by setMapSize() to increase the # of segments bool addSegment ( ) ; @@ -328,10 +330,17 @@ class RdbMap { // . IMPORTANT: if growing m_pageSize might need to change m_offsets // from short to long //key_t *m_keys [ MAX_SEGMENTS ]; - char *m_keys [ MAX_SEGMENTS ]; + //char *m_keys [ MAX_SEGMENTS ]; + char **m_keys; + long m_numSegmentPtrs; //key96_t **m_keys96; // set to m_keys //key128_t **m_keys128; // set to m_keys - short *m_offsets [ MAX_SEGMENTS ]; + + //short *m_offsets [ MAX_SEGMENTS ]; + short **m_offsets; + long m_numSegmentOffs; + + // number of valid pages in the map. long m_numPages; diff --git a/Spider.cpp b/Spider.cpp index ceb452ec..b5ffc198 100644 --- a/Spider.cpp +++ b/Spider.cpp @@ -537,7 +537,6 @@ bool Spiderdb::init ( ) { long pcmem = 20000000;//g_conf.m_spiderdbMaxDiskPageCacheMem; // keep this low if we are the tmp cluster if ( g_hostdb.m_useTmpCluster ) pcmem = 0; - // key parser checks //long ip = 0x1234; char priority = 12; @@ -571,7 +570,7 @@ bool Spiderdb::init ( ) { RDB_SPIDERDB , pcmem , pageSize , - true , // use shared mem? + false , // use shared mem? false )) // minimizeDiskSeeks? return log(LOG_INIT,"spiderdb: Init failed."); @@ -1014,9 +1013,11 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) { ///////////////////////// SpiderColl::SpiderColl () { + m_deleteMyself = false; m_gettingList1 = false; m_gettingList2 = false; m_lastScanTime = 0; + m_isPopulating = false; m_numAdded = 0; m_numBytesScanned = 0; m_lastPrintCount = 0; @@ -1488,7 +1489,7 @@ SpiderColl::~SpiderColl () { } // we call this now instead of reset when Collectiondb::resetColl() is used -void SpiderColl::clear ( ) { +void SpiderColl::clearLocks ( ) { // remove locks from locktable for all spiders out i guess HashTableX *ht = &g_spiderLoop.m_lockTable; @@ -1508,6 +1509,7 @@ void SpiderColl::clear ( ) { goto top; } + /* // reset these for SpiderLoop; m_nextDoledbKey.setMin(); m_didRound = false; @@ -1541,6 +1543,7 @@ void SpiderColl::clear ( ) { // assume the whole thing is not empty m_allDoledbPrioritiesEmpty = 0;//false; m_lastEmptyCheck = 0; + */ } void SpiderColl::reset ( ) { @@ -1554,6 +1557,8 @@ void SpiderColl::reset ( ) { m_twinDied = false; m_lastUrlFiltersUpdate = 0; + m_isPopulating = false; + char *coll = "unknown"; if ( m_coll[0] ) coll = m_coll; log(LOG_DEBUG,"spider: resetting spider cache coll=%s",coll); @@ -2251,6 +2256,7 @@ bool SpiderColl::addToWaitingTree ( uint64_t spiderTimeMS , long firstIp , // what is this? if ( firstIp == 0 || firstIp == -1 ) { log("spider: got ip of %s. wtf?",iptoa(firstIp) ); + return false; char *xx=NULL; *xx=0; } @@ -2447,6 +2453,11 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) { if ( ! m_waitingTreeNeedsRebuild ) return; // a double call? can happen if list read is slow... if ( m_gettingList2 ) return; + + // . borrow a msg5 + // . if none available just return, we will be called again + // by the sleep/timer function + // . read in a replacement SpiderRequest to add to doledb from // this ip // . get the list of spiderdb records @@ -2460,7 +2471,7 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) { // flag it m_gettingList2 = true; // make state - long state2 = (long)m_cr->m_collnum; + //long state2 = (long)m_cr->m_collnum; // read the list from local disk if ( ! m_msg5b.getList ( RDB_SPIDERDB , m_cr->m_coll , @@ -2473,7 +2484,7 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) { 0 , // max cache age 0 , // startFileNum -1 , // numFiles (all) - (void *)state2,//this//state + this,//(void *)state2,//this//state gotSpiderdbListWrapper2 , MAX_NICENESS , // niceness true )) // do error correct? @@ -2774,20 +2785,35 @@ void SpiderColl::populateDoledbFromWaitingTree ( bool reentry ) { // calls this function again with re-entry set to true if ( ! scanSpiderdb ( true ) ) return; // oom error? i've seen this happen and we end up locking up! - if ( g_errno ) return; + if ( g_errno ) { + log("spider: scandspiderdb: %s",mstrerror(g_errno)); + m_isPopulating = false; + return; + } // try more goto loop; } static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){ - collnum_t collnum = (collnum_t)(long)state; + //collnum_t collnum = (collnum_t)(long)state; + //SpiderColl *THIS = g_spiderCache.getSpiderColl(collnum); + //if ( ! THIS ) { + // log("spider: lost1 collnum %li while scanning spiderdb", + // (long)collnum); + // return; + //} - SpiderColl *THIS = g_spiderCache.getSpiderColl(collnum); + SpiderColl *THIS = (SpiderColl *)state; - if ( ! THIS ) { - log("spider: lost1 collnum %li while scanning spiderdb", - (long)collnum); + // did our collection rec get deleted? since we were doing a read + // the SpiderColl will have been preserved in that case but its + // m_deleteMyself flag will have been set. + if ( THIS->m_deleteMyself && + ! THIS->m_msg5b.m_waitingForMerge && + ! THIS->m_msg5b.m_waitingForList ) { + mdelete ( THIS , sizeof(SpiderColl),"postdel1"); + delete ( THIS ); return; } @@ -2800,6 +2826,10 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){ // . finish processing the list we read now // . if that blocks, it will call doledWrapper if ( ! THIS->scanSpiderdb ( false ) ) return; + + // no longer populating doledb. we also set to false in doledwrapper + //THIS->m_isPopulating = false; + // . otherwise, do more from tree // . re-entry is true because we just got the msg5 reply THIS->populateDoledbFromWaitingTree ( true ); @@ -2807,16 +2837,29 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){ static void gotSpiderdbListWrapper2( void *state , RdbList *list , Msg5 *msg5){ - collnum_t collnum = (collnum_t)(long)state; + //collnum_t collnum = (collnum_t)(long)state; + //SpiderColl *THIS = g_spiderCache.getSpiderColl(collnum); + //if ( ! THIS ) { + // log("spider: lost2 collnum %li while scanning spiderdb", + // (long)collnum); + // return; + //} - SpiderColl *THIS = g_spiderCache.getSpiderColl(collnum); - if ( ! THIS ) { - log("spider: lost2 collnum %li while scanning spiderdb", - (long)collnum); + SpiderColl *THIS = (SpiderColl *)state; + + // did our collection rec get deleted? since we were doing a read + // the SpiderColl will have been preserved in that case but its + // m_deleteMyself flag will have been set. + if ( THIS->m_deleteMyself && + ! THIS->m_msg5.m_waitingForMerge && + ! THIS->m_msg5.m_waitingForList ) { + mdelete ( THIS , sizeof(SpiderColl),"postdel1"); + delete ( THIS ); return; } + //SpiderColl *THIS = (SpiderColl *)state; // re-entry is true because we just got the msg5 reply THIS->populateWaitingTreeFromSpiderdb ( true ); @@ -2829,6 +2872,10 @@ static void doledWrapper ( void *state ) { // msg4 is available again THIS->m_msg4Avail = true; + // no longer populating doledb. we also set to false in + // gotSpiderListWrapper + //THIS->m_isPopulating = false; + long long now = gettimeofdayInMilliseconds(); long long diff = now - THIS->m_msg4Start; // we add recs to doledb using msg1 to keep things fast because @@ -2969,7 +3016,7 @@ bool SpiderColl::scanSpiderdb ( bool needList ) { // flag it m_gettingList1 = true; // make state - long state2 = (long)m_cr->m_collnum; + //long state2 = (long)m_cr->m_collnum; // . read the list from local disk // . if a niceness 0 intersect thread is taking a LONG time // then this will not complete in a long time and we @@ -2987,7 +3034,7 @@ bool SpiderColl::scanSpiderdb ( bool needList ) { 0 , // max cache age 0 , // startFileNum -1 , // numFiles (all) - (void *)state2,//this,//state + this,//(void *)state2,//this,//state gotSpiderdbListWrapper , MAX_NICENESS , // niceness true )) // do error correct? @@ -9346,6 +9393,10 @@ long getUrlFilterNum2 ( SpiderRequest *sreq , to_lower_a(ext[2]) == 'm' && to_lower_a(ext[3]) == 'v' ) goto gotOne; + if ( to_lower_a(ext[1]) == 'w' && + to_lower_a(ext[2]) == 'a' && + to_lower_a(ext[3]) == 'v' ) + goto gotOne; if ( to_lower_a(ext[1]) == 'j' && to_lower_a(ext[2]) == 'p' && to_lower_a(ext[3]) == 'g' ) diff --git a/Spider.h b/Spider.h index 00cb2744..ab95f7ed 100644 --- a/Spider.h +++ b/Spider.h @@ -981,7 +981,7 @@ class SpiderColl { ~SpiderColl ( ); SpiderColl ( ) ; - void clear(); + void clearLocks(); // called by main.cpp on exit to free memory void reset(); @@ -1125,6 +1125,8 @@ class SpiderColl { long m_scanningIp; bool m_gotNewRequestsForScanningIp; + char m_deleteMyself; + // start key for reading doledb key_t m_msg5StartKey; diff --git a/Threads.cpp b/Threads.cpp index fb4628b8..1e0657a5 100644 --- a/Threads.cpp +++ b/Threads.cpp @@ -284,7 +284,7 @@ bool Threads::init ( ) { // with high niceness cuz it would hold up high priority ones! // . TODO: is there a better way? cancel it when UdpServer calls // Threads::suspendLowPriorityThreads() ? - if ( ! g_threads.registerType ( MERGE_THREAD , 2/*maxThreads*/,100) ) + if ( ! g_threads.registerType ( MERGE_THREAD , 2/*maxThreads*/,1000) ) return log("thread: Failed to register thread type." ); // will raising this from 1 to 2 make it faster too? // i raised since global specs new servers have 2 (hyperthreaded?) cpus @@ -1120,7 +1120,7 @@ void makeCallback ( ThreadEntry *t ) { // then set it if ( t->m_niceness >= 1 ) g_niceness = 1; else g_niceness = 0; - + t->m_callback ( t->m_state , t ); // time it? diff --git a/Titledb.cpp b/Titledb.cpp index a20788d2..5a73cf7b 100644 --- a/Titledb.cpp +++ b/Titledb.cpp @@ -51,20 +51,18 @@ bool Titledb::init ( ) { // . just hard-code 30MB for now long pcmem = 30000000; // = g_conf.m_titledbMaxDiskPageCacheMem; // fuck that we need all the mem! - pcmem = 0; + //pcmem = 0; // do not use any page cache if doing tmp cluster in order to // prevent swapping if ( g_hostdb.m_useTmpCluster ) pcmem = 0; - //long pageSize = GB_INDEXDB_PAGE_SIZE; + long pageSize = GB_INDEXDB_PAGE_SIZE; // init the page cache // . MDW: "minimize disk seeks" not working otherwise i'd enable it! - /* if ( ! m_pc.init ( "titledb", RDB_TITLEDB, pcmem , pageSize ) ) return log("db: Titledb init failed."); - */ // each entry in the cache is usually just a single record, no lists //long maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024); @@ -90,7 +88,7 @@ bool Titledb::init ( ) { 0,//maxCacheNodes , false ,// half keys? false ,// g_conf.m_titledbSav - NULL,//&m_pc , // page cache ptr + &m_pc , // page cache ptr true ) )// is titledb? return false; return true; diff --git a/ppthtml b/ppthtml deleted file mode 100755 index 57bd055a..00000000 Binary files a/ppthtml and /dev/null differ