tons of changes from live github on neo.

lots of core fixes. took out ppthtml powerpoint convert, it hangs. dynamic rdbmap to save memory per coll. fixed disk page cache logic and brought it back.
2024-10-04 12:17:35 +03:00 · 2014-01-17 21:01:43 -08:00 · 2014-01-17 21:01:43 -08:00 · 4e803210ee
commit 4e803210ee
parent 724af442d4
26 changed files with 264 additions and 62 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -636,6 +636,23 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
 }
 */

+// if there is an outstanding disk read thread or merge thread then
+// Spider.cpp will handle the delete in the callback.
+void Collectiondb::deleteSpiderColl ( SpiderColl *sc ) {
+
+	sc->m_deleteMyself = true;
+
+	// if not currently being accessed nuke it now
+	if ( ! sc->m_msg5.m_waitingForList &&
+	     ! sc->m_msg5.m_waitingForMerge &&
+	     ! sc->m_msg5b.m_waitingForList &&
+	     ! sc->m_msg5b.m_waitingForMerge ) {
+		mdelete ( sc, sizeof(SpiderColl),"nukecr2");
+		delete ( sc );
+		return;
+	}
+}
+
 bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
 	// do not allow this if in repair mode
 	if ( g_repairMode > 0 ) {
@ -723,10 +740,14 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
 	SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(collnum);
 	if ( sc ) {
 		// remove locks from lock table:
-		sc->clear();
+		sc->clearLocks();
 		//sc->m_collnum = newCollnum;
-		sc->reset();
-		mdelete ( sc, sizeof(SpiderColl),"nukecr2");
+		//sc->reset();
+		// this will put it on "death row" so it will be deleted
+		// once Msg5::m_waitingForList/Merge is NULL
+		deleteSpiderColl ( sc );
+		//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
+		//delete ( sc );
 		cr->m_spiderColl = NULL;
 	}

@ -925,8 +946,19 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
 	// reset spider info
 	SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
 	if ( sc ) {
-		sc->clear();
-		sc->m_collnum = newCollnum;
+		// remove locks from lock table:
+		sc->clearLocks();
+		// don't do this anymore, just nuke it in case
+		// m_populatingDoledb was true etc. there are too many
+		// flags to worry about
+		//sc->m_collnum = newCollnum;
+		//sc->reset();
+		// this will put it on "death row" so it will be deleted
+		// once Msg5::m_waitingForList/Merge is NULL
+		deleteSpiderColl ( sc );
+		//mdelete ( sc, sizeof(SpiderColl),"nukecr2");
+		//delete ( sc );
+		cr->m_spiderColl = NULL;
 	}

 	// reset spider round
@ -1903,6 +1935,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 		// just turn off spidering. if we were to set priority to
 		// filtered it would be removed from index!
 		m_spidersEnabled     [i] = 0;
+		// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
+		// which has been obsoleted, but we are running old code now!
+		m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
 	}
 	// if collectiverespiderfreq is 0 or less then do not RE-spider
@ -1916,6 +1951,9 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 		// just turn off spidering. if we were to set priority to
 		// filtered it would be removed from index!
 		m_spidersEnabled     [i] = 0;
+		// temp hack so it processes in xmldoc.cpp::getUrlFilterNum()
+		// which has been obsoleted, but we are running old code now!
+		m_spiderDiffbotApiUrl[i].set ( api );
 		i++;
 	}

--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -126,6 +126,8 @@ class Collectiondb  {
 	//bool updateRec ( CollectionRec *newrec );
 	bool deleteRecs ( class HttpRequest *r ) ;

+	void deleteSpiderColl ( class SpiderColl *sc );
+
 	// returns false if blocked, true otherwise. 
 	//bool resetColl ( char *coll , WaitEntry *we , bool purgeSeeds );
 	bool resetColl2 ( collnum_t oldCollnum, 
--- a/DiskPageCache.cpp
+++ b/DiskPageCache.cpp
@ -349,7 +349,7 @@ void DiskPageCache::getPages   ( long       vfd         ,
 		// dumping more than what was end the tree because stuff was
 		// added to the tree while dumping!
 		log("db: pagecache: Caught get breach. "
-		    "ep=%li max=%li", ep,m_maxPagesInFile[vfd] );
+		    "ep=%li max=%li vfd=%li", ep,m_maxPagesInFile[vfd] ,vfd);
 		return;
 		//char *xx = NULL; *xx = 0; 
 	}
--- a/DiskPageCache.h
+++ b/DiskPageCache.h
@ -40,7 +40,8 @@
 #define MAX_PAGE_SETS 128

 // how many BigFiles can be using the same DiskPageCache?
-#define MAX_NUM_VFDS2 1024
+#include "File.h"
+#define MAX_NUM_VFDS2 MAX_NUM_VFDS

 extern void freeAllSharedMem ( long max );

--- a/File.h
+++ b/File.h
@ -21,7 +21,8 @@

 // . max # of VIRTUAL file descriptors
 // . man, chris has 958 files, lets crank it up from 2k to 5k
-#define MAX_NUM_VFDS (5*1024)
+// . boost up to 50,000 since we are hitting this limit with crawlbot
+#define MAX_NUM_VFDS (50*1024)

 #include <sys/types.h>       // for open/lseek
 #include <sys/stat.h>        // for open
--- a/Json.cpp
+++ b/Json.cpp
@ -95,6 +95,14 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , long niceness ) {
 	need += p - json;
 	// plus a \0 for the value and a \0 for the name of each jsonitem
 	need += 2;
+	// prevent cores for now
+	need += 10;
+	// . to prevent safebuf from reallocating do this
+	// . safeMemcpy() calls reserve(m_length+len) and reserves
+	//   tries to alloc m_length + (m_length+len) so since,
+	//   m_length+len should never be more than "need" we need to
+	//   double up here
+	need *= 2;
 	// this should be enough
 	if ( ! m_sb.reserve ( need ) ) return NULL;
 	// for testing if we realloc
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -109,6 +109,8 @@ bool Linkdb::init ( ) {
 	long maxTreeNodes = maxTreeMem /(sizeof(key224_t)+16);
 	// disk page cache mem, 100MB on gk0 now
 	long pcmem = 0; // g_conf.m_linkdbMaxDiskPageCacheMem;
+	// give it a little
+	pcmem = 10000000; // 10MB
 	// keep this low if we are the tmp cluster
 	//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
 	// TODO: would be nice to just do page caching on the satellite files;
--- a/Loop.cpp
+++ b/Loop.cpp
@ -1791,6 +1791,11 @@ void Loop::quickPoll(long niceness, const char* caller, long lineno) {
 	if(m_inQuickPoll) {
 		log(LOG_WARN, 
 		    "admin: tried to quickpoll from inside quickpoll");
+		// this happens when handleRequest3f is called from
+		// a quickpoll and it deletes a collection and BigFile::close
+		// calls ThreadQueue::removeThreads and Msg3::doneScanning()
+		// has niceness 2 and calls quickpoll again!
+		return;
 		//if(g_conf.m_quickpollCoreOnError) { 
 		char*xx=NULL;*xx=0;
 		//		}
--- a/Msg3.cpp
+++ b/Msg3.cpp
@ -930,9 +930,12 @@ bool Msg3::doneScanning ( ) {
 					      ff->getFilename() ,
 					      m_niceness ) ) {
 			log("net: Had error while constraining list read from "
-			    "%s: %s. This is likely caused by corrupted "
+			    "%s: %s%s. vfd=%li parts=%li. "
+			    "This is likely caused by corrupted "
 			    "data on disk.", 
-			    mstrerror(g_errno), ff->getFilename());
+			    mstrerror(g_errno), ff->m_dir ,
+			    ff->getFilename(), ff->m_vfd , 
+			    (long)ff->m_numParts );
 		}
 	}

--- a/Msg5.cpp
+++ b/Msg5.cpp
@ -22,6 +22,7 @@ long g_numCorrupt = 0;

 Msg5::Msg5() {
 	m_waitingForList = false;
+	m_waitingForMerge = false;
 	m_numListPtrs = 0;
 	m_mergeLists = true;
 	reset();
@ -33,7 +34,7 @@ Msg5::~Msg5() {

 // frees m_treeList
 void Msg5::reset() {
-	if ( m_waitingForList ) {
+	if ( m_waitingForList || m_waitingForMerge ) {
 		log("disk: Trying to reset a class waiting for a reply.");
 		// might being doing an urgent exit (mainShutdown(1)) or
 		// g_process.shutdown(), so do not core here
@ -1365,6 +1366,8 @@ bool Msg5::gotList2 ( ) {
 	// skip it for now
 	//goto skipThread;

+	m_waitingForMerge = true;
+
 	// . if size is big, make a thread
 	// . let's always make niceness 0 since it wasn't being very
 	//   aggressive before
@ -1374,6 +1377,9 @@ bool Msg5::gotList2 ( ) {
 			      threadDoneWrapper   ,
 			      mergeListsWrapper_r ) ) 
 		return false;
+
+	m_waitingForMerge = false;
+
 	// thread creation failed
 	if ( ! g_threads.areThreadsDisabled() )
 		log(LOG_INFO,
@ -1704,6 +1710,8 @@ void Msg5::mergeLists_r ( ) {
 // . we are left with an empty list
 bool Msg5::doneMerging ( ) {

+	m_waitingForMerge = false;
+
 	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
 	RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_coll))) return true;

@ -1722,8 +1730,8 @@ bool Msg5::doneMerging ( ) {
 	//   our first merge
 	if ( m_hadCorruption ) {
 		// log it here, cuz logging in thread doesn't work too well
-		log("net: Encountered a corrupt list in rdb=%s",
-		    base->m_dbname);
+		log("net: Encountered a corrupt list in rdb=%s coll=%s",
+		    base->m_dbname,m_coll);
 		// remove error condition, we removed the bad data in thread
 		
 		m_hadCorruption = false;
--- a/Msg5.h
+++ b/Msg5.h
@ -292,6 +292,7 @@ class Msg5 {
 	bool  m_mergeLists;

 	char m_waitingForList;
+	char m_waitingForMerge;
 	
 	// actually part of a different algo than m_waitingForList!
 	unsigned long long m_waitingKey;
--- a/PageAddUrl.cpp
+++ b/PageAddUrl.cpp
@ -254,6 +254,10 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 	long long probDocId = g_titledb.getProbableDocId ( st1->m_url );
 	// make one up, like we do in PageReindex.cpp
 	long firstIp = (probDocId & 0xffffffff);
+
+	// avoid ips of 0 or -1
+	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
+
 	// . now fill it up
 	// . TODO: calculate the other values... lazy!!! (m_isRSSExt, 
 	//         m_siteNumInlinks,...)
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -3841,6 +3841,9 @@ bool getSpiderRequestMetaList ( char *doc ,
 		SpiderRequest sreq;
 		sreq.reset();
 		sreq.m_firstIp = url.getHostHash32(); // fakeip!
+		// avoid ips of 0 or -1
+		if ( sreq.m_firstIp == 0 || sreq.m_firstIp == -1 )
+			sreq.m_firstIp = 1;
 		sreq.m_hostHash32 = url.getHostHash32();
 		sreq.m_domHash32  = url.getDomainHash32();
 		sreq.m_siteHash32 = url.getHostHash32();
--- a/PageStats.cpp
+++ b/PageStats.cpp
@ -488,7 +488,7 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
 		  "<tr><td><b>Kernel Version</b></td><td>%s</td></tr>\n"
 		  //"<tr><td><b>Gigablast Version</b></td><td>%s %s</td></tr>\n"
 		  "<tr><td><b>Parsing Inconsistencies</b></td><td>%li</td>\n"
-		  "<tr><td><b>Indexdb Splits</b></td><td>%li</td>\n"
+		  "<tr><td><b>Indexdb Shards</b></td><td>%li</td>\n"
 		  //"<tr><td><b>Fully Split</b></td><td>%li</td>\n"
 		  //"<tr><td><b>Tfndb Extension Bits</b></td><td>%li</td>\n"
 		  "</tr>\n"
--- a/Parms.cpp
+++ b/Parms.cpp
@ -17531,7 +17531,12 @@ bool Parms::doParmSendingLoop ( ) {
 						 NULL, // retslot
 						 (void *)h->m_hostId , // state
 						 gotParmReplyWrapper ,
-						 4 ) ) { // timeout secs
+						 30 , // timeout secs
+						 -1 , // backoff
+						 -1 , // maxwait
+						 NULL , // replybuf
+						 0 , // replybufmaxsize
+						 0 ) ) { // niceness
 			log("parms: faild to send: %s",mstrerror(g_errno));
 			continue;
 		}
--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -122,12 +122,12 @@ bool Posdb::init ( ) {
 	long nodeSize      = (sizeof(key144_t)+12+4) + sizeof(collnum_t);
 	long maxTreeNodes = maxTreeMem  / nodeSize ;

-	//long pageSize = GB_INDEXDB_PAGE_SIZE;
+	long pageSize = GB_INDEXDB_PAGE_SIZE;
 	// we now use a disk page cache as opposed to the
 	// old rec cache. i am trying to do away with the Rdb::m_cache rec
 	// cache in favor of cleverly used disk page caches, because
 	// the rec caches are not real-time and get stale. 
-	long pcmem    = 50000000; // 50MB
+	long pcmem    = 30000000; // 30MB
 	// make sure at least 30MB
 	//if ( pcmem < 30000000 ) pcmem = 30000000;
 	// keep this low if we are the tmp cluster, 30MB
@ -136,12 +136,12 @@ bool Posdb::init ( ) {
 	// prevent swapping
 	if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
 	// save more mem!!! allow os to cache it i guess...
-	pcmem = 0;
+	// let's go back to using it
+	//pcmem = 0;
 	// disable for now... for rebuild
 	//pcmem = 0;
 	// . init the page cache
 	// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
-	/*
 	if ( ! m_pc.init ( "posdb",
 			   RDB_POSDB,
 			   pcmem    ,
@ -149,7 +149,6 @@ bool Posdb::init ( ) {
 			   true     ,  // use RAM disk?
 			   false    )) // minimize disk seeks?
 		return log("db: Posdb init failed.");
-	*/

 	// . set our own internal rdb
 	// . max disk space for bin tree is same as maxTreeMem so that we
@ -174,7 +173,7 @@ bool Posdb::init ( ) {
 			   // newer systems have tons of ram to use
 			   // for their disk page cache. it is slower than
 			   // ours but the new engine has much slower things
-			   NULL,//&m_pc                       ,
+			   &m_pc                       ,
 			   false , // istitledb?
 			   false , // preloaddiskpagecache?
 			   sizeof(key144_t)
--- a/Process.cpp
+++ b/Process.cpp
@ -104,7 +104,7 @@ char *g_files[] = {
 	"antiword" ,  // msword
 	"pdftohtml",  // pdf
 	"pstotext" ,  // postscript
-	"ppthtml"  ,  // powerpoint
+	//"ppthtml"  ,  // powerpoint
 	
 	//"dict/unifiedDict",
 	//"dict/thesaurus.txt",
--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -360,6 +360,7 @@ bool RdbBase::init ( char  *dir            ,
 	// now fill up the page cache
 	// preload:
 	if ( ! preloadDiskPageCache ) return true;
+	if ( ! m_pc ) return true;
 	char buf [ 512000 ];
 	long total = m_pc->getMemMax();
 	log(LOG_DEBUG,"db: %s: Preloading page cache. Total mem to use =%lu",
--- a/RdbDump.cpp
+++ b/RdbDump.cpp
@ -204,11 +204,14 @@ void RdbDump::doneDumping ( ) {
 	     m_totalPosDumped , m_totalNegDumped ,
 	     m_totalPosDumped + m_totalNegDumped );

-	// map verify
-	log("db: map # pos=%lli neg=%lli",
-	    m_map->getNumPositiveRecs(),
-	    m_map->getNumNegativeRecs()
-	    );
+	// . map verify
+	// . if continueDumping called us with no collectionrec, it got
+	//   deleted so RdbBase::m_map is nuked too i guess
+	if ( saved != ENOCOLLREC )
+		log("db: map # pos=%lli neg=%lli",
+		    m_map->getNumPositiveRecs(),
+		    m_map->getNumNegativeRecs()
+		    );

 	// free the list's memory
 	if ( m_list ) m_list->freeList();
@ -1015,11 +1018,16 @@ void RdbDump::continueDumping() {

 	// if someone reset/deleted the collection we were dumping...
 	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
-	if ( ! cr ) g_errno = ENOCOLLREC;
-
+	if ( ! cr ) {
+		g_errno = ENOCOLLREC;
+		// m_file is invalid if collrec got nuked because so did
+		// the Rdbbase which has the files
+		log("db: continue dumping lost collection");
+	}
 	// bitch about errors
-	if (g_errno)log("db: Dump to %s had error writing: %s.",
-			m_file->getFilename(),mstrerror(g_errno));
+	else if (g_errno)log("db: Dump to %s had error writing: %s.",
+			     m_file->getFilename(),mstrerror(g_errno));
+
 	// go back now if we were NOT dumping a tree
 	if ( ! (m_tree || m_buckets) ) {
 		m_isDumping = false;
--- a/RdbMap.cpp
+++ b/RdbMap.cpp
@ -6,6 +6,8 @@

 RdbMap::RdbMap() {
 	m_numSegments = 0;
+	m_numSegmentPtrs = 0;
+	m_numSegmentOffs = 0;
 	reset ( );
 }

@ -61,6 +63,14 @@ void RdbMap::reset ( ) {
 		m_keys   [i] = NULL;
 		m_offsets[i] = NULL;
 	}
+
+	// the ptrs themselves are now a dynamic array to save mem
+	// when we have thousands of collections
+	mfree(m_keys,m_numSegmentPtrs*sizeof(char *),"MapPtrs");
+	mfree(m_offsets,m_numSegmentOffs*sizeof(short *),"MapPtrs");
+	m_numSegmentPtrs = 0;
+	m_numSegmentOffs = 0;
+
 	m_needToWrite     = false;
 	m_fileStartOffset = 0LL;
 	m_numSegments     = 0;
@ -1192,6 +1202,40 @@ long long RdbMap::getMemAlloced ( ) {
 	return (long long)m_numSegments * space;
 }

+bool RdbMap::addSegmentPtr ( long n ) {
+	// realloc
+	if ( n >= m_numSegmentPtrs ) {
+		char **k;
+		long nn = (long)((float)n * 1.20) + 1;
+		k = (char **) mrealloc (m_keys,
+					m_numSegmentPtrs * sizeof(char *) ,
+					nn * sizeof(char *) ,
+					"MapPtrs" );
+		// failed?
+		if ( ! k ) return false;
+		// succeeded
+		m_numSegmentPtrs = nn;
+		m_keys = k;
+	}
+
+	// try offsets 
+	if ( n >= m_numSegmentOffs ) {
+		short **o;
+		long nn = (long)((float)n * 1.20) + 1;
+		o = (short **) mrealloc (m_offsets,
+					 m_numSegmentOffs * sizeof(short *) ,
+					 nn * sizeof(short *) ,
+					 "MapPtrs" );
+		// failed?
+		if ( ! o ) return false;
+		// succeeded
+		m_numSegmentOffs = nn;
+		m_offsets = o;
+	}
+	return true;
+}
+	
+
 // . add "n" segments
 // . returns false and sets g_errno on error
 bool RdbMap::addSegment (  ) {
@ -1202,8 +1246,17 @@ bool RdbMap::addSegment (  ) {
 	long n   = m_numSegments;
 	long pps = PAGES_PER_SEGMENT;
 	// ensure doesn't exceed the max
-	if ( n >= MAX_SEGMENTS ) return log("db: Mapped file is "
-					    "too big. Critical error.");
+	//if ( n >= MAX_SEGMENTS ) return log("db: Mapped file is "
+	//				    "too big. Critical error.");
+
+	// the array of up to MAX_SEGMENT pool ptrs is now dynamic too!
+	// because diffbot uses thousands of collections, this will save
+	// over 1GB of ram!
+	if ( ! addSegmentPtr ( n ) )
+		return log("db: Failed to allocate memory for adding seg ptr "
+			   "for map file %s.", m_file.getFilename());
+
+
 	// alloc spaces for each key segment
 	// allocate new segments now 
 	//m_keys[n]    = (key_t         *) mmalloc ( ks * pps , "RdbMap" );
--- a/RdbMap.h
+++ b/RdbMap.h
@ -59,7 +59,7 @@
 #define PAGES_PER_SEGMENT (2*1024)
 #define PAGES_PER_SEG     (PAGES_PER_SEGMENT)
 // MAX_SEGMENTS of 16*1024 allows for 32 million pages = 256gigs of disk data
-#define MAX_SEGMENTS      (16*1024)  
+//#define MAX_SEGMENTS      (16*1024)  

 class RdbMap {

@ -284,6 +284,8 @@ class RdbMap {
 	// . used to grow the map, too
 	//bool setMapSize ( long maxNumPages );

+	bool addSegmentPtr ( long n ) ;
+
 	// called by setMapSize() to increase the # of segments
 	bool addSegment (  ) ;

@ -328,10 +330,17 @@ class RdbMap {
 	// . IMPORTANT: if growing m_pageSize might need to change m_offsets 
 	//   from short to long
 	//key_t         *m_keys    [ MAX_SEGMENTS ]; 
-	char          *m_keys    [ MAX_SEGMENTS ]; 
+	//char          *m_keys    [ MAX_SEGMENTS ]; 
+	char          **m_keys;
+	long            m_numSegmentPtrs;
 	//key96_t      **m_keys96; // set to m_keys
 	//key128_t     **m_keys128; // set to m_keys
-	short         *m_offsets [ MAX_SEGMENTS ]; 
+
+	//short         *m_offsets [ MAX_SEGMENTS ]; 
+	short         **m_offsets;
+	long            m_numSegmentOffs;
+
+

 	// number of valid pages in the map.
 	long          m_numPages;     
--- a/Spider.cpp
+++ b/Spider.cpp
@ -537,7 +537,6 @@ bool Spiderdb::init ( ) {
 	long pcmem = 20000000;//g_conf.m_spiderdbMaxDiskPageCacheMem;
 	// keep this low if we are the tmp cluster
 	if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
-
 	// key parser checks
 	//long      ip         = 0x1234;
 	char      priority   = 12;
@ -571,7 +570,7 @@ bool Spiderdb::init ( ) {
 			   RDB_SPIDERDB ,
 			   pcmem     ,
 			   pageSize  ,
-			   true      ,  // use shared mem?
+			   false     ,  // use shared mem?
 			   false     )) // minimizeDiskSeeks?
 		return log(LOG_INIT,"spiderdb: Init failed.");

@ -1014,9 +1013,11 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
 /////////////////////////

 SpiderColl::SpiderColl () {
+	m_deleteMyself = false;
 	m_gettingList1 = false;
 	m_gettingList2 = false;
 	m_lastScanTime = 0;
+	m_isPopulating = false;
 	m_numAdded = 0;
 	m_numBytesScanned = 0;
 	m_lastPrintCount = 0;
@ -1488,7 +1489,7 @@ SpiderColl::~SpiderColl () {
 }

 // we call this now instead of reset when Collectiondb::resetColl() is used
-void SpiderColl::clear ( ) {
+void SpiderColl::clearLocks ( ) {

 	// remove locks from locktable for all spiders out i guess
 	HashTableX *ht = &g_spiderLoop.m_lockTable;
@ -1508,6 +1509,7 @@ void SpiderColl::clear ( ) {
 		goto top;
 	}

+	/*
 	// reset these for SpiderLoop;
 	m_nextDoledbKey.setMin();
 	m_didRound = false;
@ -1541,6 +1543,7 @@ void SpiderColl::clear ( ) {
 	// assume the whole thing is not empty
 	m_allDoledbPrioritiesEmpty = 0;//false;
 	m_lastEmptyCheck = 0;
+	*/
 }

 void SpiderColl::reset ( ) {
@ -1554,6 +1557,8 @@ void SpiderColl::reset ( ) {
 	m_twinDied = false;
 	m_lastUrlFiltersUpdate = 0;

+	m_isPopulating = false;
+
 	char *coll = "unknown";
 	if ( m_coll[0] ) coll = m_coll;
 	log(LOG_DEBUG,"spider: resetting spider cache coll=%s",coll);
@ -2251,6 +2256,7 @@ bool SpiderColl::addToWaitingTree ( uint64_t spiderTimeMS , long firstIp ,
 	// what is this?
 	if ( firstIp == 0 || firstIp == -1 ) {
 		log("spider: got ip of %s. wtf?",iptoa(firstIp) );
+		return false;
 		char *xx=NULL; *xx=0;
 	}

@ -2447,6 +2453,11 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
 		if ( ! m_waitingTreeNeedsRebuild ) return;
 		// a double call? can happen if list read is slow...
 		if ( m_gettingList2 ) return;
+
+		// . borrow a msg5
+		// . if none available just return, we will be called again
+		//   by the sleep/timer function
+
 		// . read in a replacement SpiderRequest to add to doledb from
 		//   this ip
 		// . get the list of spiderdb records
@ -2460,7 +2471,7 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
 		// flag it
 		m_gettingList2 = true;
 		// make state
-		long state2 = (long)m_cr->m_collnum;
+		//long state2 = (long)m_cr->m_collnum;
 		// read the list from local disk
 		if ( ! m_msg5b.getList ( RDB_SPIDERDB   ,
 					 m_cr->m_coll   ,
@ -2473,7 +2484,7 @@ void SpiderColl::populateWaitingTreeFromSpiderdb ( bool reentry ) {
 					 0              , // max cache age
 					 0              , // startFileNum
 					 -1             , // numFiles (all)
-					 (void *)state2,//this//state
+					 this,//(void *)state2,//this//state
 					 gotSpiderdbListWrapper2 ,
 					 MAX_NICENESS   , // niceness
 					 true          )) // do error correct?
@ -2774,20 +2785,35 @@ void SpiderColl::populateDoledbFromWaitingTree ( bool reentry ) {
 	//   calls this function again with re-entry set to true
 	if ( ! scanSpiderdb ( true ) ) return;
 	// oom error? i've seen this happen and we end up locking up!
-	if ( g_errno ) return;
+	if ( g_errno ) { 
+		log("spider: scandspiderdb: %s",mstrerror(g_errno));
+		m_isPopulating = false; 
+		return; 
+	}
 	// try more
 	goto loop;
 }

 static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){

-	collnum_t collnum = (collnum_t)(long)state;
+	//collnum_t collnum = (collnum_t)(long)state;
+	//SpiderColl *THIS = g_spiderCache.getSpiderColl(collnum);
+	//if ( ! THIS ) {
+	//	log("spider: lost1 collnum %li while scanning spiderdb",
+	//	    (long)collnum);
+	//	return;
+	//}

-	SpiderColl *THIS = g_spiderCache.getSpiderColl(collnum);
+	SpiderColl *THIS = (SpiderColl *)state;

-	if ( ! THIS ) {
-		log("spider: lost1 collnum %li while scanning spiderdb",
-		    (long)collnum);
+	// did our collection rec get deleted? since we were doing a read
+	// the SpiderColl will have been preserved in that case but its
+	// m_deleteMyself flag will have been set.
+	if ( THIS->m_deleteMyself &&
+	     ! THIS->m_msg5b.m_waitingForMerge &&
+	     ! THIS->m_msg5b.m_waitingForList ) {
+		mdelete ( THIS , sizeof(SpiderColl),"postdel1");
+		delete ( THIS );
 		return;
 	}

@ -2800,6 +2826,10 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){
 	// . finish processing the list we read now
 	// . if that blocks, it will call doledWrapper
 	if ( ! THIS->scanSpiderdb ( false ) ) return;
+
+	// no longer populating doledb. we also set to false in doledwrapper
+	//THIS->m_isPopulating = false;
+
 	// . otherwise, do more from tree
 	// . re-entry is true because we just got the msg5 reply
 	THIS->populateDoledbFromWaitingTree ( true );
@ -2807,16 +2837,29 @@ static void gotSpiderdbListWrapper ( void *state , RdbList *list , Msg5 *msg5){

 static void gotSpiderdbListWrapper2( void *state , RdbList *list , Msg5 *msg5){

-	collnum_t collnum = (collnum_t)(long)state;
+	//collnum_t collnum = (collnum_t)(long)state;
+	//SpiderColl *THIS = g_spiderCache.getSpiderColl(collnum);
+	//if ( ! THIS ) {
+	//	log("spider: lost2 collnum %li while scanning spiderdb",
+	//	    (long)collnum);
+	//	return;
+	//}

-	SpiderColl *THIS = g_spiderCache.getSpiderColl(collnum);

-	if ( ! THIS ) {
-		log("spider: lost2 collnum %li while scanning spiderdb",
-		    (long)collnum);
+	SpiderColl *THIS = (SpiderColl *)state;
+
+	// did our collection rec get deleted? since we were doing a read
+	// the SpiderColl will have been preserved in that case but its
+	// m_deleteMyself flag will have been set.
+	if ( THIS->m_deleteMyself &&
+	     ! THIS->m_msg5.m_waitingForMerge &&
+	     ! THIS->m_msg5.m_waitingForList ) {
+		mdelete ( THIS , sizeof(SpiderColl),"postdel1");
+		delete ( THIS );
 		return;
 	}

+
 	//SpiderColl *THIS = (SpiderColl *)state;
 	// re-entry is true because we just got the msg5 reply
 	THIS->populateWaitingTreeFromSpiderdb ( true );
@ -2829,6 +2872,10 @@ static void doledWrapper ( void *state ) {
 	// msg4 is available again
 	THIS->m_msg4Avail = true;

+	// no longer populating doledb. we also set to false in 
+	// gotSpiderListWrapper
+	//THIS->m_isPopulating = false;
+
 	long long now = gettimeofdayInMilliseconds();
 	long long diff = now - THIS->m_msg4Start;
 	// we add recs to doledb using msg1 to keep things fast because
@ -2969,7 +3016,7 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 		// flag it
 		m_gettingList1 = true;
 		// make state
-		long state2 = (long)m_cr->m_collnum;
+		//long state2 = (long)m_cr->m_collnum;
 		// . read the list from local disk
 		// . if a niceness 0 intersect thread is taking a LONG time
 		//   then this will not complete in a long time and we
@ -2987,7 +3034,7 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
 					0              , // max cache age
 					0              , // startFileNum
 					-1             , // numFiles (all)
-					(void *)state2,//this,//state 
+					this,//(void *)state2,//this,//state 
 					gotSpiderdbListWrapper ,
 					MAX_NICENESS   , // niceness
 					true          )) // do error correct?
@ -9346,6 +9393,10 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 				     to_lower_a(ext[2]) == 'm' &&
 				     to_lower_a(ext[3]) == 'v' )
 					goto gotOne;
+				if ( to_lower_a(ext[1]) == 'w' &&
+				     to_lower_a(ext[2]) == 'a' &&
+				     to_lower_a(ext[3]) == 'v' )
+					goto gotOne;
 				if ( to_lower_a(ext[1]) == 'j' &&
 				     to_lower_a(ext[2]) == 'p' &&
 				     to_lower_a(ext[3]) == 'g' )
--- a/Spider.h
+++ b/Spider.h
@ -981,7 +981,7 @@ class SpiderColl {
 	~SpiderColl ( );
 	SpiderColl  ( ) ;

-	void clear();
+	void clearLocks();

 	// called by main.cpp on exit to free memory
 	void      reset();
@ -1125,6 +1125,8 @@ class SpiderColl {
 	long       m_scanningIp;
 	bool       m_gotNewRequestsForScanningIp;

+	char m_deleteMyself;
+
 	// start key for reading doledb
 	key_t m_msg5StartKey;

--- a/Threads.cpp
+++ b/Threads.cpp
@ -284,7 +284,7 @@ bool Threads::init ( ) {
 	//   with high niceness cuz it would hold up high priority ones!
 	// . TODO: is there a better way? cancel it when UdpServer calls
 	//   Threads::suspendLowPriorityThreads() ?
-	if ( ! g_threads.registerType ( MERGE_THREAD , 2/*maxThreads*/,100) ) 
+	if ( ! g_threads.registerType ( MERGE_THREAD , 2/*maxThreads*/,1000) ) 
 		return log("thread: Failed to register thread type." );
 	// will raising this from 1 to 2 make it faster too?
 	// i raised since global specs new servers have 2 (hyperthreaded?) cpus
--- a/Titledb.cpp
+++ b/Titledb.cpp
@ -51,20 +51,18 @@ bool Titledb::init ( ) {
 	// . just hard-code 30MB for now
 	long pcmem    = 30000000; // = g_conf.m_titledbMaxDiskPageCacheMem;
 	// fuck that we need all the mem!
-	pcmem = 0;
+	//pcmem = 0;
 	// do not use any page cache if doing tmp cluster in order to
 	// prevent swapping
 	if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
-	//long pageSize = GB_INDEXDB_PAGE_SIZE;
+	long pageSize = GB_INDEXDB_PAGE_SIZE;
 	// init the page cache
 	// . MDW: "minimize disk seeks" not working otherwise i'd enable it!
-	/*
 	if ( ! m_pc.init ( "titledb",
 			   RDB_TITLEDB,
 			   pcmem    ,
 			   pageSize ) )
 		return log("db: Titledb init failed.");
-	*/

 	// each entry in the cache is usually just a single record, no lists
 	//long maxCacheNodes = g_conf.m_titledbMaxCacheMem / (10*1024);
@ -90,7 +88,7 @@ bool Titledb::init ( ) {
 			    0,//maxCacheNodes               ,
 			    false                       ,// half keys?
 			    false                       ,// g_conf.m_titledbSav
-			    NULL,//&m_pc               , // page cache ptr
+			    &m_pc               , // page cache ptr
 			    true                        ) )// is titledb?
 		return false;
 	return true;
--- a/BIN
+++ b/BIN