Merge branch 'ia-zak' of https://github.com/gigablast/open-source-search-engine into ia-zak

2024-10-04 12:17:35 +03:00 · 2015-08-31 23:19:45 -06:00 · 2015-08-31 23:19:45 -06:00 · b199c67355
commit b199c67355
parent 5a7b01585d de51769e5a
74 changed files with 4300 additions and 2483 deletions
--- a/BigFile.cpp
+++ b/BigFile.cpp
@ -36,12 +36,12 @@ BigFile::BigFile () {
 	m_permissions = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH ;
 	m_flags       = O_RDWR ; // | O_DIRECT;
 	// NULLify all ptrs to files
-	for ( int32_t i = 0 ; i < MAX_PART_FILES ; i++ ) m_files[i] = NULL;
+	//for ( int32_t i = 0 ; i < MAX_PART_FILES ; i++ ) m_files[i] = NULL;
 	m_maxParts = 0;
 	m_numParts = 0;
 	m_pc  = NULL;
 	m_vfd = -1;
-	m_vfdAllowed = false;
+	//m_vfdAllowed = false;
 	m_fileSize = -1;
 	m_lastModified = -1;
 	m_numThreads = 0;
@ -49,29 +49,57 @@ BigFile::BigFile () {
 	g_lastDiskReadStarted = 0;
 	g_lastDiskReadCompleted = 0;
 	g_diskIsStuck = false;
+	//memset ( m_littleBuf , 0 , LITTLEBUFSIZE );
+	// avoid a malloc for small files.
+	// this way we can save in memory RdbMaps upon a core, even malloc/free
+	// related cores, cuz we won't have to do a malloc to save!
+	//m_fileBuf.setBuf ( m_littleBuf,LITTLEBUFSIZE,0,false);
+	// for this make the length always equal the capacity so when we
+	// call reserve it builds on the whole thing
+	//m_fileBuf.setLength ( m_fileBuf.getCapacity() );
 }

 // we alternate parts into "dirname" and "stripeDir"
+// . return false and set g_errno on error
 bool BigFile::set ( char *dir , char *baseFilename , char *stripeDir ) {
 	// reset filsize
 	m_fileSize = -1;
 	m_lastModified = -1;
 	// m_baseFilename contains the "dir" in it
 	//sprintf(m_baseFilename ,"%s/%s", dirname  , baseFilename );
-	strcpy ( m_baseFilename , baseFilename  );
-	strcpy ( m_dir          , dir           );
-	if ( stripeDir ) strcpy ( m_stripeDir    , stripeDir     );
-	else             m_stripeDir[0] = '\0';
+
+	m_dir.reset();
+	m_baseFilename.reset();
+
+	m_dir         .setLabel("bfd");
+	m_baseFilename.setLabel("bfbf");
+
+	// use this 32 byte char buf to avoid a malloc if possible
+	m_baseFilename.setBuf (m_tmpBaseBuf,32,0,false);
+
+	if ( ! m_dir.safeStrcpy          ( dir          ) ) return false;
+	if ( ! m_baseFilename.safeStrcpy ( baseFilename ) ) return false;
+
+	//strcpy ( m_baseFilename , baseFilename  );
+	//strcpy ( m_dir          , dir           );
+	//if ( stripeDir ) strcpy ( m_stripeDir    , stripeDir     );
+	//else             m_stripeDir[0] = '\0';
 	// reset # of parts
 	m_numParts = 0;
 	m_maxParts = 0;
+
+	m_filePtrsBuf.reset();
+
 	// now add parts from both directories
-	if ( ! addParts ( m_dir       ) ) return false;
-	if ( ! addParts ( m_stripeDir ) ) return false;
+	if ( ! addParts ( dir       ) ) return false;
+	//if ( ! addParts ( m_stripeDir ) ) return false;
 	return true;
 }

 bool BigFile::reset ( ) {
+	// RdbMap calls BigFile (m_file)::reset() so we need to free
+	// the files and their safebufs for their filename and dir.
+	close ();
 	// reset filsize
 	m_fileSize = -1;
 	m_lastModified = -1;
@ -82,18 +110,19 @@ bool BigFile::reset ( ) {
 	//if ( stripeDir ) strcpy ( m_stripeDir    , stripeDir     );
 	//else             m_stripeDir[0] = '\0';
 	// reset # of parts
-	m_numParts = 0;
-	m_maxParts = 0;
+	//m_numParts = 0;
+	//m_maxParts = 0;
 	// now add parts from both directories
-	if ( ! addParts ( m_dir       ) ) return false;
-	if ( ! addParts ( m_stripeDir ) ) return false;
+	// MDW: why is this in reset() function? remove...
+	//if ( ! addParts ( m_dir.getBufStart() ) ) return false;
+	//if ( ! addParts ( m_stripeDir ) ) return false;
 	return true;
 }
 	

 bool BigFile::addParts ( char *dirname ) {
 	// if dirname is NULL return true
-	if ( ! dirname[0] ) return true;
+	if ( ! dirname || ! dirname[0] ) return true;
 	// . now set the names of all the Files that we consist of
 	// . get the directory entry and find out what parts we have
 	Dir dir;
@ -102,9 +131,9 @@ bool BigFile::addParts ( char *dirname ) {
 	if (!dir.open()) return log("disk: openDir (\"%s\") failed",dirname);
 	// match files with this pattern in the directory
 	char pattern[256];
-	sprintf(pattern,"%s*", m_baseFilename );
+	sprintf(pattern,"%s*", m_baseFilename.getBufStart() );
 	// length of the base filename
-	int32_t blen = gbstrlen ( m_baseFilename );
+	int32_t blen = gbstrlen ( m_baseFilename.getBufStart() );
 	// . set our m_files array
 	// . addFile() will return false on problems
 	// . the lower the fileId the older the file (w/ exception of #0)
@ -127,12 +156,12 @@ bool BigFile::addParts ( char *dirname ) {
 		}
 		else part = atoi ( filename + blen + 5 );
 		// ensure not too big
-		if ( part >= MAX_PART_FILES ) {
-			log ("disk: Part number of %"INT32" is too big for "
-			     "\"%s\". Should be less than %"INT32".", 
-			     (int32_t)part,filename,(int32_t)MAX_PART_FILES);
-			continue;
-		}
+		// if ( part >= MAX_PART_FILES ) {
+		// 	log ("disk: Part number of %"INT32" is too big for "
+		// 	     "\"%s\". Should be less than %"INT32".", 
+		// 	     (int32_t)part,filename,(int32_t)MAX_PART_FILES);
+		// 	continue;
+		// }
 		// make this part file
 		if ( ! addPart ( part ) ) return false;
 	}
@ -142,23 +171,78 @@ bool BigFile::addParts ( char *dirname ) {
 	return true;
 }

+// WE CAN'T REALLOC the safebuf because there might be a thread 
+// referencing the file ptr. so let's just keep the m_filePtrs[] array
+// and realloc on that.
 bool BigFile::addPart ( int32_t n ) {
-	if ( n >= MAX_PART_FILES ) 
-		return log("disk: Part number %"INT32" > %"INT32".",
-			   n,(int32_t)MAX_PART_FILES);
+	// if ( n >= MAX_PART_FILES ) 
+	// 	return log("disk: Part number %"INT32" > %"INT32".",
+	// 		   n,(int32_t)MAX_PART_FILES);
+	// . grow our dynamic array and return ptr to last element
+	// . n's come in NOT necessarily in order!!!
+	int32_t need = (n+1) * sizeof(File *);
+	// capacity must be length always for this
+	if ( m_filePtrsBuf.getCapacity() != m_filePtrsBuf.getLength() ) {
+		char *xx=NULL;*xx=0;}

-	File *f ;
-	try { f = new (File); }
-	catch ( ... ) { 
-		g_errno = ENOMEM;
-		return log("BigFile: new(%i): %s",(int)sizeof(File), 
-			   mstrerror(g_errno)); 
+	// init using tiny buf to save a malloc for small files
+	if ( m_filePtrsBuf.getCapacity() == 0 ) {
+		memset (m_tinyBuf,0,8);
+		m_filePtrsBuf.setBuf ( m_tinyBuf,8,0,false);
+		m_filePtrsBuf.setLength ( m_filePtrsBuf.getCapacity() );
+	}
+
+	// how much more mem do we need?
+	int32_t delta = need - m_filePtrsBuf.getLength();
+	// . make sure our CAPACITY is increased by what we need
+	// . SafeBuf::reserve() ADDS this much to current capacity
+	// . true = clear new mem new new file ptrs are null because
+	//   there may be gaps or not exist because the BigFile was being
+	//   merged.
+	if ( delta > 0 && ! m_filePtrsBuf.reserve ( delta ,"bfbuf",true ) ) {
+		log("file: failed to reserve %i more mem for part",delta);
+		return false;
+	}
+	// make length the capacity. so if buf is resized in call to
+	// SafeBuf::reserve() it will copy over all of the old buf to new buf
+	m_filePtrsBuf.setLength ( m_filePtrsBuf.getCapacity() );
+
+	File **filePtrs = (File **)m_filePtrsBuf.getBufStart();
+
+	//File *f = filesPtrs[n];
+	// sanity to ensure we do not breach the buffer
+	//char *fend = ((char *)f) + sizeof(File);
+	//if ( fend > m_fileBuf.getBuf() ) { char *xx=NULL;*xx=0; }
+
+	// we have to call constructor ourself then
+	//f->constructor();
+
+	File *f = NULL;
+
+	if ( m_numParts == 0 ) {
+		f = (File *)m_littleBuf;
+		if ( LITTLEBUFSIZE < sizeof(File) ) {
+			log("file: littlebufsize too small.");
+			char *xx=NULL;*xx=0; 
+		}
+		f->constructor();
+	}
+	else {
+		try { f = new (File); }
+		catch ( ... ) { 
+			g_errno = ENOMEM;
+			return log("BigFile: new(%i): %s",(int)sizeof(File), 
+				   mstrerror(g_errno)); 
+		}
+		mnew ( f , sizeof(File) , "BigFile" );
 	}
-	mnew ( f , sizeof(File) , "BigFile" );
 	char buf[1024];
-	makeFilename_r ( m_baseFilename , NULL, n , buf );
+	// make the filename for this new File class
+	makeFilename_r ( m_baseFilename.getBufStart() , NULL, n , buf , 1024 );
+	// and set it with that
 	f->set ( buf );
-	m_files [ n ] = f;
+	// store the ptr to it in m_filePtrs
+	filePtrs [ n ] = f;
 	m_numParts++;
 	// set maxPart
 	if ( n+1 > m_maxParts ) m_maxParts = n+1;
@ -171,11 +255,16 @@ bool BigFile::doesExist ( ) {

 // if we can open it with a valid fd, then it exists
 bool BigFile::doesPartExist ( int32_t n ) {
-	if ( n >= MAX_PART_FILES ) return false;
-	bool exists = (bool)m_files[n];
-	return exists;
+	//if ( n >= MAX_PART_FILES ) return false;
+	if ( n >= m_maxParts ) return false;
+	// f will be null if part does not exist
+	File *f = getFile2(n);
+	if ( f ) return true;
+	return false;
 }

+static int64_t s_vfd = 0;
+
 // . overide File::open so we can set m_numParts
 // . set maxFileSize when opening a new file for writing and using 
 //   DiskPageCache
@ -192,10 +281,10 @@ bool BigFile::open ( int flags , class DiskPageCache *pc ,
 	// . this returns our "virtual fd", not the same as File::m_vfd
 	// . returns -1 and sets g_errno on failure
 	// . we pass m_vfd to getPages() and addPages()
-	if ( m_pc ) {
-		if ( maxFileSize == -1 ) maxFileSize = getFileSize();
-		m_vfd = m_pc->getVfd ( maxFileSize, m_vfdAllowed );
-		g_errno = 0;
+	if ( m_pc && m_vfd == -1 ) {
+		//if ( maxFileSize == -1 ) maxFileSize = getFileSize();
+		m_vfd = ++s_vfd;
+		//g_errno = 0;
 	}
 	return true;
 }
@ -204,15 +293,31 @@ bool BigFile::open ( int flags , class DiskPageCache *pc ,
 void BigFile::makeFilename_r ( char *baseFilename    , 
 			       char *baseFilenameDir , 
 			       int32_t  n               , 
-			       char *buf             ) {
-	char *dir = m_dir;
+			       char *buf             ,
+			       int32_t bufSize ) {
+	char *dir = m_dir.getBufStart();
 	if ( baseFilenameDir && baseFilenameDir[0] ) dir = baseFilenameDir;
+	int32_t r;
+	// ensure we do not breach the buffer
+	// int32_t dirLen = gbstrlen(dir);
+	// int32_t baseLen = gbstrlen(baseFilename);
+	// int32_t need = dirLen + 1 + baseLen + 1;
+	// if ( need < bufSize ) { char *xx=NULL;*xx=0; }
 	//static char s[1024];
-	if ( (n % 2) == 0 || ! m_stripeDir[0] ) 
-		sprintf ( buf, "%s/%s",   dir      , baseFilename );
-	else    sprintf ( buf, "%s/%s", m_stripeDir, baseFilename );
-	if ( n == 0 ) return ;
-	sprintf ( buf + gbstrlen(buf) , ".part%"INT32"", n );
+	// if ( (n % 2) == 0 || ! m_stripeDir[0] ) 
+	// 	sprintf ( buf, "%s/%s",   dir      , baseFilename );
+	// else    sprintf ( buf, "%s/%s", m_stripeDir, baseFilename );
+	if ( n == 0 ) {
+		r = snprintf ( buf, bufSize, "%s/%s",dir,baseFilename);
+		if ( r < bufSize ) return;
+		// truncation is bad
+		char *xx=NULL; *xx=0;
+	}
+	// return if it fit into "buf"
+	r = snprintf ( buf, bufSize, "%s/%s.part%"INT32,dir,baseFilename,n);
+	if ( r < bufSize ) return;
+	// truncation is bad
+	char *xx=NULL; *xx=0;
 }

 //int BigFile::getfdByOffset ( int64_t offset ) {
@ -221,30 +326,34 @@ void BigFile::makeFilename_r ( char *baseFilename    ,

 // . get the fd of the nth file
 // . will try to open the file if it hasn't yet been opened
-int BigFile::getfd ( int32_t n , bool forReading , int32_t *vfd ) {
+int BigFile::getfd ( int32_t n , bool forReading ) { // , int64_t *vfd ) {
+
 	// boundary check
-	if ( n >= MAX_PART_FILES ) 
-		return log("disk: Part number %"INT32" > %"INT32". fd not available.",
-			   n,(int32_t)MAX_PART_FILES) - 1;
+	if ( n >= m_maxParts && ! addPart ( n ) ) {
+		log("disk: Part number %"INT32" > %"INT32". fd "
+		    "not available.",
+		    n,m_maxParts);
+		// return -1 to indicate can't do it
+		return -1;
+	}

 	// get the File ptr from the table
-	File *f = m_files[n];
-	// if part does not exist then create it!
+	File *f = getFile2(n);
+	// if part does not exist then create it! addPart(n) will do that?
 	if ( ! f ) {
 		// don't create File if we're getting it for reading
 		if ( forReading    ) return -1;
 		if ( ! addPart (n) ) return -1;
-		f = m_files[n];
 	}
 	// open it if not opened
-	if ( ! f->isOpen() ) {
+	if ( ! f->calledOpen() ) {
 		if ( ! f->open ( m_flags , m_permissions ) ) {
 			log("disk: Failed to open file part #%"INT32".",n);
 			return -1;
 		}
 	}
 	// set it virtual fd, too
-	if ( vfd ) *vfd = f->m_vfd;
+	//if ( vfd ) *vfd = f->m_vfd;
 	// get it's file descriptor
 	int fd = f->getfd ( ) ;
 	if ( fd >= -1 ) return fd;
@ -264,11 +373,18 @@ int64_t BigFile::getFileSize ( ) {
 	// add up the sizes of each file
 	int64_t totalSize = 0;
 	for ( int32_t n = 0 ; n < m_maxParts ; n++ ) {
-		// we can have headless big files... count the heads
-		if ( ! m_files[n] ) { totalSize += MAX_PART_SIZE; continue; }
+		// shortcut
+		File *f = getFile2(n);
+		// we can have headless big files... count the heads.
+		// this can happen if the first Files were deleted because
+		// of an ongoing merge operation.
+		if ( ! f ) { 
+			totalSize += MAX_PART_SIZE; 
+			continue; 
+		}
 		// . returns -2 on error, -1 if does not exist
 		// . TODO: it returns 0 if does not exist! FIX...
-		int32_t size = m_files[n]->getFileSize();
+		int32_t size = f->getFileSize();
 		if ( size == -2 ) return -2;
 		if ( size == -1 ) break;
 		totalSize += size;
@ -288,10 +404,12 @@ time_t BigFile::getLastModifiedTime ( ) {
 	// add up the sizes of each file
 	time_t min = -1;
 	for ( int32_t n = 0 ; n < m_maxParts ; n++ ) {
+		// shortcut
+		File *f = getFile2(n);
 		// we can have headless big files... count the heads
-		if ( ! m_files[n] ) continue;
+		if ( ! f ) continue;
 		// returns -1 on error, 0 if file does not exist
-		time_t date = m_files[n]->getLastModifiedTime();
+		time_t date = f->getLastModifiedTime();
 		if ( date == -1 ) return -2;
 		if ( date ==  0 ) break;
 		// check min
@ -388,7 +506,7 @@ bool BigFile::readwrite ( void         *buf      ,
 		log(LOG_LOGIC,"disk: readwrite() offset is %"INT64" "
 		    "< 0. filename=%s/%s. dumping core. try deleting "
 		    "the .map file for it and restarting.",offset,
-		    m_dir,m_baseFilename);
+		    m_dir.getBufStart(),m_baseFilename.getBufStart());
 		char *xx = NULL; *xx = 0;
 	}
 	// if we're not blocking use a fake fstate
@ -406,29 +524,32 @@ bool BigFile::readwrite ( void         *buf      ,
 	int32_t  allocSize;
 	// reset this
 	fstate->m_errno = 0;
+	fstate->m_inPageCache = false;
 	// . try to get as much as we can from page cache first
 	// . the vfd of the big file will be the vfd of its last File class
 	if ( ! doWrite && m_pc && allowPageCache ) {
-		int32_t oldOff  = offset;
+		//int32_t oldOff  = offset;
 		// we have to set these so RdbScan doesn't freak out if we
 		// have it all cached and return without hitting disk
 		fstate->m_bytesDone = size;
 		fstate->m_bytesToGo = size;
+		// sanity
+		if ( m_vfd == -1 ) { char *xx=NULL;*xx=0; }
 		//log("getting pages off=%"INT64" size=%"INT32"",offset,size);
 		// now we pass in a ptr to the buf ptr, because if buf is NULL
 		// this will allocate one for us if it has some pages in the
 		// cache that we can use.
-		m_pc->getPages (m_vfd,(char **)&buf,size,offset,&size,&offset,
-				&allocBuf,&allocSize,allocOff);
+		char *readBuf = m_pc->getPages ( m_vfd, offset, size );
 		//log("got     pages off=%"INT64" size=%"INT32"",offset,size);
-		bufOff = offset - oldOff;
+		//bufOff = offset - oldOff;
 		// comment out for test
-		if ( size == 0 ) {
+		if ( readBuf ) {
 			// let caller/RdbScan know about the newly alloc'd buf
-			fstate->m_buf         = (char *)buf;
-			fstate->m_allocBuf    = allocBuf;
-			fstate->m_allocSize   = allocSize;
-			fstate->m_allocOff    = allocOff;
+			fstate->m_buf         = (char *)readBuf;
+			fstate->m_allocBuf    = readBuf;
+			fstate->m_allocSize   = size;
+			fstate->m_allocOff    = 0;
+			fstate->m_inPageCache = true;
 			return true;
 		}
 		// check
@ -494,8 +615,8 @@ bool BigFile::readwrite ( void         *buf      ,
 	//				&fstate->m_vfd2);
 	fstate->m_fd1  = -3;
 	fstate->m_fd2  = -3;
-	fstate->m_vfd1 = -3;
-	fstate->m_vfd2 = -3;
+	// fstate->m_vfd1 = -3;
+	// fstate->m_vfd2 = -3;
 	// . if we are writing, prevent these fds from being closed on us
 	//   by File::closedLeastUsed(), because the fd could then be re-opened
 	//   by someone else doing a write and we end up writing to THAT FILE!
@ -504,14 +625,12 @@ bool BigFile::readwrite ( void         *buf      ,
 	if ( doWrite ) {
 		// actually have to do the open here for writing so it
 		// can prevent the fds from being closed on us
-		fstate->m_fd1         = getfd ( fstate->m_filenum1 , !doWrite, 
-						&fstate->m_vfd1);
-		fstate->m_fd2         = getfd ( fstate->m_filenum2 , !doWrite, 
-						&fstate->m_vfd2);
+		fstate->m_fd1 = getfd ( fstate->m_filenum1 , !doWrite);
+		fstate->m_fd2 = getfd ( fstate->m_filenum2 , !doWrite);
 		//File *f1 = m_files [ fstate->m_filenum1 ];
 		//File *f2 = m_files [ fstate->m_filenum2 ];
-		enterWriteMode( fstate->m_vfd1 );
-		enterWriteMode( fstate->m_vfd2 );
+		enterWriteMode( fstate->m_fd1 );
+		enterWriteMode( fstate->m_fd2 );
 		fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
 		fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
 	}
@ -603,10 +722,8 @@ bool BigFile::readwrite ( void         *buf      ,
 	// come here if we haven't spawned a thread
 skipThread:
 	// if there was no room in the thread queue, then we must do this here
-	fstate->m_fd1         = getfd ( fstate->m_filenum1 , !doWrite , 
-					&fstate->m_vfd1);
-	fstate->m_fd2         = getfd ( fstate->m_filenum2 , !doWrite , 
-					&fstate->m_vfd2);
+	fstate->m_fd1         = getfd ( fstate->m_filenum1 , !doWrite );
+	fstate->m_fd2         = getfd ( fstate->m_filenum2 , !doWrite );
 	fstate->m_closeCount1 = getCloseCount_r ( fstate->m_fd1 );
 	fstate->m_closeCount2 = getCloseCount_r ( fstate->m_fd2 );
 	// clear g_errno from the failed thread spawn
@ -720,8 +837,8 @@ bool BigFile::readwrite ( void         *buf      ,
 		//File *f2 = m_files [ fstate->m_filenum2 ];
 		//f1->exitWriteMode();
 		//f2->exitWriteMode();
-		exitWriteMode( fstate->m_vfd1 );
-		exitWriteMode( fstate->m_vfd2 );
+		exitWriteMode( fstate->m_fd1 );
+		exitWriteMode( fstate->m_fd2 );
 	}

 	// set this up here
@ -765,9 +882,9 @@ bool BigFile::readwrite ( void         *buf      ,
 	// store read/written pages into page cache
 	if ( ! g_errno && fstate->m_pc )
 		fstate->m_pc->addPages ( fstate->m_vfd       ,
-					 fstate->m_buf       ,
-					 fstate->m_bytesDone ,
 					 fstate->m_offset    ,
+					 fstate->m_bytesDone ,
+					 fstate->m_buf       ,
 					 fstate->m_niceness  );
 	// now log our stuff here
 	if ( g_errno && g_errno != EBADENGINEER ) 
@ -823,8 +940,8 @@ void doneWrapper ( void *state , ThreadEntry *t ) {
 		//File *f2 = THIS->m_files [ fstate->m_filenum2 ];
 		//f1->exitWriteMode();
 		//f2->exitWriteMode();
-		exitWriteMode( fstate->m_vfd1 );
-		exitWriteMode( fstate->m_vfd2 );
+		exitWriteMode( fstate->m_fd1 );
+		exitWriteMode( fstate->m_fd2 );
 	}
 	// if it read less than 8MB/s bitch
 	int64_t took = fstate->m_doneTime - fstate->m_startTime;
@ -849,9 +966,9 @@ void doneWrapper ( void *state , ThreadEntry *t ) {
 	// reference it...
 	if ( ! g_errno && fstate->m_pc )
 		fstate->m_pc->addPages ( fstate->m_vfd       ,
-					 fstate->m_buf       ,
-					 fstate->m_bytesDone ,
 					 fstate->m_offset    ,
+					 fstate->m_bytesDone ,
+					 fstate->m_buf       ,
 					 fstate->m_niceness  );

 	// add the stat
@ -908,12 +1025,13 @@ void doneWrapper ( void *state , ThreadEntry *t ) {
 	if ( g_errno && g_errno != EDISKSTUCK ) {
 		//int fd1  = fstate->m_fd1;
 		//int fd2  = fstate->m_fd2;
-		int vfd1 = fstate->m_vfd1;
-		int vfd2 = fstate->m_vfd2;
-		int ofd1 = getfdFromVfd(vfd1);
-		int ofd2 = getfdFromVfd(vfd2);
-		log(tt,"disk: vfd1=%i s_fds[%i]=%i.",vfd1,vfd1,ofd1);
-		log(tt,"disk: vfd2=%i s_fds[%i]=%i.",vfd2,vfd2,ofd2);
+		//int vfd1 = fstate->m_vfd1;
+		//int vfd2 = fstate->m_vfd2;
+		//int ofd1 = getfdFromVfd(vfd1);
+		//int ofd2 = getfdFromVfd(vfd2);
+		//log(tt,"disk: vfd1=%i s_fds[%i].",vfd1,vfd1);//,ofd1);
+		//log(tt,"disk: vfd2=%i s_fds[%i].",vfd2,vfd2);//,ofd2);
+		log("disk: nondstuckerr=%s",mstrerror(g_errno));
 	}
 	// . this EBADENGINEER can happen right after a merge if
 	//   the file is renamed because the fd may have changed from
@ -1005,13 +1123,14 @@ void *readwriteWrapper_r ( void *state , ThreadEntry *t ) {
 	//pthread_testcancel();

 	// get the two files
-	File *f1 = NULL;
-	File *f2 = NULL;
-	// when we exit, m_this is invalid!!!
-	if ( fstate->m_filenum1 < fstate->m_this->m_maxParts )
-		f1 = fstate->m_this->m_files[fstate->m_filenum1];
-	if ( fstate->m_filenum2 < fstate->m_this->m_maxParts )
-		f2 = fstate->m_this->m_files[fstate->m_filenum2];
+	// mdw: no we can't access bigfile it might be deleted!
+	// File *f1 = NULL;
+	// File *f2 = NULL;
+	// // when we exit, m_this is invalid!!!
+	// if ( fstate->m_filenum1 < fstate->m_this->m_maxParts )
+	// 	f1 = fstate->m_this->getFile2(fstate->m_filenum1);
+	// if ( fstate->m_filenum2 < fstate->m_this->m_maxParts )
+	// 	f2 = fstate->m_this->getFile2(fstate->m_filenum2);

 	// . if open count changed on us our file got unlinked from under us
 	//   and another file was opened with that same fd!!! 
@ -1025,16 +1144,20 @@ void *readwriteWrapper_r ( void *state , ThreadEntry *t ) {
 	//   i saw this happen on gk153... i preserved the core/gb on there
 	//if ( (getCloseCount_r (fstate->m_fd1) != fstate->m_closeCount1 || 
 	//      getCloseCount_r (fstate->m_fd2) != fstate->m_closeCount2   )) {
-	if ( ! f1 || 
-	     ! f2 ||
-	     f1->m_closeCount != fstate->m_closeCount1 || 
-	     f2->m_closeCount != fstate->m_closeCount2   ) {
-
-		int32_t cc1 = -1;
-		int32_t cc2 = -1;
-		if ( f1 ) cc1 = f1->m_closeCount;
-		if ( f2 ) cc2 = f2->m_closeCount;
-		log("file: c1a=%"INT32" c1b=%"INT32" c2a=%"INT32" c2b=%"INT32"",
+	// get current close counts. we can't access BigFile because it
+	// might have been deleted or closed on us, i saw this before.
+	int32_t cc1 = getCloseCount_r ( fstate->m_fd1 );
+	int32_t cc2 = getCloseCount_r ( fstate->m_fd2 );
+	if ( //! f1 || 
+	     //! f2 ||
+	     cc1 != fstate->m_closeCount1 || 
+	     cc2 != fstate->m_closeCount2  ) {
+		// int32_t cc1 = -1;
+		// int32_t cc2 = -1;
+		// if ( f1 ) cc1 = f1->m_closeCount;
+		// if ( f2 ) cc2 = f2->m_closeCount;
+		log("file: c1a=%"INT32" c1b=%"INT32" "
+		    "c2a=%"INT32" c2b=%"INT32"",
 		    cc1,fstate->m_closeCount1,
 		    cc2,fstate->m_closeCount2);
 		    
@ -1171,6 +1294,36 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
 	if ( doWrite ) 	n = pwrite ( fd , p , len , localOffset );
 	else           	n = pread  ( fd , p , len , localOffset );

+	// debug msg
+	if ( g_conf.m_logDebugDisk ) {
+		char *s = "read";
+		if ( fstate->m_doWrite ) s = "wrote";
+		char *t = "no";	// are we blocking?
+		if ( fstate->m_flags & O_NONBLOCK ) t = "yes";
+		// this is bad for real-time threads cuz our unlink() routine 
+		// may have been called by RdbMerge and our m_files may be 
+		// altered 
+		// MDW: don't access m_this in case bigfile was deleted
+		// since we are in a thread
+		log("disk::readwrite: %s %i bytes of %i @ offset %i "
+		    //"from BASEfile=%s "
+		    "(nonBlock=%s) "
+		    "fd %i "
+		    "cc1=%i=?%i cc2=%i=?%i errno=%s",
+		    s,n,len,localOffset,
+		    //fstate->m_this->getFilename(),
+		    t,
+		    fd,
+		    (int)fstate->m_closeCount1 , 
+		    (int)getCloseCount_r ( fstate->m_fd1 ) ,
+		    (int)fstate->m_closeCount2 ,
+		    (int)getCloseCount_r ( fstate->m_fd2 ) ,
+		    mstrerror(errno) );
+		//log("disk::readwrite_r: %s %"INT32" bytes (nonBlock=%s)",
+		//s,n,t);
+		//log("disk::readwrite_r: did %"INT32" bytes", n);
+	}
+
 	// interrupted system call?
 	if ( n < 0 && errno == EINTR ) 
 		goto retry25;
@ -1178,28 +1331,20 @@ bool readwrite_r ( FileState *fstate , ThreadEntry *t ) {
 	// this is thread safe...
 	g_lastDiskReadCompleted = g_now; // gettimeofdayInMilliseconds_r();

-	// debug msg
-	//char *s = "read";
-	//if ( fstate->m_doWrite ) s = "wrote";
-	//char *t = "no";	// are we blocking?
-	//if ( fstate->m_this->getFlags() & O_NONBLOCK ) t = "yes";
-	// this is bad for real-time threads cuz our unlink() routine may
-	// have been called by RdbMerge and our m_files may be altered 
-	//log("disk::readwrite: %s %"INT32" bytes from %s(nonBlock=%s)",s,n,
-	//    m_files[filenum]->getFilename(),t);
-	//log("disk::readwrite_r: %s %"INT32" bytes (nonBlock=%s)", s,n,t);
-	//log("disk::readwrite_r: did %"INT32" bytes", n);
-
 	// . if n is 0 that's strange!!
 	// . i think the fd will have been closed and re-opened on us if this
 	//   happens... usually
 	if (n==0 && len > 0 ) {
-		log("disk: Read of %"INT32" bytes at offset %"INT64" for %s "
-		    "failed because file is too int16_t for that "
+		// MDW: don't access m_this in case bigfile was deleted
+		// since we are in a thread
+		log("disk: Read of %"INT32" bytes at offset %"INT64" "
+		    " failed because file is too short for that "
 		    "offset? Our fd was probably stolen from us by another "
 		    "thread. Will retry. error=%s.",
 		    (int32_t)len,fstate->m_offset,
-		    fstate->m_this->getFilename(),mstrerror(errno));
+		    //fstate->m_this->getDir(),
+		    //fstate->m_this->getFilename(),
+		    mstrerror(errno));
 		errno = EBADENGINEER;
 		return false; // log("disk::read/write: offset too big");
 	}
@ -1259,7 +1404,7 @@ bool BigFile::unlink ( ) {
 }

 bool BigFile::move ( char *newDir ) {
-	return rename ( m_baseFilename , newDir );
+	return rename ( m_baseFilename.getBufStart() , newDir );
 }

 bool BigFile::rename ( char *newBaseFilename , char *newBaseFilenameDir ) {
@ -1330,6 +1475,21 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
 		// into the trash subdir, so we must preserve the full path
 		char *s ;
 		while( (s=strchr(newBaseFilename,'/'))) newBaseFilename = s+1;
+
+		// now this is dynamic to save mem when we have 100,000+ files
+		m_newBaseFilename   .reset();
+		m_newBaseFilenameDir.reset();
+
+		m_newBaseFilename   .setLabel("nbfn");
+		m_newBaseFilenameDir.setLabel("nbfnd");
+
+		if ( ! m_newBaseFilename.safeStrcpy ( newBaseFilename ) )
+			return false;
+		if ( ! m_newBaseFilenameDir.safeStrcpy ( newBaseFilenameDir ) )
+			return false;
+		// in case newBaseFilenameDir was NULL
+		m_newBaseFilenameDir.nullTerm();
+		
 		// close all files -- they close themselves when we call rename
 		// close ();
 		// . set a new base filename for us
@ -1338,12 +1498,12 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
 		//   done (doneWrapper) it will call File::set.
 		// . when all renames have completed then 
 		//   m_bigFile::m_baseFilename will be set to m_newBaseFilename
-		strcpy ( m_newBaseFilename , newBaseFilename );
+		//strcpy ( m_newBaseFilename , newBaseFilename );
 		// save this guy
-		if ( newBaseFilenameDir )
-			strcpy ( m_newBaseFilenameDir , newBaseFilenameDir );
-		else 
-			m_newBaseFilenameDir[0] = '\0';
+		//if ( newBaseFilenameDir )
+		//	strcpy ( m_newBaseFilenameDir , newBaseFilenameDir );
+		//else 
+		//	m_newBaseFilenameDir[0] = '\0';
 		// set the op flag
 		m_isUnlink = false;
 	}
@ -1371,7 +1531,7 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
 		// break out if we should only unlink one part
 		if ( m_part >= 0 && i != m_part ) break;
 		// get the ith file to rename/unlink
-		File *f = m_files[i];
+		File *f = getFile2(i);
 		if ( ! f ) {
 			// one less part to do
 			m_partsRemaining--;
@ -1442,7 +1602,8 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
 	if ( m_isUnlink && part == -1 ) {
 		// release it first, cuz the removeThreads() below
 		// may call QUICKPOLL() and we end up reading from same file!
-		if ( m_pc ) m_pc->rmVfd ( m_vfd );
+		// this is no longer needed since we use rdbcache basically now
+		//if ( m_pc ) m_pc->rmVfd ( m_vfd );
 		// remove all queued threads that point to us that have not
 		// yet been launched
 		g_threads.m_threadQueues[DISK_THREAD].removeThreads(this);
@ -1452,7 +1613,9 @@ bool BigFile::unlinkRename ( // non-NULL for renames, NULL for unlinks
 	// if one blocked, we block, but never return false if !useThread
 	if ( m_numThreads > 0 && useThread ) return false;
 	// . if we launched no threads update OUR base filename right now
-	if ( ! m_isUnlink ) strcpy ( m_baseFilename , m_newBaseFilename );
+	//if ( ! m_isUnlink ) strcpy ( m_baseFilename , m_newBaseFilename );
+	if ( ! m_isUnlink ) 
+		m_baseFilename.set ( m_newBaseFilename.getBufStart() );
 	// we did not block
 	return true;
 }
@ -1473,15 +1636,17 @@ void *renameWrapper_r ( void *state , ThreadEntry *t ) {
 	// . get the new full name for this file
 	// . based on m_dir/m_stripeDir and m_baseFilename
 	char newFilename [ 1024 ];
-	THIS->makeFilename_r ( THIS->m_newBaseFilename    , 
-			       THIS->m_newBaseFilenameDir , 
+	THIS->makeFilename_r ( THIS->m_newBaseFilename.getBufStart()    , 
+			       THIS->m_newBaseFilenameDir.getBufStart() , 
 			       i                          , 
-			       newFilename                );
+			       newFilename                ,
+			       1024 );
 	char oldFilename [ 1024 ];
-	THIS->makeFilename_r ( THIS->m_baseFilename       ,
+	THIS->makeFilename_r ( THIS->m_baseFilename.getBufStart()       ,
 			       NULL                       ,
 			       i                          , 
-			       oldFilename                );
+			       oldFilename                ,
+			       1024 );
 	//if ( m_files[i]->rename ( newFilename ) ) continue;
 	// this returns 0 on success
 	if ( ::rename ( oldFilename , newFilename ) ) {
@ -1552,15 +1717,17 @@ void doneRenameWrapper ( void *state , ThreadEntry *t ) {
 			     THIS->getFilename(),mstrerror(g_errno));
 	// get the ith file we just unlinked
 	int32_t      i = f->m_i;
+	File *fi = THIS->getFile2 ( i );
 	// rename the part if it checks out
-	if ( f == THIS->m_files[i] ) {
+	if ( f == fi ) {
 		// set his new name
 		char newFilename [ 1024 ];
-		THIS->makeFilename_r  ( THIS->m_newBaseFilename,
-					THIS->m_newBaseFilenameDir,
-					i,
-					newFilename);
-		THIS->m_files[i]->set ( newFilename );
+		THIS->makeFilename_r (THIS->m_newBaseFilename.getBufStart(),
+				      THIS->m_newBaseFilenameDir.getBufStart(),
+				      i,
+				      newFilename ,
+				      1024 );
+		fi->set ( newFilename );
 	}
 	// otherwise bitch about it
 	else log(LOG_LOGIC,"disk: Rename had bad file ptr.");
@ -1571,7 +1738,10 @@ void doneRenameWrapper ( void *state , ThreadEntry *t ) {
 	// return if more to do
 	if ( THIS->m_partsRemaining > 0 ) return;
 	// update OUR base filename now after all Files are renamed
-	strcpy ( THIS->m_baseFilename , THIS->m_newBaseFilename );
+	//strcpy ( THIS->m_baseFilename , THIS->m_newBaseFilename );
+	THIS->m_baseFilename.reset();
+	THIS->m_baseFilename.setLabel("nbfnn");
+	THIS->m_baseFilename.safeStrcpy(THIS->m_newBaseFilename.getBufStart());
 	// . all done, call the main callback
 	// . this is NULL if we were not called in a thread
 	if ( THIS->m_callback ) THIS->m_callback ( THIS->m_state );
@ -1595,7 +1765,8 @@ void doneUnlinkWrapper ( void *state , ThreadEntry *t ) {
 	int32_t      i = f->m_i;
 	// . remove the part if it checks out
 	// . this will also close the file when it deletes it
-	if ( f == THIS->m_files[i] ) THIS->removePart ( i );
+	File *fi = THIS->getFile2(i);
+	if ( f == fi ) THIS->removePart ( i );
 	// otherwise bitch about it
 	else log(LOG_LOGIC,"disk: Unlink had bad file ptr.");
 	// bail if more to do
@ -1608,22 +1779,26 @@ void doneUnlinkWrapper ( void *state , ThreadEntry *t ) {
 }

 void BigFile::removePart ( int32_t i ) {
-
-	File *f = m_files[i];
+	//File *f = getFile2(i);
+	File **filePtrs = (File **)m_filePtrsBuf.getBufStart();
+	File *f = filePtrs[i];
 	// . thread should have stored the filename for unlinking
 	// . now delete it from memory
+	//f->destructor();
 	mdelete ( f , sizeof(File) , "BigFile" );
 	delete (f);
 	// and clear from our table
-	m_files[i] = NULL;
+	filePtrs[i] = NULL;
 	// we have one less part
 	m_numParts--;
 	// max part num may be different
 	if ( m_maxParts != i+1 ) return;
 	// set m_maxParts
 	int32_t j;
-	for ( j = i ; j >= 0 ; j-- ) 
-		if ( m_files[j] ) { m_maxParts = j+1; break; }
+	for ( j = i ; j >= 0 ; j-- ) {
+		File *fj = filePtrs[j];
+		if ( fj ) { m_maxParts = j+1; break; }
+	}
 	// may have no more part files left which means no max part num
 	if ( j < 0 ) m_maxParts = 0;
 }
@ -1633,8 +1808,9 @@ void BigFile::removePart ( int32_t i ) {
 // doesn't work.
 bool BigFile::closeFds ( ) {
 	for ( int32_t i = 0 ; i < m_maxParts ; i++ ) {
-		if ( ! m_files[i] ) continue;
-		m_files[i]->close();
+		File *f = getFile2(i);
+		if ( ! f ) continue;
+		f->close();
 	}
 	return true;
 }
@ -1645,28 +1821,41 @@ bool BigFile::close ( ) {
 	// this end up being called again through a sequence of like 20
 	// subroutines, so put a stop to that circle
 	m_isClosing = true;
+	File **filePtrs = (File **)m_filePtrsBuf.getBufStart();
 	for ( int32_t i = 0 ; i < m_maxParts ; i++ ) {
-		if ( ! m_files[i] ) continue;
-		m_files[i]->close();
-		mdelete ( m_files[i] , sizeof(File) , "BigFile" );
-		delete (m_files[i]);
-		m_files[i]   = NULL;
+		File *f = filePtrs[i];
+		if ( ! f ) continue;
+		// remove from our array of File ptrs
+		filePtrs[i]   = NULL;
+		// the destructor calls close, no need to call here
+		//f->close();
+		//f->destructor();
+		// if we were using the stack buf in BigFile then just
+		// call File::destructor()
+		if ( f == (File *)m_littleBuf ) {
+			f->destructor();
+			continue;
+		}
+		// otherwise, delete as we normally would
+		mdelete ( f , sizeof(File) , "BigFile" );
+		delete ( f );
 	}
 	m_numParts   = 0;
 	m_maxParts   = 0;
+
 	// save vfd and pc because removeThreads() actually ends up calling 
 	// the done wrapper, sending back an error reply, shutting down the 
 	// udp server, calling main.cpp::resetAll(), which resets the Rdb and
 	// free this big file
-	DiskPageCache *pc  = m_pc;
-	int32_t           vfd = m_vfd;
+	//DiskPageCache *pc  = m_pc;
+	//int32_t           vfd = m_vfd;

 	// remove all queued threads that point to us that have not
 	// yet been launched
 	g_threads.m_threadQueues[DISK_THREAD].removeThreads(this);
 	// release our pages from the DiskPageCache
 	//if ( m_pc ) m_pc->rmVfd ( m_vfd );
-	if ( pc ) pc->rmVfd ( vfd );
+	//if ( pc ) pc->rmVfd ( vfd );
 	return true;
 }

--- a/BigFile.h
+++ b/BigFile.h
@ -23,10 +23,16 @@ ssize_t gbpwrite(int fd, const void *buf, size_t count, off_t offset);
 //#define MAX_PART_SIZE  (32LL*1024LL*1024LL)

 // have enough part files to do a 2048gig file
-#define MAX_PART_FILES (((2048LL*1000LL*1000LL*1000LL)/MAX_PART_SIZE)+1LL)
+//#define MAX_PART_FILES (((2048LL*1000LL*1000LL*1000LL)/MAX_PART_SIZE)+1LL)
+
+// HACK to save mem. support a 128GB file
+//#define MAX_PART_FILES (((128LL*1000LL*1000LL*1000LL)/MAX_PART_SIZE)+1LL)
+
 // debug define
 //#define MAX_PART_FILES 100

+#define LITTLEBUFSIZE 210
+
 // use this state class for doing non-blocking reads/writes
 #ifdef ASYNCIO
 #include <aio.h> // TODO: use kaio, uses only 4 threads
@ -53,6 +59,8 @@ public:
 	void          (*m_callback) ( void *state ) ;
 	// goes from 0 to 1, the lower the niceness, the higher the priority
 	int32_t            m_niceness;
+	// was it found in the disk page cache?
+	char m_inPageCache;
 	// . if signal is still pending we need to know if BigFile got deleted
 	// . m_files must be NULL terminated
 	//class BigFile **m_files;
@ -76,15 +84,15 @@ public:
 	class DiskPageCache *m_pc;
 	// this is just used for accessing the DiskPageCache, m_pc, it is
 	// a "virtual fd" for this whole file
-	int32_t            m_vfd;
+	int64_t            m_vfd;
 	// test parms
 	//int32_t  m_osize;
 	//char *m_obuf;
 	// for avoiding unlink/reopens while doing a threaded read
 	int32_t m_closeCount1 ;
 	int32_t m_closeCount2 ;
-	int32_t m_vfd1;
-	int32_t m_vfd2;
+	//int32_t m_vfd1;
+	//int32_t m_vfd2;

 	//char m_baseFilename[32];
 	int32_t m_flags;	
@ -147,7 +155,9 @@ class BigFile {
 	int64_t getSize     ( ) { return getFileSize(); };

 	// use the base filename as our filename
-	char *getFilename() { return m_baseFilename; };
+	char *getFilename() { return m_baseFilename.getBufStart(); };
+
+	char *getDir() { return m_dir.getBufStart(); };

 	// . returns false if blocked, true otherwise
 	// . sets g_errno on error
@ -217,12 +227,12 @@ class BigFile {

 	// . opens the nth file if necessary to get it's fd
 	// . returns -1 if none, >=0 on success
-	int getfd ( int32_t n , bool forReading , int32_t *vfd = NULL );
+	int getfd ( int32_t n , bool forReading );//, int32_t *vfd = NULL );

 	// public for wrapper to call
 	//bool readwrite_r ( FileState *fstate );

-	int64_t m_currentOffset;
+	//int64_t m_currentOffset;

 	DiskPageCache *getDiskPageCache ( ) { return m_pc;  };
 	int32_t       getVfd       ( ) { return m_vfd; };
@ -230,13 +240,12 @@ class BigFile {
 	// WARNING: some may have been unlinked from call to chopHead()
 	int32_t getNumParts ( ) { return m_numParts; };

-	File *getFile ( int32_t n ) { return m_files[n]; };
-
 	// makes the filename of part file #n
 	void makeFilename_r ( char *baseFilename    , 
 			      char *baseFilenameDir ,
 			      int32_t  n               , 
-			      char *buf             );
+			      char *buf             ,
+			      int32_t maxBufSize );

 	void removePart ( int32_t i ) ;

@ -253,17 +262,16 @@ class BigFile {
 	// number of parts remaining to be unlinked/renamed
 	int32_t   m_partsRemaining;

-	// rename stores the new name here so we can rename the m_files[i] 
-	// after the rename has completed and the rename thread returns
-	char m_newBaseFilename    [256];
-	// if first char in this dir is 0 then use m_dir
-	char m_newBaseFilenameDir [256];
+	char m_tinyBuf[8];

-	// store our base filename here
-	char m_baseFilename [256];
+	// to hold the array of Files
+	SafeBuf m_filePtrsBuf;
+
+	// enough mem for our first File so we can avoid a malloc
+	char m_littleBuf[LITTLEBUFSIZE];

 	// ptrs to the part files
-	File *m_files [ MAX_PART_FILES ];
+	//File *m_files ;//[ MAX_PART_FILES ];

 	// private: 

@ -299,8 +307,17 @@ class BigFile {

 	//bool unlinkPart ( int32_t n , bool block );

+	File *getFile2 ( int32_t n ) { 
+		if ( n >= m_maxParts ) return NULL;
+		File **filePtrs = (File **)m_filePtrsBuf.getBufStart();
+		File *f = filePtrs[n];
+		//if ( ! f ->calledSet() ) return NULL;
+		// this will be NULL if addPart(n) never called
+		return f;
+	};
+
 	// if part file not created, will create it
-	File *getPartFile ( int32_t n ) { return m_files[n]; };
+	//File *getPartFile2 ( int32_t n ) { return getFile2(n); }

 	// . put a signal on the queue to do reading/writing
 	// . we call readwrite ( FileState *) when we handle the signal
@ -308,9 +325,19 @@ class BigFile {

 	bool reset ( );

-	// store our base filename here
-	char m_dir          [256];
-	char m_stripeDir    [256];
+	// for basefilename to avoid an alloc
+	char m_tmpBaseBuf[32];
+
+	// our most important the directory and filename
+	SafeBuf m_dir      ;//    [256];
+	SafeBuf m_baseFilename ;//[256];
+
+	// rename stores the new name here so we can rename the m_files[i] 
+	// after the rename has completed and the rename thread returns
+	SafeBuf m_newBaseFilename ;//   [256];
+	// if first char in this dir is 0 then use m_dir
+	SafeBuf m_newBaseFilenameDir ;//[256];
+

 	int32_t m_permissions;
 	int32_t m_flags;
@ -322,7 +349,7 @@ class BigFile {

 	class DiskPageCache *m_pc;
 	int32_t             m_vfd;
-	bool             m_vfdAllowed;
+	//bool             m_vfdAllowed;

 	// prevent circular calls to BigFile::close() with this
 	char m_isClosing;
--- a/Blaster.cpp
+++ b/Blaster.cpp
@ -40,7 +40,7 @@ bool Blaster::init(){
 		log("blaster::hashinit failed" ); return 0; }

 	// init the memory class after conf since it gets maxMem from Conf
-	if ( ! g_mem.init ( 200000000 ) ) {
+	if ( ! g_mem.init ( ) ) {//200000000 ) ) {
 		log("blaster::Mem init failed" ); return 0; }
 	// start up log file
 	if ( ! g_log.init( "/tmp/blasterLog" )        ) {
--- a/Cachedb.cpp
+++ b/Cachedb.cpp
@ -41,9 +41,7 @@ bool Cachedb::init ( ) {
 	if ( ! m_pc.init ( m_name ,
 			   m_rdbId, // RDB_CACHEDB,
 			   pcmem    ,
-			   pageSize ,
-			   true     ,  // use shared mem?
-			   false    )) // minimizeDiskSeeks?
+			   pageSize ))
 		return log("db: %s init failed.",m_name);
 	// init the rdb
 	if ( ! m_rdb.init ( g_hostdb.m_dir ,
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -36,6 +36,7 @@ Collectiondb::Collectiondb ( ) {
 	m_numRecs = 0;
 	m_numRecsUsed = 0;
 	m_numCollsSwappedOut = 0;
+	m_initializing = false;
 	//m_lastUpdateTime = 0LL;
 	m_needsSave = false;
 	// sanity
@ -88,16 +89,30 @@ bool Collectiondb::init ( bool isDump ) {
 }
 */

+extern bool g_inAutoSave;
+
 // . save to disk
 // . returns false if blocked, true otherwise
 bool Collectiondb::save ( ) {
 	if ( g_conf.m_readOnlyMode ) return true;
+
+	if ( g_inAutoSave && m_numRecsUsed > 20 && g_hostdb.m_hostId != 0 )
+		return true;
+
 	// which collection rec needs a save
 	for ( int32_t i = 0 ; i < m_numRecs ; i++ ) {
 		if ( ! m_recs[i]              ) continue;
 		// temp debug message
 		//logf(LOG_DEBUG,"admin: SAVING collection #%"INT32" ANYWAY",i);
 		if ( ! m_recs[i]->m_needsSave ) continue;
+
+		// if we core in malloc we won't be able to save the 
+		// coll.conf files
+		if ( m_recs[i]->m_isCustomCrawl && 
+		     g_inMemFunction &&
+		     g_hostdb.m_hostId != 0 )
+			continue;
+
 		//log(LOG_INFO,"admin: Saving collection #%"INT32".",i);
 		m_recs[i]->save ( );
 	}
@ -111,6 +126,9 @@ bool Collectiondb::save ( ) {
 //
 ///////////
 bool Collectiondb::loadAllCollRecs ( ) {
+
+	m_initializing = true;
+
 	char dname[1024];
 	// MDW: sprintf ( dname , "%s/collections/" , g_hostdb.m_dir );
 	sprintf ( dname , "%s" , g_hostdb.m_dir );
@ -172,6 +190,8 @@ bool Collectiondb::loadAllCollRecs ( ) {
 			     0 );
 	}

+	m_initializing = false;
+
 	// note it
 	//log(LOG_INFO,"db: Loaded data for %"INT32" collections. Ranging from "
 	//    "collection #0 to #%"INT32".",m_numRecsUsed,m_numRecs-1);
@ -246,6 +266,26 @@ bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
 		char *xx=NULL;*xx=0;
 	}

+	// also try by #, i've seen this happen too
+	CollectionRec *ocr = getRec ( i );
+	if ( ocr ) {
+		g_errno = EEXIST;
+		log("admin: Collection id %i is in use already by "
+		    "%s, so we can not add %s. moving %s to trash."
+		    ,(int)i,ocr->m_coll,coll,coll);
+		SafeBuf cmd;
+		int64_t now = gettimeofdayInMilliseconds();
+		cmd.safePrintf ( "mv coll.%s.%i trash/coll.%s.%i.%"UINT64
+				 , coll
+				 ,(int)i
+				 , coll
+				 ,(int)i
+				 , now );
+		//log("admin: %s",cmd.getBufStart());
+		gbsystem ( cmd.getBufStart() );
+		return true;
+	}
+
 	// create the record in memory
 	CollectionRec *cr = new (CollectionRec);
 	if ( ! cr ) 
@ -312,6 +352,10 @@ bool Collectiondb::addNewColl ( char *coll ,
 				// to add the same collnum to every shard
 				collnum_t newCollnum ) {

+
+	//do not send add/del coll request until we are in sync with shard!!
+	// just return ETRYAGAIN for the parmlist...
+
 	// ensure coll name is legit
 	char *p = coll;
 	for ( ; *p ; p++ ) {
@ -996,7 +1040,9 @@ bool Collectiondb::deleteRec2 ( collnum_t collnum ) { //, WaitEntry *we ) {
 		bf.set ( bu.getBufStart() );
 		if ( bf.doesExist() ) bf.unlink();
 	}
-	
+
+	// now remove from list of collections that might need a disk merge
+	removeFromMergeLinkedList ( cr );

 	//////
 	//
@ -1061,6 +1107,8 @@ bool Collectiondb::growRecPtrBuf ( collnum_t collnum ) {
 		return true;
 	}

+	m_recPtrBuf.setLabel ("crecptrb");
+
 	// . true here means to clear the new space to zeroes
 	// . this shit works based on m_length not m_capacity
 	if ( ! m_recPtrBuf.reserve ( need2 ,NULL, true ) ) {
@ -1684,6 +1732,8 @@ static CollectionRec g_default;


 CollectionRec::CollectionRec() {
+	m_nextLink = NULL;
+	m_prevLink = NULL;
 	m_spiderCorruptCount = 0;
 	m_collnum = -1;
 	m_coll[0] = '\0';
@ -1900,7 +1950,7 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
 		gbmemcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );


-	if ( ! g_conf.m_doingCommandLine )
+	if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
 		log("coll: Loaded %s (%"INT32") local hasurlsready=%"INT32"",
 		    m_coll,
 		    (int32_t)m_collnum,
@ -1947,7 +1997,7 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
 		// it is binary now
 		gbmemcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );

-	if ( ! g_conf.m_doingCommandLine )
+	if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
 		log("coll: Loaded %s (%"INT32") global hasurlsready=%"INT32"",
 		    m_coll,
 		    (int32_t)m_collnum,
@ -1987,6 +2037,15 @@ bool CollectionRec::load ( char *coll , int32_t i ) {
 	// always turn off gigabits so &s=1000 can do summary skipping
 	if ( m_isCustomCrawl ) m_docsToScanForTopics = 0;

+	// make min to merge smaller than normal since most collections are
+	// small and we want to reduce the # of vfds (files) we have
+	if ( m_isCustomCrawl ) {
+		m_posdbMinFilesToMerge   = 6;
+		m_titledbMinFilesToMerge = 4;
+		m_linkdbMinFilesToMerge  = 3;
+		m_tagdbMinFilesToMerge   = 2;
+	}
+
 	// always turn on distributed spider locking because otherwise
 	// we end up calling Msg50 which calls Msg25 for the same root url
 	// at the same time, thereby wasting massive resources. it is also
@ -3890,7 +3949,7 @@ bool CollectionRec::rebuildUrlFiltersDiffbot() {
 // . it is also called on load of the collection at startup
 bool CollectionRec::rebuildUrlFilters ( ) {

-	if ( ! g_conf.m_doingCommandLine )
+	if ( ! g_conf.m_doingCommandLine && ! g_collectiondb.m_initializing )
 		log("coll: Rebuilding url filters for %s ufp=%s",m_coll,
 		    m_urlFiltersProfile.getBufStart());

--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -174,6 +174,7 @@ class Collectiondb  {

 	int32_t m_numCollsSwappedOut;

+	bool m_initializing;
 	//int64_t            m_lastUpdateTime;
 };

@ -420,6 +421,9 @@ class CollectionRec {
 	int32_t m_dailyMergeStarted; // time_t
 	int32_t m_dailyMergeTrigger;

+	class CollectionRec *m_nextLink;
+	class CollectionRec *m_prevLink;
+
 	char m_dailyMergeDOWList[48];

 	int32_t m_treeCount;
@ -531,6 +535,7 @@ class CollectionRec {
 	char  m_recycleContent          ;
 	char  m_recycleCatdb            ;
 	char  m_getLinkInfo             ; // turn off to save seeks
+	char  m_computeSiteNumInlinks   ;
 	//char  m_recycleLinkInfo2        ; // ALWAYS recycle linkInfo2?
 	//char  m_useLinkInfo2ForQuality  ;
 	char  m_indexInlinkNeighborhoods;
--- a/Conf.cpp
+++ b/Conf.cpp
@ -12,6 +12,8 @@ Conf g_conf;
 Conf::Conf ( ) {
 	m_save = true;
 	m_doingCommandLine = false;
+	// set max mem to 16GB at least until we load on disk
+	m_maxMem = 16000000000;
 }

 // . does this requester have ROOT admin privledges???
@ -285,9 +287,12 @@ bool Conf::init ( char *dir ) { // , int32_t hostId ) {
 	//}

 	// make sure g_mem.maxMem is big enough temporarily
-	if ( g_mem.m_maxMem < 10000000 ) g_mem.m_maxMem = 10000000;
+	g_conf.m_maxMem = 8000000000; // 8gb temp
+
 	bool status = g_parms.setFromFile ( this , fname , NULL , OBJ_CONF );

+	if ( g_conf.m_maxMem < 10000000 ) g_conf.m_maxMem = 10000000;
+
 	// if not there, create it!
 	if ( ! status ) {
 		log("gb: Creating %s from defaults.",fname);
@ -323,7 +328,7 @@ bool Conf::init ( char *dir ) { // , int32_t hostId ) {

 	// update g_mem
 	//g_mem.m_maxMem = g_conf.m_maxMem;
-	if ( ! g_mem.init ( g_conf.m_maxMem ) ) return false;
+	if ( ! g_mem.init ( ) ) return false;
 	// always turn this off
 	g_conf.m_testMem      = false;
 	// and this, in case you forgot to turn it off
@ -527,7 +532,9 @@ bool Conf::save ( ) {
 	g_conf.m_testMem = false;
 	//char fname[1024];
 	//sprintf ( fname , "%sgb.conf.saving", g_hostdb.m_dir );
-	SafeBuf fn;
+	// fix so if we core in malloc/free we can still save conf
+	char fnbuf[1024];
+	SafeBuf fn(fnbuf,1024);
 	fn.safePrintf("%sgb.conf",g_hostdb.m_dir);
 	bool status = g_parms.saveToXml ( (char *)this , 
 					  fn.getBufStart(),
--- a/Conf.h
+++ b/Conf.h
@ -653,6 +653,7 @@ class Conf {
 	bool  m_logDebugDb      ;
 	bool  m_logDebugDirty   ;
 	bool  m_logDebugDisk    ;
+	bool  m_logDebugDiskPageCache;
 	bool  m_logDebugDns     ;
 	bool  m_logDebugDownloads;
 	bool  m_logDebugFacebook;
--- a/Dir.cpp
+++ b/Dir.cpp
@ -50,7 +50,10 @@ bool Dir::open ( ) {
 	close ( );
 	if ( ! m_dirname ) return false;
 retry8:
+	// opendir() calls malloc
+	g_inMemFunction = true;
 	m_dir = opendir ( m_dirname );
+	g_inMemFunction = false;
 	// interrupted system call
 	if ( ! m_dir && errno == EINTR ) goto retry8;

--- a/DiskPageCache.cpp
+++ b/DiskPageCache.cpp
--- a/DiskPageCache.h
+++ b/DiskPageCache.h
@ -1,42 +1,13 @@
-// Matt Wells, Copyright Jan 2004
+// Matt Wells, Copyright Jan 2004-2015

-// . each Rdb has its own m_pageCache member
-// . a ptr to this class is passed to all File::open() calls
-// . that ptr is stored in the File class as File::m_pageCachePtr
-// . the File class uses the virtual file descriptor, vfd, for use with
-//   the pageCache since we tend to open and close files a lot when we run
-//   out of actual fds
-// . every subsequent read/write to that file will then use the pageCache
-// . before doing a read in File::read() we try to increase the offset
-//   by filling the beginning of the buffer with data from the page cache.
-//   We also try to decrease the bytes to read by filling the end of the
-//   buffer. What is left to actually read, if anything, is the middle.
-// . after File::read() completes it call DiskPageCache::storePages (buf,size,off)
-//   to fill the page cache.
-// . when maxMem is reached, the DiskPageCache will unfrequently used pages by 
-//   using a linked list
-// . when File class releases its vfd it must call m_pageCachePtr->close(vfd)
+// . now we just use RdbCache
+// . when a BigFile is first opened we assign it a unique 'vfd' (virtual fd)
+// . to make the rdbcache key we hash this vfd with the read offset and size

-// . we use PAGESIZE defined in RdbMap.h as our page size
-// . TODO: convert PAGESIZE to 8000 not 8192
+#ifndef PAGECACHE_H
+#define PAGECACHE_H

-#ifndef _PAGECACHE_H_
-#define _PAGECACHE_H_
-
-// . use 128 disk megabytes per set of pages
-// . this MUST be a multiple of (PAGE_SIZE+HEADERSIZE) now
-//#define PAGE_SET_SIZE (128*1024*1024)
-//#define PHSIZE (GB_PAGE_SIZE+HEADERSIZE)
-//#define PAGE_SET_SIZE (((128*1024*1024)/PHSIZE)*PHSIZE)
-
-// how many page sets can we have?
-#define MAX_PAGE_SETS 128
-
-// how many BigFiles can be using the same DiskPageCache?
-#include "File.h"
-#define MAX_NUM_VFDS2 MAX_NUM_VFDS
-
-extern void freeAllSharedMem ( int32_t max );
+#include "RdbCache.h"

 class DiskPageCache {

@ -48,180 +19,39 @@ class DiskPageCache {

 	// returns false and sets g_errno if unable to alloc the memory,
 	// true otherwise
-	bool init ( const char *dbname ,
-		    char rdbId, // use 0 for none
-		    int32_t maxMem ,
-		    int32_t pageSize,
-		    bool useRAMDisk = false,
-		    bool minimizeDiskSeeks = false );
-		//    int32_t maxMem ,
-		//    void (*getPages2)(DiskPageCache*, int32_t, char*, int32_t, 
-		//	    	      int64_t, int32_t*, int64_t*) = NULL,
-		//    void (*addPages2)(DiskPageCache*, int32_t, char*, int32_t,
-		//	    	      int64_t) = NULL,
-		//    int32_t (*getVfd2)(DiskPageCache*, int64_t) = NULL,
-		//    void (*rmVfd2)(DiskPageCache*, int32_t) = NULL );
-
-	bool initRAMDisk( const char *dbname, int32_t maxMem );
-
-	int32_t getMemUsed    () ;
-	int32_t getMemAlloced () { return m_memAlloced; };
-	int32_t getMemMax     () { return m_maxMem; };
-
-	int64_t getNumHits   () { return m_hits; };
-	int64_t getNumMisses () { return m_misses; };
-	void      resetStats   () { m_hits = 0 ; m_misses = 0; };
-
-	// verify each page in cache for this file is what is on disk
-	bool verifyData ( class BigFile *f );
-	bool verifyData2 ( int32_t vfd );
-
-	void disableCache ( ) { m_enabled = false; };
-	void enableCache  ( ) { m_enabled = true; };
-
-	// . grow/shrink m_memOff[] which maps vfd/page to a mem offset
-	// . returns false and sets g_errno on error
-	// . called by DiskPageCache::open()/close() respectively
-	// . maxFileSize is so we can alloc m_memOff[vfd] big enough for all
-	//   pages that are in or will be in the file (if it is being created)
-	int32_t getVfd ( int64_t maxFileSize, bool vfdAllowed );
-	void rmVfd  ( int32_t vfd );
+	bool init ( const char *dbname , 
+		    char    rdbId , 
+		    int64_t maxMem ,
+		    int32_t pageSize );

 	// . this returns true iff the entire read was copied into
 	//   "buf" from the page cache
 	// . it will move the used pages to the head of the linked list
-	void getPages   ( int32_t       vfd         ,
-			  char     **buf         ,
-			  int32_t       numBytes    ,
-			  int64_t  offset      ,
-			  int32_t      *newNumBytes ,
-			  int64_t *newOffset   ,
-			  char     **allocBuf    , //we alloc this if buf==NULL
-			  int32_t      *allocSize   , //size of the alloc
-			  int32_t       allocOff    );
+	char *getPages ( int64_t  vfd , 
+			 int64_t  offset , 
+			 int64_t  readSize );

 	// after you read/write from/to disk, copy into the page cache
-	void addPages ( int32_t vfd, char *buf , int32_t numBytes, int64_t offset,
-			int32_t niceness );
-
-
-	// used for minimize disk seeks
-	bool m_minimizeDiskSeeks;
-
-	int32_t m_diskPageSize;
-
-private:
-
-	void addPage   (int32_t vfd,int32_t pageNum,char *page,int32_t size,int32_t skip);
-	void enhancePage ( int32_t poff,char *page,int32_t size,int32_t skip) ;
-	void promotePage ( int32_t poff , bool isNew ) ;
-	void excisePage  ( int32_t poff ) ;
-
-	bool growCache ( int32_t mem );
-
-	//bool needsMerge();
-
-	void writeToCache ( int32_t memOff, int32_t memPageOff, void *inBuf, 
-			    int32_t size );
-	void readFromCache( void *outBuf, int32_t memOff, int32_t memPageOff,
-			    int32_t size );
-
-	char *getMemPtrFromMemOff ( int32_t off );
-
-	// . the pages are here
-	// . there are 1024 page sets
-	// . each page set can have up to 128 megabytes of pages
-	// . much more than that and pthread_create() fails
-	char *m_pageSet     [ MAX_PAGE_SETS ];
-	int32_t  m_pageSetSize [ MAX_PAGE_SETS ];
-	int32_t  m_numPageSets;
-
-	// . next available page offset
-	// . when storing a page we read from disk into a pageSet we first
-	//   try to get a memory offset from m_availMemOff, if none are there
-	//   then we use m_nextMemOff and increment it by PAGE_SIZE+HEADERSIZE
-	// . if m_nextMemOff would breech m_upperMemOff then we call 
-	//   growCache to increase m_upperMemOff
-	// . we try to grow 100k with each call to growCache
-	// . if m_upperMemOff would breech m_maxMem, then we kick out the
-	//   least used page using
-	// . we store a linked list in bytes 4-12 of each page in memory
-	int32_t  m_nextMemOff;  // next available mem offset to hold a page
-	int32_t  m_upperMemOff; // how many bytes are allocated in page sets?
-	int32_t  m_maxMem;   // max we can allocate
-
-	// . available offsets of released pages
-	// . offsets are into the page sets, m_pageSet[]
-	int32_t *m_availMemOff;
-	int32_t  m_numAvailMemOffs;
-	int32_t  m_maxAvailMemOffs;
-
-	// . m_memOffFromDiskPage[vfd][diskPageNum] --> memOff
-	// . maps a vfd and disk page number to a memory offset
-	// . maps to -1 if not in page cache
-	// . try to keep the number of pages down, under 100,000
-	// . 100,000 pages would be about 800 megabytes
-	// . I am only planning on using this for tfndb and Checksumdb so
-	//   we should be under or around this limit
-	int32_t  *m_memOffFromDiskPage [ MAX_NUM_VFDS2 ];
-
-	// . how many offsets are in m_memOffFromDiskPage?
-	// . we have one offset per page in the file
-	int32_t m_maxPagesInFile [ MAX_NUM_VFDS2 ];
-
-	// max number of pages that this file shall have
-	int32_t m_maxPagesPerFile [ MAX_NUM_VFDS2 ];
-	// max number of pages of file currently in the cache
-	int32_t m_numPagesPresentOfFile[ MAX_NUM_VFDS2 ];
-	// mem that has not been used
-	int32_t m_memFree;
-
-	// how much memory is currently allocated?
-	int32_t m_memAlloced;
-
-	// stats (partial hits/misses supported)
-	int64_t m_hits;
-	int64_t m_misses;
-
-	// . linked list boundary info
-	// . linked list is actually stored in bytes 2-8 (next/prev) of pages
-	//   in memory
-	int32_t m_headOff;
-	int32_t m_tailOff;
-
-	// for selecting the next vfd in line and preventing sudden closing
-	// and opening of a vfd, resulting in a thread returning and calling
-	// addPages() for the wrong file!!
-	int32_t m_nexti;
+	bool addPages  ( int64_t  vfd , 
+			 int64_t  offset , 
+			 int64_t  readSize ,
+			 char    *buf , 
+			 char     niceness );

+	void enableCache () { m_enabled = true ; };
+	void disableCache() { m_enabled = false; };
 	bool m_enabled;

-	int32_t m_maxPageSetSize;
-
-	const char *m_dbname;
+	int32_t m_pageSize;
 	char m_rdbId;
-	bool *m_switch;
+	char m_dbname[64];

-	char m_memTag[16];
+	RdbCache m_rc;

-	//bool m_useRAMDisk;
-	//bool m_useSHM;
-
-	//int m_ramfd;
-
-	//int   m_shmids    [ 4096 ];
-	//int32_t  m_shmidSize [ 4096 ];
-	//int32_t  m_numShmids;
-	//int32_t  m_maxAllocSize;
-	//int32_t  m_spageSize;
-
-	// for overriding the disk page cache with custom functions
-	//bool m_isOverriden;
-	//void (*m_getPages2)(DiskPageCache*, int32_t, char*, int32_t, int64_t, 
-	//		    int32_t*, int64_t*);
-	//void (*m_addPages2)(DiskPageCache*, int32_t, char*, int32_t, int64_t);
-	//int32_t (*m_getVfd2)(DiskPageCache*, int64_t);
-	//void (*m_rmVfd2)(DiskPageCache*, int32_t);
+	int64_t getNumHits   () { return m_rc.getNumHits(); }
+	int64_t getNumMisses () { return m_rc.getNumMisses(); }
+	int64_t getMemUsed   () { return m_rc.getMemOccupied(); } 
+	int64_t getMemAlloced() { return m_rc.getMemAlloced(); } 
 };

 #endif
--- a/Errno.cpp
+++ b/Errno.cpp
@ -194,6 +194,7 @@ case EJSONMISSINGLASTCURLY: return "JSON was missing last curly bracket";
 case EADMININTERFERENCE: return "Adminstrative interference";
 case	EDNSERROR        : return "DNS lookup error";
 case ETHREADSDISABLED:return "Threads Disabled";
+case EMALFORMEDQUERY: return "Malformed query";
 	}
 	// if the remote error bit is clear it must be a regulare errno
 	//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
--- a/Errno.h
+++ b/Errno.h
@ -198,6 +198,7 @@ enum {
 	EJSONMISSINGLASTCURLY,
 	EADMININTERFERENCE,
 	EDNSERROR        ,
-	ETHREADSDISABLED
+	ETHREADSDISABLED,
+	EMALFORMEDQUERY
 };
 #endif
--- a/File.cpp
+++ b/File.cpp
--- a/File.h
+++ b/File.h
@ -23,7 +23,8 @@
 // . man, chris has 958 files, lets crank it up from 2k to 5k
 // . boost up to 50,000 since we are hitting this limit with crawlbot
 // . we are hitting again with crawlbot, boost to 200k from 50k
-#define MAX_NUM_VFDS (200*1024)
+// . TODO: make this dynamically allocate based on need
+//#define MAX_NUM_VFDS (1024*1024)

 #include <sys/types.h>       // for open/lseek
 #include <sys/stat.h>        // for open
@ -31,18 +32,21 @@
 #include <sys/stat.h>        // for stat
 #include "Mem.h"             // for g_mem
 #include "Loop.h"            // for g_loop.setNonBlocking(int fd)
+#include "SafeBuf.h"

 int64_t getFileSize ( char *filename ) ;

+int64_t getFileSize_cygwin ( char *filename ) ;
+
 // for avoiding unlink/opens that mess up our threaded read
 int32_t getCloseCount_r ( int fd );

 // prevent fd from being closed on us when we are writing
-void enterWriteMode ( int32_t vfd ) ;
-void exitWriteMode  ( int32_t vfd ) ;
+void enterWriteMode ( int fd ) ;
+void exitWriteMode  ( int fd ) ;
 // error correction routine used by BigFile.cpp
-void releaseVfd     ( int32_t vfd ) ;
-int  getfdFromVfd   ( int32_t vfd ) ;
+//void releaseVfd     ( int32_t vfd ) ;
+//int  getfdFromVfd   ( int32_t vfd ) ;

 class File {

@ -56,6 +60,9 @@ class File {
 	 File ( );
 	~File ( );

+	void constructor();
+	void destructor ();
+
 	// . if you don't need to do a full open then just set the filename
 	// . useful for unlink/rename/reserve/...
 	// . IMPORTANT: if bytes were already reserved can only increase the 
@ -66,8 +73,8 @@ class File {
 	// returns false and sets errno on error, returns true on success
 	bool rename ( char *newFilename );

-	// if m_vfd is negative it's never been opened
-	bool isOpen () { return ( m_vfd >= 0 ); };
+	bool calledOpen () { return m_calledOpen; };
+	bool calledSet  () { return m_calledSet; };

 	bool isNonBlocking () ;

@ -149,11 +156,15 @@ class File {
 	// return -1 if not opened, otherwise, return the opened fd
 	int   getfdNoOpen ( ) ;

+	//char *getFilename ( ) { return m_filename.getBufStart(); };
 	char *getFilename ( ) { return m_filename; };

 	// our filename allocated with strdup
 	// we publicize for ease of use
 	char m_filename [ MAX_FILENAME_LEN ];
+	//SafeBuf m_filename;
+
+	//char m_filenameBuf [ MAX_FILENAME_LEN ];

 	// File::rename() uses this
 	//char m_oldFilename [ MAX_FILENAME_LEN ];
@ -174,18 +185,25 @@ class File {
 	bool closeLeastUsed ( );

 	// THIS file's VIRTUAL descriptor
-	int m_vfd;
+	//int m_vfd;
+
+	// now just the real fd. is -1 if not opened
+	int m_fd;
+

 	// save the permission and flag sets in case of re-opening
 	int m_flags;
 	int m_permissions;
+	
+	char m_calledOpen;
+	char m_calledSet;

 	time_t m_st_mtime;  // file last mod date
 	int32_t   m_st_size;   // file size
 	time_t getLastModifiedDate ( ) ;

-	class File *m_nextActive;
-	class File *m_prevActive;
+	//class File *m_nextActive;
+	//class File *m_prevActive;
 };


--- a/HashTableX.cpp
+++ b/HashTableX.cpp
@ -457,6 +457,9 @@ bool HashTableX::load ( char *dir, char *filename, char **tbuf, int32_t *tsize )

 	// bogus key size?
 	if ( ks <= 0 ) {
+		// is very common for this file so skip it
+		if ( strstr(filename,"ipstouseproxiesfor.dat") )
+			return false;
 		log("htable: reading hashtable from %s%s: "
 		    "bogus keysize of %"INT32"",
 		    dir,filename,ks );
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -2633,6 +2633,9 @@ bool Hostdb::createHostsConf( char *cwd ) {
 	sb.safePrintf("# List of hosts. Limited to 512 from MAX_HOSTS in Hostdb.h. Increase that\n");
 	sb.safePrintf("# if you want more.\n");
 	sb.safePrintf("#\n");
+
+	/*
+
 	sb.safePrintf("# Format:\n");
 	sb.safePrintf("#\n");
 	sb.safePrintf("# first   column: hostID (starts at 0 and increments from there)\n");
@ -2672,6 +2675,7 @@ bool Hostdb::createHostsConf( char *cwd ) {
 	sb.safePrintf("# The working directory is the last string on each line. That is where the\n");
 	sb.safePrintf("# 'gb' binary resides.\n");
 	sb.safePrintf("#\n");
+	*/

 	sb.safePrintf("#\n");
 	sb.safePrintf("# Example of a four-node distributed search index running on a single\n");
@ -2680,7 +2684,7 @@ bool Hostdb::createHostsConf( char *cwd ) {
 	sb.safePrintf("# different ports for each gb instance since they are all on the same\n");
 	sb.safePrintf("# server.\n");
 	sb.safePrintf("#\n");
-	sb.safePrintf("# Use './gb 2' to run as the host on IP 1.2.3.8 for example.\n");
+	//sb.safePrintf("# Use './gb 2' to run as the host on IP 1.2.3.8 for example.\n");
 	sb.safePrintf("#\n");
 	sb.safePrintf("#0 5998 7000 8000 9000 1.2.3.4 1.2.3.5 /home/mwells/host0/\n");
 	sb.safePrintf("#1 5997 7001 8001 9001 1.2.3.4 1.2.3.5 /home/mwells/host1/\n");
@ -2707,6 +2711,7 @@ bool Hostdb::createHostsConf( char *cwd ) {
 	sb.safePrintf("#5 5998 7000 8000 9000 se5 se5b /home/mwells/gigablast/\n");
 	sb.safePrintf("#6 5998 7000 8000 9000 se6 se6b /home/mwells/gigablast/\n");
 	sb.safePrintf("#7 5998 7000 8000 9000 se7 se7b /home/mwells/gigablast/\n");
+	/*
 	sb.safePrintf("\n");
 	sb.safePrintf("\n");
 	sb.safePrintf("# Proxies\n");
@ -2732,6 +2737,7 @@ bool Hostdb::createHostsConf( char *cwd ) {
 	sb.safePrintf("# Example:\n");
 	sb.safePrintf("# A proxy will be running on 10.5.66.18:\n");
 	sb.safePrintf("#proxy 6001 7001 8001 9001 10.5.66.18\n");
+	*/

 	log("%shosts.conf does not exist, creating.",cwd);
 	sb.save ( cwd , "hosts.conf" );
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -1543,7 +1543,7 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 	//   case, as it is only set to true in TcpServer::readSocketWrapper()
 	//   which should never be called by TcpServer::sendMsg() above.
 	//   so let cleanUp know it is no longer valid
-	if ( ! f->isOpen() ) f->open( O_RDONLY );
+	if ( ! f->calledOpen() ) f->open( O_RDONLY );
 	int fd = f->getfd();
 	cleanUp ( f , NULL/*TcpSocket */ );
 	// . AND we need to do this ourselves here
@ -2249,7 +2249,7 @@ int32_t getMsgPiece ( TcpSocket *s ) {
 		char *p = s->m_sendBuf;
 		char *pend = p + s->m_sendBufUsed;
 		// skip if not a doc.234567 filename format
-		if ( ! gb_strcasestr(f->m_filename,"/doc." ) ) p = pend;
+		if ( ! gb_strcasestr(f->getFilename(),"/doc." ) ) p = pend;
 		// do the replace
 		for ( ; p < pend ; p++ ) {
 			if ( strncasecmp(p,"google",6)) continue;
--- a/Indexdb.cpp
+++ b/Indexdb.cpp
@ -112,9 +112,7 @@ bool Indexdb::init ( ) {
 	if ( ! m_pc.init ( "indexdb",
 			   RDB_INDEXDB,
 			   pcmem    ,
-			   pageSize , 
-			   true     ,  // use RAM disk?
-			   false    )) // minimize disk seeks?
+			   pageSize ))
 		return log("db: Indexdb init failed.");

 	// . set our own internal rdb
--- a/Json.cpp
+++ b/Json.cpp
@ -378,6 +378,9 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , int32_t niceness ) {
 	if ( mem != memEnd )
 		log("json: json parser reallocated buffer. inefficient.");

+	// return NULL if no json items were found
+	if ( m_sb.length() <= 0 ) return NULL;
+
 	return (JsonItem *)m_sb.getBufStart();
 }

--- a/Json.h
+++ b/Json.h
@ -90,6 +90,7 @@ class Json {
 	JsonItem *m_stack[MAXJSONPARENTS];
 	int32_t m_stackPtr;
 	class JsonItem *m_prev;
+	void reset() { m_sb.purge(); };
 };

 #endif
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -120,9 +120,7 @@ bool Linkdb::init ( ) {
 	if ( ! m_pc.init ( "linkdb" ,
 			   RDB_LINKDB,
 			   pcmem    ,
-			   pageSize ,
-			   true     ,  // use shared mem?
-			   false    )) // minimizeDiskSeeks?
+			   pageSize ))
 		return log("db: Linkdb init failed.");
 	// init the rdb
 	return m_rdb.init ( g_hostdb.m_dir ,
@ -716,6 +714,14 @@ void  handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
 	// used by sendReply()
 	req->m_udpSlot = slot;

+	if ( g_conf.m_logDebugLinkInfo && req->m_mode == MODE_SITELINKINFO ) {
+		log("linkdb: got msg25 request sitehash64=%"INT64" "
+		    "site=%s "
+		    ,req->m_siteHash64
+		    ,req->ptr_site
+		    );
+	}
+
 	// set up the hashtable if our first time
 	if ( ! g_lineTable.isInitialized() )
 		g_lineTable.set ( 8,sizeof(Msg25Request *),256,
@ -740,7 +746,8 @@ void  handleRequest25 ( UdpSlot *slot , int32_t netnice ) {
 			req->m_next = head->m_next;
 		head->m_next = req;
 		// note it for debugging
-		log("build: msg25 request waiting in line for %s slot=0x%"PTRFMT"",
+		log("build: msg25 request waiting in line for %s "
+		    "udpslot=0x%"PTRFMT"",
 		    req->ptr_url,(PTRTYPE)slot);
 		// we will send a reply back for this guy when done
 		// getting the reply for the head msg25request
@ -1118,9 +1125,9 @@ bool Msg25::doReadLoop ( ) {
 	if ( g_conf.m_logDebugLinkInfo ) {
 		char *ms = "page";
 		if ( m_mode == MODE_SITELINKINFO ) ms = "site";
-		log("msg25: getting full linkinfo mode=%s site=%s url=%s "
-		    "docid=%"INT64"",
-		    ms,m_site,m_url,m_docId);
+		log("msg25: reading linkdb list mode=%s site=%s url=%s "
+		    "docid=%"INT64" linkdbstartkey=%s",
+		    ms,m_site,m_url,m_docId,KEYSTR(&startKey,LDBKS));
 	}

 	m_gettingList = true;
@ -2310,8 +2317,9 @@ bool Msg25::gotLinkText ( Msg20Request *req ) { // LinkTextReply *linkText ) {
 		}
 		// debug
 		if ( g_conf.m_logDebugLinkInfo ) {
-			log("linkdb: recalling round=%"INT32" for %s=%s",
-			    m_round,ms,m_site);
+			log("linkdb: recalling round=%"INT32" for %s=%s "
+			    "req=0x%"PTRFMT" numlinkerreplies=%"INT32,
+			    m_round,ms,m_site,(PTRTYPE)m_req25,m_numReplyPtrs);
 		}
 		// and re-call. returns true if did not block.
 		// returns true with g_errno set on error.
--- a/Log.cpp
+++ b/Log.cpp
@ -222,6 +222,8 @@ bool Log::shouldLog ( int32_t type , char *msg ) {
 	return true;
 }

+bool g_loggingEnabled = true;
+
 // 1GB max log file size
 #define MAXLOGFILESIZE 1000000000
 // for testing:
@ -233,6 +235,8 @@ bool Log::logR ( int64_t now , int32_t type , char *msg , bool asterisk ,
 	// filter if we should
 	//if ( forced ) goto skipfilter;

+	if ( ! g_loggingEnabled )
+		return true;
 	// return true if we should not log this
 	if ( ! forced && ! shouldLog ( type , msg ) ) return true;
 	// skipfilter:
@ -367,7 +371,7 @@ bool Log::logR ( int64_t now , int32_t type , char *msg , bool asterisk ,

 	// . if filesize would be too big then make a new log file
 	// . should make a new m_fd
-	if ( m_logFileSize + tlen+1 > MAXLOGFILESIZE )
+	if ( m_logFileSize + tlen+1 > MAXLOGFILESIZE && g_conf.m_runAsDaemon )
 		makeNewLogFile();

 	if ( m_fd >= 0 ) {
@ -398,9 +402,17 @@ bool Log::logR ( int64_t now , int32_t type , char *msg , bool asterisk ,
 }

 bool Log::makeNewLogFile ( ) {
+
+	// prevent deadlock. don't log since we are in the middle of logging.
+	// otherwise, safebuf, which is used when renaming files, might
+	// call logR().
+	g_loggingEnabled = false;
 	// . rename old log file like log000 to log000-2013_11_04-18:19:32
 	// . returns false on error
-	if ( ! renameCurrentLogFile() ) return false;
+	bool status = renameCurrentLogFile();
+	// re-enable logging since nothing below should call logR() indirectly
+	g_loggingEnabled = true;
+	if ( ! status ) return false;
 	// close old fd
 	if ( m_fd >= 0 ) ::close ( m_fd );
 	// invalidate
--- a/Loop.cpp
+++ b/Loop.cpp
@ -1017,6 +1017,11 @@ void printStackTrace ( int signum , siginfo_t *info , void *ptr ) {
 	logf(LOG_DEBUG,"gb: seg fault. printing stack trace. use "
 	     "'addr2line -e gb' to decode the hex below.");

+	if ( g_inMemFunction ) {
+		logf(LOG_DEBUG,"gb: in mem function not doing backtrace");
+		return;
+	}
+
 	static void *s_bt[200];
 	int sz = backtrace(s_bt, 200);
 	//char **strings = backtrace_symbols(s_bt, sz);
--- a/16
+++ b/16
@ -32,7 +32,7 @@ OBJS =  UdpSlot.o Rebalance.o \
 	Msg39.o Msg3.o \
 	Msg22.o \
 	Msg20.o Msg2.o \
-	Msg1.o Msg35.o \
+	Msg1.o \
 	Msg0.o Mem.o Matches.o Loop.o \
 	Log.o Lang.o \
 	Indexdb.o Posdb.o Clusterdb.o IndexList.o Revdb.o \
@ -86,7 +86,10 @@ STATIC :=
 XMLDOCOPT := -O2
 else
 OS_DEB := true
-STATIC := -static
+# let's remove static now by default to be safe because we don't always
+# detect red hat installs like on aws. do 'make static' to make as static.
+#STATIC := -static
+STATIC :=
 # MDW: i get some parsing inconsistencies when running the first qa injection
 # test if this is -O3. strange.
 # now debian jesse doesn't like -O3, it will core right away when spidering
@ -110,11 +113,13 @@ LIBS = ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a
 # are we a 32-bit architecture? use different libraries then
 else ifeq ($(ARCH), i686)
 CPPFLAGS= -m32 -g -Wall -pipe -fno-stack-protector -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -DPTHREADS -Wno-unused-but-set-variable $(STATIC)
-LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
+#LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
+LIBS=  -lm -lpthread -lssl -lcrypto ./libiconv.a ./libz.a

 else ifeq ($(ARCH), i386)
 CPPFLAGS= -m32 -g -Wall -pipe -fno-stack-protector -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -DPTHREADS -Wno-unused-but-set-variable $(STATIC)
-LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
+#LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
+LIBS=  -lm -lpthread -lssl -lcrypto ./libiconv.a ./libz.a

 else
 #
@ -190,6 +195,9 @@ vclean:
 gb: vclean $(OBJS) main.o $(LIBFILES)
 	$(CC) $(DEFS) $(CPPFLAGS) -o $@ main.o $(OBJS) $(LIBS)

+static: vclean $(OBJS) main.o $(LIBFILES)
+	$(CC) $(DEFS) $(CPPFLAGS) -static -o gb main.o $(OBJS) $(LIBS)
+

 # use this for compiling on CYGWIN: 
 # only for 32bit cygwin right now and
--- a/Mem.cpp
+++ b/Mem.cpp
@ -12,6 +12,9 @@
 //#include "Stats.h"
 #include "Pages.h"

+// uncomment this #define to electric fence just on umsg00 buffers:
+//#define SPECIAL
+
 // put me back
 //#define EFENCE
 //#define EFENCE_SIZE 50000
@ -27,6 +30,8 @@
 #undef calloc
 #undef realloc

+bool g_inMemFunction = false;
+
 // from malloc.c (dlmalloc)
 //void *dlmalloc(size_t);
 //void  dlfree(void*);
@ -68,7 +73,7 @@ extern bool g_isYippy;

 bool freeCacheMem();

-#if defined(EFENCE) || defined(EFENCE_SIZE)
+#if defined(EFENCE) || defined(EFENCE_SIZE) || defined(SPECIAL)
 static void *getElecMem ( int32_t size ) ;
 static void  freeElecMem ( void *p  ) ;
 #endif
@ -246,14 +251,21 @@ void * operator new (size_t size) throw (std::bad_alloc) {
 	//if ( ! g_stats.m_gotLock || g_threads.amThread() ) mutexLock();
 	//else                                               unlock = false;

+	// hack so hostid #0 can use more mem
+	int64_t max = g_conf.m_maxMem;
+	//if ( g_hostdb.m_hostId == 0 )  max += 2000000000;
+
 	// don't go over max
-	if ( g_mem.m_used + (int32_t)size >= g_mem.m_maxMem &&
-	     g_mem.m_maxMem > 1000000 ) {
+	if ( g_mem.m_used + (int32_t)size >= max &&
+	     g_conf.m_maxMem > 1000000 ) {
 		log("mem: new(%"UINT32"): Out of memory.", (uint32_t)size );
 		//if ( unlock ) mutexUnlock();
 		throw std::bad_alloc();
 		//throw 1;
 	}
+
+	g_inMemFunction = true;
+
 #ifdef EFENCE
 	void *mem = getElecMem(size);
 #elif EFENCE_SIZE
@ -266,6 +278,9 @@ void * operator new (size_t size) throw (std::bad_alloc) {
 	//void *mem = dlmalloc ( size );
 	void *mem = sysmalloc ( size );
 #endif
+
+	g_inMemFunction = false;
+
 	int32_t  memLoop = 0;
 newmemloop:
 	//void *mem = s_pool.malloc ( size );
@ -334,13 +349,20 @@ void * operator new [] (size_t size) throw (std::bad_alloc) {
 	//	// return NULL; }
 	//} 
 	
+	// hack so hostid #0 can use more mem
+	int64_t max = g_conf.m_maxMem;
+	//if ( g_hostdb.m_hostId == 0 )  max += 2000000000;
+
 	// don't go over max
-	if ( g_mem.m_used + (int32_t)size >= g_mem.m_maxMem &&
-	     g_mem.m_maxMem > 1000000 ) {
+	if ( g_mem.m_used + (int32_t)size >= max &&
+	     g_conf.m_maxMem > 1000000 ) {
 		log("mem: new(%"UINT32"): Out of memory.", (uint32_t)size );
 		throw std::bad_alloc();
 		//throw 1;
 	}
+
+	g_inMemFunction = true;
+
 #ifdef EFENCE
 	void *mem = getElecMem(size);
 #elif EFENCE_SIZE
@ -354,6 +376,9 @@ void * operator new [] (size_t size) throw (std::bad_alloc) {
 	void *mem = sysmalloc ( size );
 #endif

+	g_inMemFunction = false;
+
+
 	int32_t  memLoop = 0;
 newmemloop:
 	//void *mem = s_pool.malloc ( size );
@ -406,7 +431,7 @@ newmemloop:
 Mem::Mem() {
 	m_used = 0;
 	// assume large max until this gets set for real
-	m_maxMem  = 50000000;
+	//m_maxMem  = 50000000;
 	m_numAllocated = 0;
 	m_numTotalAllocated = 0;
 	m_maxAlloc = 0;
@ -447,17 +472,16 @@ pid_t Mem::getPid() {
 	return s_pid;
 }

-bool Mem::init  ( int64_t maxMem ) { 
+bool Mem::init  ( ) { // int64_t maxMem ) { 
 	// set main process pid
 	s_pid = getpid();
-
 	// . don't swap our memory out, man...
 	// . damn, linux 2.4.17 seems to crash the kernel sometimes w/ this
 	//if ( mlockall( MCL_CURRENT | MCL_FUTURE ) == -1 ) {
 	//	log("Mem::init: mlockall: %s" , strerror(errno) );
 	//	errno = 0;
 	//}
-	m_maxMem  = maxMem;
+	//m_maxMem  = maxMem;
 	// set it 
 	//struct rlimit lim;
 	//lim.rlim_max = maxMem;
@ -530,9 +554,9 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {

 	//validate();

-	// if ( note && note[0] == 'S' && note[1] == 'a' &&
-	//      note[2] == 'f' && size == 13371521 )
-	// 	log("mem: got mystery safebuf");
+	 // if ( note && note[0] == 'S' && note[1] == 'a' &&
+	 //      note[2] == 'f' && size == 1179 )
+	 // 	log("mem: got mystery safebuf");


        //m_memtablesize = 0;//DMEMTABLESIZE;
@ -542,7 +566,8 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
 	if ( ! s_initialized ) {
 		//m_memtablesize = m_maxMem / 6510;
 		// support 1.2M ptrs for now. good for about 8GB
-		m_memtablesize = 3000*1024;//m_maxMem / 6510;
+		// raise from 3000 to 8194 to fix host #1
+		m_memtablesize = 8194*1024;//m_maxMem / 6510;
 		//if ( m_maxMem < 8000000000 ) { char *xx=NULL;*xx=0; }
 	}

@ -600,7 +625,18 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
 		*xx = 0;
 	}

-	if ( ! isnew ) {
+	// umsg00
+	bool useElectricFence = false;
+#ifdef SPECIAL
+	if ( note[0] == 'u' &&
+	     note[1] == 'm' &&
+	     note[2] == 's' &&
+	     note[3] == 'g' &&
+	     note[4] == '0' &&
+	     note[5] == '0' )
+		useElectricFence = true;
+#endif
+	if ( ! isnew && ! useElectricFence ) {
 		for ( int32_t i = 0 ; i < UNDERPAD ; i++ )
 			((char *)mem)[0-i-1] = MAGICCHAR;
 		for ( int32_t i = 0 ; i < OVERPAD ; i++ )
@ -608,7 +644,8 @@ void Mem::addMem ( void *mem , int32_t size , const char *note , char isnew ) {
 	}
 	// hey!
 	if ( s_pid == -1 && m_numTotalAllocated >1000 ) {
-        log(LOG_WARN, "pid is %i and numAllocs is %i", s_pid,  m_numTotalAllocated);
+		log(LOG_WARN, "pid is %i and numAllocs is %i", (int)s_pid,  
+		    (int)m_numTotalAllocated);
        //char *xx=NULL;*xx=0;}
        //	if ( s_pid == -1 && m_numTotalAllocated >1000 ) { char *xx=NULL;*xx=0;}
    }
@ -961,9 +998,10 @@ bool Mem::rmMem  ( void *mem , int32_t size , const char *note ) {
 	if ( size == 0 ) return true;
 	// hey!
 	if ( s_pid == -1 && m_numTotalAllocated >1000 ) {
-        log(LOG_WARN, "pid is %i and numAllocs is %i", s_pid,  m_numTotalAllocated);
+		log(LOG_WARN, "pid is %i and numAllocs is %i", 
+		    (int)s_pid,  (int)m_numTotalAllocated);
        //char *xx=NULL;*xx=0;}
-    }
+	}
 	// threads can't be here!
 	if ( s_pid != -1 && getpid() != s_pid ) {
 		log("mem: rmMem: Called from thread.");
@ -1145,6 +1183,18 @@ int Mem::printBreech ( int32_t i , char core ) {
 	if ( s_labels[i*16+0] == 'T' &&
 	     s_labels[i*16+1] == 'h' &&
 	     !strcmp(&s_labels[i*16  ],"ThreadStack" ) ) return 0;
+#ifdef SPECIAL
+	// for now this is efence. umsg00
+	bool useElectricFence = false;
+	if ( s_labels[i*16+0] == 'u' &&
+	     s_labels[i*16+1] == 'm' &&
+	     s_labels[i*16+2] == 's' &&
+	     s_labels[i*16+3] == 'g' &&
+	     s_labels[i*16+4] == '0' &&
+	     s_labels[i*16+5] == '0' )
+		useElectricFence = true;
+	if ( useElectricFence ) return 0;
+#endif
 	char flag = 0;
 	// check for underruns
 	char *mem = (char *)s_mptrs[i];
@ -1270,6 +1320,9 @@ int Mem::printBreeches ( char core ) {
 	if ( ! s_mptrs ) return 0;
 	// do not bother if no padding at all
 	if ( (int32_t)UNDERPAD == 0 && (int32_t)OVERPAD == 0 ) return 0;
+	
+	log("mem: checking mem for breeches");
+
 	// loop through the whole mem table
 	for ( int32_t i = 0 ; i < (int32_t)m_memtablesize ; i++ )
 		// only check if non-empty
@ -1346,8 +1399,13 @@ void *Mem::gbmalloc ( int size , const char *note ) {
 	} 

 retry:
+
+	// hack so hostid #0 can use more mem
+	int64_t max = g_conf.m_maxMem;
+	//if ( g_hostdb.m_hostId == 0 )  max += 2000000000;
+
 	// don't go over max
-	if ( m_used + size + UNDERPAD + OVERPAD >= m_maxMem ) {
+	if ( m_used + size + UNDERPAD + OVERPAD >= max ) {
 		// try to free temp mem. returns true if it freed some.
 		if ( freeCacheMem() ) goto retry;
 		g_errno = ENOMEM;
@ -1363,6 +1421,8 @@ void *Mem::gbmalloc ( int size , const char *note ) {

 	void *mem;

+	g_inMemFunction = true;
+
 	// to find bug that cores on malloc do this
 	//printBreeches(true);
 	//g_errno=ENOMEM;return (void *)log("Mem::malloc: reached mem limit");}
@ -1375,11 +1435,32 @@ void *Mem::gbmalloc ( int size , const char *note ) {
 		mem = getElecMem(size+0+0);
 	else
 		mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
-#else			
+#else		
+
+#ifdef SPECIAL	
+	// debug where tagrec in xmldoc.cpp's msge0 tag list is overrunning
+	// for umsg00
+	bool useElectricFence = false;
+	if ( note[0] == 'u' &&
+	     note[1] == 'm' &&
+	     note[2] == 's' &&
+	     note[3] == 'g' &&
+	     note[4] == '0' &&
+	     note[5] == '0' ) 
+		useElectricFence = true;
+	if ( useElectricFence ) {
+		mem = getElecMem(size+0+0);
+		addMem ( (char *)mem + 0 , size , note , 0 );
+		return (char *)mem + 0;
+	}
+#endif

 	//void *mem = dlmalloc ( size );
 	mem = (void *)sysmalloc ( size + UNDERPAD + OVERPAD );
 #endif
+
+	g_inMemFunction = false;
+
 	// initialization debug
 	//char *pend = (char *)mem + UNDERPAD + size;
 	//for ( char *p = (char *)mem + UNDERPAD ; p < pend ; p++ )
@ -1406,7 +1487,7 @@ mallocmemloop:
 		static int64_t s_lastTime;
 		static int32_t s_missed = 0;
 		int64_t now = gettimeofdayInMillisecondsLocal();
-		int64_t avail = (int64_t)m_maxMem - 
+		int64_t avail = (int64_t)g_conf.m_maxMem - 
 			(int64_t)m_used;
 		if ( now - s_lastTime >= 1000LL ) {
 			log("mem: system malloc(%i,%s) availShouldBe=%"INT64": "
@ -1510,8 +1591,13 @@ void *Mem::gbrealloc ( void *ptr , int oldSize , int newSize ,
 	//	return NULL;
 	//}
 retry:
+
+	// hack so hostid #0 can use more mem
+	int64_t max = g_conf.m_maxMem;
+	//if ( g_hostdb.m_hostId == 0 )  max += 2000000000;
+
 	// don't go over max
-	if ( m_used + newSize - oldSize >= m_maxMem ) {
+	if ( m_used + newSize - oldSize >= max ) {
 		// try to free temp mem. returns true if it freed some.
 		if ( freeCacheMem() ) goto retry;
 		g_errno = ENOMEM;
@ -1536,6 +1622,34 @@ void *Mem::gbrealloc ( void *ptr , int oldSize , int newSize ,
 	return mem;
 #endif

+
+#ifdef SPECIAL
+	int32_t slot = g_mem.getMemSlot ( ptr );
+	// debug where tagrec in xmldoc.cpp's msge0 tag list is overrunning
+	// for umsg00
+	if ( slot >= 0 ) {
+		char *label = &s_labels[slot*16];
+		bool useElectricFence = false;
+		if ( label[0] == 'u' &&
+		     label[1] == 'm' &&
+		     label[2] == 's' &&
+		     label[3] == 'g' &&
+		     label[4] == '0' &&
+		     label[5] == '0' ) 
+			useElectricFence = true;
+		if ( useElectricFence ) {
+			// just make a new buf
+			mem = (char *)mmalloc ( newSize , note );
+			if ( ! mem ) return NULL;
+			// copy over to it
+			gbmemcpy ( mem , ptr , oldSize );
+			// free the old
+			mfree ( ptr , oldSize , note );
+			return mem;
+		}
+	}
+#endif
+
 	// assume it will be successful. we can't call rmMem() after
 	// calling sysrealloc() because it will mess up our MAGICCHAR buf
 	rmMem  ( ptr , oldSize , note );
@ -1626,11 +1740,38 @@ void Mem::gbfree ( void *ptr , int size , const char *note ) {
 	}
 #endif	

+
+#ifdef SPECIAL
+	g_inMemFunction = true;
+	// debug where tagrec in xmldoc.cpp's msge0 tag list is overrunning
+	// for umsg00
+	bool useElectricFence = false;
+	char *label = &s_labels[slot*16];
+	if ( label[0] == 'u' &&
+	     label[1] == 'm' &&
+	     label[2] == 's' &&
+	     label[3] == 'g' &&
+	     label[4] == '0' &&
+	     label[5] == '0' ) 
+		useElectricFence = true;
+	if ( useElectricFence ) {
+		// this calls rmMem() itself
+		freeElecMem ((char *)ptr - 0 );
+		g_inMemFunction = false;
+		// if this returns false it was an unbalanced free
+		//if ( ! rmMem ( ptr , size , note ) ) return;
+		return;
+	}
+	g_inMemFunction = false;
+#endif
+
 	// if this returns false it was an unbalanced free
 	if ( ! rmMem ( ptr , size , note ) ) return;

+	g_inMemFunction = true;
 	if ( isnew ) sysfree ( (char *)ptr );
 	else         sysfree ( (char *)ptr - UNDERPAD );
+	g_inMemFunction = false;
 }

 int32_t getLowestLitBitLL ( uint64_t bits ) {
@ -2062,6 +2203,7 @@ void *getElecMem ( int32_t size ) {
 	// store the ptrs
 	*(char **)(returnMem- sizeof(char *)) = realMem;
 	*(char **)(returnMem- sizeof(char *)*2) = realMemEnd;
+	//log("protect2 0x%"PTRFMT"\n",(PTRTYPE)protMem);
 	// protect that after we wrote our ptr
 	if ( mprotect ( protMem , MEMPAGESIZE , PROT_NONE) < 0 )
 		log("mem: mprotect failed: %s",mstrerror(errno));
@ -2113,6 +2255,7 @@ void *getElecMem ( int32_t size ) {
 	*(char **)(returnMem- sizeof(char *)*2) = realMemEnd;
 	// sanity
 	if ( returnMem - sizeof(char *)*2 < realMem ) { char *xx=NULL;*xx=0; }
+	//log("protect3 0x%"PTRFMT"\n",(PTRTYPE)protMem);
 	// protect that after we wrote our ptr
 	if ( mprotect ( protMem , MEMPAGESIZE , PROT_NONE) < 0 )
 		log("mem: mprotect failed: %s",mstrerror(errno));
@ -2165,6 +2308,9 @@ void freeElecMem ( void *fakeMem ) {
 	char *oldProtMem = cp + fakeSize;
 #endif

+	// hack
+	//oldProtMem -= 4;
+	//log("unprotect1 0x%"PTRFMT"\n",(PTRTYPE)oldProtMem);
 	// unprotect it
 	if ( mprotect ( oldProtMem , MEMPAGESIZE, PROT_READ|PROT_WRITE) < 0 )
 		log("mem: munprotect failed: %s",mstrerror(errno));
@ -2186,6 +2332,7 @@ void freeElecMem ( void *fakeMem ) {
 	// sanity
 	if ( protMem < realMem ) { char *xx=NULL;*xx=0; }
 	if ( protMem - realMem > (int32_t)MEMPAGESIZE) { char *xx=NULL;*xx=0; }
+	//log("protect1 0x%"PTRFMT"\n",(PTRTYPE)protMem);
 	// before adding it into the ring, protect it
 	if ( mprotect ( protMem , protEnd-protMem, PROT_NONE) < 0 )
 		log("mem: mprotect2 failed: %s",mstrerror(errno));
@ -2199,6 +2346,8 @@ void freeElecMem ( void *fakeMem ) {
 		g_mem.rmMem ( s_freeCursor->m_fakeMem,
 			      s_freeCursor->m_fakeSize,
 			      s_freeCursor->m_note );
+		// log("unprotect2 0x%"PTRFMT"\n",
+		//     (PTRTYPE)s_freeCursor->m_protMem);
 		// unprotect it
 		if ( mprotect (s_freeCursor->m_protMem,
 			       s_freeCursor->m_protSize,
@ -2237,6 +2386,8 @@ void freeElecMem ( void *fakeMem ) {
 		g_mem.rmMem ( s_freeCursor->m_fakeMem,
 			      s_freeCursor->m_fakeSize,
 			      s_freeCursor->m_note );
+		// log("unprotect3 0x%"PTRFMT"\n",
+		//     (PTRTYPE)s_freeCursor->m_protMem);
 		// unprotect it
 		if ( mprotect (s_freeCursor->m_protMem,
 			       s_freeCursor->m_protSize,
--- a/Mem.h
+++ b/Mem.h
@ -17,6 +17,8 @@
 //#include <dmalloc.h>
 //#endif

+extern bool g_inMemFunction;
+
 // we share malloc between threads, so you need to get the lock
 //void mutexLock   ( );
 //void mutexUnlock ( );
@ -81,7 +83,7 @@ class Mem {
 	Mem();
 	~Mem();

-	bool init ( int64_t maxMem );
+	bool init ( );//int64_t maxMem );

 	void  setPid();
 	pid_t getPid();
@ -130,7 +132,7 @@ class Mem {
 	int  printBreeches ( char core ) ;
 	// print mem usage stats
 	int  printMem      ( ) ;
-	void addMem ( void *mem , int32_t size , const char *note , char isnew ) ;
+	void addMem ( void *mem , int32_t size , const char *note, char isnew);
 	bool rmMem  ( void *mem , int32_t size , const char *note ) ;
 	bool lblMem ( void *mem , int32_t size , const char *note );

@ -161,7 +163,7 @@ class Mem {
 	int64_t m_maxAlloced; // at any one time
 	int64_t m_maxAlloc; // the biggest single alloc ever done
 	const char *m_maxAllocBy; // the biggest single alloc ever done
-	int64_t m_maxMem;
+	//int64_t m_maxMem;

 	// shared mem used
 	int64_t m_sharedUsed;
--- a/Monitordb.cpp
+++ b/Monitordb.cpp
@ -26,9 +26,7 @@ bool Monitordb::init ( ) {
 	if ( ! m_pc.init ( "monitordb" ,
 			   RDB_MONITORDB,
 			   pcmem    ,
-			   pageSize ,
-			   true     ,  // use shared mem?
-			   false    )) // minimizeDiskSeeks?
+			   pageSize ))
 		return log("db: Monitordb init failed.");
 	// init the rdb
 	return m_rdb.init ( g_hostdb.m_dir ,
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -2300,7 +2300,8 @@ bool getTestDoc ( char *u , TcpSocket *ts , Msg13Request *r ) {

 	// log it for now
 	//if ( g_conf.m_logDebugSpider )
-		log("test: GOT doc in test cache: %s (%"UINT64")",u,h);
+		log("test: GOT doc in test cache: %s (qa/doc.%"UINT64".html)",
+		    u,h);
 		
 	//fprintf(stderr,"scp gk252:/e/test-spider/doc.%"UINT64".* /home/mwells/gigablast/test-parser/\n",h);

@ -3132,6 +3133,8 @@ bool addToHammerQueue ( Msg13Request *r ) {
 		// we gotta update the crawldelay here in case we modified
 		// it in the above logic.
 		r->m_crawlDelayMS = crawlDelayMS;
+		// when we stored it in the hammer queue
+		r->m_stored = nowms;
 		// add it to queue
 		if ( ! s_hammerQueueHead ) {
 			s_hammerQueueHead = r;
@ -12053,3 +12056,99 @@ char *getRandUserAgent ( int32_t urlIp , int32_t proxyIp , int32_t proxyPort ) {

 	return s_agentList[n];
 }
+
+bool printHammerQueueTable ( SafeBuf *sb ) {
+
+	char *title = "Queued Download Requests";
+	sb->safePrintf ( 
+			 "<table %s>"
+			 "<tr class=hdrow><td colspan=19>"
+			 "<center>"
+			 "<b>%s</b>"
+			 "</td></tr>"
+
+			 "<tr bgcolor=#%s>"
+			 "<td><b>#</td>"
+			 "<td><b>age</td>"
+			 "<td><b>first ip found</td>"
+			 "<td><b>actual ip</td>"
+			 "<td><b>crawlDelayMS</td>"
+			 "<td><b># proxies banning</td>"
+			 
+			 "<td><b>coll</td>"
+			 "<td><b>url</td>"
+
+			 "</tr>\n"
+			 , TABLE_STYLE
+			 , title 
+			 , DARK_BLUE
+			 );
+
+	Msg13Request *r = s_hammerQueueHead ;
+
+	int32_t count = 0;
+	int64_t nowms = gettimeofdayInMilliseconds();
+
+ loop:
+	if ( ! r ) return true;
+
+	// print row
+	sb->safePrintf( "<tr bgcolor=#%s>"
+		       "<td>%i</td>" // #
+		       "<td>%ims</td>" // age in hammer queue
+		       "<td>%s</td>"
+			,LIGHT_BLUE
+		       ,(int)count
+		       ,(int)(nowms - r->m_stored)
+		       ,iptoa(r->m_firstIp)
+		       );
+
+	sb->safePrintf("<td>%s</td>" // actual ip
+		       , iptoa(r->m_urlIp));
+
+	// print crawl delay as link to robots.txt
+	sb->safePrintf( "<td><a href=\"");
+	Url cu;
+	cu.set ( r->ptr_url );
+	bool isHttps = false;
+	if ( cu.m_url && cu.m_url[4] == 's' ) isHttps = true;
+	if ( isHttps ) sb->safeStrcpy ( "https://");
+	else           sb->safeStrcpy ( "http://" );
+        sb->safeMemcpy ( cu.getHost() , cu.getHostLen() );
+	int32_t port = cu.getPort();
+	int32_t defPort = 80;
+	if ( isHttps ) defPort = 443;
+	if ( port != defPort ) sb->safePrintf ( ":%"INT32"",port );
+	sb->safePrintf ( "/robots.txt\">"
+			 "%i"
+			 "</a>"
+			 "</td>" // crawl delay MS
+			 "<td>%i</td>" // proxies banning
+			 , r->m_crawlDelayMS
+			 , r->m_numBannedProxies
+			 );
+
+	// show collection name as a link, also truncate to 32 chars
+	CollectionRec *cr = g_collectiondb.getRec ( r->m_collnum );
+	char *coll = "none";
+	if ( cr ) coll = cr->m_coll;
+	sb->safePrintf("<td>");
+	if ( cr ) {
+		sb->safePrintf("<a href=/admin/sockets?c=");
+		sb->urlEncode(coll);
+		sb->safePrintf(">");
+	}
+	sb->safeTruncateEllipsis ( coll , 32 );
+	if ( cr ) sb->safePrintf("</a>");
+	sb->safePrintf("</td>");
+	// then the url itself
+	sb->safePrintf("<td><a href=%s>",r->ptr_url);
+	sb->safeTruncateEllipsis ( r->ptr_url , 128 );
+	sb->safePrintf("</a></td>");
+	sb->safePrintf("</tr>\n");
+
+	// print next entry now
+	r = r->m_nextLink;
+	goto loop;
+
+}
--- a/Msg13.h
+++ b/Msg13.h
@ -16,6 +16,7 @@
 #define MAX_PROXYCRAWLDELAYMS 60000

 void resetMsg13Caches ( ) ;
+bool printHammerQueueTable ( SafeBuf *sb ) ;

 extern char *g_fakeReply;

@ -55,6 +56,9 @@ public:
 	int64_t m_urlHash48;
 	int32_t  m_firstIp;

+	// when it was stored in the hammer queue
+	int64_t m_stored;
+
 	// a tmp hack var referencing into m_url[] below
 	char *m_proxiedUrl;
 	int32_t  m_proxiedUrlLen;
--- a/Msg3.cpp
+++ b/Msg3.cpp
@ -829,7 +829,7 @@ bool Msg3::doneScanning ( ) {
 		if ( now - s_time > 5 || g_errno != ENOTHREADSLOTS ) {
 			log("net: Had error reading %s: %s. Retrying. "
 			    "(retry #%"INT32")", 
-			    base->m_dbname,mstrerror(g_errno) , m_retryNum );
+			    base->m_dbname,mstrerror(m_errno) , m_retryNum );
 			s_time = now;
 		}
 		// send email alert if in an infinite loop, but don't send
@ -928,19 +928,23 @@ bool Msg3::doneScanning ( ) {
 		// . this returns false and sets g_errno on error
 		// . like if data is corrupt
 		BigFile *ff = base->getFile(m_fileNums[i]);
+		// if we did a merge really quick and delete one of the 
+		// files we were reading, i've seen 'ff' be NULL
+		char *filename = "lostfilename";
+		if ( ff ) filename = ff->getFilename();
 		if ( ! m_lists[i].constrain ( m_startKey       ,
 					      m_constrainKey   , // m_endKey
 					      mrs           , // m_minRecSizes
 					      m_hintOffsets[i] ,
 					      //m_hintKeys   [i] ,
 					      &m_hintKeys   [i*m_ks] ,
-					      ff->getFilename() ,
+					      filename,//ff->getFilename() ,
 					      m_niceness ) ) {
 			log("net: Had error while constraining list read from "
 			    "%s: %s/%s. vfd=%"INT32" parts=%"INT32". "
 			    "This is likely caused by corrupted "
 			    "data on disk.", 
-			    mstrerror(g_errno), ff->m_dir ,
+			    mstrerror(g_errno), ff->getDir(),
 			    ff->getFilename(), ff->m_vfd , 
 			    (int32_t)ff->m_numParts );
 		}
--- a/Msg3a.cpp
+++ b/Msg3a.cpp
@ -1247,6 +1247,13 @@ bool Msg3a::mergeLists ( ) {

 	int32_t need =  nd * (8+sizeof(double)+
 			   sizeof(key_t)+sizeof(DocIdScore *)+1);
+	if ( need < 0 ) {
+		log("msg3a: need is %i, nd = %i is too many docids",
+		    (int)need,(int)nd);
+		g_errno = EBUFTOOSMALL;
+		return true;
+	}
+		
 	// allocate it
 	m_finalBuf     = (char *)mmalloc ( need , "finalBuf" );
 	m_finalBufSize = need;
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -5971,7 +5971,7 @@ bool Msg40::printCSVHeaderRow ( SafeBuf *sb ) {
 	SafeBuf nameBuf (tmp2, 1024);

 	int32_t ct = 0;
-	if ( msg20s[0] ) ct = msg20s[0]->m_r->m_contentType;
+	if ( msg20s[0] && msg20s[0]->m_r ) ct = msg20s[0]->m_r->m_contentType;

 	CollectionRec *cr =g_collectiondb.getRec(m_firstCollnum);

--- a/OldDiskPageCache.cpp
+++ b/OldDiskPageCache.cpp
--- a/OldDiskPageCache.h
+++ b/OldDiskPageCache.h
@ -0,0 +1,227 @@
+// Matt Wells, Copyright Jan 2004
+
+// . each Rdb has its own m_pageCache member
+// . a ptr to this class is passed to all File::open() calls
+// . that ptr is stored in the File class as File::m_pageCachePtr
+// . the File class uses the virtual file descriptor, vfd, for use with
+//   the pageCache since we tend to open and close files a lot when we run
+//   out of actual fds
+// . every subsequent read/write to that file will then use the pageCache
+// . before doing a read in File::read() we try to increase the offset
+//   by filling the beginning of the buffer with data from the page cache.
+//   We also try to decrease the bytes to read by filling the end of the
+//   buffer. What is left to actually read, if anything, is the middle.
+// . after File::read() completes it call DiskPageCache::storePages (buf,size,off)
+//   to fill the page cache.
+// . when maxMem is reached, the DiskPageCache will unfrequently used pages by 
+//   using a linked list
+// . when File class releases its vfd it must call m_pageCachePtr->close(vfd)
+
+// . we use PAGESIZE defined in RdbMap.h as our page size
+// . TODO: convert PAGESIZE to 8000 not 8192
+
+#ifndef _PAGECACHE_H_
+#define _PAGECACHE_H_
+
+// . use 128 disk megabytes per set of pages
+// . this MUST be a multiple of (PAGE_SIZE+HEADERSIZE) now
+//#define PAGE_SET_SIZE (128*1024*1024)
+//#define PHSIZE (GB_PAGE_SIZE+HEADERSIZE)
+//#define PAGE_SET_SIZE (((128*1024*1024)/PHSIZE)*PHSIZE)
+
+// how many page sets can we have?
+#define MAX_PAGE_SETS 128
+
+// how many BigFiles can be using the same DiskPageCache?
+#include "File.h"
+#define MAX_NUM_VFDS2 MAX_NUM_VFDS
+
+extern void freeAllSharedMem ( int32_t max );
+
+class DiskPageCache {
+
+ public:
+
+	DiskPageCache();
+	~DiskPageCache();
+	void reset();
+
+	// returns false and sets g_errno if unable to alloc the memory,
+	// true otherwise
+	bool init ( const char *dbname ,
+		    char rdbId, // use 0 for none
+		    int32_t maxMem ,
+		    int32_t pageSize,
+		    bool useRAMDisk = false,
+		    bool minimizeDiskSeeks = false );
+		//    int32_t maxMem ,
+		//    void (*getPages2)(DiskPageCache*, int32_t, char*, int32_t, 
+		//	    	      int64_t, int32_t*, int64_t*) = NULL,
+		//    void (*addPages2)(DiskPageCache*, int32_t, char*, int32_t,
+		//	    	      int64_t) = NULL,
+		//    int32_t (*getVfd2)(DiskPageCache*, int64_t) = NULL,
+		//    void (*rmVfd2)(DiskPageCache*, int32_t) = NULL );
+
+	bool initRAMDisk( const char *dbname, int32_t maxMem );
+
+	int32_t getMemUsed    () ;
+	int32_t getMemAlloced () { return m_memAlloced; };
+	int32_t getMemMax     () { return m_maxMem; };
+
+	int64_t getNumHits   () { return m_hits; };
+	int64_t getNumMisses () { return m_misses; };
+	void      resetStats   () { m_hits = 0 ; m_misses = 0; };
+
+	// verify each page in cache for this file is what is on disk
+	bool verifyData ( class BigFile *f );
+	bool verifyData2 ( int32_t vfd );
+
+	void disableCache ( ) { m_enabled = false; };
+	void enableCache  ( ) { m_enabled = true; };
+
+	// . grow/shrink m_memOff[] which maps vfd/page to a mem offset
+	// . returns false and sets g_errno on error
+	// . called by DiskPageCache::open()/close() respectively
+	// . maxFileSize is so we can alloc m_memOff[vfd] big enough for all
+	//   pages that are in or will be in the file (if it is being created)
+	int32_t getVfd ( int64_t maxFileSize, bool vfdAllowed );
+	void rmVfd  ( int32_t vfd );
+
+	// . this returns true iff the entire read was copied into
+	//   "buf" from the page cache
+	// . it will move the used pages to the head of the linked list
+	void getPages   ( int32_t       vfd         ,
+			  char     **buf         ,
+			  int32_t       numBytes    ,
+			  int64_t  offset      ,
+			  int32_t      *newNumBytes ,
+			  int64_t *newOffset   ,
+			  char     **allocBuf    , //we alloc this if buf==NULL
+			  int32_t      *allocSize   , //size of the alloc
+			  int32_t       allocOff    );
+
+	// after you read/write from/to disk, copy into the page cache
+	void addPages ( int32_t vfd, char *buf , int32_t numBytes, int64_t offset,
+			int32_t niceness );
+
+
+	// used for minimize disk seeks
+	bool m_minimizeDiskSeeks;
+
+	int32_t m_diskPageSize;
+
+private:
+
+	void addPage   (int32_t vfd,int32_t pageNum,char *page,int32_t size,int32_t skip);
+	void enhancePage ( int32_t poff,char *page,int32_t size,int32_t skip) ;
+	void promotePage ( int32_t poff , bool isNew ) ;
+	void excisePage  ( int32_t poff ) ;
+
+	bool growCache ( int32_t mem );
+
+	//bool needsMerge();
+
+	void writeToCache ( int32_t memOff, int32_t memPageOff, void *inBuf, 
+			    int32_t size );
+	void readFromCache( void *outBuf, int32_t memOff, int32_t memPageOff,
+			    int32_t size );
+
+	char *getMemPtrFromMemOff ( int32_t off );
+
+	// . the pages are here
+	// . there are 1024 page sets
+	// . each page set can have up to 128 megabytes of pages
+	// . much more than that and pthread_create() fails
+	char *m_pageSet     [ MAX_PAGE_SETS ];
+	int32_t  m_pageSetSize [ MAX_PAGE_SETS ];
+	int32_t  m_numPageSets;
+
+	// . next available page offset
+	// . when storing a page we read from disk into a pageSet we first
+	//   try to get a memory offset from m_availMemOff, if none are there
+	//   then we use m_nextMemOff and increment it by PAGE_SIZE+HEADERSIZE
+	// . if m_nextMemOff would breech m_upperMemOff then we call 
+	//   growCache to increase m_upperMemOff
+	// . we try to grow 100k with each call to growCache
+	// . if m_upperMemOff would breech m_maxMem, then we kick out the
+	//   least used page using
+	// . we store a linked list in bytes 4-12 of each page in memory
+	int32_t  m_nextMemOff;  // next available mem offset to hold a page
+	int32_t  m_upperMemOff; // how many bytes are allocated in page sets?
+	int32_t  m_maxMem;   // max we can allocate
+
+	// . available offsets of released pages
+	// . offsets are into the page sets, m_pageSet[]
+	int32_t *m_availMemOff;
+	int32_t  m_numAvailMemOffs;
+	int32_t  m_maxAvailMemOffs;
+
+	// . m_memOffFromDiskPage[vfd][diskPageNum] --> memOff
+	// . maps a vfd and disk page number to a memory offset
+	// . maps to -1 if not in page cache
+	// . try to keep the number of pages down, under 100,000
+	// . 100,000 pages would be about 800 megabytes
+	// . I am only planning on using this for tfndb and Checksumdb so
+	//   we should be under or around this limit
+	int32_t  *m_memOffFromDiskPage [ MAX_NUM_VFDS2 ];
+
+	// . how many offsets are in m_memOffFromDiskPage?
+	// . we have one offset per page in the file
+	int32_t m_maxPagesInFile [ MAX_NUM_VFDS2 ];
+
+	// max number of pages that this file shall have
+	int32_t m_maxPagesPerFile [ MAX_NUM_VFDS2 ];
+	// max number of pages of file currently in the cache
+	int32_t m_numPagesPresentOfFile[ MAX_NUM_VFDS2 ];
+	// mem that has not been used
+	int32_t m_memFree;
+
+	// how much memory is currently allocated?
+	int32_t m_memAlloced;
+
+	// stats (partial hits/misses supported)
+	int64_t m_hits;
+	int64_t m_misses;
+
+	// . linked list boundary info
+	// . linked list is actually stored in bytes 2-8 (next/prev) of pages
+	//   in memory
+	int32_t m_headOff;
+	int32_t m_tailOff;
+
+	// for selecting the next vfd in line and preventing sudden closing
+	// and opening of a vfd, resulting in a thread returning and calling
+	// addPages() for the wrong file!!
+	int32_t m_nexti;
+
+	bool m_enabled;
+
+	int32_t m_maxPageSetSize;
+
+	const char *m_dbname;
+	char m_rdbId;
+	bool *m_switch;
+
+	char m_memTag[16];
+
+	//bool m_useRAMDisk;
+	//bool m_useSHM;
+
+	//int m_ramfd;
+
+	//int   m_shmids    [ 4096 ];
+	//int32_t  m_shmidSize [ 4096 ];
+	//int32_t  m_numShmids;
+	//int32_t  m_maxAllocSize;
+	//int32_t  m_spageSize;
+
+	// for overriding the disk page cache with custom functions
+	//bool m_isOverriden;
+	//void (*m_getPages2)(DiskPageCache*, int32_t, char*, int32_t, int64_t, 
+	//		    int32_t*, int64_t*);
+	//void (*m_addPages2)(DiskPageCache*, int32_t, char*, int32_t, int64_t);
+	//int32_t (*m_getVfd2)(DiskPageCache*, int64_t);
+	//void (*m_rmVfd2)(DiskPageCache*, int32_t);
+};
+
+#endif
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -3545,6 +3545,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      "</td><td>"
 			      "<a href=/crawlbot/download/%s_urls.csv>"
 			      "csv</a>"
+
+			      " <a href=/v3/crawl/download/%s_urls.csv>"
+			      "new csv format</a>"
+			      
 			      "</td>"
 			      "</tr>"

@ -3613,6 +3617,10 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      //, cr->m_coll
 			      //, cr->m_coll

+			      // urls.csv old
+			      , cr->m_coll
+
+			      // urls.csv new format v3
 			      , cr->m_coll

 			      // latest objects in html
@ -3623,7 +3631,6 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      , cr->m_coll
 			      , rand64

-
 			      // latest products in html
 			      , cr->m_coll
 			      , rand64
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -3753,14 +3753,15 @@ bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
 		if ( firstTime ) {
 			sb->safePrintf("<font size=-1>");
 			sb->safePrintf("<table border=1>"
-				      "<tr><td colspan=3>"
+				      "<tr><td colspan=10>"
 				      "<center>"
 				      "<b>Inlinks with Query Terms</b>"
 				      "</center>"
 				      "</td></tr>"
 				      "<tr>"
 				      "<td>Inlink Text</td>"
-				      "<td>From</td>"
+				      "<td>From Site</td>"
+				      "<td>Site IP</td>"
 				      "<td>Site Rank</td>"
 				      "</tr>"
 				      );
@ -3780,7 +3781,13 @@ bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
 		char *host = getHostFast(k->getUrl(),&hostLen,NULL);
 		sb->safePrintf("</td><td>");
 		if ( host ) sb->safeMemcpy(host,hostLen);
-		sb->safePrintf("</td><td>%"INT32"</td></tr>",(int32_t)k->m_siteRank);
+		sb->safePrintf("</td><td>");
+		sb->safePrintf("<a href=/search?c=%s&q=ip%%3A%s"
+			       "+gbsortbyint%%3Agbsitenuminlinks&n=100>"
+			       ,si->m_cr->m_coll,iptoa(k->m_ip));
+		sb->safePrintf("%s</a>",iptoa(k->m_ip));
+		sb->safePrintf("</td><td>%"INT32"</td></tr>"
+			       ,(int32_t)k->m_siteRank);
 		//sb->safePrintf("<br>");
 		printedInlinkText = true;
 		*numPrinted = *numPrinted + 1;
@ -6212,8 +6219,8 @@ bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps ,
 	//int64_t tf1 = ps->m_termFreq1;//sz1 / 10;
 	//int64_t tf2 = ps->m_termFreq2;//sz2 / 10;
 	
-	QueryTerm *qt1 = &msg40->m_msg3a.m_q->m_qterms[qtn1];
-	QueryTerm *qt2 = &msg40->m_msg3a.m_q->m_qterms[qtn2];
+	QueryTerm *qt1 = &q->m_qterms[qtn1];
+	QueryTerm *qt2 = &q->m_qterms[qtn2];

 	//int64_t tf1 = msg40->m_msg3a.m_termFreqs[qtn1];
 	//int64_t tf2 = msg40->m_msg3a.m_termFreqs[qtn2];
@ -6935,7 +6942,7 @@ bool printSingleScore ( SafeBuf *sb ,
 	//int64_t tf = ss->m_termFreq;//ss->m_listSize;
 	int32_t qtn = ss->m_qtermNum;
 	//int64_t tf = msg40->m_msg3a.m_termFreqs[qtn];
-	QueryTerm *qt = &msg40->m_msg3a.m_q->m_qterms[qtn];
+	QueryTerm *qt = &q->m_qterms[qtn];
 	int64_t tf = qt->m_termFreq;
 	float tfw = ss->m_tfWeight;
 	
--- a/PageSockets.cpp
+++ b/PageSockets.cpp
@ -51,6 +51,9 @@ bool sendPageSockets ( TcpSocket *s , HttpRequest *r ) {
 	printUdpTable(&p,"Udp Server (dns)", &g_dns.m_udpServer,
 		      coll,NULL,s->m_ip,true/*isDns?*/);

+	// from msg13.cpp print the queued url download requests
+	printHammerQueueTable ( &p );
+
 	// get # of disks per machine
 	int32_t count = 0;
 	for ( int32_t i = 0 ; i < g_hostdb.getNumHosts(); i++ ) {
@ -221,11 +224,7 @@ void printTcpTable ( SafeBuf* p, char *title, TcpServer *server ) {
 			       "<td>%s</td>"  // ip
 			       "<td>%hu</td>" // port
 			       "<td>%s</td>"  // state
-			       "<td>%"INT32"</td>" // bytes read
-			       "<td>%"INT32"</td>" // bytes to read
-			       "<td>%"INT32"</td>" // bytes sent
-			       "<td>%"INT32"</td>" // bytes to send
-			       "</tr>\n" ,
+			       ,
 			       bg ,
 			       i,
 			       s->m_sd ,
@ -234,11 +233,46 @@ void printTcpTable ( SafeBuf* p, char *title, TcpServer *server ) {
 			       //s->m_timeout ,
 			       iptoa(s->m_ip) ,
 			       s->m_port ,
-			       st ,
-			       s->m_readOffset ,
+			       st );
+
+
+		// tool tip to show top 500 bytes of send buf
+		if ( s->m_readOffset && s->m_readBuf ) {
+			p->safePrintf("<td><a title=\"");
+			SafeBuf tmp;
+			tmp.safeTruncateEllipsis ( s->m_readBuf , 
+						   s->m_readOffset ,
+						   500 );
+			p->htmlEncode ( tmp.getBufStart() );
+			p->safePrintf("\">");
+			p->safePrintf("<u>%"INT32"</u></td>",s->m_readOffset);
+		}
+		else
+			p->safePrintf("<td>0</td>");
+
+		p->safePrintf( "<td>%"INT32"</td>" // bytes to read
+			       "<td>%"INT32"</td>" // bytes sent
+			       ,
 			       s->m_totalToRead ,
-			       s->m_sendOffset  ,
-			       s->m_totalToSend );
+			       s->m_sendOffset
+			       );
+
+		// tool tip to show top 500 bytes of send buf
+		if ( s->m_totalToSend && s->m_sendBuf ) {
+			p->safePrintf("<td><a title=\"");
+			SafeBuf tmp;
+			tmp.safeTruncateEllipsis ( s->m_sendBuf , 
+						   s->m_totalToSend ,
+						   500 );
+			p->htmlEncode ( tmp.getBufStart() );
+			p->safePrintf("\">");
+			p->safePrintf("<u>%"INT32"</u></td>",s->m_totalToSend);
+		}
+		else
+			p->safePrintf("<td>0</td>");
+
+		p->safePrintf("</tr>\n");
+
 	}
 	// end the table
 	p->safePrintf ("</table><br>\n" );
@ -358,7 +392,7 @@ void printUdpTable ( SafeBuf *p, char *title, UdpServer *server ,
 			"<center>"
 			//"<font size=+1>"
 			"<b>%s</b> (%"INT32" transactions)"
-			"(%"INT32" reads ready)"
+			"(%"INT32" requests waiting to processed)"
 			//"</font>"
 			"</td></tr>"
 			"<tr bgcolor=#%s>"
--- a/PageStats.cpp
+++ b/PageStats.cpp
@ -175,9 +175,17 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
 			      "<tr class=poo><td><b>current allocations</b>"
 			      "</td>"
 			      "<td>%"INT32"</td></tr>\n" 
+
+
+			      "<tr class=poo><td><b>max allocations</b>"
+			      "</td>"
+			      "<td>%"INT32"</td></tr>\n" 
+
+
 			      "<tr class=poo><td><b>total allocations</b></td>"
 			      "<td>%"INT64"</td></tr>\n" ,
 			      g_mem.getNumAllocated() ,
+			      g_mem.m_memtablesize ,
 			      (int64_t)g_mem.getNumTotalAllocated() );

 	}
--- a/Parms.cpp
+++ b/Parms.cpp
@ -385,6 +385,13 @@ bool CommandAddColl ( char *rec , char customCrawl ) {
 		return true;
 	}

+	// if ( ! g_parms.m_inSyncWithHost0 ) {
+	// 	log("parms: can not add coll #%i %s until in sync with host 0",
+	// 	    (int)newCollnum,collName);
+	// 	g_errno = EBADENGINEER;
+	// 	return true;
+	// }
+
 	// this saves it to disk! returns false and sets g_errno on error.
 	if ( ! g_collectiondb.addNewColl ( collName,
 					   customCrawl ,
@ -421,6 +428,14 @@ bool CommandResetProxyTable ( char *rec ) {
 // . returns false if would block
 bool CommandDeleteColl ( char *rec , WaitEntry *we ) {
 	collnum_t collnum = getCollnumFromParmRec ( rec );
+
+	// if ( ! g_parms.m_inSyncWithHost0 ) {
+	// 	log("parms: can not del collnum %i until in sync with host 0",
+	// 	    (int)collnum);
+	// 	g_errno = EBADENGINEER;
+	// 	return true;
+	// }
+
 	// the delete might block because the tree is saving and we can't
 	// remove our collnum recs from it while it is doing that
 	if ( ! g_collectiondb.deleteRec2 ( collnum ) )
@ -436,6 +451,14 @@ bool CommandDeleteColl2 ( char *rec , WaitEntry *we ) {
 	char *data = rec + sizeof(key96_t) + 4;
 	char *coll = (char *)data;
 	collnum_t collnum = g_collectiondb.getCollnum ( coll );
+
+	// if ( ! g_parms.m_inSyncWithHost0 ) {
+	// 	log("parms: can not del collnum %i until in sync with host 0",
+	// 	    (int)collnum);
+	// 	g_errno = EBADENGINEER;
+	// 	return true;
+	// }
+
 	if ( collnum < 0 ) {
 		g_errno = ENOCOLLREC;
 		return true;;
@ -671,44 +694,52 @@ bool CommandSpiderTestCont ( char *rec ) {

 // some of these can block a little. if threads are off, a lot!
 bool CommandMerge ( char *rec ) {
+	forceMergeAll ( RDB_POSDB ,1);
+	forceMergeAll ( RDB_TITLEDB ,1);
+	forceMergeAll ( RDB_TAGDB ,1);
+	forceMergeAll ( RDB_SPIDERDB ,1);
+	forceMergeAll ( RDB_LINKDB ,1);
 	// most of these are probably already in good shape
 	//g_checksumdb.getRdb()->attemptMerge (1,true);
-	g_clusterdb.getRdb()->attemptMerge  (1,true); // niceness, force?
-	g_tagdb.getRdb()->attemptMerge     (1,true);
-	g_catdb.getRdb()->attemptMerge      (1,true);
-	//g_tfndb.getRdb()->attemptMerge      (1,true);
-	g_spiderdb.getRdb()->attemptMerge   (1,true);
-	// these 2 will probably need the merge the most
-	g_indexdb.getRdb()->attemptMerge    (1,true);
-	g_datedb.getRdb()->attemptMerge     (1,true);
-	g_titledb.getRdb()->attemptMerge    (1,true);
-	//g_sectiondb.getRdb()->attemptMerge  (1,true);
-	g_statsdb.getRdb()->attemptMerge    (1,true);
-	g_linkdb .getRdb()->attemptMerge    (1,true);
+	// g_clusterdb.getRdb()->attemptMerge  (1,true); // niceness, force?
+	// g_tagdb.getRdb()->attemptMerge     (1,true);
+	// g_catdb.getRdb()->attemptMerge      (1,true);
+	// //g_tfndb.getRdb()->attemptMerge      (1,true);
+	// g_spiderdb.getRdb()->attemptMerge   (1,true);
+	// // these 2 will probably need the merge the most
+	// g_indexdb.getRdb()->attemptMerge    (1,true);
+	// g_datedb.getRdb()->attemptMerge     (1,true);
+	// g_titledb.getRdb()->attemptMerge    (1,true);
+	// //g_sectiondb.getRdb()->attemptMerge  (1,true);
+	// g_statsdb.getRdb()->attemptMerge    (1,true);
+	// g_linkdb .getRdb()->attemptMerge    (1,true);
 	return true;
 }


 bool CommandMergePosdb ( char *rec ) {
-	g_posdb.getRdb()->attemptMerge    (1,true);
+	forceMergeAll ( RDB_POSDB ,1);
+	// set this for each posdb base
 	return true;
 }


 bool CommandMergeSectiondb ( char *rec ) {
-	g_sectiondb.getRdb()->attemptMerge    (1,true); // nice , force
+	//g_sectiondb.getRdb()->attemptMerge    (1,true); // nice , force
 	return true;
 }


 bool CommandMergeTitledb ( char *rec ) {
-	g_titledb.getRdb()->attemptMerge    (1,true);
+	forceMergeAll ( RDB_TITLEDB ,1);
+	//g_titledb.getRdb()->attemptMerge    (1,true);
 	return true;
 }


 bool CommandMergeSpiderdb ( char *rec ) {
-	g_spiderdb.getRdb()->attemptMerge    (1,true);
+	forceMergeAll ( RDB_SPIDERDB ,1);
+	//g_spiderdb.getRdb()->attemptMerge    (1,true);
 	return true;
 }

@ -942,6 +973,7 @@ private:
 Parms::Parms ( ) {
 	m_isDefaultLoaded = false;
 	m_inSyncWithHost0 = false;
+	m_triedToSync     = false;
 }

 void Parms::detachSafeBufs ( CollectionRec *cr ) {
@ -3834,8 +3866,9 @@ bool Parms::saveToXml ( char *THIS , char *f , char objType ) {
 	if ( g_conf.m_readOnlyMode ) return true;
 	// print into buffer
 	// "seeds" can be pretty big so go with safebuf now
-	//char  buf[MAX_CONF_SIZE];
-	SafeBuf sb;
+	// fix so if we core in malloc/free we can still save conf
+	char  tmpbuf[200000];
+	SafeBuf sb(tmpbuf,200000);
 	//char *p    = buf;
 	//char *pend = buf + MAX_CONF_SIZE;
 	int32_t  len ;
@ -5082,18 +5115,6 @@ void Parms::init ( ) {
 	m++;
 	*/

-	m->m_title = "max mem";
-	m->m_desc  = "Mem available to this process. May be exceeded due "
-		"to fragmentation.";
-	m->m_off   = (char *)&g_conf.m_maxMem - g;
-	m->m_def   = "8000000000";
-	m->m_cgi   = "maxmem";
-	m->m_obj   = OBJ_CONF;
-	m->m_page  = PAGE_NONE;
-	m->m_type  = TYPE_LONG_LONG;
-	m->m_flags = PF_NOAPI;
-	m++;
-
 	/*
 	m->m_title = "indexdb split";
 	m->m_desc  = "Number of times to split indexdb across groups. "
@ -9918,6 +9939,18 @@ void Parms::init ( ) {
 	m->m_obj   = OBJ_CONF;
 	m++;

+	m->m_title = "max mem";
+	m->m_desc  = "Mem available to this process. May be exceeded due "
+		"to fragmentation.";
+	m->m_cgi   = "maxmem";
+	m->m_off   = (char *)&g_conf.m_maxMem - g;
+	m->m_def   = "8000000000";
+	m->m_obj   = OBJ_CONF;
+	m->m_page  = PAGE_MASTER; // PAGE_NONE;
+	m->m_type  = TYPE_LONG_LONG;
+	//m->m_flags = PF_NOAPI;
+	m++;
+
 	
 	m->m_title = "max total spiders";
 	m->m_desc  = "What is the maximum number of web "
@ -12401,15 +12434,15 @@ void Parms::init ( ) {
 	m->m_group = 0;
 	m++;

-	m->m_title = "do synchronous writes";
+	m->m_title = "flush disk writes";
 	m->m_desc  = "If enabled then all writes will be flushed to disk. "
-		"This is generally a good thing.";
+		"If not enabled, then gb uses the Linux disk write cache.";
 	m->m_cgi   = "fw";
 	m->m_off   = (char *)&g_conf.m_flushWrites - g;
 	m->m_type  = TYPE_BOOL;
 	m->m_def   = "0";
 	m->m_group = 0;
-	m->m_flags = PF_HIDDEN | PF_NOSAVE;
+	m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
 	m->m_page  = PAGE_MASTER;
 	m->m_obj   = OBJ_CONF;
 	m->m_group = 0;
@ -16094,6 +16127,7 @@ void Parms::init ( ) {

 	m->m_title = "home page";
 	static SafeBuf s_tmpBuf;
+	s_tmpBuf.setLabel("stmpb1");
 	s_tmpBuf.safePrintf (
 			  "Html to display for the home page. "
 			  "Leave empty for default home page. "
@ -16170,6 +16204,7 @@ void Parms::init ( ) {

 	m->m_title = "html head";
        static SafeBuf s_tmpBuf2;
+	s_tmpBuf2.setLabel("stmpb2");
 	s_tmpBuf2.safePrintf("Html to display before the search results. ");
 	char *fff = "Leave empty for default. "
 		"Convenient "
@ -16280,6 +16315,7 @@ void Parms::init ( ) {

 	m->m_title = "html tail";
        static SafeBuf s_tmpBuf3;
+	s_tmpBuf3.setLabel("stmpb3");
 	s_tmpBuf3.safePrintf("Html to display after the search results. ");
 	s_tmpBuf3.safeStrcpy(fff);
 	s_tmpBuf3.htmlEncode (
@ -17279,6 +17315,21 @@ void Parms::init ( ) {
 	m->m_obj   = OBJ_COLL;
 	m++;

+	m->m_title = "compute site num inlinks";
+	m->m_desc  = "If this is true Gigablast will "
+		"compute the number of site inlinks for the sites it "
+		"indexes. It will cache them in tagdb for some time. "
+		"The greater the number of inlinks, the longer the cached "
+		"time, because the site is considered more stable.";
+	m->m_cgi   = "csni";
+	m->m_off   = (char *)&cr.m_computeSiteNumInlinks - x;
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "1";
+	m->m_flags = PF_CLONE|PF_API;//PF_HIDDEN | PF_NOSAVE;
+	m->m_page  = PAGE_SPIDER;
+	m->m_obj   = OBJ_COLL;
+	m++;
+
 	m->m_title = "do link spam checking";
 	m->m_desc  = "If this is true, do not allow spammy inlinks to vote. "
 		"This check is "
@ -19358,6 +19409,16 @@ void Parms::init ( ) {
 	m->m_obj   = OBJ_CONF;
 	m++;

+	m->m_title = "log debug disk page cache";
+	m->m_cgi   = "ldpc";
+	m->m_off   = (char *)&g_conf.m_logDebugDiskPageCache - g;
+	m->m_type  = TYPE_BOOL;
+	m->m_def   = "0";
+	m->m_priv  = 1;
+	m->m_page  = PAGE_LOG;
+	m->m_obj   = OBJ_CONF;
+	m++;
+
 	m->m_title = "log debug dns messages";
 	m->m_cgi   = "lddns";
 	m->m_off   = (char *)&g_conf.m_logDebugDns - g;
@ -20547,7 +20608,17 @@ bool Parms::addCurrentParmToList2 ( SafeBuf *parmList ,

 	//int32_t occNum = -1;
 	key96_t key = makeParmKey ( collnum , m ,  occNum );
-
+	/*
+	// debug it
+	log("parms: adding parm collnum=%i title=%s "
+	    "key=%s datasize=%i data=%s hash=%"UINT32
+	    ,(int)collnum
+	    ,m->m_title
+	    ,KEYSTR(&key,sizeof(key))
+	    ,(int)dataSize
+	    ,data
+	    ,(uint32_t)hash32(data,dataSize));
+	*/
 	// then key
 	if ( ! parmList->safeMemcpy ( &key , sizeof(key) ) )
 		return false;
@ -21684,20 +21755,37 @@ void handleRequest3f ( UdpSlot *slot , int32_t niceness ) {
 //    have with ETRYAGAIN in Msg4.cpp


+void tryToSyncWrapper ( int fd , void *state ) {
+	g_parms.syncParmsWithHost0();
+}
+
 // host #0 just sends back an empty reply, but it will hit us with
 // 0x3f parmlist requests. that way it uses the same mechanism and can
 // guarantee ordering of the parm update requests
 void gotReplyFromHost0Wrapper ( void *state , UdpSlot *slot ) {
 	// ignore his reply unless error?
-	if ( g_errno )
-		log("parms: got error syncing with host 0: %s",
+	if ( g_errno ) {
+		log("parms: got error syncing with host 0: %s. Retrying.",
 		    mstrerror(g_errno));
+		// re-try it!
+		g_parms.m_triedToSync = false;
+	}
+	else {
+		log("parms: synced with host #0");
+		// do not re-call
+		g_loop.unregisterSleepCallback(NULL,tryToSyncWrapper);
+	}
+
 	g_errno = 0;
 }
-	
+
 // returns false and sets g_errno on error, true otherwise
 bool Parms::syncParmsWithHost0 ( ) {

+	if ( m_triedToSync ) return true;
+
+	m_triedToSync = true;
+
 	m_inSyncWithHost0 = false;

 	// dont sync with ourselves
@ -21730,6 +21818,8 @@ bool Parms::syncParmsWithHost0 ( ) {

 	Host *h = g_hostdb.getHost(0);

+	log("parms: trying to sync with host #0");
+
 	// . send it off. use 3e i guess
 	// . host #0 will reply using msg4 really
 	// . msg4 guarantees ordering of requests
@ -21798,6 +21888,9 @@ void handleRequest3e ( UdpSlot *slot , int32_t niceness ) {
 		// get collnum
 		collnum_t c = *(collnum_t *)p;
 		p += sizeof(collnum_t);
+		// then coll NAME hash
+		uint32_t collNameHash32 = *(int32_t *)p;
+		p += 4;
 		// sanity check. -1 means g_conf. i guess.
 		if ( c < -1 ) { char *xx=NULL;*xx=0; }
 		// and parm hash
@ -21807,6 +21900,14 @@ void handleRequest3e ( UdpSlot *slot , int32_t niceness ) {
 		// him to delete it!
 		CollectionRec *cr = NULL;
 		if ( c >= 0 ) cr = g_collectiondb.getRec ( c );
+
+		// if collection names are different delete it
+		if ( cr && collNameHash32 != hash32n ( cr->m_coll ) ) {
+			log("sync: host had collnum %i but wrong name, "
+			    "name not %s like it should be",(int)c,cr->m_coll);
+			cr = NULL;
+		}
+
 		if ( c >= 0 && ! cr ) {
 			// note in log
 			logf(LOG_INFO,"sync: telling host #%"INT32" to delete "
@ -21854,7 +21955,8 @@ void handleRequest3e ( UdpSlot *slot , int32_t niceness ) {
 		if ( cr->m_isCustomCrawl == 2 ) cmdStr = "addBulk";
 		// note in log
 		logf(LOG_INFO,"sync: telling host #%"INT32" to add "
-		     "collnum %"INT32"", hostId,(int32_t)cr->m_collnum);
+		     "collnum %"INT32" coll=%s", hostId,(int32_t)cr->m_collnum,
+		     cr->m_coll);
 		// add the parm rec as a parm cmd
 		if ( ! g_parms.addNewParmToList1 ( &replyBuf,
 						   (collnum_t)i,
@ -21905,17 +22007,25 @@ bool Parms::makeSyncHashList ( SafeBuf *hashList ) {

 	// first do g_conf, collnum -1!
 	for ( int32_t i = -1 ; i < g_collectiondb.m_numRecs ; i++ ) {
+		// shortcut
+		CollectionRec *cr = NULL;
+		if ( i >= 0 ) cr = g_collectiondb.m_recs[i];
 		// skip if empty
-		if ( i >=0 && ! g_collectiondb.m_recs[i] ) continue;
+		if ( i >=0 && ! cr ) continue;
 		// clear since last time
 		tmp.reset();
-		// g_conf?
+		// g_conf? if i is -1 do g_conf
 		if ( ! addAllParmsToList ( &tmp , i ) )
 			return false;
 		// store collnum first as 4 bytes
 		if ( ! hashList->safeMemcpy ( &i , sizeof(collnum_t) ) )
 			return false;
-		// hash that shit
+		// then store the collection name hash, 32 bit hash
+		uint32_t collNameHash32 = 0;
+		if ( cr ) collNameHash32 = hash32n ( cr->m_coll );
+		if ( ! hashList->safeMemcpy ( &collNameHash32, 4 ) )
+			return false;
+		// hash the parms
 		int64_t h64 = hash64 ( tmp.getBufStart(),tmp.length() );
 		// and store it
 		if ( ! hashList->pushLongLong ( h64 ) )
--- a/Parms.h
+++ b/Parms.h
@ -516,6 +516,7 @@ class Parms {
 	//

 	bool m_inSyncWithHost0;
+	bool m_triedToSync;

 	bool m_isDefaultLoaded;

--- a/PingServer.cpp
+++ b/PingServer.cpp
@ -3221,7 +3221,8 @@ void doneSendingNotifyEmailWrapper ( void *state ) {
 	EmailInfo *ei = (EmailInfo *)state;
 	ei->m_notifyBlocked--;
 	// error?
-	log("build: email notification status: %s",mstrerror(g_errno));
+	log("build: email notification status (count=%i) (ei=0x%"PTRFMT"): %s",
+	    (int)ei->m_notifyBlocked,(PTRTYPE)ei,mstrerror(g_errno));
 	// ignore it for rest
 	g_errno = 0;
 	// wait for post url to get done
@ -3236,7 +3237,8 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
 	EmailInfo *ei = (EmailInfo *)state;
 	ei->m_notifyBlocked--;
 	// error?
-	log("build: url notification status: %s",mstrerror(g_errno));
+	log("build: url notification status (count=%i) (ei=0x%"PTRFMT"): %s",
+	    (int)ei->m_notifyBlocked,(PTRTYPE)ei,mstrerror(g_errno));
 	// wait for email to get done
 	if ( ei->m_notifyBlocked > 0 ) return;
 	// unmark it
@ -3253,6 +3255,10 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
 //   or maxToProcess limitation.
 bool sendNotification ( EmailInfo *ei ) {

+	// disable for now
+	//log("ping: NOT SENDING NOTIFICATION -- DEBUG!!");
+	//return true;
+
 	if ( ei->m_inUse ) { char *xx=NULL;*xx=0; }

 	// caller must set this, as well as m_finalCallback/m_finalState
--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -148,9 +148,7 @@ bool Posdb::init ( ) {
 	if ( ! m_pc.init ( "posdb",
 			   RDB_POSDB,
 			   pcmem    ,
-			   pageSize , 
-			   true     ,  // use RAM disk?
-			   false    )) // minimize disk seeks?
+			   pageSize ))
 		return log("db: Posdb init failed.");

 	// . set our own internal rdb
--- a/Process.cpp
+++ b/Process.cpp
@ -53,6 +53,7 @@
 // normally in seo.cpp, but here so it compiles
 SafeBuf    g_qbuf;
 int32_t       g_qbufNeedSave = 0;
+bool g_inAutoSave;

 // for resetAll()
 //#include "Msg6.h"
@ -467,6 +468,7 @@ Process::Process ( ) {
 }

 bool Process::init ( ) {
+	g_inAutoSave = false;
 	// -1 means unknown
 	m_diskUsage = -1.0;
 	m_diskAvail = -1LL;
@ -1331,7 +1333,9 @@ void processSleepWrapper ( int fd , void *state ) {
 	g_process.m_lastSaveTime = nextLastSaveTime;//now;
 	// save everything
 	logf(LOG_INFO,"db: Autosaving.");
+	g_inAutoSave = 1;
 	g_process.save();
+	g_inAutoSave = 0;
 }

 bool Process::save ( ) {
@ -1874,9 +1878,10 @@ bool Process::saveBlockingFiles1 ( ) {
 	if ( g_hostdb.m_myHost && g_hostdb.m_myHost->m_isProxy )
 		g_proxy.saveUserBufs();

-	// save the Conf file now
+	// save the gb.conf file now
 	g_conf.save();
 	// save the conf files
+	// if autosave and we have over 20 colls, just make host #0 do it
        g_collectiondb.save();
 	// . save repair state
 	// . this is repeated above too
--- a/Query.cpp
+++ b/Query.cpp
@ -618,7 +618,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 		     qw->m_userTypePhrase   == 'a'  ) continue;
 		nqt++;
 	}
-	// count phrase terms too!!!
+	// count single terms
 	for ( int32_t i = 0 ; i < m_numWords; i++ ) {
 		QueryWord *qw  = &m_qwords[i];
 		if ( qw->m_ignoreWord && 
@ -705,7 +705,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 	}


-	//char u8Buf[256]; 
+	// count phrase terms
 	for ( int32_t i = 0 ; i < m_numWords ; i++ ) {
 		// break out if no more explicit bits!
 		/*
@ -1019,6 +1019,13 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 		if (fieldLen > 0) {
 			qt->m_term    = m_qwords[fieldStart].m_word;
 			qt->m_termLen = fieldLen;
+			// fix for query
+			// text:""  foo bar   ""
+			if ( pw-1 < i ) {
+				log("query: bad query %s",m_orig);
+				g_errno = EMALFORMEDQUERY;
+				return false;
+			}
 			// skip past the end of the field value
 			i = pw-1;
 		}
@ -2702,6 +2709,19 @@ bool Query::setQWords ( char boolFlag ,
 				for ( ; s < send && *s != '-' ; s++ );
 				// stop if not hyphen
 				if ( *s != '-' ) break;
+
+                                // If the first character is a hyphen, check
+                                // if its part of a negative number. If it is,
+                                // don't consider it a hyphen
+                                if ( sav == s && is_digit(s[1]) ) {
+                                  // Read the entire negative number
+                                  char *s2 = s + 1;
+                                  for ( ; s2 < send && is_digit(s2[0]); s2++);
+                                  // If there's a hyphen after the negative
+                                  // number, use that as the hyphen separator
+                                  if ( *s2 == '-' ) s = s2;
+                                }
+
 				// skip hyphen
 				s++;
 				// must be a digit or . or - or *
@ -2746,6 +2766,23 @@ bool Query::setQWords ( char boolFlag ,
 				for ( ; s < send && *s != '-' ; s++ );
 				// stop if not hyphen
 				if ( *s != '-' ) break;
+
+                                // If the first character is a hyphen, check
+                                // if its part of a negative number. If it is,
+                                // don't consider it a hyphen
+                                if ( sav == s && (is_digit(s[1]) ||
+						  (s[1] == '.' &&
+						   s + 2 < send &&
+						   is_digit(s[2]))) ) {
+                                  // Read the entire negative number
+                                  char *s2 = s + 1;
+                                  for ( ; s2 < send &&
+					  (is_digit(s2[0]) || s2[0] == '.'); s2++);
+                                  // If there's a hyphen after the negative
+                                  // number, use that as the hyphen separator
+                                  if ( *s2 == '-' ) s = s2;
+                                }
+ 
 				// save that
 				char *cma = s;
 				// skip hyphen
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -1267,6 +1267,7 @@ bool Rdb::dumpTree ( int32_t niceness ) {
 	if ( m_isTitledb && max > 240 ) max = 240;
 	// . keep the number of files down
 	// . dont dump all the way up to the max, leave one open for merging
+	/*
 	for ( int32_t i = 0 ; i < getNumBases() ; i++ ) {
 		CollectionRec *cr = g_collectiondb.m_recs[i];
 		if ( ! cr ) continue;
@ -1279,7 +1280,7 @@ bool Rdb::dumpTree ( int32_t niceness ) {
 			break;
 		}
 	}
-
+	*/
 	// . wait for all unlinking and renaming activity to flush out
 	// . we do not want to dump to a filename in the middle of being
 	//   unlinked
@ -1450,7 +1451,6 @@ bool Rdb::dumpCollLoop ( ) {
 loop:
 	// if no more, we're done...
 	if ( m_dumpCollnum >= getNumBases() ) return true;
-
 	// the only was g_errno can be set here is from a previous dump
 	// error?
 	if ( g_errno ) {
@ -1548,7 +1548,7 @@ bool Rdb::dumpCollLoop ( ) {
 				m_dbname,mstrerror(g_errno) );

 	log(LOG_INFO,"build: Dumping to %s/%s for coll \"%s\".",
-	    base->m_files[m_fn]->m_dir,
+	    base->m_files[m_fn]->getDir(),
 	    base->m_files[m_fn]->getFilename() , 
 	    g_collectiondb.getCollName ( m_dumpCollnum ) );
 	// . append it to "sync" state we have in memory
@ -1667,8 +1667,51 @@ bool Rdb::dumpCollLoop ( ) {
 	goto loop;
 }	

+static CollectionRec *s_mergeHead = NULL;
+static CollectionRec *s_mergeTail = NULL;
+static bool s_needsBuild = true;
+
+void addCollnumToLinkedListOfMergeCandidates ( collnum_t dumpCollnum ) {
+	// add this collection to the linked list of merge candidates
+	CollectionRec *cr = g_collectiondb.getRec ( dumpCollnum );
+	if ( ! cr ) return;
+	// do not double add it, if already there just return
+	if ( cr->m_nextLink ) return;
+	if ( cr->m_prevLink ) return;
+	if ( s_mergeTail && cr ) {
+		s_mergeTail->m_nextLink = cr;
+		cr         ->m_nextLink = NULL;
+		cr         ->m_prevLink = s_mergeTail;
+		s_mergeTail = cr;
+	}
+	else if ( cr ) {
+		cr->m_prevLink = NULL;
+		cr->m_nextLink = NULL;
+		s_mergeHead = cr;
+		s_mergeTail = cr;
+	}
+}
+
+// this is also called in Collectiondb::deleteRec2()
+void removeFromMergeLinkedList ( CollectionRec *cr ) {
+	CollectionRec *prev = cr->m_prevLink;
+	CollectionRec *next = cr->m_nextLink;
+	cr->m_prevLink = NULL;
+	cr->m_nextLink = NULL;
+	if ( prev ) prev->m_nextLink = next;
+	if ( next ) next->m_prevLink = prev;
+	if ( s_mergeTail == cr ) s_mergeTail = prev;
+	if ( s_mergeHead == cr ) s_mergeHead = next;
+}
+
 void doneDumpingCollWrapper ( void *state ) {
 	Rdb *THIS = (Rdb *)state;
+
+	// we just finished dumping to a file, 
+	// so allow it to try to merge again.
+	//RdbBase *base = THIS->getBase(THIS->m_dumpCollnum);
+	//if ( base ) base->m_checkedForMerge = false;
+
 	// return if the loop blocked
 	if ( ! THIS->dumpCollLoop() ) return;
 	// otherwise, call big wrapper
@ -1717,66 +1760,173 @@ void Rdb::doneDumping ( ) {
 	attemptMergeAll(0,NULL);
 }

+void forceMergeAll ( char rdbId , char niceness ) {
+	// set flag on all RdbBases
+	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
+		// we need this quickpoll for when we got 20,000+ collections
+		QUICKPOLL ( niceness );
+		CollectionRec *cr = g_collectiondb.m_recs[i];
+		if ( ! cr ) continue;
+		RdbBase *base = cr->getBase ( rdbId );
+		if ( ! base ) continue;
+		base->m_nextMergeForced = true;
+	}
+	// rebuild the linked list
+	s_needsBuild = true;
+	// and try to merge now
+	attemptMergeAll2 ();
+}
+
 // this should be called every few seconds by the sleep callback, too
 void attemptMergeAll ( int fd , void *state ) {
-
-	if ( state && g_conf.m_logDebugDb ) state = NULL;
-	//g_checksumdb.getRdb()->attemptMerge ( 1 , false , !state);
-	g_linkdb.getRdb()->attemptMerge     ( 1 , false , !state);
-	//g_sectiondb.getRdb()->attemptMerge    ( 1 , false , !state);
-	//g_indexdb.getRdb()->attemptMerge    ( 1 , false , !state);
-	g_posdb.getRdb()->attemptMerge    ( 1 , false , !state);
-	//g_datedb.getRdb()->attemptMerge     ( 1 , false , !state);
-	g_titledb.getRdb()->attemptMerge    ( 1 , false , !state);
-	//g_tfndb.getRdb()->attemptMerge      ( 1 , false , !state);
-	g_tagdb.getRdb()->attemptMerge     ( 1 , false , !state);
-	g_catdb.getRdb()->attemptMerge      ( 1 , false , !state);
-	g_clusterdb.getRdb()->attemptMerge  ( 1 , false , !state);
-	g_statsdb.getRdb()->attemptMerge    ( 1 , false , !state);
-	g_syncdb.getRdb()->attemptMerge    ( 1 , false , !state);
-	//g_placedb.getRdb()->attemptMerge     ( 1 , false , !state);
-	g_doledb.getRdb()->attemptMerge     ( 1 , false , !state);
-	//g_revdb.getRdb()->attemptMerge     ( 1 , false , !state);
-	g_spiderdb.getRdb()->attemptMerge   ( 1 , false , !state);
-	g_cachedb.getRdb()->attemptMerge     ( 1 , false , !state);
-	g_serpdb.getRdb()->attemptMerge     ( 1 , false , !state);
-	g_monitordb.getRdb()->attemptMerge     ( 1 , false , !state);
-	// if we got a rebuild going on
-	g_spiderdb2.getRdb()->attemptMerge   ( 1 , false , !state);
-	//g_checksumdb2.getRdb()->attemptMerge ( 1 , false , !state);
-	//g_indexdb2.getRdb()->attemptMerge    ( 1 , false , !state);
-	g_posdb2.getRdb()->attemptMerge    ( 1 , false , !state);
-	//g_datedb2.getRdb()->attemptMerge     ( 1 , false , !state);
-	//g_sectiondb2.getRdb()->attemptMerge    ( 1 , false , !state);
-	g_titledb2.getRdb()->attemptMerge    ( 1 , false , !state);
-	//g_tfndb2.getRdb()->attemptMerge      ( 1 , false , !state);
-	//g_tagdb2.getRdb()->attemptMerge     ( 1 , false , !state);
-	//g_catdb2.getRdb()->attemptMerge      ( 1 , false , !state);
-	g_clusterdb2.getRdb()->attemptMerge  ( 1 , false , !state);
-	//g_statsdb2.getRdb()->attemptMerge    ( 1 , false , !state);
-	g_linkdb2.getRdb()->attemptMerge     ( 1 , false , !state);
-	//g_placedb2.getRdb()->attemptMerge     ( 1 , false , !state);
-	//g_revdb2.getRdb()->attemptMerge     ( 1 , false , !state);
+	attemptMergeAll2 ( );
 }

 // called by main.cpp
-void Rdb::attemptMerge ( int32_t niceness , bool forced , bool doLog ) {
+// . TODO: if rdbbase::attemptMerge() needs to launch a merge but can't
+//   then do NOT remove from linked list. maybe set a flag like 'needsMerge'
+void attemptMergeAll2 ( ) {

-	for ( int32_t i = 0 ; i < getNumBases() ; i++ ) {
+	// wait for any current merge to stop!
+	if ( g_merge.isMerging() ) return;

-		CollectionRec *cr = g_collectiondb.m_recs[i];
-		if ( ! cr ) continue;
-		// if swapped out, this will be NULL, so skip it
-		RdbBase *base = cr->getBasePtr(m_rdbId);
-		//RdbBase *base = getBase(i);
-		if ( ! base ) continue;
-		base->attemptMerge(niceness,forced,doLog);
-		// stop if we got unlink/rename threads out from a merge
-		// in RdbBase.cpp beause the merge can't go until this is 0
-		// lest we have 2000 collections all trying to merge tagdb
-		// at the same time!!!! this happened once...
-		if ( g_numThreads > 0 ) break;
+	int32_t niceness = MAX_NICENESS;
+	collnum_t s_lastCollnum = 0;
+	int32_t count = 0;
+
+ tryLoop:
+
+	// if a collection got deleted, reset this to 0
+	if ( s_lastCollnum >= g_collectiondb.m_numRecs )
+		s_lastCollnum = 0;
+
+	// limit to 1000 checks to save the cpu since we call this once
+	// every 2 seconds.
+	if ( ++count >= 1000 ) return;
+
+	CollectionRec *cr = g_collectiondb.m_recs[s_lastCollnum];
+	if ( ! cr ) goto tryLoop;
+
+	bool force = false;
+	RdbBase *base ;
+	// args = niceness, forceMergeAll, doLog, minToMergeOverride
+	// if RdbBase::attemptMerge() returns true that means it
+	// launched a merge and it will call attemptMergeAll2() when
+	// the merge completes.
+	base = cr->getBasePtr(RDB_POSDB);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+	base = cr->getBasePtr(RDB_TITLEDB);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+	base = cr->getBasePtr(RDB_TAGDB);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+	base = cr->getBasePtr(RDB_LINKDB);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+	base = cr->getBasePtr(RDB_SPIDERDB);
+	if ( base && base->attemptMerge(niceness,force,true) ) 
+		return;
+
+	// try next collection
+	s_lastCollnum++;
+
+	goto tryLoop;
+
+	/* 
+
+	   MDW: linked list approach is too prone to error. just try to 
+	   merge 1000 collection recs  in a call and keep a cursor.
+
+	CollectionRec *last = NULL;
+	CollectionRec *cr;
+
+ rebuild:
+
+	//
+	// . if the first time then build the linked list
+	// . or if we set s_needsBuild to false, like above, re-build it
+	//
+	if ( s_needsBuild ) {
+		s_mergeHead = NULL;
+		s_mergeTail = NULL;
 	}
+	for ( int32_t i=0 ; s_needsBuild && i<g_collectiondb.m_numRecs ; i++) {
+		// we need this quickpoll for when we got 20,000+ collections
+		QUICKPOLL ( niceness );
+		cr = g_collectiondb.getRec(i);//m_recs[i];
+		if ( ! cr ) continue;
+		// add it
+		if ( ! s_mergeHead ) s_mergeHead = cr;
+		if ( last ) last->m_nextLink = cr;
+		cr->m_prevLink = last;
+		cr->m_nextLink = NULL;
+		s_mergeTail = cr;
+		last = cr;
+	}
+	s_needsBuild = false;
+
+	bool force = false;
+
+	// . just scan the linked list that we now maintain
+	// . if a collection is deleted then we remove it from this list too!
+	cr = s_mergeHead;
+	while ( cr ) {
+		QUICKPOLL(niceness);
+		// this is a requirement in RdbBase::attemptMerge() so check
+		// for it here so we can bail out early
+		if ( g_numThreads > 0 ) break;
+		// sanity
+		CollectionRec *vr = g_collectiondb.getRec(cr->m_collnum);
+		if ( vr != cr ) {
+			log("rdb: attemptmergeall: bad collnum %i. how "
+			    "did this happen?",
+			    (int)cr->m_collnum);
+			s_needsBuild = true;
+			goto rebuild;
+		}
+		// pre advance
+		CollectionRec *next = cr->m_nextLink;
+		// try to merge the next guy in line, in the linked list
+		RdbBase *base ;
+		base = cr->getBasePtr(RDB_POSDB);
+		// args = niceness, forceMergeAll, doLog, minToMergeOverride
+		// if RdbBase::attemptMerge() returns true that means it
+		// launched a merge and it will call attemptMergeAll2() when
+		// the merge completes.
+		if ( base && base->attemptMerge(niceness,force,true) ) 
+			return;
+		base = cr->getBasePtr(RDB_TITLEDB);
+		if ( base && base->attemptMerge(niceness,force,true) ) 
+			return;
+		base = cr->getBasePtr(RDB_TAGDB);
+		if ( base && base->attemptMerge(niceness,force,true) ) 
+			return;
+		base = cr->getBasePtr(RDB_LINKDB);
+		if ( base && base->attemptMerge(niceness,force,true) ) 
+			return;
+		base = cr->getBasePtr(RDB_SPIDERDB);
+		if ( base && base->attemptMerge(niceness,force,true) ) 
+			return;
+		// hey, why was it in the list? remove it. we also remove
+		// guys if the collection gets deleted in Collectiondb.cpp,
+		// so this is a function.
+		removeFromMergeLinkedList ( cr );
+		cr = next;
+	}
+
+	// every 60 seconds try to merge collectionless rdbs
+	static int32_t s_count = 0;
+	if ( ++s_count == 30 ) {
+		s_count = 0;
+		// try to merge collectionless rdbs like statsdb/catdb
+		// RdbBase *base1 = g_catdb.getRdb()->getBase(0);
+		// if ( base1 ) base1->attemptMerge(niceness,force,true);
+		// RdbBase *base2 = g_statsdb.getRdb()->getBase(0);
+		// if ( base2 ) base2->attemptMerge(niceness,force,true);
+	}
+	*/
 }

 // . return false and set g_errno on error
@ -3152,7 +3302,10 @@ RdbBase *getRdbBase ( uint8_t rdbId , char *coll ) {
 		collnum = (collnum_t) 0;
 	else    
 		collnum = g_collectiondb.getCollnum ( coll );
-	if(collnum == -1) return NULL;
+	if(collnum == -1) {
+		g_errno = ENOCOLLREC;
+		return NULL;
+	}
 	//return rdb->m_bases [ collnum ];
 	return rdb->getBase(collnum);
 }
--- a/Rdb.h
+++ b/Rdb.h
@ -15,6 +15,9 @@

 bool makeTrashDir() ;

+void removeFromMergeLinkedList ( class CollectionRec *cr ) ;
+void addCollnumToLinkedListOfMergeCandidates ( collnum_t dumpCollnum ) ;
+
 // . each Rdb instance has an ID
 // . these ids are also return values for getIdFromRdb()
 #define	RDB_START 1
@ -78,8 +81,10 @@ char *getDbnameFromId ( uint8_t rdbId ) ;
 char getKeySizeFromRdbId  ( uint8_t rdbId );
 // and this is -1 if dataSize is variable
 int32_t getDataSizeFromRdbId ( uint8_t rdbId );
+void forceMergeAll ( char rdbId , char niceness ) ;
 // main.cpp calls this
 void attemptMergeAll ( int fd , void *state ) ;
+void attemptMergeAll2 ( );

 class Rdb {

@ -277,8 +282,8 @@ class Rdb {
 	
 	// private:

-	void attemptMerge ( int32_t niceness , bool forceMergeAll ,
-			    bool doLog = true );
+	//void attemptMerge ( int32_t niceness , bool forceMergeAll ,
+	//		    bool doLog = true );

 	bool gotTokenForDump  ( ) ;
 	//void gotTokenForMerge ( ) ;
--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -1,7 +1,7 @@
 #include "gb-include.h"

 #include "Rdb.h"
-#include "Msg35.h"
+//#include "Msg35.h"
 //#include "Tfndb.h"
 //#include "Checksumdb.h"
 #include "Clusterdb.h"
@ -99,6 +99,7 @@ void RdbBase::reset ( ) {
 	m_hasMergeFile = false;
 	m_isUnlinking  = false;
 	m_numThreads = 0;
+	m_checkedForMerge = false;
 }

 RdbBase::~RdbBase ( ) {
@ -340,6 +341,11 @@ bool RdbBase::init ( char  *dir            ,
 	// load any saved tree
 	//if ( ! loadTree ( ) ) return false;

+	// now diskpagecache is much simpler, just basically rdbcache...
+	return true;
+
+	/*
+
 	// . init BigFile::m_fileSize and m_lastModifiedTime
 	// . m_lastModifiedTime is now used by the merge to select older
 	//   titledb files to merge
@ -423,6 +429,7 @@ bool RdbBase::init ( char  *dir            ,
 	//int32_t n = f.write ( buf , 128*1024*5+10 , 0 );
 	//fprintf(stderr,"n=%"INT32"\n",n);
 	return true;
+	*/
 }

 // . move all files into trash subdir
@ -710,38 +717,67 @@ bool RdbBase::setFiles ( ) {

 // return the fileNum we added it to in the array
 // reutrn -1 and set g_errno on error
-int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , int32_t id2 ,
-		    bool converting ) {
+int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , 
+			   int32_t id2 , bool converting ) {

 	int32_t n = m_numFiles;
 	// can't exceed this
 	if ( n >= MAX_RDB_FILES ) { 
 		g_errno = ETOOMANYFILES; 
 		log(LOG_LOGIC,
-		    "db: Can not have more than %"INT32" files. File add failed.",
-		    (int32_t)MAX_RDB_FILES);
+		    "db: Can not have more than %"INT32" files. File add "
+		    "failed.",(int32_t)MAX_RDB_FILES);
 		return -1;
 	}

 	// HACK: skip to avoid a OOM lockup. if RdbBase cannot dump
 	// its data to disk it can backlog everyone and memory will
 	// never get freed up.
-	int64_t mm = g_mem.m_maxMem;
-	g_mem.m_maxMem = 0x0fffffffffffffffLL;
+	int64_t mm = g_conf.m_maxMem;
+	g_conf.m_maxMem = 0x0fffffffffffffffLL;
 	BigFile *f ;
 	try { f = new (BigFile); }
 	catch ( ...  ) { 
-		g_mem.m_maxMem = mm;
+		g_conf.m_maxMem = mm;
 		g_errno = ENOMEM;
 		log("RdbBase: new(%i): %s", 
 		    (int)sizeof(BigFile),mstrerror(g_errno));
 		return -1; 
 	}
 	mnew ( f , sizeof(BigFile) , "RdbBFile" );
+
+	// set the data file's filename
+	char name[512];
+	if      ( mergeNum <= 0 && m_isTitledb )
+		snprintf(name,511,"%s%04"INT32"-%03"INT32".dat",
+			 m_dbname,id,id2 );
+	else if ( mergeNum <= 0 )
+		snprintf ( name ,511,"%s%04"INT32".dat"      , m_dbname, id );
+	else if ( m_isTitledb )
+		snprintf ( name ,511,"%s%04"INT32"-%03"INT32".%03"INT32".dat",
+			  m_dbname, id , id2, mergeNum );
+	else
+		snprintf(name,511,"%s%04"INT32".%03"INT32".dat",
+			m_dbname,id,mergeNum);
+
+	f->set ( getDir() , name , NULL ); // getStripeDir() );
+
+	// if new insure does not exist
+	if ( isNew && f->doesExist() ) {
+		log("rdb: creating NEW file %s/%s which already exists!",
+		    f->getDir(),
+		    f->getFilename());
+		mdelete ( f , sizeof(BigFile),"RdbBFile");
+		delete (f); 
+		return -1;
+		char *xx=NULL;*xx=0;
+	}
+
+
 	RdbMap  *m ;
 	try { m = new (RdbMap); }
 	catch ( ... ) { 
-		g_mem.m_maxMem = mm;
+		g_conf.m_maxMem = mm;
 		g_errno = ENOMEM;
 		log("RdbBase: new(%i): %s", 
 		    (int)sizeof(RdbMap),mstrerror(g_errno));
@ -751,43 +787,23 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , int32_t
 	}
 	mnew ( m , sizeof(RdbMap) , "RdbBMap" );
 	// reinstate the memory limit
-	g_mem.m_maxMem = mm;
+	g_conf.m_maxMem = mm;
 	// sanity check
 	if ( id2 < 0 && m_isTitledb ) { char *xx = NULL; *xx = 0; }

 	CollectionRec *cr = NULL;

-	// set the data file's filename
-	char name[256];
 	// if we're converting, just add to m_filesIds and m_fileIds2
 	if ( converting ) {
 	       log("*-*-*-* Converting titledb files to new file name format");
 	       goto skip;
 	}

-	if      ( mergeNum <= 0 && m_isTitledb )
-		sprintf ( name , "%s%04"INT32"-%03"INT32".dat" , m_dbname, id , id2 );
-	else if ( mergeNum <= 0 )
-		sprintf ( name , "%s%04"INT32".dat"      , m_dbname, id );
-	else if ( m_isTitledb )
-		sprintf ( name , "%s%04"INT32"-%03"INT32".%03"INT32".dat",
-			  m_dbname, id , id2, mergeNum );
-	else
-		sprintf ( name , "%s%04"INT32".%03"INT32".dat", m_dbname, id , mergeNum);
-	f->set ( getDir() , name , NULL ); // getStripeDir() );
-
-	// if new insure does not exist
-	if ( isNew && f->doesExist() ) {
-		log("rdb: creating NEW file %s/%s which already exists!",
-		    f->m_dir,
-		    f->getFilename());
-		char *xx=NULL;*xx=0;
-	}

 	// debug help
 	if ( isNew )
 		log("rdb: adding new file %s/%s",// m_numFiles=%"INT32"",
-		    f->m_dir,f->getFilename());//,m_numFiles);
+		    f->getDir(),f->getFilename());//,m_numFiles);

 	// rename bug fix?
 	/*
@ -805,13 +821,15 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , int32_t

 	// if not a new file sanity check it
 	for ( int32_t j = 0 ; ! isNew && j < f->m_maxParts - 1 ; j++ ) {
-		File *ff = f->m_files[j];
+		// might be headless
+		File *ff = f->getFile2(j);//m_files[j];
 		if ( ! ff ) continue;
 		if ( ff->getFileSize() == MAX_PART_SIZE ) continue;
-		log ( "db: File %s has length %"INT64", but it should be %"INT64". "
+		log ( "db: File %s/%s has length %"INT64", but it should be %"INT64". "
 		      "You should move it to a temporary directory "
 		      "and restart. It probably happened when the power went "
 		      "out and a file delete operation failed to complete.",
+		      f->getDir(),
 		      ff->getFilename() ,
 		      (int64_t)ff->getFileSize(),
 		      (int64_t)MAX_PART_SIZE);
@ -870,7 +888,8 @@ int32_t RdbBase::addFile ( int32_t id , bool isNew , int32_t mergeNum , int32_t
 		// these writes because it is not initialized yet and will
 		// cause this write to fail!
 		g_statsdb.m_disabled = true;
-		bool status = m->writeMap();
+		// true = alldone
+		bool status = m->writeMap( true );
 		g_statsdb.m_disabled = false;
 		if ( ! status ) return log("db: Save failed.");
 	}
@ -1002,7 +1021,7 @@ bool RdbBase::incorporateMerge ( ) {
 		// exit merge mode
 		m_isMerging = false;
 		// return the merge token, no need for a callback
-		g_msg35.releaseToken ( );
+		//g_msg35.releaseToken ( );
 		//return true; 
 	}
 	// file #x is the merge file
@ -1026,7 +1045,8 @@ bool RdbBase::incorporateMerge ( ) {
 	log(LOG_INFO,"db: Writing map %s.",m_maps[x]->getFilename());
 	// . ensure we can save the map before deleting other files
 	// . sets g_errno and return false on error
-	m_maps[x]->writeMap();
+	// . allDone = true
+	m_maps[x]->writeMap( true );

 	// tfndb has his own merge class since titledb merges write tfndb recs
 	RdbMerge *m = &g_merge;
@ -1102,7 +1122,7 @@ bool RdbBase::incorporateMerge ( ) {
 		if ( ! m_files[i] ) continue;
 		// debug msg
 		log(LOG_INFO,"merge: Unlinking merged file %s/%s (#%"INT32").",
-		    m_files[i]->m_dir,m_files[i]->getFilename(),i);
+		    m_files[i]->getDir(),m_files[i]->getFilename(),i);
 		// . append it to "sync" state we have in memory
 		// . when host #0 sends a OP_SYNCTIME signal we dump to disk
 		//g_sync.addOp ( OP_UNLINK , m_files[i] , 0 );
@ -1226,7 +1246,8 @@ void RdbBase::doneWrapper2 ( ) {
 void doneWrapper3 ( void *state ) {
 	RdbBase *THIS = (RdbBase *)state;
 	log("rdb: thread completed rename operation for collnum=%"INT32" "
-	    "#threads=%"INT32"",(int32_t)THIS->m_collnum,THIS->m_numThreads);
+	    "#thisbaserenamethreads=%"INT32"",
+	    (int32_t)THIS->m_collnum,THIS->m_numThreads-1);
 	THIS->doneWrapper4 ( );
 }

@ -1243,7 +1264,7 @@ void RdbBase::doneWrapper4 ( ) {
 		if ( --m_numThreads > 0 ) return;
 	}

-	// some int16_thand variable notation
+	// some shorthand variable notation
 	int32_t a = m_mergeStartFileNum;
 	int32_t b = m_mergeStartFileNum + m_numFilesToMerge;

@ -1290,7 +1311,7 @@ void RdbBase::doneWrapper4 ( ) {
 	// exit merge mode
 	m_isMerging = false;
 	// return the merge token, no need for a callback
-	g_msg35.releaseToken ( );
+	//g_msg35.releaseToken ( );
 	// the rename has completed at this point, so tell sync table in mem
 	//g_sync.addOp ( OP_CLOSE , m_files[x] , 0 );
 	// unlink old merge filename from sync table
@ -1304,6 +1325,7 @@ void RdbBase::doneWrapper4 ( ) {
 	//attemptMerge ( 1/*niceness*/ , false /*don't force it*/ ) ;
 	// try all in case they were waiting (and not using tokens)
 	//g_tfndb.getRdb()->attemptMerge      ( 1 , false );
+	/*
 	g_clusterdb.getRdb()->attemptMerge  ( 1 , false );
 	g_linkdb.getRdb()->attemptMerge     ( 1 , false );
 	//g_sectiondb.getRdb()->attemptMerge  ( 1 , false );
@ -1323,6 +1345,10 @@ void RdbBase::doneWrapper4 ( ) {
 	g_posdb.getRdb()->attemptMerge    ( 1 , false );
 	//g_datedb.getRdb()->attemptMerge     ( 1 , false );
 	g_spiderdb.getRdb()->attemptMerge   ( 1 , false );
+	*/
+
+	// try to merge more when we are done
+	attemptMergeAll2 ( );
 }

 void RdbBase::buryFiles ( int32_t a , int32_t b ) {
@ -1360,28 +1386,33 @@ void attemptMergeWrapper ( int fd , void *state ) {
 }
 */

-static void gotTokenForMergeWrapper ( void *state ) ;
+//static void gotTokenForMergeWrapper ( void *state ) ;

-// the DailyMerge.cpp will set minToMergeOverride for titledb, and this
-// overrides "forceMergeAll" which is the same as setting "minToMergeOverride"
-// to "2". (i.e. perform a merge if you got 2 or more files)
-void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
+// . the DailyMerge.cpp will set minToMergeOverride for titledb, and this
+//   overrides "forceMergeAll" which is the same as setting 
+//   "minToMergeOverride" to "2". (i.e. perform a merge if you got 2 or more 
+//   files)
+// . now return true if we started a merge, false otherwise
+// . TODO: fix Rdb::attemptMergeAll() to not remove from linked list if
+//   we had an error in addNewFile() or rdbmerge.cpp's call to rdbbase::addFile
+bool RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 			     int32_t minToMergeOverride ) {
+
 	// don't do merge if we're in read only mode
-	if ( g_conf.m_readOnlyMode ) return ;
+	if ( g_conf.m_readOnlyMode ) return false;
 	// or if we are copying our files to a new host
 	//if ( g_hostdb.m_syncHost == g_hostdb.m_myHost ) return;
 	// nor if EITHER of the merge classes are suspended
-	if ( g_merge.m_isSuspended  ) return;
-	if ( g_merge2.m_isSuspended ) return;
+	if ( g_merge.m_isSuspended  ) return false;
+	if ( g_merge2.m_isSuspended ) return false;

 	// shutting down? do not start another merge then
-	if ( g_process.m_mode == EXIT_MODE ) return;
+	if ( g_process.m_mode == EXIT_MODE ) return false;

 	// sanity checks
 	if (   g_loop.m_inQuickPoll ) { 
 		log("rdb: cant attempt merge in quickpoll");
-		return;
+		return false;
 	}

 	if (   niceness == 0 ) { char *xx=NULL;*xx=0; }
@ -1398,7 +1429,7 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 		if ( doLog ) 
 			log(LOG_INFO,"db: Can not merge titledb while it "
 			    "is dumping.");
-		return;
+		return false;
 	}

 	// or if in repair mode, do not mess with any files in any coll
@ -1433,7 +1464,7 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 			log(LOG_INFO,"merge: Waiting for unlink/rename "
 			    "operations to finish before attempting merge "
 			    "for %s. (collnum=%"INT32")",m_dbname,(int32_t)m_collnum);
-		return;
+		return false;
 	}

 	if ( g_numThreads > 0 ) {
@ -1442,7 +1473,7 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 			    "collection's unlink/rename "
 			    "operations to finish before attempting merge "
 			    "for %s (collnum=%"INT32").",m_dbname,(int32_t)m_collnum);
-		return;
+		return false;
 	}


@ -1574,7 +1605,7 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 	// tfndb has his own merge class since titledb merges write tfndb recs
 	RdbMerge *m = &g_merge;
 	if ( m->isMerging() )
-		return;
+		return false;

 	// if we are tfndb and someone else is merging, do not merge unless
 	// we have 3 or more files
@ -1598,7 +1629,7 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,

 	// this triggers the negative rec concentration msg below and
 	// tries to merge on one file...
-	if ( ! resuming && m_numFiles <= 1 ) return;
+	if ( ! resuming && m_numFiles <= 1 ) return false;

 	// what percent of recs in the collections' rdb are negative?
 	// the rdbmaps hold this info
@ -1633,7 +1664,14 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,

 	// . don't merge if we don't have the min # of files
 	// . but skip this check if there is a merge to be resumed from b4
-	if ( ! resuming && ! forceMergeAll && numFiles < minToMerge ) return;
+	if ( ! resuming && ! forceMergeAll && numFiles < minToMerge ) {
+		// now we no longer have to check this collection rdb for
+		// merging. this will save a lot of cpu time when we have
+		// 20,000+ collections. if we dump a file to disk for it
+		// then we set this flag back to false in Rdb.cpp.
+		m_checkedForMerge = true;
+		return false;
+	}

 	// bail if already merging THIS class
 	if ( m_isMerging ) {
@ -1641,14 +1679,14 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 			log(LOG_INFO,
 			    "merge: Waiting for other merge to complete "
 			    "before merging %s.",m_dbname);
-		return;
+		return false;
 	}
 	// bail if already waiting for it
 	if ( m_waitingForTokenForMerge ) {
 		if ( doLog ) 
 			log(LOG_INFO,"merge: Already requested token. "
 			    "Request for %s pending.",m_dbname);
-		return;
+		return false;
 	}
 	// score it
 	m_waitingForTokenForMerge = true;
@ -1704,15 +1742,15 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 	//   gotTokenForMergeWrapper() may be called multiple times
 	// . if a host is always in urgent mode he may starve another host
 	//   whose is too, but his old request has an low priority.
-	int32_t priority = 0;
+	//int32_t priority = 0;
 	// save this so gotTokenForMerge() can use it
 	m_doLog = doLog;
 	//if ( m_mergeUrgent ) priority = 2;
 	//else                 priority = 0;
 	// tfndb doesn't need token, since titledb merge writes tfndb recs
-	if ( //m_rdb != g_tfndb.getRdb() &&
-	     ! g_msg35.getToken ( this , gotTokenForMergeWrapper, priority ) )
-		return ;
+	// if ( //m_rdb != g_tfndb.getRdb() &&
+	//      ! g_msg35.getToken ( this , gotTokenForMergeWrapper, priority))
+	// 	return ;
 	// bitch if we got token because there was an error somewhere
 	if ( g_errno ) {
 		log(LOG_LOGIC,"merge: attemptMerge: %s failed: %s",
@ -1722,13 +1760,14 @@ void RdbBase::attemptMerge ( int32_t niceness, bool forceMergeAll, bool doLog ,
 		// undo request
 		m_waitingForTokenForMerge = false;		 
 		// we don't have the token, so we're fucked...
-		return;
+		return false;
 	}
 	// debug msg
 	//if ( doLog )
 	//log(LOG_INFO,"merge: Got merge token for %s without blocking.",
 	//    m_dbname);
 	// if did not block
+/*
 	gotTokenForMerge ( );
 }

@ -1738,14 +1777,16 @@ void gotTokenForMergeWrapper ( void *state ) {
 }

 void RdbBase::gotTokenForMerge ( ) {
+*/
 	// debug mg
 	//log("RdbBase::gotTokenForMerge: for %s",m_dbname);
 	// don't repeat
 	m_waitingForTokenForMerge = false;
 	// if a dump is happening it will always be the last file, do not
 	// include it in the merge
-	int32_t numFiles = m_numFiles;
-	if ( numFiles > 0 && m_dump->isDumping() ) numFiles--;
+	//int32_t numFiles = m_numFiles;
+	//if ( numFiles > 0 && m_dump->isDumping() ) numFiles--;
+
 	// . if we are significantly over our m_minToMerge limit
 	//   then set m_mergeUrgent to true so merge disk operations will
 	//   starve any spider disk reads (see Threads.cpp for that)
@ -1760,7 +1801,7 @@ void RdbBase::gotTokenForMerge ( ) {
 		g_numUrgentMerges++;
 	}
 	// tfndb has his own merge class since titledb merges write tfndb recs
-	RdbMerge *m = &g_merge;
+	//RdbMerge *m = &g_merge;
 	//if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
 	// sanity check
 	if ( m_isMerging || m->isMerging() ) {
@ -1769,11 +1810,11 @@ void RdbBase::gotTokenForMerge ( ) {
 			//"merge: Someone already merging. Waiting for "
 			//"merge token "
 			//"in order to merge %s.",m_dbname);
-		return;
+		return false;
 	}

 	// or if # threads out is positive
-	if ( m_numThreads > 0 ) return;
+	if ( m_numThreads > 0 ) return false;

 	// clear for take-off
 	//m_inWaiting = false;
@ -1793,7 +1834,7 @@ void RdbBase::gotTokenForMerge ( ) {
 	int32_t      mini ;
 	bool      minOld ;
 	int32_t      id2  = -1;
-	int32_t      minToMerge;
+	//int32_t      minToMerge;
 	bool      overide = false;
 	//int32_t      smini = - 1;
 	//int32_t      sn ;
@ -1807,7 +1848,7 @@ void RdbBase::gotTokenForMerge ( ) {
 	//	goto skip;
 	//}

-	char rdbId = getIdFromRdb ( m_rdb );
+	//char rdbId = getIdFromRdb ( m_rdb );

 	// if one file is even #'ed then we were merging into that, but
 	// got interrupted and restarted. maybe the power went off or maybe
@ -1838,8 +1879,8 @@ void RdbBase::gotTokenForMerge ( ) {
 		if ( n <= 1 ) {
 			log(LOG_LOGIC,"merge: attemptMerge: Resuming. bad "
 			    "engineer");
-			g_msg35.releaseToken();
-			return;
+			//g_msg35.releaseToken();
+			return false;
 		}
 		// make a log note
 		log(LOG_INFO,"merge: Resuming killed merge for %s coll=%s.",
@ -1903,7 +1944,7 @@ void RdbBase::gotTokenForMerge ( ) {
 			//File *mf = m_maps[j]->getFile();
 			m_maps[j]->rename(fbuf);
 			log("merge: renaming final merged file %s",fbuf);
-			return;
+			return false;
 		}

 		// resume the merging
@ -2116,8 +2157,8 @@ void RdbBase::gotTokenForMerge ( ) {
 	if ( mini == -1 ) { 
 		log(LOG_LOGIC,"merge: gotTokenForMerge: Bad engineer. mini "
 		    "is -1.");
-		g_msg35.releaseToken(); 
-		return; 
+		//g_msg35.releaseToken(); 
+		return false; 
 	}
 	// . merge from file #mini through file #(mini+n)
 	// . these files should all have ODD fileIds so we can sneak a new
@ -2133,8 +2174,8 @@ void RdbBase::gotTokenForMerge ( ) {
 			log(LOG_LOGIC,"merge: attemptMerge: could not add "
 			    "new file for titledb. No avail ids."); 
 			g_errno = 0;
-			g_msg35.releaseToken();
-			return; 
+			//g_msg35.releaseToken();
+			return false; 
 		}
 	}
 	// . make a filename for the merge
@ -2148,8 +2189,8 @@ void RdbBase::gotTokenForMerge ( ) {
 	if ( mergeFileNum < 0 ) {
 		log(LOG_LOGIC,"merge: attemptMerge: Could not add new file."); 
 		g_errno = 0;
-		g_msg35.releaseToken();
-		return; 
+		//g_msg35.releaseToken();
+		return false; 
 	}
 	// we just opened a new file
 	//g_sync.addOp ( OP_OPEN , m_files[mergeFileNum] , 0 );
@ -2167,8 +2208,8 @@ void RdbBase::gotTokenForMerge ( ) {
 	if ( n <= 1 && ! overide ) {
 	       log(LOG_LOGIC,"merge: gotTokenForMerge: Not merging %"INT32" files.",
 		    n);
-		g_msg35.releaseToken(); 
-		return; 
+	       //g_msg35.releaseToken(); 
+		return false; 
 	}

 	// . save the # of files we're merging for the cleanup process
@ -2176,7 +2217,7 @@ void RdbBase::gotTokenForMerge ( ) {
 	m_numFilesToMerge   = n  ; // numFiles - 1;
 	m_mergeStartFileNum = mergeFileNum + 1; // 1

-	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
+	//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
 	char *coll = "";
 	if ( cr ) coll = cr->m_coll;

@ -2224,7 +2265,9 @@ void RdbBase::gotTokenForMerge ( ) {
 			  m_niceness            ,
 			  m_pc                  ,
 			  mint /*maxTargetFileSize*/ ,
-			  m_ks                  ) ) return;
+			  m_ks                  ) ) 
+		// we started the merge so return true here
+		return true;
 	// hey, we're no longer merging i guess
 	m_isMerging = false;
 	// decerment this count
@ -2243,9 +2286,12 @@ void RdbBase::gotTokenForMerge ( ) {
 		    m_dbname,mstrerror(g_errno));
 	g_errno = 0;
 	// give token back
-	g_msg35.releaseToken();
+	//g_msg35.releaseToken();
 	// try again
-	m_rdb->attemptMerge( m_niceness, false , true );
+	//m_rdb->attemptMerge( m_niceness, false , true );
+	// how did this happen?
+	log("merge: did not block for some reason.");
+	return true;
 }

 // . use the maps and tree to estimate the size of this list w/o hitting disk
@ -2480,16 +2526,18 @@ void RdbBase::saveMaps ( bool useThread ) {
 			log("base: map for file #%i is null",i);
 			continue;
 		}
-		m_maps[i]->writeMap ( );
+		m_maps[i]->writeMap ( false );
 	}
 }

 void RdbBase::verifyDiskPageCache ( ) {
 	if ( !m_pc ) return;
-	for ( int32_t i = 0; i < m_numFiles; i++ ){
-		BigFile *f = m_files[i];
-		m_pc->verifyData(f);
-	}
+	// disable for now
+	return;
+	// for ( int32_t i = 0; i < m_numFiles; i++ ){
+	// 	BigFile *f = m_files[i];
+	// 	m_pc->verifyData(f);
+	// }
 }

 bool RdbBase::verifyFileSharding ( ) {
--- a/RdbBase.h
+++ b/RdbBase.h
@ -231,13 +231,15 @@ class RdbBase {
 	
 	// private:

-	void attemptMerge ( int32_t niceness , bool forceMergeAll , 
+	// returns true if merge was started, false if no merge could
+	// be launched right now for some reason.
+	bool attemptMerge ( int32_t niceness , bool forceMergeAll , 
 			    bool doLog = true ,
 			    // -1 means to not override it
 			    int32_t minToMergeOverride = -1 );

-	bool gotTokenForDump  ( ) ;
-	void gotTokenForMerge ( ) ;
+	//bool gotTokenForDump  ( ) ;
+	//void gotTokenForMerge ( ) ;

 	// called after merge completed
 	bool incorporateMerge ( );
@ -420,7 +422,9 @@ class RdbBase {

 	// key size
 	char      m_ks;
-	
+
+	bool m_checkedForMerge;
+
 	int32_t      m_pageSize;

 	// are we waiting on another merge/dump to complete before our turn?
--- a/RdbBuckets.cpp
+++ b/RdbBuckets.cpp
@ -1802,7 +1802,9 @@ void RdbBuckets::cleanBuckets ( ) {
 	for ( int32_t i = 0; i < m_numBuckets; i++ ) {
 		RdbBucket *b = m_buckets[i];
 		collnum_t collnum = b->getCollnum();
-		CollectionRec *cr = g_collectiondb.m_recs[collnum];
+		CollectionRec *cr = NULL;
+		if ( collnum < g_collectiondb.m_numRecs ) 
+			cr = g_collectiondb.m_recs[collnum];
 		if ( cr ) continue;
 		// count # deleted
 		count += b->getNumKeys();
@ -2244,14 +2246,16 @@ int64_t RdbBuckets::fastLoadColl( BigFile *f,

 	m_dbname = dbname;

-	if ( g_errno ) return -1;
+	if ( g_errno ) 
+		return -1;

 	for (int32_t i = 0; i < numBuckets; i++ ) {
 		m_buckets[i] = bucketFactory();
 		if(m_buckets[i] == NULL) return -1;
 		offset = m_buckets[i]->fastLoad(f, offset);
 		// returns -1 on error
-		if ( offset < 0 ) return -1;
+		if ( offset < 0 ) 
+			return -1;
 		m_numBuckets++;
 	}
 	return offset;
@ -2312,7 +2316,7 @@ int64_t RdbBucket::fastSave_r(int fd, int64_t offset) {
 }

 int64_t RdbBucket::fastLoad(BigFile *f, int64_t offset) {
-	errno = 0;
+	//errno = 0;

 	f->read  ( &m_collnum,sizeof(collnum_t), offset ); 
 	offset += sizeof(collnum_t);
@ -2333,7 +2337,10 @@ int64_t RdbBucket::fastLoad(BigFile *f, int64_t offset) {
 	offset += recSize*m_numKeys;

 	m_endKey = m_keys + endKeyOffset;
+	if ( g_errno ) {
+		log("bucket: fastload %s",mstrerror(g_errno));
+		return -1;
+	}

-	if(errno) return -1;
 	return offset;
 }
--- a/RdbCache.h
+++ b/RdbCache.h
@ -28,8 +28,8 @@
 //   allocating if the record size is 256k or more. Copying 256k only
 //   takes .1 ms on the P4 2.60CGHz. This is on the TODO list.

-#ifndef _RDBCACHE_H_
-#define _RDBCACHE_H_
+#ifndef RDBCACHE_H
+#define RDBCACHE_H

 // . TODO:
 // . if size of added rec is ABOVE this, then don't use our memory buffer
--- a/RdbDump.cpp
+++ b/RdbDump.cpp
@ -229,8 +229,15 @@ void RdbDump::doneDumping ( ) {
 	// did collection get deleted/reset from under us?
 	if ( saved == ENOCOLLREC ) return;

-	// save the map to disk
-	if ( m_map ) m_map->writeMap();
+	// save the map to disk. true = allDone
+	if ( m_map ) m_map->writeMap( true );
+
+	// now try to merge this collection/db again
+	// if not already in the linked list. but do not add to linked list
+	// if it is statsdb or catdb.
+	if ( m_rdb && ! m_rdb->m_isCollectionLess )
+		addCollnumToLinkedListOfMergeCandidates ( m_collnum );
+
 #ifdef GBSANITYCHECK
 	// sanity check
 	log("DOING SANITY CHECK FOR MAP -- REMOVE ME");
@ -678,10 +685,11 @@ bool RdbDump::doneDumpingList ( bool addToMap ) {
 			// note it
 			log(LOG_LOGIC,"db: setting fd for vfd to -1.");
 			// mark our fd as not there...
-			int32_t i = (m_offset - m_bytesToWrite) / MAX_PART_SIZE;
+			//int32_t i=(m_offset-m_bytesToWrite) / MAX_PART_SIZE;
 			// sets s_fds[vfd] to -1
-			if ( m_file->m_files[i] )
-				releaseVfd ( m_file->m_files[i]->m_vfd );
+			// MDW: no, can't do this now
+			// if ( m_file->m_files[i] )
+			// 	releaseVfd ( m_file->m_files[i]->m_vfd );
 		}
 		//log("RdbDump::doneDumpingList: retrying.");
 		return dumpList ( m_list , m_niceness , true );
--- a/RdbMap.cpp
+++ b/RdbMap.cpp
@ -48,13 +48,14 @@ void RdbMap::set ( char *dir , char *mapFilename,

 bool RdbMap::close ( bool urgent ) {
 	bool status = true;
-	if ( /*mdw m_numPages > 0 &&*/ m_needToWrite ) status = writeMap ( );
+	if ( /*mdw m_numPages > 0 &&*/ m_needToWrite ) status=writeMap(false);
 	// clears and frees everything
 	if ( ! urgent ) reset ();
 	return status;
 }

 void RdbMap::reset ( ) {
+	m_reducedMem = false;
 	m_generatingMap = false;
 	int32_t pps = PAGES_PER_SEGMENT;
 	if ( m_newPagesPerSegment > 0 ) pps = m_newPagesPerSegment;
@ -70,8 +71,8 @@ void RdbMap::reset ( ) {

 	// the ptrs themselves are now a dynamic array to save mem
 	// when we have thousands of collections
-	mfree(m_keys,m_numSegmentPtrs*sizeof(char *),"MapPtrs");
-	mfree(m_offsets,m_numSegmentOffs*sizeof(int16_t *),"MapPtrs");
+	mfree(m_keys,m_numSegmentPtrs*sizeof(char *),"MapPtrs1");
+	mfree(m_offsets,m_numSegmentOffs*sizeof(int16_t *),"MapPtrs2");
 	m_numSegmentPtrs = 0;
 	m_numSegmentOffs = 0;

@ -93,10 +94,12 @@ void RdbMap::reset ( ) {
 	m_lastLogTime = 0;
 	m_badKeys     = 0;
 	m_needVerify  = false;
+
+	m_file.reset();
 }


-bool RdbMap::writeMap ( ) {
+bool RdbMap::writeMap ( bool allDone ) {
 	if ( g_conf.m_readOnlyMode ) return true;
 	// return true if nothing to write out
 	// mdw if ( m_numPages <= 0 ) return true;
@ -112,6 +115,8 @@ bool RdbMap::writeMap ( ) {
 	// . close map
 	// . no longer since we use BigFile
 	//m_file.close ( );
+	// map is done so save some memory
+	if ( allDone ) reduceMemFootPrint () ;
 	// return status
 	return status;
 }
@ -236,7 +241,7 @@ bool RdbMap::verifyMap ( BigFile *dataFile ) {
 		    "db: Map file %s says that file %s should be %"INT64" bytes "
 		    "long, but it is %"INT64" bytes.",
 		    m_file.getFilename(),
-		    dataFile->m_baseFilename ,
+		    dataFile->getFilename() ,
 		    m_offset - m_fileStartOffset ,
 		    dataFile->getFileSize() );
 		// we let headless files squeak by on this because we cannot
@ -290,7 +295,7 @@ bool RdbMap::verifyMap ( BigFile *dataFile ) {
 		dataFile->doesPartExist ( numMissingParts-1 ) ) 
 		numMissingParts--;
 	if ( numMissingParts > 0 ) {
-		File *f = dataFile->getFile ( numMissingParts );
+		File *f = dataFile->getFile2 ( numMissingParts );
 		if ( f ) log("db: Missing part file before %s.",
 			     f->getFilename());
 	}
@ -328,7 +333,7 @@ bool RdbMap::verifyMap2 ( ) {
 		    "Map or data file is "
 		    "corrupt, but it is probably the data file. Please "
 		    "delete the map file and restart.", 
-		    m_file.m_dir,m_file.getFilename() ,
+		    m_file.getDir(),m_file.getFilename() ,
 		    i,(int64_t)m_pageSize*(int64_t)i+getOffset(i));

 		//log("db: oldk.n1=%08"XINT32" n0=%016"XINT64"",
@ -341,13 +346,16 @@ bool RdbMap::verifyMap2 ( ) {

 		SafeBuf cmd;
 		cmd.safePrintf("mv %s/%s %s/trash/",
-			       m_file.m_dir,
+			       m_file.getDir(),
 			       m_file.getFilename(),
 			       g_hostdb.m_dir);
 		log("db: %s",cmd.getBufStart() );
 		gbsystem ( cmd.getBufStart() );

-		exit(0);
+		//exit(0);
+		// make the bash shell restart us by returning a 1 error code
+		exit(1);
+
 		//char *xx=NULL;*xx=0;
 		// was k too small?
 		//if ( i + 1 < m_numPages && lastKey <= getKey(i+1) ) {
@ -504,6 +512,7 @@ int64_t RdbMap::readSegment ( int32_t seg , int64_t offset , int32_t fileSize )
 bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
 	// calculate size of the whole slot
 	//int32_t size = sizeof(key_t) ;
+	if ( m_reducedMem ) { char *xx=NULL;*xx=0; }
 	// include the dataSize, 4 bytes, for each slot if it's not fixed
 	//if ( m_fixedDataSize == -1 ) size += 4;
 	// include the data
@ -554,7 +563,7 @@ bool RdbMap::addRecord ( char *key, char *rec , int32_t recSize ) {
 		//pageNum > 0 && getKey(pageNum-1) > getKey(pageNum) ) {
 		log(LOG_LOGIC,"build: RdbMap: added key out of order. "
 		    "count=%"INT64" file=%s/%s.",m_badKeys,
-		    m_file.m_dir,m_file.getFilename());
+		    m_file.getDir(),m_file.getFilename());
 		//log(LOG_LOGIC,"build: k.n1=%"XINT32" %"XINT64"  lastKey.n1=%"XINT32" %"XINT64"",
 		//    key.n1,key.n0,m_lastKey.n1,m_lastKey.n0 );
 		log(LOG_LOGIC,"build: offset=%"INT64"",
@ -661,6 +670,9 @@ bool RdbMap::prealloc ( RdbList *list ) {
 	if ( list->m_ks != m_ks ) { char *xx = NULL; *xx = 0; }
 	// bail now if it's empty
 	if ( list->isEmpty() ) return true;
+
+	if ( m_reducedMem ) { char *xx=NULL;*xx=0; }
+
 	// what is the last page we touch?
 	int32_t lastPageNum = (m_offset + list->getListSize() - 1) / m_pageSize;
 	// . need to pre-alloc up here so malloc does not fail mid stream
@ -690,6 +702,9 @@ bool RdbMap::addList ( RdbList *list ) {

 	// what is the last page we touch?
 	int32_t lastPageNum = (m_offset + list->getListSize() - 1) / m_pageSize;
+
+	if ( m_reducedMem ) { char *xx=NULL;*xx=0; }
+
 	// . need to pre-alloc up here so malloc does not fail mid stream
 	// . TODO: only do it if list is big enough
 	while ( lastPageNum + 2 >= m_maxNumPages ) {
@ -759,6 +774,8 @@ bool RdbMap::addIndexList ( IndexList *list ) {
 	// return now if empty
 	if ( list->isEmpty() ) return true;

+	if ( m_reducedMem ) { char *xx=NULL;*xx=0; }
+
 	// we need to call writeMap() before we exit
 	m_needToWrite = true;

@ -1231,6 +1248,7 @@ int64_t RdbMap::getMemAlloced ( ) {
 }

 bool RdbMap::addSegmentPtr ( int32_t n ) {
+	if ( m_reducedMem ) { char *xx=NULL;*xx=0; }
 	// realloc
 	if ( n >= m_numSegmentPtrs ) {
 		char **k;
@ -1238,7 +1256,7 @@ bool RdbMap::addSegmentPtr ( int32_t n ) {
 		k = (char **) mrealloc (m_keys,
 					m_numSegmentPtrs * sizeof(char *) ,
 					nn * sizeof(char *) ,
-					"MapPtrs" );
+					"MapPtrs1" );
 		// failed?
 		if ( ! k ) return false;
 		// succeeded
@ -1253,7 +1271,7 @@ bool RdbMap::addSegmentPtr ( int32_t n ) {
 		o = (int16_t **) mrealloc (m_offsets,
 					 m_numSegmentOffs * sizeof(int16_t *) ,
 					 nn * sizeof(int16_t *) ,
-					 "MapPtrs" );
+					 "MapPtrs2" );
 		// failed?
 		if ( ! o ) return false;
 		// succeeded
@ -1267,6 +1285,24 @@ bool RdbMap::addSegmentPtr ( int32_t n ) {
 void RdbMap::reduceMemFootPrint () {
 	if ( m_numSegments != 1 ) return;
 	if ( m_numPages >= 100 ) return;
+	// if already reduced, return now
+	if ( m_newPagesPerSegment > 0 ) return;
+
+	// if it is like posdb0054.map then it is being merged into and
+	// we'll resume a killed merge, so don't mess with it, we'll need to
+	// add more pages.
+	char *s = m_file.getFilename();
+	for ( ; s && *s && ! is_digit(*s) ; s++ );
+	int id = 0;
+	if ( s ) id = atoi(s);
+	if ( id && (id % 2) == 0 ) return;
+
+	// log("map: reducing mem footprint for %s/%s",
+	//     m_file.getDir(),
+	//     m_file.getFilename());
+	
+	// seems kinda buggy now..
+	m_reducedMem = true;
 	//return;
 	char *oldKeys = m_keys[0];
 	short *oldOffsets = m_offsets[0];
@ -1295,6 +1331,8 @@ bool RdbMap::addSegment (  ) {
 	//if ( n >= MAX_SEGMENTS ) return log("db: Mapped file is "
 	//				    "too big. Critical error.");

+	if ( m_reducedMem ) { char *xx=NULL;*xx=0; }
+
 	// the array of up to MAX_SEGMENT pool ptrs is now dynamic too!
 	// because diffbot uses thousands of collections, this will save
 	// over 1GB of ram!
@ -1391,7 +1429,7 @@ bool RdbMap::generateMap ( BigFile *f ) {
 	reset();
 	if ( g_conf.m_readOnlyMode ) return false;

-	log("db: Generating map for %s/%s",f->m_dir,f->getFilename());
+	log("db: Generating map for %s/%s",f->getDir(),f->getFilename());

 	// we don't support headless datafiles right now
 	if ( ! f->doesPartExist(0) ) {
@ -1450,6 +1488,11 @@ bool RdbMap::generateMap ( BigFile *f ) {
 		mfree ( buf , bufSize , "RdbMap");
 		return true;
 	}
+
+	// debug msg
+	//fprintf(stderr,"reading map @ off=%"INT64" size=%"INT64"\n"
+	//	, offset , readSize );
+
 	// otherwise, read it in
 	if ( ! f->read ( buf , readSize , offset ) ) {
 		mfree ( buf , bufSize , "RdbMap");
@ -1640,7 +1683,7 @@ bool RdbMap::truncateFile ( BigFile *f ) {
 	int32_t numParts = f->getNumParts();
 	// what part num are we on?
 	int32_t partnum = f->getPartNum ( m_offset );
-	File *p = f->getFile ( partnum );
+	File *p = f->getFile2 ( partnum );
 	if ( ! p ) return log("db: Unable to get part file.");
 	// get offset relative to the part file
 	int32_t newSize = m_offset % (int64_t)MAX_PART_SIZE;
@ -1661,7 +1704,7 @@ bool RdbMap::truncateFile ( BigFile *f ) {
 	// MAX_TRUNC_SIZE bytes big
 	File *p2 = NULL;
 	if ( partnum == numParts-2 ) {
-		p2 = f->getFile ( partnum + 1 );
+		p2 = f->getFile2 ( partnum + 1 );
 		if ( ! p2 ) return log("db: Could not get next part in line.");
 		if ( p2->getFileSize() > MAX_TRUNC_SIZE )
 			return log("db: Next part file is bigger than %"INT32" "
--- a/RdbMap.h
+++ b/RdbMap.h
@ -111,7 +111,7 @@ class RdbMap {
 	// . this is totally MTUnsafe
 	// . don't be calling addRecord with this is dumping
 	// . flushes when done
-	bool writeMap  ( );
+	bool writeMap  ( bool allDone );
 	bool writeMap2 ( );
 	int64_t writeSegment ( int32_t segment , int64_t offset );

@ -342,7 +342,7 @@ class RdbMap {
 	int16_t         **m_offsets;
 	int32_t            m_numSegmentOffs;

-
+	bool m_reducedMem;

 	// number of valid pages in the map.
 	int32_t          m_numPages;     
--- a/RdbMerge.cpp
+++ b/RdbMerge.cpp
@ -303,7 +303,14 @@ bool RdbMerge::getNextList ( ) {
 	// no chop threads
 	m_numThreads = 0;
 	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
-	RdbBase *base; if (!(base=getRdbBase(m_rdbId,m_collnum))) return true;
+	RdbBase *base = getRdbBase(m_rdbId,m_collnum);
+	if ( ! base ) {
+		// hmmm it doesn't set g_errno so we set it here now
+		// otherwise we do an infinite loop sometimes if a collection
+		// rec is deleted for the collnum
+		g_errno = ENOCOLLREC;
+		return true;
+	}
 	// . if a contributor has just surpassed a "part" in his BigFile
 	//   then we can delete that part from the BigFile and the map
 	for ( int32_t i = m_startFileNum ; i < m_startFileNum + m_numFiles; i++ ){
--- a/RdbScan.cpp
+++ b/RdbScan.cpp
@ -203,6 +203,7 @@ void gotListWrapper ( void *state ) {

 void RdbScan::gotList ( ) {
 	char *allocBuf  = m_fstate.m_allocBuf;
+	int32_t  allocOff  = m_fstate.m_allocOff; //buf=allocBuf+allocOff
 	int32_t  allocSize = m_fstate.m_allocSize;
 	// do not free the allocated buf for when the actual thread
 	// does the read and finally completes in this case. we free it
@ -226,7 +227,6 @@ void RdbScan::gotList ( ) {
 	if ( m_fstate.m_allocBuf ) {
 		// get the buffer info for setting the list
 		//char *allocBuf  = m_fstate.m_allocBuf;
-		int32_t  allocOff  = m_fstate.m_allocOff; //buf=allocBuf+allocOff
 		//int32_t  allocSize = m_fstate.m_allocSize;
 		int32_t  bytesDone = m_fstate.m_bytesDone;
 		// sanity checks
@ -248,16 +248,21 @@ void RdbScan::gotList ( ) {
 			      m_useHalfKeys   , 
 			      m_ks            );
 	}
+
 	// this was bitching a lot when running on a multinode cluster,
 	// so i effectively disabled it by changing to _GBSANITYCHECK2_
-#ifdef GBSANITYCHECK2
+//#ifdef GBSANITYCHECK2
 	// this first test, tests to make sure the read from cache worked
 	DiskPageCache *pc = m_file->getDiskPageCache();
-	if ( pc && ! g_errno ) {
+	if ( pc && 
+	     ! g_errno && 
+	     g_conf.m_logDebugDiskPageCache && 
+	     // if we got it from the page cache, verify with disk
+	     m_fstate.m_inPageCache ) {
 		// ensure threads disabled
 		bool on = ! g_threads.areThreadsDisabled();
 		if ( on ) g_threads.disableThreads();
-		pc->disableCache();
+		//pc->disableCache();
 		FileState fstate;
 		// ensure we don't mess around
 		fstate.m_allocBuf = NULL;
@ -274,7 +279,7 @@ void RdbScan::gotList ( ) {
 			       NULL             , // callback state
 			       gotListWrapper   , // FAKE callback
 			       MAX_NICENESS     , // niceness
-			       false, // m_allowPageCache ,
+			       false, // m_allowPageCache ,... not for test!
 			       m_hitDisk  ,
 			       16 + m_off );
 		//char *allocBuf  = fstate.m_allocBuf;
@ -289,16 +294,21 @@ void RdbScan::gotList ( ) {
 			if ( m_bytesToRead != m_list->getListSize() ) {
 				char *xx = NULL; *xx = 0; }
 		}
+		// compare
+		if ( memcmp ( allocBuf+allocOff, bb , m_bytesToRead ) ) {
+			log("db: failed diskpagecache verify");
+			char *xx=NULL;*xx=0; 
+		}
 		//mfree ( allocBuf , allocSize , "RS" );
 		mfree ( bb , m_bytesToRead , "RS" );
 		if ( on ) g_threads.enableThreads();
-		pc->enableCache();
+		//pc->enableCache();
 		// . this test tests to make sure the page stores worked
 		// . go through each page in page cache and verify on disk
-		pc->verifyData ( m_file );
+		//pc->verifyData ( m_file );
 	}
 skip:
-#endif
+//#endif
 	// assume we did not shift it
 	m_shifted = 0;//false;
 	// if we were doing a cache only read, and got nothing, bail now
--- a/Rebalance.cpp
+++ b/Rebalance.cpp
@ -302,8 +302,8 @@ bool Rebalance::saveRebalanceFile ( ) {
 	binToHex ( (unsigned char *)&m_nextKey , MAX_KEY_BYTES , keyStr );

 	//log("db: saving rebalance.txt");
-
-	SafeBuf sb;
+	char tmp[30000];
+	SafeBuf sb(tmp,30000);
 	sb.safePrintf (
 		       "myshard: %"INT32"\n"
 		       "numshards: %"INT32"\n"
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -77,6 +77,10 @@ SafeBuf::SafeBuf(char *heapBuf, int32_t bufMax, int32_t bytesInUse, bool ownData
 }

 SafeBuf::~SafeBuf() {
+	destructor();
+}
+
+void SafeBuf::destructor() {
 	if(!m_usingStack && m_buf) 
 		mfree(m_buf, m_capacity, "SafeBuf");
 	m_buf = NULL;
@ -475,7 +479,8 @@ int32_t SafeBuf::safeSave (char *filename ) {
 retry22:

 	// first write to tmp file
-	SafeBuf fn;
+	char tmp[1024];
+	SafeBuf fn(tmp,1024);
 	fn.safePrintf( "%s.saving",filename );

 	int32_t fd = open ( fn.getBufStart() ,
--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -20,6 +20,7 @@ public:
 	SafeBuf(int32_t initSize, char *label = NULL);

 	void constructor();
+	void destructor ();

 	//be careful with passing in a stackBuf! it could go out
 	//of scope independently of the safebuf.
@ -53,6 +54,8 @@ public:

 	//ACCESSORS
 	char *getBuf() { return m_buf + m_length; }
+	char *getBufPtr() { return m_buf + m_length; }
+	char *getBufCursor() { return m_buf + m_length; }
 	char *getBufStart() { return m_buf; }
 	char *getBufEnd() { return m_buf + m_capacity; }
 	int32_t getCapacity() { return m_capacity; }
--- a/SiteGetter.cpp
+++ b/SiteGetter.cpp
@ -362,7 +362,9 @@ bool SiteGetter::gotSiteList ( ) {
 		// mark it so caller knows
 		m_errno = g_errno;
 		// so try again without increasing m_pathDepth
-		m_tryAgain = true;
+		// i've seen a host return EBADRDBID for some reason
+		// and put host #0 in an infinite log spam loop so stop it
+		if ( g_errno != EBADRDBID ) m_tryAgain = true;
 		return true;
 	}
 	// how many urls at this path depth?
--- a/Spider.cpp
+++ b/Spider.cpp
@ -652,9 +652,7 @@ bool Spiderdb::init ( ) {
 	if ( ! m_pc.init ( "spiderdb", 
 			   RDB_SPIDERDB ,
 			   pcmem     ,
-			   pageSize  ,
-			   false     ,  // use shared mem?
-			   false     )) // minimizeDiskSeeks?
+			   pageSize  ))
 		return log(LOG_INIT,"spiderdb: Init failed.");

 	// initialize our own internal rdb
@ -854,9 +852,7 @@ bool Doledb::init ( ) {
 	if ( ! m_pc.init ( "doledb"  , 
 			   RDB_DOLEDB ,
 			   pcmem     ,
-			   pageSize  ,
-			   true      ,  // use shared mem?
-			   false     )) // minimizeDiskSeeks?
+			   pageSize  ))
 		return log(LOG_INIT,"doledb: Init failed.");

 	// initialize our own internal rdb
@ -6340,6 +6336,8 @@ void SpiderLoop::spiderDoledUrls ( ) {

 subloop:

+	QUICKPOLL(MAX_NICENESS);
+
 	// must be spidering to dole out
 	if ( ! g_conf.m_spideringEnabled ) return;
 	// or if trying to exit
@ -6420,6 +6418,8 @@ void SpiderLoop::spiderDoledUrls ( ) {

 subloopNextPriority:

+	QUICKPOLL(MAX_NICENESS);
+
 		// wrap it if we should
 		//if ( m_cri >= g_collectiondb.m_numRecs ) m_cri = 0;
 		// get rec
@ -6679,6 +6679,8 @@ void SpiderLoop::spiderDoledUrls ( ) {

 loop:

+	QUICKPOLL(MAX_NICENESS);
+
 	// shortcut
 	//CrawlInfo *ci = &cr->m_localCrawlInfo;
 	ci = &cr->m_localCrawlInfo;
@ -7534,7 +7536,7 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
 	// this causes us to dead lock when spiders use up all the mem, and
 	// file merge operation can not get any, and spiders need to add to 
 	// titledb but can not until the merge completes!!
-	if ( g_mem.m_maxMem - g_mem.m_used < 25*1024*1024 ) {
+	if ( g_conf.m_maxMem - g_mem.m_used < 25*1024*1024 ) {
 		static int32_t s_lastTime = 0;
 		static int32_t s_missed   = 0;
 		s_missed++;
@ -7543,7 +7545,7 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
 		if ( now - s_lastTime > 10 ) {
 			log("spider: Need 25MB of free mem to launch spider, "
 			    "only have %"INT64". Failed to launch %"INT32" times so "
-			    "far.", g_mem.m_maxMem - g_mem.m_used , s_missed );
+			    "far.", g_conf.m_maxMem - g_mem.m_used , s_missed );
 			s_lastTime = now;
 		}
 	}
--- a/SpiderProxy.cpp
+++ b/SpiderProxy.cpp
@ -249,6 +249,10 @@ bool resetProxyStats ( ) {
 // save the stats
 bool saveSpiderProxyStats ( ) {

+	// do not save if coring in a malloc/free because we call malloc/free
+	// below to save stuff possibly
+	if ( g_inMemFunction ) return true;
+
 	// save hashtable
 	s_proxyBannedTable.save(g_hostdb.m_dir,"proxybantable.dat");

--- a/Syncdb.cpp
+++ b/Syncdb.cpp
@ -1349,10 +1349,10 @@ void Syncdb::syncStart_r ( bool amThread ) {
 	for ( int32_t m = 0 ; m < f->m_numParts ; m++ ) {

 	// get part file
-	File *p = f->m_files[m];
+	File *p = f->getFile2(m);//m_files[m];
 	// copy that
 	sprintf ( cmd , "rcp %s %s:%scoll.%s.%"INT32"/'",
-		  p->m_filename,ips,dir,coll,collnum);
+		  p->getFilename(),ips,dir,coll,collnum);
 	// excecute
 	log ( LOG_INFO, "sync: %s", cmd );
 	// MDW: take out for now
--- a/Tagdb.h
+++ b/Tagdb.h
@ -182,7 +182,10 @@ class TagRec {
 		// advance
 		current += recSize;
 		// sanity check
-		if ( recSize > 500000 ) { char *xx=NULL;*xx=0;}
+		if ( recSize > 500000 || recSize < 12 ) { 
+			log("tagdb: corrupt tag recsize %i",(int)recSize);
+			return NULL;
+			char *xx=NULL;*xx=0;}
 		// breach list?
 		if ( current < m_listPtrs[i]->m_listEnd) return (Tag *)current;
 		// advance list
--- a/TcpServer.cpp
+++ b/TcpServer.cpp
@ -268,7 +268,7 @@ void timePollWrapper ( int fd , void *state ) {
 	THIS->closeLeastUsed( 60 );
 }

-bool TcpServer::testBind ( uint16_t port ) {
+bool TcpServer::testBind ( uint16_t port , bool printMsg ) {
 	// assign port for the test
 	m_port = port;
 	// sockaddr_in provides interface to sockaddr
@ -321,6 +321,8 @@ retry19:
 		g_errno = errno;
 		//if ( g_errno == EINVAL ) { port++; goto again; }
 		close ( m_sock );
+		if ( ! printMsg ) 
+			return false;
 		fprintf(stderr,"Failed to bind socket on port %"INT32": %s."
 			"\n"
 			"Are you already running gb?\n"
@ -2855,7 +2857,10 @@ int TcpServer::sslHandshake ( TcpSocket *s ) {
 		SSL_set_connect_state(s->m_ssl);
 	}

+	// SSL_connect() calls malloc()
+	g_inMemFunction = true;
 	int r = SSL_connect(s->m_ssl);
+	g_inMemFunction = false;

 	if ( g_conf.m_logDebugTcp )
 		log("tcp: ssl handshake on sd=%"INT32" r=%i",
--- a/TcpServer.h
+++ b/TcpServer.h
@ -65,7 +65,7 @@ class TcpServer {
 		    //int32_t      maxReadBufSize = 128*1024  , 
 		    //int32_t      maxSendBufSize = 128*1024  );

-	bool testBind ( uint16_t port ) ;
+	bool testBind ( uint16_t port , bool printMsg ) ;

 	// . returns false if blocked, true otherwise
 	// . sets errno on error
--- a/Threads.cpp
+++ b/Threads.cpp
@ -1984,13 +1984,13 @@ bool ThreadQueue::launchThread2 ( ThreadEntry *te ) {
 		// . we know the stored File is still around because of that
 		bool doWrite = fs->m_doWrite;
 		BigFile *bb = fs->m_this;
-		fs->m_fd1 = bb->getfd (fs->m_filenum1, !doWrite, &fs->m_vfd1);
-		fs->m_fd2 = bb->getfd (fs->m_filenum2, !doWrite, &fs->m_vfd2);
+		fs->m_fd1 = bb->getfd (fs->m_filenum1,!doWrite);//&fs->m_vfd1);
+		fs->m_fd2 = bb->getfd (fs->m_filenum2,!doWrite);//&fs->m_vfd2);
 		// is this bad?
 		if ( fs->m_fd1 < 0 ) log("disk: fd1 is %i for %s",
-					 fs->m_fd1,bb->m_baseFilename);
+					 fs->m_fd1,bb->getFilename());
 		if ( fs->m_fd2 < 0 ) log("disk: fd2 is %i for %s.",
-					 fs->m_fd2,bb->m_baseFilename);
+					 fs->m_fd2,bb->getFilename());
 		fs->m_closeCount1 = getCloseCount_r ( fs->m_fd1 );
 		fs->m_closeCount2 = getCloseCount_r ( fs->m_fd2 );
 	}
--- a/Titledb.h
+++ b/Titledb.h
@ -17,7 +17,8 @@
 //#define TITLEREC_CURRENT_VERSION 118
 // add new link stats into LinkInfo
 //#define TITLEREC_CURRENT_VERSION 119
-#define TITLEREC_CURRENT_VERSION 120
+//#define TITLEREC_CURRENT_VERSION 120
+#define TITLEREC_CURRENT_VERSION 121

 #include "Rdb.h"
 #include "Url.h"
--- a/Users.cpp
+++ b/Users.cpp
@ -114,6 +114,7 @@ Users::~Users(){
 }

 bool Users::save(){
+	return true;
 	if ( ! m_needsSave ) return true;
 	if ( ! m_loginTable.save(g_hostdb.m_dir,"userlogin.dat",NULL,0) )
 		return log("users: userlogin.dat save failed");
--- a/Xml.cpp
+++ b/Xml.cpp
@ -427,14 +427,152 @@ bool Xml::set ( char  *s             ,
 		}
 		// ok, we got a <script> tag now
 		m_numNodes++;
+
+		// use this for parsing consistency when deleting records
+		// so they equal what we added.
+		bool newVersion = true;
+		if ( version <= 120 ) newVersion = false;
+		//newVersion = false;
+
+		//	retry:
 		// scan for </script>
 		char *pstart = &m_xml[i];
 		char *p      = pstart;
 		char *pend   = &m_xml[0] + m_xmlLen;
+		bool inDoubles = false;
+		bool inSingles = false;
+		bool inComment1 = false;
+		bool inComment2 = false;
+		bool inComment3 = false;
+		bool inComment4 = false;
+		bool escaped    = false;
+		//bool newLine    = false;
+		// bool foo = false;
+		// if ( m_xmlLen == 13257 ) { //pstart - m_xml == 88881 ) {
+		// 	foo = true;
+		// }
 		// scan -- 5 continues -- node 1570 is text of script
 		for ( ; p < pend ; p++ ) {
 			// breathe
 			QUICKPOLL(m_niceness);
+			//
+			// adding these new quote checks may cause a few
+			// parsing inconsistencies for pages a hanful of pages
+			//
+			// windows-based html pages use 13 sometimes and no
+			// \n at all...
+			if ( p[0] =='\n' || p[0] == 13 )  { // ^m = 13 = CR
+				//newLine = true;
+				inComment1 = false;
+			}
+			if ( p[0] == '\\' ) {
+				escaped = ! escaped;
+				continue;
+			}
+			//if ( newLine && is_wspace_a(p[0]) )
+			//	continue;
+			if ( p[0] == '<' && p[1] == '!' && 
+			     p[2] == '-' && p[2] == '-' &&
+			     ! inSingles && ! inDoubles &&
+			     ! inComment1 &&
+			     ! inComment2 &&
+			     ! inComment4 ) 
+				inComment3 = true;
+			if ( p[0] == '-' && p[1] == '-' && 
+			     p[2] == '>' && 
+			     inComment3 ) 
+				inComment3 = false;
+			// no. i saw <script>//</script> and </script> was
+			// not considered to be in a comment
+			if ( p[0] == '/' && p[1]=='/'&& 
+			     ! inSingles && ! inDoubles &&
+			     ! inComment2 && 
+			     ! inComment3 &&
+			     // allow for "//<![CDATA[..." to end in
+			     // "//]]>" so ignore if inComment4 is true.
+			     // i'd say these are the weaker of all 4 
+			     // comment types in that regard.
+			     ! inComment4 )
+				inComment1 = true;
+			// handle /* */ comments
+			if ( p[0] == '/' && p[1]=='*' &&
+			     ! inSingles && ! inDoubles &&
+			     ! inComment1 && 
+			     ! inComment3 &&
+			     ! inComment4 )
+				inComment2 = true;
+			// <![CDATA[...]]> "comments" in <script> tags
+			// are common. CDATA tags seem to prevail even if
+			// within another comment tag, like i am seeing
+			// "//<![CDATA[..." a lot.
+			if ( p[0] == '<' &&
+			     p[1] == '!' &&
+			     p[2] == '[' &&
+			     p[3] == 'C' &&
+			     p[4] == 'D' &&
+			     p[5] == 'A' &&
+			     p[6] == 'T' &&
+			     p[7] == 'A' &&
+			     p[8] == '[' 
+			     //! inComment1 &&
+			     //! inComment2 && 
+			     //! inComment3 )
+			     )
+				inComment4 = true;
+			if ( p[0] == ']' &&
+			     p[1] == ']' &&
+			     p[2] == '>' )
+				inComment4 = false;
+			if ( p[0] == '*' && 
+			     p[1]=='/' &&
+			     ! inComment4 )
+				inComment2 = false;
+			// no longer the start of a newLine
+			//newLine = false;
+			// don't check for quotes or </script> if in comment
+			// no, if've seen <script>//</script> on ibm.com pages,
+			// so just ignore ' and " for // comments
+			if ( inComment1 && newVersion ) {
+				escaped = false;
+				//continue;
+			}
+			if ( inComment2 && newVersion ) {
+				escaped = false;
+				continue;
+			}
+			if ( inComment3 && newVersion ) {
+				escaped = false;
+				continue;
+			}
+			if ( inComment4 && newVersion ) {
+				escaped = false;
+				continue;
+			}
+			// if an unescaped double quote
+			if ( p[0] == '\"' && ! escaped && ! inSingles &&
+			     // i've seen <script>//</script> on ibm.com pages,
+			     // so just ignore ' and " for // comments
+			     ! inComment1 ) 
+				inDoubles = ! inDoubles;
+			// if an unescaped single quote. 
+			if ( p[0] == '\'' && ! escaped && ! inDoubles &&
+			     // i've seen <script>//</script> on ibm.com pages,
+			     // so just ignore ' and " for // comments
+			     ! inComment1 ) 
+				inSingles = ! inSingles;
+			// no longer escaped
+			escaped = false;
+			// if ( foo ) {
+			// 	fprintf(stderr,"%c [%lu](inDoubles=%i,"
+			// 		"inSingles=%i)\n",*p,
+			// 		(unsigned long)(uint8_t)*p,
+			// 		(int)inDoubles,
+			// 		(int)inSingles);
+			// }
+			// if ( inSingles ) 
+			// 	continue;
+			// if ( inDoubles ) 
+			// 	continue;
 			// keep going if not a tag
 			if ( p[0]  != '<' ) continue;
 			// </script> or </gbframe> stops it
@ -444,8 +582,11 @@ bool Xml::set ( char  *s             ,
 				     to_lower_a(p[4]) == 'r' &&
 				     to_lower_a(p[5]) == 'i' &&
 				     to_lower_a(p[6]) == 'p' &&
-				     to_lower_a(p[7]) == 't' ) 
+				     to_lower_a(p[7]) == 't' ) {
+					if((inDoubles||inSingles)&& newVersion)
+						continue;
 					break;
+				}
 				if ( to_lower_a(p[2]) == 'g' &&
 				     to_lower_a(p[3]) == 'b' &&
 				     to_lower_a(p[4]) == 'f' &&
@ -460,14 +601,29 @@ bool Xml::set ( char  *s             ,
 			     to_lower_a(p[3]) == 'r' &&
 			     to_lower_a(p[4]) == 'i' &&
 			     to_lower_a(p[5]) == 'p' &&
-			     to_lower_a(p[6]) == 't' ) 
+			     to_lower_a(p[6]) == 't' ) {
+				if ( (inDoubles || inSingles) && newVersion )
+					continue;
 				break;
+			}
 		}
+		// if ( foo )
+		// 	log("done");
 		// make sure we do not breach! i saw this happen once!
 		if ( m_numNodes >= m_maxNumNodes ) break;
 		// was it like <script></script> then no scripttext tag?
 		if ( p - pstart == 0 )
 			continue;
+
+		// none found? allow for </script> in quotes then, maybe
+		// they were unbalanced quotes. also allow for </script>
+		// in a comment. do we need to do this? just enable it if
+		// we find a page that needs it.
+		// if ( p == pend && newVersion ) {
+		// 	newVersion = false;
+		// 	goto retry;
+		// }
+
 		XmlNode *xn      = &m_nodes[m_numNodes++];
 		xn->m_nodeId     = TAG_SCRIPTTEXT;//0; // TEXT NODE
 		xn->m_node       =     pstart;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -2505,6 +2505,10 @@ bool XmlDoc::indexDoc ( ) {
 	if ( g_errno == ESHUTTINGDOWN )
 		return true;

+	// i saw this on shard 9, how is it happening
+	if ( g_errno == EBADRDBID )
+		return true;
+
 	// if docid not found when trying to do a query reindex...
 	// this really shouldn't happen but i think we were adding
 	// additional SpiderRequests since we were using a fake first ip.
@ -6699,6 +6703,9 @@ Xml *XmlDoc::getXml ( ) {
 	// return it if it is set
 	if ( m_xmlValid ) return &m_xml;

+	// note it
+	setStatus ( "parsing html");
+
 	// get the filtered content
 	char **u8 = getUtf8Content();
 	if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8;
@ -6707,8 +6714,6 @@ Xml *XmlDoc::getXml ( ) {
 	uint8_t *ct = getContentType();
 	if ( ! ct || ct == (void *)-1 ) return (Xml *)ct;

-	// note it
-	setStatus ( "getting xml");
 	// set it
 	if ( ! m_xml.set ( *u8        , 
 			   u8len      , 
@ -7501,6 +7506,8 @@ Sections *XmlDoc::getImpliedSections ( ) {
 // add in Section::m_sentFlags bits having to do with our voting tables
 Sections *XmlDoc::getSections ( ) {

+	setStatus("getting sections");
+
 	// get the sections without implied sections
 	Sections *ss = getImpliedSections();
 	if ( ! ss || ss==(void *)-1) return (Sections *)ss;
@ -10014,6 +10021,8 @@ char *XmlDoc::getIsDup ( ) {
 		return &m_isDup;
 	}

+	setStatus ( "checking for dups" );
+
 	// BUT if we are already indexed and a a crawlbot/bulk diffbot job
 	// then do not kick us out just because another indexed doc is
 	// a dup of us because it messes up the TestOnlyProcessIfNew smoketests
@ -10047,8 +10056,6 @@ char *XmlDoc::getIsDup ( ) {
 	// sanity. must be posdb list.
 	if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;}

-	setStatus ( "checking for dups" );
-
 	// . see if there are any pages that seem like they are dups of us
 	// . they must also have a HIGHER score than us, for us to be 
 	//   considered the dup
@ -13794,6 +13801,30 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
 	// sanity check
 	if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {char *xx=NULL;*xx=0;}

+	CollectionRec *cr = getCollRec();
+	if ( ! cr ) return NULL;
+
+	// hacks of speed. computeSiteNumInlinks is true by default
+	// but if the user turns it off the just use sitelinks.txt
+	if ( ! cr->m_computeSiteNumInlinks ) {
+		int32_t hostHash32 = getHostHash32a();
+		int32_t min = g_tagdb.getMinSiteInlinks ( hostHash32 );
+		// try with www if not there
+		if ( min < 0 && ! m_firstUrl.hasSubdomain() ) {
+			int32_t wwwHash32 = m_firstUrl.getHash32WithWWW();
+			min = g_tagdb.getMinSiteInlinks ( wwwHash32 );
+		}
+		// if still not in sitelinks.txt, just use 0
+		if ( min < 0 ) {
+			m_siteNumInlinksValid = true;
+			m_siteNumInlinks = 0;
+			return &m_siteNumInlinks;
+		}
+		m_siteNumInlinks = min;
+		m_siteNumInlinksValid = true;
+		return &m_siteNumInlinks;
+	}
+
 	setStatus ( "getting site num inlinks");

 	// get it from the tag rec if we can
@ -13831,9 +13862,6 @@ int32_t *XmlDoc::getSiteNumInlinks ( ) {
 	if ( ! wfts ) return NULL;
 	if ( wfts == -1 ) return (int32_t *)-1;

-	CollectionRec *cr = getCollRec();
-	if ( ! cr ) return NULL;
-
 	setStatus ( "getting site num inlinks");
 	// check the tag first
 	Tag *tag = gr->getTag ("sitenuminlinks");
@ -15423,7 +15451,7 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {

 	// set the mime
 	HttpMime mime;
-	if ( s->m_readOffset>0 && 
+	if ( ! hadError && s && s->m_readOffset>0 && 
 	     // set location url to "null"
 	     ! mime.set ( s->m_readBuf , s->m_readOffset , NULL ) ) {
 		// g_errno should be set
@ -19316,6 +19344,9 @@ File *XmlDoc::getUtf8ContentInFile ( int64_t *fileSizeArg ) {
 		//int32_t loaded = tmp.load ( "/home/mwells/.config/internetarchive.yml");
 		int32_t loaded = tmp.load ( "auth/internetarchive.yml");
 		if(loaded <= 0) {
+			log("gb: failed to load auth/internetarchive.yml");
+			g_errno = EDOCTOOBIG;
+			return NULL;
 			// FIXME
 			char *xx=NULL;*xx=0;
 		}
@ -19394,6 +19425,8 @@ char **XmlDoc::getUtf8Content ( ) {
 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;

+	setStatus("getting utf8 content");
+
 	// recycle?
 	if ( cr->m_recycleContent || m_recycleContent ||
 	     // if trying to delete from index, load from old titlerec
--- a/errnotest.cpp
+++ b/errnotest.cpp
@ -37,12 +37,12 @@ int startup ( void *state ) {
 	int bytes = read(-9,buf,5);
 	//errno = 7; // E2BIG;
 	//assert ( errno && bytes == -1 );
-	g_errno = errno;
+	//g_errno = errno;
 }


 int main() {
-	errno = 10; // EINVAL;
+	//errno = 10; // EINVAL;
 	g_errno = 10;
 	char stack[10000];
 	pid_t pid = clone( startup , 
@ -53,8 +53,8 @@ int main() {
 	int status;
 	waitpid ( pid , &status, 0  );

-	if ( s_called ) fprintf(stderr,"__errno_location() was called %i "
-				"times\n",s_called);
+	fprintf(stderr,"__errno_location() was called %i "
+		"times\n",s_called);

 	if ( errno != 10 ) fprintf(stderr,"errno=%i (failed)\n",errno);
 	else fprintf(stderr,"errno=%i (success)\n",errno);
--- a/main.cpp
+++ b/main.cpp
@ -77,7 +77,7 @@
 #include "Msg9b.h"
 #include "Msg17.h"
 //#include "Msg34.h"
-#include "Msg35.h"
+//#include "Msg35.h"
 //#include "Msg24.h"
 //#include "Msg28.h"
 //#include "Msg30.h"
@ -373,6 +373,7 @@ extern void resetQuery         ( );
 extern void resetStopWords     ( );
 extern void resetUnicode       ( );

+extern void tryToSyncWrapper ( int fd , void *state ) ;

 #if 0
 void stack_test();
@ -1357,6 +1358,18 @@ int main2 ( int argc , char *argv[] ) {
 	}
 	*/

+	if ( strcmp ( cmd ,"isportinuse") == 0 ) {
+		if ( cmdarg+1 >= argc ) goto printHelp;
+		int port = atol ( argv[cmdarg+1] );
+		// make sure port is available. returns false if in use.
+		if ( ! g_httpServer.m_tcp.testBind(port,false) )
+			// and we should return with 1 so the keep alive
+			// script will exit
+			exit (1);
+		// port is not in use, return 0
+		exit(0);
+	}
+
 	// need threads here for tests?

 	// gb thrutest <testDir> <fileSize>
@ -1805,7 +1818,7 @@ int main2 ( int argc , char *argv[] ) {
 		// Load categories and generate country table
 		char structureFile[256];
 		g_conf.m_maxMem = 1000000000LL; // 1G
-		g_mem.m_maxMem  = 1000000000LL; // 1G
+		//g_mem.m_maxMem  = 1000000000LL; // 1G
 		sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
 		g_categories = &g_categories1;
 		if (g_categories->loadCategories(structureFile) != 0) {
@ -2396,7 +2409,7 @@ int main2 ( int argc , char *argv[] ) {
 	if ( strcmp ( cmd , "freecache" ) == 0 ) {	
 		int32_t max = 7000000;
 		if ( cmdarg + 1 < argc ) max = atoi ( argv[cmdarg+1] );
-		freeAllSharedMem( max );
+		//freeAllSharedMem( max );
 		return true;
 	}

@ -3047,7 +3060,8 @@ int main2 ( int argc , char *argv[] ) {
 	// make sure port is available, no use loading everything up then
 	// failing because another process is already running using this port
 	//if ( ! g_udpServer.testBind ( g_hostdb.getMyPort() ) )
-	if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->m_httpPort))
+	if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->m_httpPort,
+					   true)) // printmsg?
 		return 1;

 	int32_t *ips;
@ -3453,9 +3467,15 @@ int main2 ( int argc , char *argv[] ) {
 	//}

 	// test all collection dirs for write permission -- metalincs' request
+	int32_t pcount = 0;
 	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
 		CollectionRec *cr = g_collectiondb.m_recs[i];
 		if ( ! cr ) continue;
+		if ( ++pcount >= 100 ) {
+			log("rdb: not checking directory permission for "
+			    "more than first 100 collections to save time.");
+			break;
+		}
 		char tt[1024 + MAX_COLL_LEN ];
 		sprintf ( tt , "%scoll.%s.%"INT32"",
 			  g_hostdb.m_dir, cr->m_coll , (int32_t)cr->m_collnum );
@ -3838,7 +3858,8 @@ int main2 ( int argc , char *argv[] ) {
 	// . put this in here instead of Rdb.cpp because we don't want
 	//   generator commands merging on us
 	// . the (void *)1 prevents gb from logging merge info every 2 seconds
-	if ( ! g_loop.registerSleepCallback(2000,(void *)1,attemptMergeAll))
+	// . niceness is 1
+	if ( ! g_loop.registerSleepCallback(2000,(void *)1,attemptMergeAll,1))
 		log("db: Failed to init merge sleep callback.");

 	// SEO MODULE
@ -3848,7 +3869,9 @@ int main2 ( int argc , char *argv[] ) {
 	     ! g_loop.registerSleepCallback(2000,(void *)1,runSEOQueryLoop))
 		log("db: Failed to register seo query loop");

-
+	// try to sync parms (and collection recs) with host 0
+	if ( ! g_loop.registerSleepCallback(1000,NULL,tryToSyncWrapper,0))
+		return false;

 	//if( !g_loop.registerSleepCallback(2000,(void *)1,controlDumpTopDocs) )
 	//	log("db: Failed to init dump TopDocs sleep callback.");
@ -3866,11 +3889,11 @@ int main2 ( int argc , char *argv[] ) {
 	//msg3e.checkForNewParms();

 	// this stuff is similar to alden's msg3e but will sync collections
-	// that were added/deleted
-	if ( ! g_parms.syncParmsWithHost0() ) {
-		log("parms: error syncing parms: %s",mstrerror(g_errno));
-		return 0;
-	}
+	// that were added/deletede
+	//if ( ! g_parms.syncParmsWithHost0() ) {
+	//	log("parms: error syncing parms: %s",mstrerror(g_errno));
+	//	return 0;
+	//}


 	if(g_recoveryMode) {
@ -3896,6 +3919,7 @@ int main2 ( int argc , char *argv[] ) {

 	Json json;
 	json.test();
+	json.reset();

 	// . start the spiderloop
 	// . comment out when testing SpiderCache
@ -5191,6 +5215,23 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 				 "while [ \\$EXITSTATUS != 0 ]; do "
 				 "{ "

+				// if gb still running, then do not try to
+				// run it again. we
+				// probably double-called './gb start'.
+				// so see if the port is bound to. 
+				"./gb isportinuse %i ; "
+				"if [ \\$? -eq 1 ] ; then "
+				"echo \"gb or something else "
+				"is already running on "
+				"port %i. Not starting.\" ; "
+				"exit 0; "
+				"fi ; "
+
+				// ok, the port is available
+				//"echo \"Starting gb\"; "
+
+				//"exit 0; "
+
 				// in case gb was updated...
 				"mv -f gb.installed gb ; "

@ -5211,11 +5252,16 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
 				"ADDARGS='-r'\\$INC ; "
 				"INC=\\$((INC+1));"
 				"} " 
- 				"done >& /dev/null & \" %s",
+ 				//"done >& /dev/null & \" %s",
+ 				"done & \" %s",
 				//"\" %s",
 				iptoa(h2->m_ip),
 				h2->m_dir      ,

+				// for ./gb isportinuse %i
+				h2->m_httpPort ,
+				h2->m_httpPort ,
+
 				// for moving log file
 				 h2->m_hostId   ,
 				 h2->m_hostId   ,
@ -5840,7 +5886,7 @@ bool registerMsgHandlers2(){
 bool registerMsgHandlers3(){
 	Msg17 msg17;    if ( ! msg17.registerHandler () ) return false;
 	//Msg34 msg34;    if ( ! msg34.registerHandler () ) return false;
-	Msg35 msg35;    if ( ! msg35.registerHandler () ) return false;
+	//Msg35 msg35;    if ( ! msg35.registerHandler () ) return false;
 	//Msg24 msg24;    if ( ! msg24.registerHandler () ) return false;
 	//Msg40 msg40;    if ( ! msg40.registerHandler () ) return false;
 	//MsgB  msgb;     if ( ! msgb.registerHandler  () ) return false;
@ -10726,7 +10772,7 @@ bool gbgunzip (char *filename) {
 // time speed of inserts into RdbTree for indexdb
 bool bucketstest ( char* dbname ) {
 	g_conf.m_maxMem = 2000000000LL; // 2G
-	g_mem.m_maxMem  = 2000000000LL; // 2G
+	//g_mem.m_maxMem  = 2000000000LL; // 2G


 	if ( dbname ) {
@ -12223,7 +12269,7 @@ void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,

 bool parseTest ( char *coll , int64_t docId , char *query ) {
 	g_conf.m_maxMem = 2000000000LL; // 2G
-	g_mem.m_maxMem  = 2000000000LL; // 2G
+	//g_mem.m_maxMem  = 2000000000LL; // 2G
 	//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
 	//g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
 	g_conf.m_tfndbMaxDiskPageCacheMem = 0;
@ -14546,7 +14592,8 @@ int injectFile ( char *filename , char *ips ,
 	int64_t startDocId = 0LL;
 	int64_t endDocId = MAX_DOCID;

-	g_mem.init ( 4000000000LL );
+	g_conf.m_maxMem = 4000000000LL;
+	g_mem.init ( );//4000000000LL );

 	// set up the loop
 	if ( ! g_loop.init() ) return log("build: inject: Loop init "
@ -16324,8 +16371,8 @@ bool memTest() {
 	// if ( ! g_log.init( "./memlog" ) ) {//g_hostdb.m_logFilename )        ) {
 	// 	fprintf (stderr,"db: Log file init failed.\n" ); return 1; }
 	//g_mem.init(0xffffffff);
-	g_mem.m_maxMem = 0xffffffffLL;
-	g_mem.init( g_mem.m_maxMem );
+	g_conf.m_maxMem = 0xffffffffLL;
+	g_mem.init( );//g_mem.m_maxMem );
 	

 	fprintf(stderr, "memtest: Testing memory bus bandwidth.\n");
@ -16343,7 +16390,7 @@ bool memTest() {
 	membustest ( 8000 , 100000 , true );

 	fprintf(stderr, "memtest: Allocating up to %"INT64" bytes\n",
-		g_mem.m_maxMem);
+		g_conf.m_maxMem);
 	for (i=0;i<4096;i++) {
 		ptrs[numPtrs] = mmalloc(1024*1024, "memtest");
 		if (!ptrs[numPtrs]) break;
@ -16353,7 +16400,7 @@ bool memTest() {
 	fprintf(stderr, "memtest: Was able to allocate %"INT64" bytes of a "
 		"total of "
 	    "%"INT64" bytes of memory attempted.\n",
-	    g_mem.m_used,g_mem.m_maxMem);
+	    g_mem.m_used,g_conf.m_maxMem);

 	return true;

@ -16483,7 +16530,7 @@ void membustest ( int32_t nb , int32_t loops , bool readf ) {
 bool cacheTest() {

 	g_conf.m_maxMem = 2000000000LL; // 2G
-	g_mem.m_maxMem  = 2000000000LL; // 2G
+	//g_mem.m_maxMem  = 2000000000LL; // 2G

 	hashinit();

--- a/qa.cpp
+++ b/qa.cpp
@ -476,6 +476,9 @@ void processReply ( char *reply , int32_t replyLen ) {

 	g_numErrors++;
 	
+	SafeBuf he;
+	he.htmlEncode ( s_url.getUrl() );
+
 	g_qaOutput.safePrintf("<b style=color:red;>FAILED TEST</b><br>%s : "
 			      "<a href=%s>%s</a> (urlhash=%"UINT32")<br>"

@ -496,7 +499,7 @@ void processReply ( char *reply , int32_t replyLen ) {
 			      "<pre id=%"UINT32" style=background-color:0xffffff;>",
 			      s_qt->m_testName,
 			      s_url.getUrl(),
-			      s_url.getUrl(),
+			      he.getBufStart(),
 			      urlHash32,

 			      // input checkbox name field
@ -815,7 +818,7 @@ bool qainject1 ( ) {
 	}


-	// stop for now
+	// stop for now so we can analyze the index
 	//return true; //

 	//