Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

Conflicts: Process.cpp
2024-10-04 12:17:35 +03:00 · 2013-12-06 12:31:36 -08:00 · 2013-12-06 12:31:36 -08:00 · adf9d807ea
commit adf9d807ea
parent 08faf78be9 4b3e111bed
90 changed files with 4547 additions and 1199 deletions
--- a/AutoBan.cpp
+++ b/AutoBan.cpp
@ -799,7 +799,7 @@ bool sendPageAutoban ( TcpSocket *s , HttpRequest *r ) {
 }

 bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
-	SafeBuf sb(512 * 512);
+	SafeBuf sb(512 * 512,"autobbuf");
 	//read in all of the possible cgi parms off the bat:
 	//long  user     = g_pages.getUserType( s , r );
 	char *username = g_users.getUsername(r);
--- a/BigFile.cpp
+++ b/BigFile.cpp
@ -11,6 +11,10 @@
 #include "Statsdb.h"
 #include "DiskPageCache.h"

+#ifdef ASYNCIO
+#include <aio.h>
+#endif
+
 // main.cpp will wait for this to be zero before exiting so all unlink/renames
 // can complete
 long g_unlinkRenameThreads = 0;
@ -530,6 +534,11 @@ bool BigFile::readwrite ( void         *buf      ,
 	// . if we're blocking then do it now
 	// . this should return false and set g_errno on error, true otherwise
 	if ( ! isNonBlocking ) 	goto skipThread;
+
+#ifdef ASYNCIO
+	goto skipThread;
+#endif
+
 	// . otherwise, spawn a thread to do this i/o
 	// . this returns false and sets g_errno on error, true on success
 	// . we should return false cuz we blocked
@ -597,7 +606,90 @@ bool BigFile::readwrite ( void         *buf      ,
 			log("disk: read buf alloc failed for %li "
 			    "bytes.",need);
 	}
+
+	//
+	// pthread_create() is abhorently slow. use asyncio if possible.
+	//
+
+#ifdef ASYNCIO	
+
+	// we only have two in the array... most likely though we only
+	// need one here...
+	aiocb *a0 = &fstate->m_aiocb[0];
+	aiocb *a1 = &fstate->m_aiocb[1];
+	// init them for the read
+	a0->aio_fildes = fstate->m_fd1;
+	a1->aio_fildes = fstate->m_fd2;
+	// the offset of each file
+	long long off1 = fs->m_offset;
+	// always read at start of 2nd file
+	long long off2 = 0;
+	// how many bytes to read from each file?
+	long long readSize1 = size;
+	long long readSize2 = 0;
+	if ( off1 + readSize1 > MAX_PART_SIZE ) {
+		readSize1 = ((long long)MAX_PART_SIZE) - off1;
+		readSize2 = size - readSize1;
+	}
+	a0->aio_offset = off1;
+	a1->aio_offset = off2;
+	a0->aio_nbytes = readSize1;
+	a1->aio_nbytes = readSize2;
+	a0->aio_buf = fstate->m_buf;
+	a1->aio_buf = fstate->m_buf + readSize1;
+	a0->aio_reqprio = 0;
+	a1->aio_reqprio = 0;
+	a0->aio_sigevent = SIGEV_SIGNAL;
+	a1->aio_sigevent = SIGEV_SIGNAL;
+
+	// translate offset to a filenum and offset
+	long filenum     = offset / MAX_PART_SIZE;
+	long localOffset = offset % MAX_PART_SIZE;
+
+
+	// read or write?
+	if ( doWrite ) a0->aio_lio_opcode = LIO_WRITE;
+	else           a0->aio_lio_opcode = LIO_READ;
 	
+	// different fds implies two different files we gotta read from.
+	long numFilesToReadFrom = 1;
+	if ( fstate->m_fd1 != fstate->m_fd2 ) numFilesToReadFrom = 2;
+	// set it up
+	//aioList->m_signal = ESIG;
+
+ retry77:
+
+	//
+	// don't use this on kernels below 3.12 because it can block 
+	// when reading ext4 files.
+	//
+	io_submit();
+
+
+	// this will send the signal when read/write is completed
+	//long status = lio_listio ( LIO_NOWAIT , 
+	//			   a0 ,
+	//			   numFilesToReadFrom ,
+	//			   &fstate->m_sigEvent );
+
+	// if status is 0, there was no error
+	if ( status == 0 ) {
+		g_errno = 0;
+		// assume we will get the signal later
+		return false;
+	}
+	// got interrupted by a signal? try again.
+	if ( errno == EINTR ) 
+		goto retry77;
+	// tell caller about the error
+	g_errno = errno;
+	log("aio: %s", mstrerror(g_errno));
+	// we did not block or anything
+	return true;
+
+#endif
+
+
 	// . this returns false and sets errno on error
 	// . set g_errno to the errno
 	if ( ! readwrite_r ( fstate , NULL ) ) g_errno = errno;
--- a/BigFile.h
+++ b/BigFile.h
@ -95,6 +95,11 @@ public:
 	// m_allocOff is offset into m_allocBuf where we start reading into 
 	// from the file
 	long  m_allocOff;
+	// do not call pthread_create() for every read we do. use async io
+	// because it should be much much faster
+#ifdef ASYNCIO
+	struct aiocb m_aiocb[2];
+#endif
 };


--- a/CollectionRec.cpp
+++ b/CollectionRec.cpp
@ -72,6 +72,10 @@ CollectionRec::CollectionRec() {

 	m_lastResetCount = 0;

+	// regex_t types
+	m_hasucr = false;
+	m_hasupr = false;
+
 	// for diffbot caching the global spider stats
 	reset();

@ -91,6 +95,11 @@ void CollectionRec::setToDefaults ( ) {
 }

 void CollectionRec::reset() {
+
+	// regex_t types
+	if ( m_hasucr ) regfree ( &m_ucr );
+	if ( m_hasupr ) regfree ( &m_upr );
+
 	// make sure we do not leave spiders "hanging" waiting for their
 	// callback to be called... and it never gets called
 	//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
@ -140,8 +149,34 @@ bool CollectionRec::load ( char *coll , long i ) {
 	// . accepts OBJ_COLLECTIONREC or OBJ_CONF
 	g_parms.setFromFile ( this , tmp2 , tmp1 );

-	// add default reg ex
-	setUrlFiltersToDefaults();
+	// add default reg ex IFF there are no url filters there now
+	if ( m_numRegExs == 0 ) setUrlFiltersToDefaults();
+
+	// compile regexs here
+	char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
+	if ( rx && ! rx[0] ) rx = NULL;
+	if ( rx ) m_hasucr = true;
+	if ( rx && regcomp ( &m_ucr , rx ,
+		       REG_EXTENDED|REG_ICASE|
+		       REG_NEWLINE|REG_NOSUB) ) {
+			// error!
+			return log("xmldoc: regcomp %s failed: %s. "
+				   "Ignoring.",
+				   rx,mstrerror(errno));
+	}
+
+	rx = m_diffbotUrlProcessRegEx.getBufStart();
+	if ( rx && ! rx[0] ) rx = NULL;
+	if ( rx ) m_hasupr = true;
+	if ( rx && regcomp ( &m_upr , rx ,
+		       REG_EXTENDED|REG_ICASE|
+		       REG_NEWLINE|REG_NOSUB) ) {
+			// error!
+			return log("xmldoc: regcomp %s failed: %s. "
+				   "Ignoring.",
+				   rx,mstrerror(errno));
+	}
+

 	//
 	// LOAD the crawlinfo class in the collectionrec for diffbot
@ -392,7 +427,7 @@ bool CollectionRec::save ( ) {
 		  g_hostdb.m_dir , m_coll , (long)m_collnum );
 	if ( ! g_parms.saveToXml ( (char *)this , tmp ) ) return false;
 	// log msg
-	log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
+	//log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());

 	//
 	// save the crawlinfo class in the collectionrec for diffbot
@ -400,7 +435,7 @@ bool CollectionRec::save ( ) {
 	// SAVE LOCAL
 	sprintf ( tmp , "%scoll.%s.%li/localcrawlinfo.dat",
 		  g_hostdb.m_dir , m_coll , (long)m_collnum );
-	log("coll: saving %s",tmp);
+	//log("coll: saving %s",tmp);
 	SafeBuf sb;
 	//m_localCrawlInfo.print ( &sb );
 	// binary now
@ -413,7 +448,7 @@ bool CollectionRec::save ( ) {
 	// SAVE GLOBAL
 	sprintf ( tmp , "%scoll.%s.%li/globalcrawlinfo.dat",
 		  g_hostdb.m_dir , m_coll , (long)m_collnum );
-	log("coll: saving %s",tmp);
+	//log("coll: saving %s",tmp);
 	sb.reset();
 	//m_globalCrawlInfo.print ( &sb );
 	// binary now
--- a/CollectionRec.h
+++ b/CollectionRec.h
@ -56,7 +56,7 @@
 //#define MAX_SITE_EXPRESSION_LEN 128
 //#define MAX_SITE_EXPRESSIONS    256

-//#include "regex.h"
+#include "regex.h"

 #include "Url.h"  // MAX_COLL_LEN
 //#include "Sync.h"
@ -108,6 +108,9 @@ class CrawlInfo {
 	// currently in the ready queue (doledb) to spider?
 	char m_sentCrawlDoneAlert;

+	//long m_numUrlsLaunched;
+	long m_dummy1;
+
 	void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
 	//bool print (class SafeBuf *sb ) ;
 	//bool setFromSafeBuf (class SafeBuf *sb ) ;
@ -432,8 +435,27 @@ class CollectionRec {
 	//SafeBuf m_diffbotApiList;//QueryString;
 	//SafeBuf m_diffbotUrlCrawlPattern;
 	//SafeBuf m_diffbotUrlProcessPattern;
+
+	// use for all now...
+	SafeBuf m_diffbotApiUrl;
+
+	// only process pages whose content matches this pattern
 	SafeBuf m_diffbotPageProcessPattern;
+	// only process urls that match this pattern
+	SafeBuf m_diffbotUrlProcessPattern;
+	// only CRAWL urls that match this pattern
+	SafeBuf m_diffbotUrlCrawlPattern;
+
+	// regex support
+	SafeBuf m_diffbotUrlCrawlRegEx;
+	SafeBuf m_diffbotUrlProcessRegEx;
+	regex_t m_ucr;
+	regex_t m_upr;
+	long    m_hasucr:1;
+	long    m_hasupr:1;
+
 	char    m_diffbotOnlyProcessIfNew;
+
 	//SafeBuf m_diffbotClassify;
 	//char m_diffbotClassify;
 	//char m_useDiffbot;
@ -515,6 +537,9 @@ class CollectionRec {
 	long      m_numRegExs11;
 	SafeBuf   m_spiderDiffbotApiUrl [ MAX_FILTERS ];

+	long      m_numRegExs8;
+	char      m_harvestLinks     [ MAX_FILTERS ];
+
 	// dummy?
 	long      m_numRegExs9;

@ -671,12 +696,6 @@ class CollectionRec {

 	class SpiderColl *m_spiderColl;

-	// each Rdb has a tree, so keep the pos/neg key count here so
-	// that RdbTree does not have to have its own array limited by
-	// MAX_COLLS which we did away with because we made this dynamic.
-	long m_numPosKeysInTree[RDB_END];
-	long m_numNegKeysInTree[RDB_END];
-
 	long m_overflow;
 	long m_overflow2;

@ -1018,6 +1037,12 @@ class CollectionRec {
 	// used by Parms.cpp
 	char m_hackFlag;

+	// each Rdb has a tree, so keep the pos/neg key count here so
+	// that RdbTree does not have to have its own array limited by
+	// MAX_COLLS which we did away with because we made this dynamic.
+	long m_numPosKeysInTree[RDB_END];
+	long m_numNegKeysInTree[RDB_END];
+
 	//long m_numEventsOnHost;

 	// do we have the doc:quality var in any url filter?
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -247,7 +247,12 @@ bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
 	// MDW: ensure not created on disk since time of last load
 	char dname[512];
 	sprintf(dname, "%scoll.%s.%li/",g_hostdb.m_dir,coll,i);
-	if ( isNew && opendir ( dname ) ) {
+	DIR *dir = NULL;
+	if ( isNew )
+		dir = opendir ( dname );
+	if ( dir )
+		closedir ( dir );
+	if ( isNew && dir ) {
 		g_errno = EEXIST;
 		return log("admin: Trying to create collection %s but "
 			   "directory %s already exists on disk.",coll,dname);
@ -524,11 +529,12 @@ bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
 void savingCheckWrapper1 ( int fd , void *state ) {
 	WaitEntry *we = (WaitEntry *)state;
 	// no state?
-	if ( ! we ) return;
-	// if it blocked again i guess tree is still saving
-	if ( ! g_collectiondb.resetColl ( we->m_coll , we ) ) return;
+	if ( ! we ) { log("colldb: we1 is null"); return; }
 	// unregister too
 	g_loop.unregisterSleepCallback ( state,savingCheckWrapper1 );
+	// if it blocked again i guess tree is still saving
+	if ( ! g_collectiondb.resetColl ( we->m_coll , we , we->m_purgeSeeds))
+		return;
 	// all done
 	we->m_callback ( we->m_state );
 }
@ -536,11 +542,11 @@ void savingCheckWrapper1 ( int fd , void *state ) {
 void savingCheckWrapper2 ( int fd , void *state ) {
 	WaitEntry *we = (WaitEntry *)state;
 	// no state?
-	if ( ! we ) return;
-	// if it blocked again i guess tree is still saving
-	if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
+	if ( ! we ) { log("colldb: we2 is null"); return; }
 	// unregister too
 	g_loop.unregisterSleepCallback ( state,savingCheckWrapper2 );
+	// if it blocked again i guess tree is still saving
+	if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
 	// all done
 	we->m_callback ( we->m_state );
 }
@ -599,7 +605,7 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
 		g_errno = ENOTFOUND;
 		return true;
 	}
-		
+
 	if ( g_process.isAnyTreeSaving() ) {
 		// note it
 		log("admin: tree is saving. waiting2.");
@ -700,7 +706,11 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {

 // . reset a collection
 // . returns false if blocked and will call callback
-bool Collectiondb::resetColl ( char *coll ,  WaitEntry *we ) {
+bool Collectiondb::resetColl ( char *coll ,  WaitEntry *we , bool purgeSeeds) {
+
+	// save parms in case we block
+	we->m_purgeSeeds = purgeSeeds;
+
 	// ensure it's not NULL
 	if ( ! coll ) {
 		log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
@ -849,11 +859,13 @@ bool Collectiondb::resetColl ( char *coll ,  WaitEntry *we ) {
 	//cr->m_spiderStatusMsg = NULL;

 	// reset seed buf
-	cr->m_diffbotSeeds.purge();
-
-	// reset seed dedup table
-	HashTableX *ht = &cr->m_seedHashTable;
-	ht->reset();
+	if ( purgeSeeds ) {
+		// free the buffer of seed urls
+		cr->m_diffbotSeeds.purge();
+		// reset seed dedup table
+		HashTableX *ht = &cr->m_seedHashTable;
+		ht->reset();
+	}

 	// so XmlDoc.cpp can detect if the collection was reset since it
 	// launched its spider:
@ -866,6 +878,14 @@ bool Collectiondb::resetColl ( char *coll ,  WaitEntry *we ) {
 	// right now we #define collnum_t short
 	if ( m_numRecs > 0x7fff ) { char *xx=NULL;*xx=0; }

+	// make a new collnum so records in transit will not be added
+	// to any rdb...
+	cr->m_collnum = newCollnum;
+
+	// Rdb::resetColl() needs to know the new cr so it can move
+	// the RdbBase into cr->m_bases[rdbId] array. recycling.
+	m_recs[newCollnum] = cr;
+
 	// . unlink all the *.dat and *.map files for this coll in its subdir
 	// . remove all recs from this collnum from m_tree/m_buckets
 	// . updates RdbBase::m_collnum
@ -879,16 +899,10 @@ bool Collectiondb::resetColl ( char *coll ,  WaitEntry *we ) {
 	g_clusterdb.getRdb()->resetColl ( oldCollnum , newCollnum );
 	g_linkdb.getRdb()->resetColl    ( oldCollnum , newCollnum );

-	// make a new collnum so records in transit will not be added
-	// to any rdb...
-	cr->m_collnum = newCollnum;
-
 	// reset crawl status too!
 	cr->m_spiderStatus = SP_INITIALIZING;

 	m_recs[oldCollnum] = NULL;
-	m_recs[newCollnum] = cr;
-

 	// readd it to the hashtable that maps name to collnum too
 	long long h64 = hash64n(cr->m_coll);
@ -902,7 +916,10 @@ bool Collectiondb::resetColl ( char *coll ,  WaitEntry *we ) {
 		g_hostdb.m_dir,
 		cr->m_coll,
 		(long)newCollnum);
-	if ( opendir ( dname ) ) {
+	DIR *dir = opendir ( dname );
+	if ( dir )
+	     closedir ( dir );
+	if ( dir ) {
 		//g_errno = EEXIST;
 		log("admin: Trying to create collection %s but "
 		    "directory %s already exists on disk.",coll,dname);
--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -18,6 +18,7 @@ public:
 	void (* m_callback) (void *state);
 	void *m_state;
 	char *m_coll;
+	bool  m_purgeSeeds;
 };

 class Collectiondb  {
@ -94,7 +95,7 @@ class Collectiondb  {
 	bool deleteRecs ( class HttpRequest *r ) ;

 	// returns false if blocked, true otherwise. 
-	bool resetColl ( char *coll , WaitEntry *we );
+	bool resetColl ( char *coll , WaitEntry *we , bool purgeSeeds );

 	// . keep up to 128 of them, these reference into m_list
 	// . COllectionRec now includes m_needsSave and m_lastUpdateTime
--- a/Conf.cpp
+++ b/Conf.cpp
@ -189,6 +189,7 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
 	//g_conf.m_testSearchEnabled = false;


+	/*
 	//
 	// are we running in Matt Wells's data center?
 	// if so, we want to be able to use the seo tools that are not part
@ -207,11 +208,16 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
 	if ( hh && strcmp(hh,"galileo") == 0) priv = true;
 	if ( hh && strcmp(hh,"sputnik") == 0) priv = true;
 	if ( hh && strcmp(hh,"titan") == 0) priv = true;
-	if ( hh[0]=='g' && hh[1]=='k' && is_digit(hh[2]) ) priv = true;
+	if ( hh && hh[0]=='g' && hh[1]=='k' && is_digit(hh[2]) ) priv = true;
 	//if(hh[0]=='s' && hh[1]=='p' && is_digit(hh[2])) ) priv = true;
 	if ( priv ) g_conf.m_isMattWells = true;
 	else        g_conf.m_isMattWells = false;
+	*/
+	g_conf.m_isMattWells = false;

+#ifdef MATTWELLS
+	g_conf.m_isMattWells = true;
+#endif

 	// this is not possible
 	/*
--- a/Dir.cpp
+++ b/Dir.cpp
@ -5,6 +5,7 @@
 Dir::Dir ( ) {
 	m_dirname = NULL;
 	m_dir     = NULL;
+	m_needsClose = false;
 }


@ -40,7 +41,8 @@ bool Dir::set ( char *dirname ) {
 }

 bool Dir::close ( ) {
-	if ( m_dir ) closedir ( m_dir );
+	if ( m_dir && m_needsClose ) closedir ( m_dir );
+	m_needsClose = false;
 	return true;
 }

@ -56,6 +58,7 @@ bool Dir::open ( ) {
 	if ( ! m_dir ) 
 		return log("disk: opendir(%s) : %s",
 			   m_dirname,strerror( g_errno ) );
+	m_needsClose = true;
 	return true;
 }

--- a/Dir.h
+++ b/Dir.h
@ -49,6 +49,7 @@ class Dir {

 	char          *m_dirname;
 	DIR           *m_dir;
+	bool m_needsClose;
 };

 #endif
--- a/Errno.cpp
+++ b/Errno.cpp
@ -161,7 +161,8 @@ case    EDIFFBOTMIMEERROR: return "Diffbot mime error";
 case    EDIFFBOTBADHTTPSTATUS: return "Diffbot reply bad http status";
 case    EHITCRAWLLIMIT: return "Hit the page download limit";
 case    EHITPROCESSLIMIT: return "Hit the page process limit";
-	case    EINTERNALERROR: return "Internal error";
+case    EINTERNALERROR: return "Internal error";
+case	EBADJSONPARSER: return "Bad JSON parser";
 	}
 	// if the remote error bit is clear it must be a regulare errno
 	//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
--- a/Errno.h
+++ b/Errno.h
@ -165,6 +165,7 @@ enum {
 	EDIFFBOTBADHTTPSTATUS,
 	EHITCRAWLLIMIT,
 	EHITPROCESSLIMIT,
-	EINTERNALERROR
+	EINTERNALERROR,
+	EBADJSONPARSER
 };
 #endif
--- a/Facebook.h
+++ b/Facebook.h
@ -238,7 +238,7 @@ class FBRec {
 #endif

 // facebook id for matt wells
-#define MATTWELLS 100003532411011LL
+#define FB_MATTWELLS 100003532411011LL


 //#define APPNAME "Event Widget"
--- a/HashTableX.cpp
+++ b/HashTableX.cpp
@ -341,6 +341,7 @@ bool HashTableX::setTableSize ( long oldn , char *buf , long bufSize ) {
 		m_bufSize = need;
 		m_doFree  = true;
 		if ( ! m_buf ) return false;
+		QUICKPOLL(m_niceness);
 	}

 	// save the old junk
--- a/Highlight.cpp
+++ b/Highlight.cpp
@ -99,10 +99,10 @@ long Highlight::set ( SafeBuf *sb,
 	long version = TITLEREC_CURRENT_VERSION;

 	Bits bits;
-	if ( ! bits.set (&words,version,niceness) ) return 0;
+	if ( ! bits.set (&words,version,niceness) ) return -1;

 	Phrases phrases;
-	if ( !phrases.set(&words,&bits,true,false,version,niceness))return 0;
+	if ( !phrases.set(&words,&bits,true,false,version,niceness))return -1;

 	//SafeBuf langBuf;
 	//if ( !setLangVec ( &words , &langBuf , niceness )) return 0;
@ -115,7 +115,7 @@ long Highlight::set ( SafeBuf *sb,
 	Matches matches;
 	matches.setQuery ( q );

-	if ( ! matches.addMatches ( &words , &phrases ) ) return 0;
+	if ( ! matches.addMatches ( &words , &phrases ) ) return -1;

 	// store
 	m_numMatches = matches.getNumMatches();
@ -172,7 +172,7 @@ long Highlight::set ( SafeBuf *sb ,
 	// save room for terminating \0
 	//m_bufEnd = m_buf + m_bufLen - 1;

-	if ( ! highlightWords ( words, matches, q ) ) return 0;
+	if ( ! highlightWords ( words, matches, q ) ) return -1;

 	// null terminate
 	//*m_bufPtr = '\0';
--- a/HttpRequest.cpp
+++ b/HttpRequest.cpp
@ -72,7 +72,9 @@ bool HttpRequest::copy ( class HttpRequest *r ) {
 // . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not)
 bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 		       char *userAgent , char *proto , bool doPost ,
-		       char *cookie , char *additionalHeader ) {
+		       char *cookie , char *additionalHeader ,
+		       // if posting something, how many bytes is it?
+		       long postContentLen ) {

 	m_reqBufValid = false;

@ -279,6 +281,8 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 	 if ( doPost ) {
 		 long contentLen = 0;
 		 if ( postData ) contentLen = strlen(postData);
+		 // this overrides if provided. -1 is default
+		 if ( postContentLen >= 0 ) contentLen = postContentLen;
 		 m_reqBuf.safePrintf ("Content-Length: %li\r\n", contentLen );
 		 m_reqBuf.safePrintf("\r\n");
 		 if ( postData ) m_reqBuf.safePrintf("%s",postData);
@ -633,6 +637,13 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 	 // matt comcast
 	 if ( sock && strncmp(iptoa(sock->m_ip),"75.160.49.8",11) == 0) 
 	 	 m_isLocal = true;
+	 // matt comcast #2
+	 if ( sock && strncmp(iptoa(sock->m_ip),"69.181.136.143",14) == 0) 
+	 	 m_isLocal = true;
+	 // titan
+	 if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0) 
+	 	 m_isLocal = true;
+

 	 // roadrunner ip
 	 // if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0) 
--- a/HttpRequest.h
+++ b/HttpRequest.h
@ -41,7 +41,8 @@ class HttpRequest {
 		   char *proto = "HTTP/1.0" ,
 		   bool doPost = false ,
 		   char *cookie = NULL ,
-		   char *additionalHeader = NULL ); // does not incl \r\n
+		   char *additionalHeader = NULL , // does not incl \r\n
+		   long postContentLen = -1 ); // for content-length of POST

 	// use this
 	SafeBuf m_reqBuf;
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -130,7 +130,8 @@ bool HttpServer::getDoc ( char   *url      ,
 			  bool     doPost ,
 			  char    *cookie ,
 			  char    *additionalHeader ,
-			  char    *fullRequest ) { 
+			  char    *fullRequest ,
+			  char    *postContent ) { 
 	// sanity
 	if ( ip == -1 ) 
 		log("http: you probably didn't mean to set ip=-1 did you? "
@ -154,6 +155,9 @@ bool HttpServer::getDoc ( char   *url      ,
 		defPort = 443;
 	}

+	long pcLen = 0;
+	if ( postContent ) pcLen = gbstrlen(postContent);
+
 	char *req = NULL;
 	long reqSize;

@ -161,9 +165,15 @@ bool HttpServer::getDoc ( char   *url      ,
 	if ( ! fullRequest ) {
 		if ( ! r.set ( url , offset , size , ifModifiedSince ,
 			       userAgent , proto , doPost , cookie ,
-			       additionalHeader ) ) return true;
+			       additionalHeader , pcLen ) ) return true;
 		reqSize = r.getRequestLen();
-		req = (char *) mdup ( r.getRequest() , reqSize,"HttpServer");
+		req = (char *) mmalloc( reqSize + pcLen ,"HttpServer");
+		if ( req ) 
+			memcpy ( req , r.getRequest() , reqSize );
+		if ( req && pcLen ) {
+			memcpy ( req + reqSize, postContent , pcLen );
+			reqSize += pcLen;
+		}
 	}
 	else {
 		// does not contain \0 i guess
@ -911,7 +921,8 @@ bool HttpServer::sendReply ( TcpSocket  *s , HttpRequest *r , bool isAdmin) {
 	// "GET /crawlbot/downloadobjects"
 	// "GET /crawlbot/downloadpages"
 	if ( strncmp ( path , "/crawlbot/download/" ,19 ) == 0 ||
-	     strncmp ( path , "/v2/crawl/download/" ,19 ) == 0 )
+	     strncmp ( path , "/v2/crawl/download/" ,19 ) == 0 ||
+	     strncmp ( path , "/v2/bulk/download/"  ,18 ) == 0 )
 		return sendBackDump ( s , r );

 	// . is it a diffbot api request, like "GET /api/*"
@ -1542,7 +1553,9 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
 	*/
 }
 bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error , 
-				      char *errmsg, long  rawFormat, 
+				      char *errmsg, 
+				      //long  rawFormat, 
+				      char format ,
 				      int errnum, char *content) {
 	// clear g_errno so the send goes through
 	g_errno = 0;
@ -1559,7 +1572,7 @@ bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
 	// sanity check
 	if ( strncasecmp(errmsg,"Success",7)==0 ) {char*xx=NULL;*xx=0;}

-	if (!rawFormat){
+	if ( format == FORMAT_HTML ) {
 		// Page content
 		char cbuf[1024];
 		sprintf (cbuf, 
@ -1946,7 +1959,11 @@ long getMsgSize ( char *buf, long bufSize, TcpSocket *s ) {
 			    totalReplySize,max);
 		}
 		// truncate the reply if we have to
-		if ( totalReplySize > max ) totalReplySize = max;
+		if ( totalReplySize > max ) {
+			log("http: truncating reply of %li to %li bytes",
+			    totalReplySize,max);
+			totalReplySize = max;
+		}
 		// truncate if we need to
 		return totalReplySize;
 	}
--- a/HttpServer.h
+++ b/HttpServer.h
@ -98,7 +98,8 @@ class HttpServer {
 		      char   *cookie = NULL ,
 		      char *additionalHeader = NULL , // does not include \r\n
 		      // specify your own mime and post data here...
-		      char *fullRequest = NULL );
+		      char *fullRequest = NULL ,
+		      char *postContent = NULL );

 	bool getDoc ( long ip,
 		      long port,
@ -134,7 +135,8 @@ class HttpServer {
 			      long *bytesSent = NULL ); 
 	// send a "prettier" error reply, formatted in XML if necessary
 	bool sendQueryErrorReply ( TcpSocket *s , long error , char *errmsg,
-				   long rawFormat, int errnum, 
+				   // FORMAT_HTML=0,FORMAT_XML,FORMAT_JSON
+				   char format, int errnum, 
 				   char *content=NULL); 
 	

--- a/Json.cpp
+++ b/Json.cpp
@ -4,8 +4,16 @@
 class JsonItem *Json::addNewItem () {

 	JsonItem *ji = (JsonItem *)m_sb.getBuf();
+
+	if ( m_sb.m_length + (long)sizeof(JsonItem) > m_sb.m_capacity ) {
+		log("json: preventing buffer breach");
+		return NULL;
+	}
+
+	// otherwise we got room
 	m_sb.incrementLength(sizeof(JsonItem));

+
 	if ( m_prev ) m_prev->m_next = ji;
 	ji->m_prev = m_prev;
 	ji->m_next = NULL;
@ -53,7 +61,7 @@ JsonItem *Json::getItem ( char *name ) {

 #include "Mem.h" // gbstrlen()

-JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
+JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , long niceness ) {

 	m_prev = NULL;

@ -67,9 +75,15 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
 	bool inQuote = false;
 	long need = 0;
 	for ( ; *p ; p++ ) {
-		if ( *p == '\"' && (p==json || p[-1]!='\\') )
+		// ignore any escaped char. also \x1234
+		if ( *p == '\\' ) {
+			if ( p[1] ) p++;
+			continue;
+		}
+		if ( *p == '\"' )
 			inQuote = ! inQuote;
-		if ( inQuote ) continue;
+		if ( inQuote ) 
+			continue;
 		if ( *p == '{' ||
 		     *p == ',' ||
 		     *p == '[' ||
@ -172,8 +186,15 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
 		if ( *p == '\"' ) {
 			// find end of quote
 			char *end = p + 1;
-			for ( ; *end ; end++ ) 
-				if ( *end == '\"' && end[-1] != '\\' ) break;
+			for ( ; *end ; end++ ) {
+				// skip two chars if escaped
+				if ( *end == '\\' && end[1] ) {
+					end++; 
+					continue;
+				}
+				// this quote is unescaped then
+				if ( *end == '\"' ) break;
+			}
 			// field?
 			char *x = end + 1;
 			// skip spaces
@ -207,7 +228,8 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
 				// get length decoded
 				long curr = m_sb.length();
 				// store decoded string right after jsonitem
-				if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,0))
+				if ( !m_sb.safeDecodeJSONToUtf8 (str,slen,
+								 niceness ))
 					return NULL;
 				// store length decoded json
 				ji->m_valueLen = m_sb.length() - curr;
@ -240,7 +262,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
 				ji->m_valueDouble = 0;
 			}
 			// store decoded string right after jsonitem
-			if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,0))
+			if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,niceness))
 				return NULL;
 			// store length decoded json
 			ji->m_valueLen = m_sb.length() - curr;
@ -283,7 +305,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
 			// copy the number as a string as well
 			long curr = m_sb.length();
 			// store decoded string right after jsonitem
-			if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,0))
+			if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,niceness))
 				return NULL;
 			// store length decoded json
 			ji->m_valueLen = m_sb.length() - curr;
@ -323,11 +345,68 @@ void Json::test ( ) {
 		"in 2010\",\"18083009\":\"Apple personal digital assistants\",\"23475157\":\"Touchscreen portable media players\",\"30107877\":\"IPad\",\"9301031\":\"Apple Inc. hardware\",\"27765345\":\"IOS (Apple)\",\"26588084\":\"Tablet computers\"},\"type\":1,\"senseRank\":1,\"variety\":0.49056603773584906,\"depth\":0.5882352941176471},{\"id\":18839,\"positions\":[[1945,1950],[2204,2209]],\"name\":\"Music\",\"score\":0.7,\"contentMatch\":1,\"categories\":{\"991222\":\"Performing arts\",\"693016\":\"Entertainment\",\"691484\":\"Music\"},\"type\":1,\"senseRank\":1,\"variety\":0.22264150943396221,\"depth\":0.7058823529411764}],\"media\":[{\"pixelHeight\":350,\"link\":\"http://www.onlinemba.com/wp-content/uploads/2013/02/apple-innovates-invert-350x350.png\",\"primary\":\"true\",\"pixelWidth\":350,\"type\":\"image\"}]}";


-	JsonItem *ji = parseJsonStringIntoJsonItems ( json );
+	long niceness = 0;
+	JsonItem *ji = parseJsonStringIntoJsonItems ( json , niceness );

 	// print them out?
 	log("json: type0=%li",(long)ji->m_type);

 	return;
 }
-	
+
+bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) {
+
+	// reset, but don't free mem etc. just set m_length to 0
+	nameBuf.reset();
+	// get its full compound name like "meta.twitter.title"
+	JsonItem *p = this;//ji;
+	char *lastName = NULL;
+	char *nameArray[20];
+	long  numNames = 0;
+	for ( ; p ; p = p->m_parent ) {
+		// empty name?
+		if ( ! p->m_name ) continue;
+		if ( ! p->m_name[0] ) continue;
+		// dup? can happen with arrays. parent of string
+		// in object, has same name as his parent, the
+		// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
+		if ( p->m_name == lastName ) continue;
+		// update
+		lastName = p->m_name;
+		// add it up
+		nameArray[numNames++] = p->m_name;
+		// breach?
+		if ( numNames < 15 ) continue;
+		log("build: too many names in json tag");
+		break;
+	}
+	// assemble the names in reverse order which is correct order
+	for ( long i = 1 ; i <= numNames ; i++ ) {
+		// copy into our safebuf
+		if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) ) 
+			return false;
+		// separate names with periods
+		if ( ! nameBuf.pushChar('.') ) return false;
+	}
+	// remove last period
+	nameBuf.removeLastChar('.');
+	// and null terminate
+	if ( ! nameBuf.nullTerm() ) return false;
+	// change all :'s in names to .'s since : is reserved!
+	char *px = nameBuf.getBufStart();
+	for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
+
+	return true;
+}
+
+// is this json item in an array of json items?
+bool JsonItem::isInArray ( ) {
+	JsonItem *p = this;//ji;
+	for ( ; p ; p = p->m_parent ) {
+		// empty name? it's just a "value item" then, i guess.
+		//if ( ! p->m_name ) continue;
+		//if ( ! p->m_name[0] ) continue;
+		if ( p->m_type == JT_ARRAY ) return true;
+	}
+	return false;
+}
--- a/Json.h
+++ b/Json.h
@ -51,6 +51,10 @@ class JsonItem {
 		return (char *)this + sizeof(JsonItem);
 	};

+	// like acme.product.offerPrice if "acme:{product:{offerprice:1.23}}"
+	bool getCompoundName ( SafeBuf &nameBuf ) ;
+
+	bool isInArray ( );
 };


@ -59,7 +63,7 @@ class Json {

 	void test();

-	JsonItem *parseJsonStringIntoJsonItems ( char *json );
+	JsonItem *parseJsonStringIntoJsonItems ( char *json , long niceness );

 	JsonItem *getFirstItem ( ) ;

--- a/Make.depend
+++ b/Make.depend
@ -354,7 +354,7 @@ Collectiondb.o: Collectiondb.cpp gb-include.h types.h fctypes.h Unicode.h \
 IndexTable2.h Msg51.h Msg17.h IndexReadInfo.h Msg3a.h Stats.h \
 PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h zlib.h zconf.h \
 HttpMime.h Users.h Pages.h HttpServer.h TcpServer.h openssl/err.h \
- PageCrawlBot.h Statsdb.h Process.h Msg28.h Cachedb.h Syncdb.h PageTurk.h
+ PageCrawlBot.h Statsdb.h Process.h Msg28.h Cachedb.h Syncdb.h
 CollectionRec.o: CollectionRec.cpp gb-include.h types.h fctypes.h \
 Unicode.h UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h \
 hash.h Errno.h Log.h CollectionRec.h Url.h ip.h Parms.h Xml.h XmlNode.h \
@ -374,7 +374,11 @@ CollectionRec.o: CollectionRec.cpp gb-include.h types.h fctypes.h \
 RdbBuckets.h RdbCache.h Msg5.h Msg3.h RdbMerge.h Dir.h PingServer.h \
 HttpServer.h TcpServer.h openssl/err.h MsgC.h UdpServer.h UdpSlot.h \
 UdpProtocol.h Dns.h DnsProtocol.h Multicast.h Threads.h HttpMime.h \
- Datedb.h Indexdb.h DiskPageCache.h Titledb.h Timedb.h
+ Datedb.h Indexdb.h DiskPageCache.h Titledb.h Timedb.h Spider.h Msg4.h \
+ Msg1.h Msg0.h Clusterdb.h Linkdb.h Msg2.h Query.h Msg20.h Summary.h \
+ matches2.h Words.h StopWords.h Bits.h Pos.h Matches.h HashTableT.h \
+ Domains.h CountryCode.h Tagdb.h Events.h Sections.h IndexList.h Dates.h \
+ Msg22.h CatRec.h Categories.h Catdb.h
 Conf.o: Conf.cpp gb-include.h types.h fctypes.h Unicode.h \
 UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
 Log.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h iana_charset.h File.h \
@ -668,6 +672,7 @@ Entities.o: Entities.cpp gb-include.h types.h fctypes.h Unicode.h \
 Errno.o: Errno.cpp gb-include.h types.h fctypes.h Unicode.h \
 UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
 Log.h
+errnotest.o: errnotest.cpp
 Facebook.o: Facebook.cpp Facebook.h Conf.h Xml.h XmlNode.h gb-include.h \
 types.h fctypes.h Unicode.h UnicodeProperties.h UCPropTable.h iconv.h \
 UCNormalizer.h hash.h Errno.h Log.h Lang.h Iso8859.h iana_charset.h \
@ -1349,10 +1354,10 @@ main.o: main.cpp gb-include.h types.h fctypes.h Unicode.h \
 Msge0.h Msge1.h Msg8b.h SearchInput.h Msg40.h Msg39.h Msg37.h TopTree.h \
 IndexTable2.h Msg51.h Msg17.h Msg3a.h PostQueryRerank.h Sanity.h \
 SiteGetter.h Title.h Address.h DailyMerge.h Speller.h Language.h Wiki.h \
- Wiktionary.h Scraper.h Msg2a.h Msg9b.h Msg35.h Msg30.h Msg3e.h \
- PageNetTest.h AutoBan.h TuringTest.h Msg1f.h Profiler.h Blaster.h \
- Proxy.h linkspam.h sort.h Ads.h LanguagePages.h ValidPointer.h Placedb.h \
- Test.h seo.h Json.h
+ Wiktionary.h Scraper.h Msg2a.h Msg9b.h Msg35.h Msg3e.h PageNetTest.h \
+ AutoBan.h TuringTest.h Msg1f.h Profiler.h Blaster.h Proxy.h linkspam.h \
+ sort.h Ads.h LanguagePages.h ValidPointer.h Placedb.h Test.h seo.h \
+ Json.h
 matches2.o: matches2.cpp gb-include.h types.h fctypes.h Unicode.h \
 UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
 Log.h matches2.h Titledb.h Rdb.h RdbBase.h Conf.h Xml.h XmlNode.h Lang.h \
@ -2694,7 +2699,7 @@ PageResults.o: PageResults.cpp gb-include.h types.h fctypes.h Unicode.h \
 Highlight.h AutoBan.h TuringTest.h sort.h LanguageIdentifier.h \
 LanguagePages.h LangList.h XmlDoc.h Phrases.h Images.h Msg13.h Msge0.h \
 Msge1.h Msg8b.h SiteGetter.h Title.h Address.h Spider.h PageResults.h \
- Proxy.h
+ Proxy.h Json.h
 PageRoot.o: PageRoot.cpp gb-include.h types.h fctypes.h Unicode.h \
 UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
 Log.h Indexdb.h Rdb.h RdbBase.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
--- a/9
+++ b/9
@ -77,15 +77,20 @@ ifeq ("titan","$(HOST)")
 # in 2013. So it just uses clone() and does its own "threading". Unfortunately,
 # the way it works is not even possible on newer kernels because they no longer
 # allow you to override the _errno_location() function. -- matt
-CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static	
+CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DMATTWELLS
 LIBS = ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a
 else
 # use -m32 to force 32-bit mode compilation.
 # you might have to do apt-get install gcc-multilib to ensure that -m32 works.
 # -m32 should use /usr/lib32/ as the library path.
 # i also provide 32-bit libraries for linking that are not so easy to get.
+#
+# mdw. 11/17/2013. i took out the -D_PTHREADS_ flag (and -lpthread).
+# trying to use good ole' clone() again because it seems the errno location
+# thing is fixed by just ignoring it.
+#
 CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DPTHREADS -Wno-unused-but-set-variable
-LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread 
+LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
 endif

 # if you have seo.cpp link that in. This is not part of the open source
--- a/Mem.cpp
+++ b/Mem.cpp
@ -462,6 +462,10 @@ bool Mem::init  ( long long maxMem ) {
 // this is called by C++ classes' constructors to register mem
 void Mem::addMem ( void *mem , long size , const char *note , char isnew ) {

+	// enforce safebuf::setLabel being called
+	//if ( size>=100000 && note && strcmp(note,"SafeBuf")==0 ) {
+	//	char *xx=NULL;*xx=0; }
+
 	//validate();

 	// sanity check
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -15,6 +15,9 @@ long filterRobotsTxt ( char *reply , long replySize , HttpMime *mime ,
 bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts );
 void gotIframeExpandedContent ( void *state ) ;

+void scanHammerQueue ( int fd , void *state );
+void downloadTheDocForReals ( Msg13Request *r ) ;
+
 // utility functions
 bool getTestSpideredDate ( Url *u , long *origSpiderDate , char *testDir ) ;
 bool addTestSpideredDate ( Url *u , long  spideredTime   , char *testDir ) ;
@ -111,6 +114,11 @@ bool Msg13::registerHandler ( ) {
 	if ( ! s_rt.set ( 8 , 4 , 0 , NULL , 0 , true,0,"wait13tbl") )
 		return false;

+	if ( ! g_loop.registerSleepCallback(10,NULL,scanHammerQueue) )
+		return log("build: Failed to register timer callback for "
+			   "hammer queue.");
+
+
 	// success
 	return true;
 }
@ -419,6 +427,8 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){

 RdbCache s_hammerCache;
 static bool s_flag = false;
+Msg13Request *s_hammerQueueHead = NULL;
+Msg13Request *s_hammerQueueTail = NULL;

 // . only return false if you want slot to be nuked w/o replying
 // . MUST always call g_udpServer::sendReply() or sendErrorReply()
@ -486,15 +496,6 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 	// temporary hack
 	if ( r->m_parent ) { char *xx=NULL;*xx=0; }

-	// use the default agent unless scraping
-	// force to event guru bot for now
-	//char *agent = "Mozilla/5.0 (compatible; ProCogSEOBot/1.0; +http://www.procog.com/ )";
-	//char *agent = "Mozilla/5.0 (compatible; GigaBot/1.0; +http://www.gigablast.com/ )";
-	char *agent = g_conf.m_spiderUserAgent;
-	if ( r->m_isScraping )
-		agent = "Mozilla/4.0 "
-			"(compatible; MSIE 6.0; Windows 98; "
-			"Win 9x 4.90)" ;
 	// assume we do not add it!
 	r->m_addToTestCache = false;

@ -515,18 +516,53 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 	// we skip it if its a frame page, robots.txt, root doc or some other
 	// page that is a "child" page of the main page we are spidering
 	if ( ! r->m_skipHammerCheck ) {
-		// make sure we are not hammering an ip
+		// . make sure we are not hammering an ip
+		// . returns 0 if currently downloading a url from that ip
+		// . returns -1 if not found
 		long long last=s_hammerCache.getLongLong(0,r->m_firstIp,
 							 30,true);
 		// get time now
 		long long nowms = gettimeofdayInMilliseconds();
 		// how long has it been since last download START time?
 		long long waited = nowms - last;
+
+		bool queueIt = false;
+		if ( last > 0 && waited < r->m_crawlDelayMS ) queueIt = true;
+		// a "last" of 0 means currently downloading
+		if ( r->m_crawlDelayMS > 0 && last == 0LL ) queueIt = true;
+		// a last of -1 means not found. so first time i guess.
+		if ( last == -1 ) queueIt = false;
+
+		// . queue it up if we haven't waited long enough
+		// . then the functionr, checkQueue(), will re-eval all
+		//   the download requests in this hammer queue every 10ms. 
+		// . it will just lookup the lastdownload time in the cache,
+		//   which will store maybe a -1 if currently downloading...
+		if ( queueIt ) {
+			// debug
+			//log("spider: adding %s to crawldelayqueue",r->m_url);
+			// save this
+			r->m_udpSlot = slot;
+			r->m_nextLink = NULL;
+			// add it to queue
+			if ( ! s_hammerQueueHead ) {
+				s_hammerQueueHead = r;
+				s_hammerQueueTail = r;
+			}
+			else {
+				s_hammerQueueTail->m_nextLink = r;
+				s_hammerQueueTail = r;
+			}
+			return;
+		}
+			
+
 		// if we had it in cache check the wait time
-		if ( last > 0 && waited < 400 ) {
+		if ( last > 0 && waited < r->m_crawlDelayMS ) {
 			log("spider: hammering firstIp=%s url=%s "
-			    "only waited %lli ms",
-			    iptoa(r->m_firstIp),r->m_url,waited);
+			    "only waited %lli ms of %li ms",
+			    iptoa(r->m_firstIp),r->m_url,waited,
+			    r->m_crawlDelayMS);
 			// this guy has too many redirects and it fails us...
 			// BUT do not core if running live, only if for test
 			// collection
@ -536,14 +572,14 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 			//	char*xx = NULL; *xx = 0; }
 		}
 		// store time now
-		s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
+		//s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
 		// note it
-		if ( g_conf.m_logDebugSpider )
-			log("spider: adding download end time of %llu for "
-			    "firstIp=%s "
-			    "url=%s "
-			    "to msg13::hammerCache",
-			    nowms,iptoa(r->m_firstIp),r->m_url);
+		//if ( g_conf.m_logDebugSpider )
+		//	log("spider: adding download end time of %llu for "
+		//	    "firstIp=%s "
+		//	    "url=%s "
+		//	    "to msg13::hammerCache",
+		//	    nowms,iptoa(r->m_firstIp),r->m_url);
 		// clear error from that if any, not important really
 		g_errno = 0;
 	}
@ -616,26 +652,71 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 	}


+	// do not get .google.com/ crap
+	//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
+
+	downloadTheDocForReals ( r );
+}
+
+void downloadTheDocForReals ( Msg13Request *r ) {
+
 	// are we the first?
 	bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
 	// wait in line cuz someone else downloading it now
 	if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
-		g_udpServer.sendErrorReply(slot,g_errno);
+		g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
 		return;
 	}

 	// this means our callback will be called
-	if ( ! firstInLine ) return;
+	if ( ! firstInLine ) {
+		//log("spider: inlining %s",r->m_url);
+		return;
+	}
+
+	// . store time now
+	// . no, now we store 0 to indicate in progress, then we
+	//   will overwrite it with a timestamp when the download completes
+	// . but if measuring crawldelay from beginning of the download then
+	//   store the current time
+	// . do NOT do this when downloading robots.txt etc. type files
+	//   which should have skipHammerCheck set to true
+	if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck ) {
+		s_hammerCache.addLongLong(0,r->m_firstIp, 0LL);//nowms);
+	}
+	else if ( ! r->m_skipHammerCheck ) {
+		// get time now
+		long long nowms = gettimeofdayInMilliseconds();
+		s_hammerCache.addLongLong(0,r->m_firstIp, nowms);
+	}
+
+	// note it
+	if ( g_conf.m_logDebugSpider )
+		log("spider: adding special \"in-progress\" time of %lli for "
+		    "firstIp=%s "
+		    "url=%s "
+		    "to msg13::hammerCache",
+		    -1LL,iptoa(r->m_firstIp),r->m_url);

-	// do not get .google.com/ crap
-	//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }

 	// flag this
 	r->m_addToTestCache = true;
 	// note it here
 	if ( g_conf.m_logDebugSpider )
-		log("spider: downloading %s (%s)",
-		    r->m_url,iptoa(r->m_urlIp) );
+		log("spider: downloading %s (%s) (skiphammercheck=%li)",
+		    r->m_url,iptoa(r->m_urlIp) ,
+		    (long)r->m_skipHammerCheck);
+
+	// use the default agent unless scraping
+	// force to event guru bot for now
+	//char *agent = "Mozilla/5.0 (compatible; ProCogSEOBot/1.0; +http://www.procog.com/ )";
+	//char *agent = "Mozilla/5.0 (compatible; GigaBot/1.0; +http://www.gigablast.com/ )";
+	char *agent = g_conf.m_spiderUserAgent;
+	if ( r->m_isScraping )
+		agent = "Mozilla/4.0 "
+			"(compatible; MSIE 6.0; Windows 98; "
+			"Win 9x 4.90)" ;
+
 	// download it
 	if ( ! g_httpServer.getDoc ( r->m_url             ,
 				     r->m_urlIp           ,
@ -702,6 +783,21 @@ void gotHttpReply2 ( void *state ,
 		    "for %s at ip %s",
 		    mstrerror(g_errno),r->m_url,iptoa(r->m_urlIp));

+	// get time now
+	long long nowms = gettimeofdayInMilliseconds();
+	// . now store the current time in the cache
+	// . do NOT do this for robots.txt etc. where we skip hammer check
+	if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck )
+		s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
+	// note it
+	if ( g_conf.m_logDebugSpider )
+		log("spider: adding final download end time of %lli for "
+		    "firstIp=%s "
+		    "url=%s "
+		    "to msg13::hammerCache",
+		    nowms,iptoa(r->m_firstIp),r->m_url);
+
+
 	// sanity. this was happening from iframe download
 	//if ( g_errno == EDNSTIMEDOUT ) { char *xx=NULL;*xx=0; }

@ -2086,5 +2182,48 @@ void gotIframeExpandedContent ( void *state ) {
 	delete  ( xd );
 }

+// call this once every 10ms to launch queued up download requests so that
+// we respect crawl delay for sure
+void scanHammerQueue ( int fd , void *state ) {

-	
+	Msg13Request *r = s_hammerQueueHead;
+	if ( ! r ) return;
+
+	long long nowms = gettimeofdayInMilliseconds();
+
+	Msg13Request *prev = NULL;
+	long long waited = -1LL;
+
+	// scan down the linked list of queued of msg13 requests
+	for ( ; r ; prev = r , r = r->m_nextLink ) { 
+		long long last;
+		last = s_hammerCache.getLongLong(0,r->m_firstIp,30,true);
+		// is one from this ip outstanding?
+		if ( last == 0LL && r->m_crawlDelayFromEnd ) continue;
+		// download finished? 
+		if ( last > 0 ) {
+		        waited = nowms - last;
+			// but skip if haven't waited long enough
+			if ( waited < r->m_crawlDelayMS ) continue;
+		}
+		// debug
+		//log("spider: downloading %s from crawldelay queue "
+		//    "waited=%llims crawldelay=%lims", 
+		//    r->m_url,waited,r->m_crawlDelayMS);
+		// good to go
+		downloadTheDocForReals ( r );
+		//
+		// remove from future scans
+		//
+		if ( prev ) 
+			prev->m_nextLink = r->m_nextLink;
+
+		if ( s_hammerQueueHead == r )
+			s_hammerQueueHead = r->m_nextLink;
+
+		if ( s_hammerQueueTail == r )
+			s_hammerQueueTail = prev;
+
+		// try to download some more i guess...
+	}
+}
--- a/Msg13.h
+++ b/Msg13.h
@ -25,6 +25,10 @@ public:
 	long  m_maxCacheAge;
 	long  m_maxTextDocLen;
 	long  m_maxOtherDocLen;
+	// in milliseconds. use -1 if none or unknown.
+	long  m_crawlDelayMS;
+	// for linked list, this is the hammer queue
+	class Msg13Request *m_nextLink;
 	// if doing spider compression, compute contentHash32 of document
 	// downloaded, and if it matches this then send back EDOCUNCHANGED
 	long  m_contentHash32;
@ -50,7 +54,8 @@ public:
 	long  m_addToTestCache:1;
 	long  m_skipHammerCheck:1;
 	long  m_attemptedIframeExpansion:1;
-	long  m_forEvents;
+	long  m_crawlDelayFromEnd:1;
+	long  m_forEvents:1;
 	//long  m_testParserEnabled:1;
 	//long  m_testSpiderEnabled:1;
 	//long  m_isPageParser:1;
@ -83,6 +88,7 @@ public:
 		memset (this,0,(char *)m_url - (char *)this + 1); 
 		m_maxTextDocLen  = -1; // no limit
 		m_maxOtherDocLen = -1; // no limit
+		m_crawlDelayMS   = -1; // unknown or none
 	};
 };

--- a/Msg20.cpp
+++ b/Msg20.cpp
@ -224,6 +224,7 @@ bool Msg20::getSummary ( Msg20Request *req ) {
 			      hostdb            )) {
 		// sendto() sometimes returns "Network is down" so i guess
 		// we just had an "error reply".
+		log("msg20: error sending mcast %s",mstrerror(g_errno));
 		m_gotReply = true;
 		return true;
 	}
@ -428,6 +429,12 @@ Msg20Reply::Msg20Reply ( ) {
 	// this is free in destructor, so clear it here
 	//ptr_eventSummaryLines = NULL;
 	m_tmp = 0;
+
+	// seems to be an issue... caused a core with bogus size_dbuf
+	long *sizePtr = &size_tbuf;
+	long *sizeEnd = &size_note;
+	for ( ; sizePtr <= sizeEnd ; sizePtr++ )
+		*sizePtr = 0;
 }


--- a/Msg39.cpp
+++ b/Msg39.cpp
@ -13,7 +13,8 @@ static void  sendReply         ( UdpSlot *slot         ,
 				 Msg39   *msg39        ,
 				 char    *reply        ,
 				 long     replySize    ,
-				 long     replyMaxSize );
+				 long     replyMaxSize ,
+				 bool     hadError     );
 // called when Msg2 has got all the termlists
 static void  gotListsWrapper   ( void *state ) ;
 // thread wrappers
@ -66,7 +67,7 @@ void handleRequest39 ( UdpSlot *slot , long netnice ) {
 	catch ( ... ) {
 		g_errno = ENOMEM;
 		log("msg39: new(%i): %s", sizeof(Msg39),mstrerror(g_errno));
-		sendReply ( slot , NULL , NULL , 0 , 0 );
+		sendReply ( slot , NULL , NULL , 0 , 0 ,true);
 		return;
 	}
 	mnew ( THIS , sizeof(Msg39) , "Msg39" );
@ -79,12 +80,15 @@ void handleRequest39 ( UdpSlot *slot , long netnice ) {

 // this must always be called sometime AFTER handleRequest() is called
 void sendReply ( UdpSlot *slot , Msg39 *msg39 , char *reply , long replyLen ,
-		 long replyMaxSize ) {
+		 long replyMaxSize , bool hadError ) {
 	// debug msg
 	if ( g_conf.m_logDebugQuery || (msg39&&msg39->m_debug) ) 
 		logf(LOG_DEBUG,"query: msg39: [%lu] Sending reply len=%li.",
 		     (long)msg39,replyLen);

+	// sanity
+	if ( hadError && ! g_errno ) { char *xx=NULL;*xx=0; }
+
 	// no longer in use. msg39 will be NULL if ENOMEM or something
 	if ( msg39 ) msg39->m_inUse = false;

@ -140,7 +144,7 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
 		g_errno = EBADREQUESTSIZE; 
 		log(LOG_LOGIC,"query: msg39: getDocIds: %s." , 
 		    mstrerror(g_errno) );
-		sendReply ( m_slot , this , NULL , 0 , 0 );
+		sendReply ( m_slot , this , NULL , 0 , 0 , true );
 		return ; 
 	}

@ -176,7 +180,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
 		g_errno = ENOCOLLREC;
 		log(LOG_LOGIC,"query: msg39: getDocIds: %s." , 
 		    mstrerror(g_errno) );
-		sendReply ( m_slot , this , NULL , 0 , 0 );
+		sendReply ( m_slot , this , NULL , 0 , 0 , true );
 		return ; 
 	}

@ -185,7 +189,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
 		g_errno = ENOCOLLREC;
 		log(LOG_LOGIC,"query: msg39: getDocIds: %s." , 
 		    mstrerror(g_errno) );
-		sendReply ( m_slot , this , NULL , 0 , 0 );
+		sendReply ( m_slot , this , NULL , 0 , 0 , true );
 		return ; 
 	}

@ -199,7 +203,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
 			     m_r->m_useQueryStopWords ) ) {
 		log(LOG_LOGIC,"query: msg39: setQuery: %s." , 
 		    mstrerror(g_errno) );
-		sendReply ( m_slot , this , NULL , 0 , 0 );
+		sendReply ( m_slot , this , NULL , 0 , 0 , true );
 		return ; 
 	}

@ -217,7 +221,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
 		    ,m_tmpq.m_orig
 		    ,(long)m_r->m_language
 		    );
-		sendReply ( m_slot , this , NULL , 0 , 0 );
+		sendReply ( m_slot , this , NULL , 0 , 0 , true );
 		return ; 
 	}
 	// debug
@ -286,7 +290,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
 	if ( g_errno ) {
 		log(LOG_LOGIC,"query: msg39: doDocIdSplitLoop: %s." , 
 		    mstrerror(g_errno) );
-		sendReply ( m_slot , this , NULL , 0 , 0 );
+		sendReply ( m_slot , this , NULL , 0 , 0 , true );
 		return ; 
 	}
 	// it might not have blocked! if all lists in tree and used no thread
@ -327,11 +331,13 @@ bool Msg39::doDocIdSplitLoop ( ) {
 		if ( d0 >= d1 ) break;
 		// use this
 		//m_debug = true;
+		//log("call1");
 		// . get the lists
 		// . i think this always should block!
 		// . it will also intersect the termlists to get the search
 		//   results and accumulate the winners into the "tree"
 		if ( ! getLists() ) return false;
+		//log("call2 g_errno=%li",(long)g_errno);
 		// if there was an error, stop!
 		if ( g_errno ) break;
 	}
@ -339,7 +345,7 @@ bool Msg39::doDocIdSplitLoop ( ) {
 	// return error reply if we had an error
 	if ( g_errno ) {
 		log("msg39: Had error3: %s.", mstrerror(g_errno));
-		sendReply (m_slot,this,NULL,0,0);
+		sendReply (m_slot,this,NULL,0,0 , true);
 		return true; 
 	}

@ -507,6 +513,7 @@ bool Msg39::getLists () {
 			     "sign=%c "
 			     "numPlusses=%hhu "
 			     "required=%li "
+			     "fielcode=%li "

 			     "ebit=0x%0llx "
 			     "impBits=0x%0llx "
@ -534,6 +541,7 @@ bool Msg39::getLists () {
 			     sign , //c ,
 			     0 , 
 			     (long)qt->m_isRequired,
+			     (long)qt->m_fieldCode,

 			     (long long)qt->m_explicitBit  ,
 			     (long long)qt->m_implicitBits ,
@ -623,6 +631,16 @@ bool Msg39::getLists () {
 		m_blocked = true;
 		return false;
 	}
+
+	// error?
+	if ( g_errno ) { 
+		log("msg39: Had error getting termlists2: %s.",
+		    mstrerror(g_errno));
+		// don't bail out here because we are in docIdSplitLoop()
+		//sendReply (m_slot,this,NULL,0,0,true);
+		return true; 
+	}
+	
 	return gotLists ( true );
 }

@ -630,7 +648,16 @@ void gotListsWrapper ( void *state ) {
 	Msg39 *THIS = (Msg39 *) state;
 	// . hash the lists into our index table
 	// . this will send back a reply or recycle and read more list data
-	THIS->gotLists ( true );
+	if ( ! THIS->gotLists ( true ) ) return;
+
+	// . if he did not block and there was an errno we send reply
+	//   otherwise if there was NO error he will have sent the reply
+	// . if gotLists() was called in the ABOVE function and it returns
+	//   true then the docIdLoop() function will send back the reply.
+	if ( g_errno ) {
+		log("msg39: sending back error reply = %s",mstrerror(g_errno));
+		sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
+	}
 }

 // . now come here when we got the necessary index lists
@ -641,7 +668,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
 	if ( g_errno ) { 
 		log("msg39: Had error getting termlists: %s.",
 		    mstrerror(g_errno));
-		sendReply (m_slot,this,NULL,0,0);
+		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
+		//sendReply (m_slot,this,NULL,0,0,true);
 		return true; 
 	}
 	// timestamp log
@ -681,7 +709,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
 	// . actually we were using it before for rat=0/bool queries but
 	//   i got rid of NO_RAT_SLOTS
 	if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) {
-		sendReply ( m_slot , this , NULL , 0 , 0 );
+		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
+		//sendReply ( m_slot , this , NULL , 0 , 0 , true);
 		return true;
 	}

@ -690,7 +719,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
 	if ( ! m_posdbTable.allocWhiteListTable() ) {
 		log("msg39: Had error allocating white list table: %s.",
 		    mstrerror(g_errno));
-		sendReply (m_slot,this,NULL,0,0);
+		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
+		//sendReply (m_slot,this,NULL,0,0,true);
 		return true; 
 	}

@ -703,7 +733,6 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
 	// . we have to re-set the QueryTermInfos with each docid range split
 	//   since it will set the list ptrs from the msg2 lists
 	if ( m_r->m_useNewAlgo && ! m_posdbTable.setQueryTermInfo () ) {
-		sendReply ( m_slot , this , NULL , 0 , 0 );
 		return true;
 	}

@ -856,7 +885,7 @@ bool Msg39::addedLists ( ) {
 		m_posdbTable.freeMem();
 		g_errno = m_posdbTable.m_errno;
 		log("query: posdbtable had error = %s",mstrerror(g_errno));
-		sendReply ( m_slot , this , NULL , 0 , 0 );
+		sendReply ( m_slot , this , NULL , 0 , 0 ,true);
 		return true;
 	}

@ -899,7 +928,7 @@ bool Msg39::setClusterRecs ( ) {
 	// on error, return true, g_errno should be set
 	if ( ! m_buf ) { 
 		log("query: msg39: Failed to alloc buf for clustering.");
-		sendReply(m_slot,this,NULL,0,0);
+		sendReply(m_slot,this,NULL,0,0,true);
 		return true; 
 	}

@ -981,7 +1010,7 @@ void Msg39::gotClusterRecs ( ) {
 				  m_clusterLevels    )) {
 		m_errno = g_errno;
 		// send back an error reply
-		sendReply ( m_slot , this , NULL , 0 , 0 );
+		sendReply ( m_slot , this , NULL , 0 , 0 ,true);
 		return;
 	}

@ -1146,7 +1175,7 @@ void Msg39::estimateHits ( ) {
 		if ( ! reply ) {
 			log("query: Could not allocated memory "
 			    "to hold reply of docids to send back.");
-			sendReply(m_slot,this,NULL,0,0);
+			sendReply(m_slot,this,NULL,0,0,true);
 			return ; 
 		}
 		topDocIds    = (long long *) mr.ptr_docIds;
@ -1233,6 +1262,6 @@ void Msg39::estimateHits ( ) {
 	}

 	// now send back the reply
-	sendReply(m_slot,this,reply,replySize,replySize);
+	sendReply(m_slot,this,reply,replySize,replySize,false);
 	return;
 }
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -1107,7 +1107,7 @@ bool Msg40::launchMsg20s ( bool recalled ) {
 		req.m_bigSampleMaxLen    = bigSampleMaxLen;
 		req.m_titleMaxLen        = 256;
 		req.m_titleMaxLen = cr->m_titleMaxLen;
-		if(m_si->m_isAdmin && m_si->m_xml == 0) 
+		if(m_si->m_isAdmin && m_si->m_format == FORMAT_HTML )
 			req.m_getGigabitVector   = true;
 		else    req.m_getGigabitVector   = false;
 		req.m_flags              = 0;
@ -1222,6 +1222,7 @@ bool Msg40::gotSummary ( ) {
 	if ( m_numReplies < m_numRequests )
 		return false;

+ doAgain:

 	// do we need to launch another batch of summary requests?
 	if ( m_numRequests < m_msg3a.m_numDocIds ) {
@ -1235,7 +1236,12 @@ bool Msg40::gotSummary ( ) {
 		// it returned true, so m_numRequests == m_numReplies and
 		// we don't need to launch any more! but that does NOT
 		// make sense because m_numContiguous < m_msg3a.m_numDocIds
-		char *xx=NULL; *xx=0;
+		// . i guess the launch can fail because of oom... and
+		//   end up returning true here... seen it happen, and
+		//   we had full requests/replies for m_msg3a.m_numDocIds
+		log("msg40: got all replies i guess");
+		goto doAgain;
+		//char *xx=NULL; *xx=0;
 	}


@ -1895,9 +1901,10 @@ bool Msg40::gotSummary ( ) {
 	}


+	// take this out for now...
+#ifdef GB_PQR
 	// run post query reranks for this query
 	long wanted = m_si->m_docsWanted + m_si->m_firstResultNum + 1;
-
 	if ( m_postQueryRerank.isEnabled() && 
 	    m_postQueryRerank.set2(wanted)){
 		if (      ! m_postQueryRerank.preRerank  () ) {
@ -1916,6 +1923,7 @@ bool Msg40::gotSummary ( ) {
 			m_postQueryRerank.rerankFailed();
 		}
 	}
+#endif

 	// set m_moreToCome, if true, we print a "Next 10" link
 	m_moreToCome = (visible > //m_visibleContiguous > 
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -33,7 +33,7 @@ public:
 	//TagRec     m_tagRec;
 	TcpSocket *m_socket;
 	HttpRequest m_r;
-	char m_coll[50];
+	char m_coll[MAX_COLL_LEN+2];
 	//CollectionRec *m_cr;
 	bool       m_isAdmin;
 	bool       m_isLocal;
@ -136,7 +136,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
 		uint8_t langId = getLangIdFromAbbr ( langAbbr );
 		st->m_langId = langId;
 	}
-	strncpy ( st->m_coll , coll , 40 );
+	strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
 	// store query for query highlighting
 	st->m_netTestResults    = r->getLong ("rnettest", false );
 	if( st->m_netTestResults ) {
@ -179,14 +179,22 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
 		sreq.reset();
 		strcpy(sreq.m_url, url );
 		sreq.setDataSize();
-		xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ); 
+		// this returns false if "coll" is invalid
+		if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) ) 
+			goto hadSetError;
 	}
 	// . when getTitleRec() is called it will load the old one
 	//   since XmlDoc::m_setFromTitleRec will be true
 	// . niceness is 0
-	else {
-		// use st->m_coll since XmlDoc just points to it!
-		xd->set3 ( docId , st->m_coll , 0 );
+	// . use st->m_coll since XmlDoc just points to it!
+	// . this returns false if "coll" is invalid
+	else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
+	hadSetError:
+		mdelete ( st , sizeof(State2) , "PageGet1" );
+		delete ( st );
+		g_errno = ENOMEM;
+		log("PageGet: set3: %s", mstrerror(g_errno));
+		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
 	}
 	// if it blocks while it loads title rec, it will re-call this routine
 	xd->setCallback ( st , processLoopWrapper );
--- a/PagePerf.cpp
+++ b/PagePerf.cpp
@ -23,6 +23,8 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
 	// don't allow pages bigger than 128k in cache
 	char  buf [ 64*1024 ];
 	SafeBuf p(buf, 64*1024);
+	p.setLabel ( "perfgrph" );
+
 	// print standard header
 	g_pages.printAdminTop ( &p , s , r );

--- a/PageResults.cpp
+++ b/PageResults.cpp
--- a/PageRoot.cpp
+++ b/PageRoot.cpp
@ -43,15 +43,36 @@ bool sendPageRoot ( TcpSocket *s, HttpRequest *r ){
 }

 bool printNav ( SafeBuf &sb , HttpRequest *r ) {
+
+	char *root       = "";
+	char *rootSecure = "";
+	if ( g_conf.m_isMattWells ) {
+		root       = "http://www.gigablast.com";
+		rootSecure = "https://www.gigablast.com";
+	}
+
 	sb.safePrintf("<center><b><p class=nav>"
-		      "<a href=\"/about.html\">About</a>"
-		      " &nbsp; &nbsp; <a href=\"/contact.html\">Contact</a>"
-		      " &nbsp; &nbsp;<a href=\"/help.html\">Help</a>"
-		      " &nbsp; &nbsp; <a href=/privacy.html>Privacy Policy</a>"
-		      " &nbsp; &nbsp;<a href=\"/searchfeed.html\">"
-		      "Search API</a>"
-		      " &nbsp; &nbsp; <a href=/seoapi.html>SEO API</a>"
-		      " &nbsp; &nbsp; <a href=/account>My Account</a> "
+		      "<a href=%s/about.html>About</a>"
+		      " &nbsp; &nbsp; "
+		      "<a href=%s/contact.html>Contact</a>"
+		      " &nbsp; &nbsp; "
+		      "<a href=%s/help.html>Help</a>"
+		      " &nbsp; &nbsp; "
+		      "<a href=%s/privacy.html>Privacy Policy</a>"
+		      " &nbsp; &nbsp; "
+		      "<a href=%s/searchfeed.html>Search API</a>"
+		      " &nbsp; &nbsp; "
+		      "<a href=%s/seoapi.html>SEO API</a>"
+		      " &nbsp; &nbsp; "
+		      "<a href=%s/account>My Account</a> "
+		      , root
+		      , root
+		      , root
+		      , root
+		      , root
+		      , root
+		      , rootSecure
+
 		      //" &nbsp; &nbsp; <a href=/logout>Logout</a>"
 		      );
 	if ( r->isLocal() )
@ -115,7 +136,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {

 	sb.safePrintf("<br><br>\n");
 	sb.safePrintf("<br><br><br>\n");
-	sb.safePrintf("<b>web</b> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"/Top\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
+	sb.safePrintf("<b>web</b> &nbsp;&nbsp;&nbsp;&nbsp; "
+		      "<a href=http://www.gigablast.com/seo>seo</a> "
+		      "&nbsp;&nbsp;&nbsp;&nbsp; "
+		      "<a href=\"/Top\">directory</a> "
+		      "&nbsp;&nbsp;&nbsp;&nbsp; \n");
 	sb.safePrintf("<a href=/adv.html>advanced search</a>");
 	sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
 	sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
@ -135,7 +160,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {


 	sb.safePrintf("<tr valign=top>\n");
-	sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:red;></td>\n");
+	sb.safePrintf("<td align=center><div style=width:50px;height:50px;display:inline-block;background-color:red;></div></td>\n");
 	sb.safePrintf("<td><font size=+1><b>Open Source!</b>"
 	"</font><br>\n");
 	sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=/admin.html#features>Features.</a> Limited support available for free."
@ -144,19 +169,37 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
 	sb.safePrintf("</td></tr>\n");


+	char *root = "";
+	if ( g_conf.m_isMattWells )
+		root = "http://www.gigablast.com";

 	sb.safePrintf("<tr valign=top>\n");
-	sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:green;></td>\n");
-	sb.safePrintf("<td><font size=+1><b>The Green Search Engine</b></font><br>\n");
-	sb.brify2("Gigablast is the only clean-powered web search engine. 90% of its power usage comes from wind energy. Astoundingly, Gigablast is one of ONLY four search engines in the United States indexing over a billion pages.",80);
+	// 204x143
+	sb.safePrintf("<td><img height=52px width=75px "
+		      "src=%s/eventguru.png></td>\n"
+		      , root );
+	sb.safePrintf("<td><font size=+1><b>Event Guru Returns</b></font><br>\n");
+	sb.brify2("<a href=http://www.eventguru.com/>Event Guru</a> datamines events from the web. It identifies events on a web page, or even plain text, using the same rules of deduction used by the human mind. It also has Facebook integration and lots of other cool things.",80);
 	sb.safePrintf("<br><br></td></tr>\n");
 	sb.safePrintf("\n");
 	sb.safePrintf("\n");


+	/*
+	sb.safePrintf("<tr valign=top>\n");
+	sb.safePrintf("<td align=center><div style=width:50px;height:50px;display:inline-block;background-color:green;></div></td>\n");
+	sb.safePrintf("<td><font size=+1><b>The Green Search Engine</b></font><br>\n");
+	sb.brify2("Gigablast is the only clean-powered web search engine. 90% of its power usage comes from wind energy. Astoundingly, Gigablast is one of ONLY four search engines in the United States indexing over a billion pages.",80);
+	sb.safePrintf("<br><br></td></tr>\n");
+	sb.safePrintf("\n");
+	sb.safePrintf("\n");
+	*/
+

 	sb.safePrintf("<tr valign=top>\n");
-	sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:0040fe;></td>\n");
+	sb.safePrintf("<td align=center><img src=%s/gears.png "
+		      "height=50 width=50></div></td>\n"
+		      , root );
 	sb.safePrintf("<td><font size=+1><b>The Transparent Search Engine</b></font><br>\n");
 	sb.brify2("Gigablast is the first truly transparent search engine. It tells you exactly why the search results are ranked the way they are. There is nothing left to the imagination.",85);
 	sb.safePrintf("<br><br>");
@ -165,9 +208,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
 	sb.safePrintf("\n");

 	sb.safePrintf("<tr valign=top>\n");
-	sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:f2b629;></td>\n");
+	sb.safePrintf("<td align=center><center><img src=%s/dollargear.png "
+		      "height=50 width=50></center></div></center></td>\n"
+		      , root );
 	sb.safePrintf("<td><font size=+1><b>The SEO Search Engine</b></font><br>\n");
-	sb.brify2("When it comes to search-engine based SEO, Gigablast is the place to be. With a frothy set of unique and effective <a href=/seo>SEO tools</a>, you will find all you need to execute a simple yet effective SEO strategy. Stop the guesswork, and let a search engine tell you how to SEO it.",85);
+	sb.brify2("When it comes to search-engine based SEO, Gigablast is the place to be. With a frothy set of unique and effective <a href=http://www.gigablast.com/seo>SEO tools</a>, you will find all you need to execute a simple yet effective SEO strategy. Stop the guesswork, and let a search engine tell you how to SEO it.",85);
 	sb.safePrintf("</td></tr>\n");


@ -325,7 +370,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {

 	sb.safePrintf("<br><br>\n");
 	sb.safePrintf("<br><br><br>\n");
-	sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"/Top\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
+	sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=http://www.gigablast.com/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"/Top\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
 	sb.safePrintf("<a href=/adv.html>advanced search</a>");
 	sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
 	sb.safePrintf("<b title=\"Instantly add your url to Gigablast's "
@ -368,19 +413,22 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
 	// . when loaded with the main page for the first time it will
 	//   immediately replace its content...
 	if ( url ) {
+		char *root = "";
+		if ( g_conf.m_isMattWells )
+			root = "http://www.gigablast.com";
 		sb.safePrintf("<br>"
 			      "<br>"
 			      "<div id=msgbox>"
 			      //"<b>Injecting your url. Please wait...</b>"
 			      "<center>"
-			      "<img src=/gears.gif width=50 height=50>"
+			      "<img src=%s/gears.gif width=50 height=50>"
 			      "</center>"
 			      "<script type=text/javascript>"
 			      //"alert('shit');"
 			      "var client = new XMLHttpRequest();\n"
 			      "client.onreadystatechange = handler;\n"
 			      "var url='/addurl?u="
-			      );
+			      , root );
 		sb.urlEncode ( url );
 		// propagate "admin" if set
 		//long admin = hr->getLong("admin",-1);
@ -463,11 +511,17 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {

 	sb.safePrintf("<br><br>\n");
 	sb.safePrintf("<br><br><br>\n");
-	sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <b>directory</b> &nbsp;&nbsp;&nbsp;&nbsp; \n");
+	sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=http://www.gigablast.com/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <b>directory</b> &nbsp;&nbsp;&nbsp;&nbsp; \n");
+	sb.safePrintf("<a href=http://www.gigablast.com/events>events</a>"
+		      " &nbsp;&nbsp;&nbsp;&nbsp; \n");
 	sb.safePrintf("<a href=/adv.html>advanced search</a>");
 	sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
-	sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
-		      "Gigablast's index\">add url</a>");
+	char *root = "";
+	if ( g_conf.m_isMattWells )
+		root = "http://www.gigablast.com";
+	sb.safePrintf("<a href=%s/addurl title=\"Instantly add your url to "
+		      "Gigablast's index\">add url</a>"
+		      , root );
 	sb.safePrintf("\n");
 	sb.safePrintf("<br><br>\n");
 	// submit to HTTPS now
@ -1591,7 +1645,7 @@ void doneInjectingWrapper3 ( void *st ) {
 				      rand32);
 			sb.urlEncode(url);
 			sb.safePrintf(">Check it</a> or "
-				      "<a href=/seo?u=");
+				      "<a href=http://www.gigablast.com/seo?u=");
 			sb.urlEncode(url);
 			sb.safePrintf(">SEO it</a>"
 				      ".</b>");
--- a/PageStatsdb.cpp
+++ b/PageStatsdb.cpp
@ -97,7 +97,7 @@ bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
 	st->m_dateCustom   = (bool)r->getLong( "custom",  0 );
 	// default to 10 hours, i would do 1 day except that there are
 	// some bugs that mess up the display a lot when i do that
-	st->m_datePeriod   = r->getLong( "date_period" , 36000 );
+	st->m_datePeriod   = r->getLong( "date_period" , 300 );//36000 );
 	st->m_dateUnits    = r->getLong( "date_units"  , 1 );//SECS_PER_MIN
 	st->m_now	   = (bool)r->getLong( "date_now"   , 1 );
 	st->m_autoUpdate   = (bool)r->getLong( "auto_update" , 0 );
@ -152,8 +152,8 @@ void sendReply ( void *state ) {

 	TcpSocket *s = st->m_socket;

-	SafeBuf buf( 1024*32 );
-	SafeBuf tmpBuf( 1024 );
+	SafeBuf buf( 1024*32 , "tmpbuf0" );
+	SafeBuf tmpBuf( 1024 , "tmpbuf1" );

 	//
 	// take these out until we need them!
--- a/Pages.cpp
+++ b/Pages.cpp
@ -361,6 +361,8 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
 		path = "admin/inject"; pathLen = gbstrlen(path); }
 	if ( pathLen == 9 && strncmp ( path , "index.php" , 9 ) == 0 ) {
 		path = "search"; pathLen = gbstrlen(path); }
+	if ( pathLen == 10 && strncmp ( path , "search.csv" , 10 ) == 0 ) {
+		path = "search"; pathLen = gbstrlen(path); }

 	// if it is like /GA/Atlanta then call sendPageResults
 	// and that should be smart enough to set the m_where in
--- a/Parms.cpp
+++ b/Parms.cpp
@ -2127,15 +2127,15 @@ bool Parms::printParm ( SafeBuf* sb,
 	// . if printing on crawlbot page hide these
 	// . we repeat this logic below when printing parm titles
 	//   for the column headers in the table
-	char *vt = "";
-	if ( isCrawlbot &&
-	     m->m_page == PAGE_FILTERS &&
-	     (strcmp(m->m_xml,"spidersEnabled") == 0 ||
-	      //strcmp(m->m_xml,"maxSpidersPerRule")==0||
-	      //strcmp(m->m_xml,"maxSpidersPerIp") == 0||
-	      strcmp(m->m_xml,"spiderIpWait") == 0 
-	      ) )
-		vt = " style=display:none;";
+	//char *vt = "";
+	//if ( isCrawlbot &&
+	//     m->m_page == PAGE_FILTERS &&
+	//     (strcmp(m->m_xml,"spidersEnabled") == 0 ||
+	//      //strcmp(m->m_xml,"maxSpidersPerRule")==0||
+	//      //strcmp(m->m_xml,"maxSpidersPerIp") == 0||
+	//      strcmp(m->m_xml,"spiderIpWait") == 0 
+	//      ) )
+	//	vt = " style=display:none;";

 	// what type of parameter?
 	char t = m->m_type;
@ -2210,15 +2210,16 @@ bool Parms::printParm ( SafeBuf* sb,
 			if ( isJSON ) continue;
 			// . hide table column headers that are too advanced
 			// . we repeat this logic above for the actual parms
-			char *vt = "";
-			if ( isCrawlbot &&
-			     m->m_page == PAGE_FILTERS &&
-			     (strcmp(mk->m_xml,"spidersEnabled") == 0 ||
-			      //strcmp(mk->m_xml,"maxSpidersPerRule")==0||
-			      //strcmp(mk->m_xml,"maxSpidersPerIp") == 0||
-			      strcmp(mk->m_xml,"spiderIpWait") == 0 ) )
-				vt = " style=display:none;display:none;";
-			sb->safePrintf ( "<td%s>" , vt );
+			//char *vt = "";
+			//if ( isCrawlbot &&
+			//     m->m_page == PAGE_FILTERS &&
+			//     (strcmp(mk->m_xml,"spidersEnabled") == 0 ||
+			//      //strcmp(mk->m_xml,"maxSpidersPerRule")==0||
+			//      //strcmp(mk->m_xml,"maxSpidersPerIp") == 0||
+			//      strcmp(mk->m_xml,"spiderIpWait") == 0 ) )
+			//	vt = " style=display:none;display:none;";
+			//sb->safePrintf ( "<td%s>" , vt );
+			sb->safePrintf ( "<td>" );
 			// if its of type checkbox in a table make it
 			// toggle them all on/off
 			if ( mk->m_type == TYPE_CHECKBOX &&
@ -2310,7 +2311,8 @@ bool Parms::printParm ( SafeBuf* sb,
 		else if ( firstInRow ) 
 			sb->safePrintf ( "<tr><td>" );
 		else    
-			sb->safePrintf ( "<td%s>" , vt);
+			//sb->safePrintf ( "<td%s>" , vt);
+			sb->safePrintf ( "<td>" );
 	}

 	long cast = m->m_cast;
@ -4008,7 +4010,7 @@ char *Parms::getParmHtmlEncoded ( char *p , char *pend , Parm *m , char *s ) {
 		// time is stored as long
 		long ct = *(long *)s;
 		// get the time struct
-		struct tm *tp = gmtime ( (time_t *)&ct ) ;
+		struct tm *tp = localtime ( (time_t *)&ct ) ;
 		// set the "selected" month for the drop down
 		strftime ( p , 100 , "%d %b %Y %H:%M UTC" , tp );
 	}
@ -8499,6 +8501,30 @@ void Parms::init ( ) {
 	m->m_units = "seconds";
 	m++;

+	m->m_cgi   = "dbapi";
+	m->m_xml   = "diffbotApiUrl";
+	m->m_off   = (char *)&cr.m_diffbotApiUrl - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m->m_def   = "";
+	m++;
+
+	m->m_cgi   = "dbucp";
+	m->m_xml   = "diffbotUrlCrawlPattern";
+	m->m_off   = (char *)&cr.m_diffbotUrlCrawlPattern - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m->m_def   = "";
+	m++;
+
+	m->m_cgi   = "dbupp";
+	m->m_xml   = "diffbotUrlProcessPattern";
+	m->m_off   = (char *)&cr.m_diffbotUrlProcessPattern - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m->m_def   = "";
+	m++;
+
 	m->m_cgi   = "dbppp";
 	m->m_xml   = "diffbotPageProcessPattern";
 	m->m_off   = (char *)&cr.m_diffbotPageProcessPattern - x;
@ -8507,6 +8533,22 @@ void Parms::init ( ) {
 	m->m_def   = "";
 	m++;

+	m->m_cgi   = "dbucre";
+	m->m_xml   = "diffbotUrlCrawlRegEx";
+	m->m_off   = (char *)&cr.m_diffbotUrlCrawlRegEx - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m->m_def   = "";
+	m++;
+
+	m->m_cgi   = "dbupre";
+	m->m_xml   = "diffbotUrlProcessRegEx";
+	m->m_off   = (char *)&cr.m_diffbotUrlProcessRegEx - x;
+	m->m_type  = TYPE_SAFEBUF;
+	m->m_page  = PAGE_NONE;
+	m->m_def   = "";
+	m++;
+
 	m->m_cgi   = "dbopn";
 	m->m_xml   = "diffbotOnlyProcessIfNew";
 	m->m_off   = (char *)&cr.m_diffbotOnlyProcessIfNew - x;
@ -13027,6 +13069,17 @@ void Parms::init ( ) {
 	m->m_def   = "";
 	m++;

+	m->m_title = "harvest links";
+	m->m_cgi   = "hspl";
+	m->m_xml   = "harvestLinks";
+	m->m_max   = MAX_FILTERS;
+	m->m_off   = (char *)cr.m_harvestLinks - x;
+	m->m_type  = TYPE_CHECKBOX;
+	m->m_def   = "1";
+	m->m_page  = PAGE_FILTERS;
+	m->m_rowid = 1;
+	m++;
+
 	m->m_title = "spidering enabled";
 	m->m_cgi   = "cspe";
 	m->m_xml   = "spidersEnabled";
@ -15116,18 +15169,19 @@ void Parms::init ( ) {
 	m->m_sprpp = 0;
 	m++;

+	/*
 	m->m_title = "format of the returned search results";
-	m->m_desc  = "X is 0 to get back results in regular html, and 8 to "
-		"get back results in XML.";
+	m->m_desc  = "X is 0 to get back results in regular html, 1 to "
+		"get back results in XML, 2 for JSON.";
 	m->m_def   = "0";
-	m->m_soff  = (char *)&si.m_xml - y;
-	m->m_type  = TYPE_LONG;
+	m->m_soff  = (char *)&si.m_formatStr - y;
+	m->m_type  = TYPE_STRING;//CHAR;
 	m->m_sparm = 1;
-	m->m_scgi  = "xml";
+	m->m_scgi  = "format";
 	m->m_smin  = 0;
 	m->m_smax  = 12;
 	m++;
-
+	*/

 	m->m_title = "highlight query terms in summaries.";
 	m->m_desc  = "Use to disable or enable "
--- a/PingServer.cpp
+++ b/PingServer.cpp
@ -3043,6 +3043,8 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
 	ei->m_finalCallback ( ei->m_finalState );
 }

+bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) ;
+
 // . return false if would block, true otherwise
 // . used to send email and get a url when a crawl hits a maxToCrawl
 //   or maxToProcess limitation.
@ -3103,15 +3105,38 @@ bool sendNotification ( EmailInfo *ei ) {
 	if ( url && url[0] ) {
 		log("build: sending url notification to %s for coll \"%s\"",
 		    url,crawl);
+
+		Url uu; uu.set ( url );
+
+		SafeBuf fullReq;
+		fullReq.safePrintf("POST %s HTTP/1.0\r\n"
+				   "User-Agent: Crawlbot/2.0\r\n"
+				   "Accept: */*\r\n"
+				   "Host: "
+				   , uu.getPath()
+				   );
+		fullReq.safeMemcpy ( uu.getHost() , uu.getHostLen() );
 		// make custom headers
-		SafeBuf custom;
-		custom.safePrintf ( "X-Crawl-Name: %s\r\n"
+		fullReq.safePrintf ("X-Crawl-Name: %s\r\n"
 				    // last \r\n is added in HttpRequest.cpp
-				    "X-Crawl-Status: %s"// \r\n" // hdrs
-				    
+				    "X-Crawl-Status: %s\r\n" // hdrs
 				    , cr->m_diffbotCrawlName.getBufStart()
 				    , ei->m_spiderStatusMsg.getBufStart()
 				    );
+		// also in post body
+		SafeBuf postContent;
+		// the collection details
+		printCrawlDetailsInJson ( postContent , cr );
+		// content-length of it
+		fullReq.safePrintf("Content-Length: %li\r\n",
+				   postContent.length());
+		// type is json
+		fullReq.safePrintf("Content-Type: application/json\r\n");
+		fullReq.safePrintf("\r\n");
+		// then the post content
+		fullReq.safeMemcpy ( &postContent );
+		fullReq.nullTerm();
+
 		// GET request
 		if ( ! g_httpServer.getDoc ( url ,
 					     0 , // ip
@ -3129,8 +3154,9 @@ bool sendNotification ( EmailInfo *ei ) {
 					     "HTTP/1.0", // proto
 					     true , // doPost
 					     NULL, // cookie
-					     custom.getBufStart(),
-					     NULL ) ) // fullRequest
+					     NULL , // custom hdrs
+					     fullReq.getBufStart() ,
+					     NULL ) )
 			ei->m_notifyBlocked++;
 	}

--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -712,6 +712,9 @@ bool PosdbTable::allocTopTree ( ) {
 	//	return false;

 	if ( m_r->m_getDocIdScoringInfo ) {
+
+		m_scoreInfoBuf.setLabel ("scinfobuf" );
+
 		// . for holding the scoring info
 		// . add 1 for the \0 safeMemcpy() likes to put at the end so 
 		//   it will not realloc on us
@ -731,6 +734,10 @@ bool PosdbTable::allocTopTree ( ) {
 		// compute. so this could easily get into the megabytes, most 
 		// of the time we will not need nearly that much however.
 		numPairs *= xx;
+
+		m_pairScoreBuf.setLabel ( "pairbuf" );
+		m_singleScoreBuf.setLabel ("snglbuf" );
+
 		// but alloc it just in case
 		if ( ! m_pairScoreBuf.reserve (numPairs * sizeof(PairScore) ) )
 			return false;
@ -786,7 +793,7 @@ bool PosdbTable::allocTopTree ( ) {
 			slots = 20000000;
 		}
 		// each site hash is 4 bytes
-		if ( ! m_siteHashList.reserve ( slots ) )
+		if ( ! m_siteHashList.reserve ( slots ,"shshbuf" ) )
 			return false;
 		// quad # of sites to have space in between
 		if ( ! m_dt.set(4,0,slots,NULL,0,false,0,"pdtdt"))
@ -1005,7 +1012,7 @@ void PosdbTable::evalSlidingWindow ( char **ptrs ,
 	for ( long i = 0 ; i < maxi ; i++ ) {

 		// skip if to the left of a pipe operator
-		if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
+		if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;

 		//if ( ptrs[i] ) wpi = ptrs[i];
 		// if term does not occur in body, sub-in the best term
@ -1027,7 +1034,7 @@ void PosdbTable::evalSlidingWindow ( char **ptrs ,
 	for ( ; j < maxj ; j++ ) {

 		// skip if to the left of a pipe operator
-		if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE) ) continue;
+		if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;

 		// TODO: use a cache using wpi/wpj as the key. 
 		//if ( ptrs[j] ) wpj = ptrs[j];
@ -4097,6 +4104,9 @@ bool PosdbTable::setQueryTermInfo ( ) {

 	long nrg = 0;

+	// assume not sorting by a numeric termlist
+	m_sortByTermNum = -1;
+
 	//for ( long i = 0 ; i < m_msg2->getNumLists() ; i++ ) {
 	for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
 		QueryTerm *qt = &m_q->m_qterms[i];
@ -4111,6 +4121,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
 		qti->m_qpos          = wordNum;
 		qti->m_wikiPhraseId  = qw->m_wikiPhraseId;
 		qti->m_quotedStartId = qw->m_quoteStart;
+		// is it gbsortby:?
+		if ( qt->m_fieldCode == FIELD_GBSORTBY ||
+		     qt->m_fieldCode == FIELD_GBREVSORTBY )
+			m_sortByTermNum = i;
 		// count
 		long nn = 0;
 		// also add in bigram lists
@ -4226,6 +4240,18 @@ bool PosdbTable::setQueryTermInfo ( ) {
 		if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
 		// is it a negative term?
 		if ( qt->m_termSign=='-')qti->m_bigramFlags[nn]|=BF_NEGATIVE; 
+
+		// numeric posdb termlist flags. instead of word position
+		// they have a float stored there for sorting etc.
+		if (qt->m_fieldCode == FIELD_GBSORTBY )
+			qti->m_bigramFlags[nn]|=BF_NUMBER;
+		if (qt->m_fieldCode == FIELD_GBREVSORTBY )
+			qti->m_bigramFlags[nn]|=BF_NUMBER;
+		if (qt->m_fieldCode == FIELD_GBNUMBERMIN )
+			qti->m_bigramFlags[nn]|=BF_NUMBER;
+		if (qt->m_fieldCode == FIELD_GBNUMBERMAX )
+			qti->m_bigramFlags[nn]|=BF_NUMBER;
+
 		// only really add if useful
 		// no, because when inserting NEW (related) terms that are
 		// not currently in the document, this list may initially
@ -4912,15 +4938,27 @@ void PosdbTable::intersectLists10_r ( ) {
 	// sites right now. this hash table must have been pre-allocated
 	// in Posdb::allocTopTree() above since we might be in a thread.
 	//
-	RdbList *whiteLists = m_msg2->m_whiteLists;
-	long nw = m_msg2->m_w;
+	RdbList *whiteLists = NULL;
+	long nw = 0;
+	if ( m_msg2 ) {
+		whiteLists = m_msg2->m_whiteLists;
+		nw = m_msg2->m_w;
+	}
 	for ( long i = 0 ; ! m_addedSites && i < nw ; i++ ) {
 		RdbList *list = &whiteLists[i];
 		if ( list->isEmpty() ) continue;
 		// sanity test
 		long long d1 = g_posdb.getDocId(list->getList());
-		if ( d1 > m_msg2->m_docIdEnd ) { char *xx=NULL;*xx=0; }
-		if ( d1 < m_msg2->m_docIdStart ) { char *xx=NULL;*xx=0; }
+		if ( d1 > m_msg2->m_docIdEnd ) { 
+			log("posdb: d1=%lli > %lli",
+			    d1,m_msg2->m_docIdEnd);
+			//char *xx=NULL;*xx=0; 
+		}
+		if ( d1 < m_msg2->m_docIdStart ) { 
+			log("posdb: d1=%lli < %lli",
+			    d1,m_msg2->m_docIdStart);
+			//char *xx=NULL;*xx=0; 
+		}
 		// first key is always 18 bytes cuz it has the termid
 		// scan recs in the list
 		for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
@ -5049,6 +5087,8 @@ void PosdbTable::intersectLists10_r ( ) {
 		QueryTermInfo *qti = &qip[i];
 		// skip if negative query term
 		if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
+		// skip if numeric field like gbsortby:price gbmin.price:1.23
+		if ( qti->m_bigramFlags[0] & BF_NUMBER ) continue;
 		// set it
 		if ( qti->m_wikiPhraseId == 1 ) continue;
 		// stop
@ -5298,6 +5338,9 @@ void PosdbTable::intersectLists10_r ( ) {
 	long nnn = m_numQueryTermInfos;
 	if ( ! m_r->m_doMaxScoreAlgo ) nnn = 0;

+	// do not do it if we got a gbsortby: field
+	if ( m_sortByTermNum >= 0 ) nnn = 0;
+
 	/*
 	// skip all this if getting score of just one docid on special
 	// posdb termlists that are 6-byte only keys
@ -5584,6 +5627,8 @@ void PosdbTable::intersectLists10_r ( ) {

 	pass0++;

+	if ( m_sortByTermNum >= 0 ) goto skipScoringFilter;
+
 	// test why we are slow
 	//if ( (s_sss++ % 8) != 0 ) { docIdPtr += 6; fail0++; goto docIdLoop;}

@ -5743,6 +5788,8 @@ void PosdbTable::intersectLists10_r ( ) {
 		}
 	}

+ skipScoringFilter:
+
 	pass++;

 skipPreAdvance:
@ -5770,7 +5817,12 @@ void PosdbTable::intersectLists10_r ( ) {
 	// mini merge buf:
 	mptr = mbuf;

-	// merge each set of sublists
+	// . merge each set of sublists
+	// . like we merge a term's list with its two associated bigram
+	//   lists, if there, the left bigram and right bigram list.
+	// . and merge all the synonym lists for that term together as well.
+	//   so if the term is 'run' we merge it with the lists for
+	//   'running' 'ran' etc.
 	for ( long j = 0 ; j < m_numQueryTermInfos ; j++ ) {
 		// get the query term info
 		QueryTermInfo *qti = &qip[j];
@ -6045,12 +6097,12 @@ void PosdbTable::intersectLists10_r ( ) {
 	for ( long i = 0   ; i < m_numQueryTermInfos ; i++ ) {

 	// skip if not part of score
-	if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
+	if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;

 	// and pair it with each other possible query term
 	for ( long j = i+1 ; j < m_numQueryTermInfos ; j++ ) {
 		// skip if not part of score
-		if ( bflags[j] & (BF_PIPED|BF_NEGATIVE) ) continue;
+		if ( bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
 		// but if they are in the same wikipedia phrase
 		// then try to keep their positions as in the query.
 		// so for 'time enough for love' ideally we want
@ -6126,7 +6178,7 @@ void PosdbTable::intersectLists10_r ( ) {
 	for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
 		float sts;
 		// skip if to the left of a pipe operator
-		if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
+		if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
 		// sometimes there is no wordpos subtermlist for this docid
 		// because it just has the bigram, like "streetlight" and not
 		// the word "light" by itself for the query 'street light'
@ -6218,7 +6270,7 @@ void PosdbTable::intersectLists10_r ( ) {
 	//
 	for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
 		// skip if to the left of a pipe operator
-		if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
+		if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
 		// skip wordposition until it in the body
 		while ( xpos[i] &&!s_inBody[g_posdb.getHashGroup(xpos[i])]) {
 			// advance
@ -6269,7 +6321,9 @@ void PosdbTable::intersectLists10_r ( ) {
 	minx = -1;
 	for ( long x = 0 ; x < m_numQueryTermInfos ; x++ ) {
 		// skip if to the left of a pipe operator
-		if ( bflags[x] & (BF_PIPED|BF_NEGATIVE) ) continue;
+		// and numeric posdb termlists do not have word positions,
+		// they store a float there.
+		if ( bflags[x] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
 		if ( ! xpos[x] ) continue;
 		if ( xpos[x] && minx == -1 ) {
 			minx = x;
@ -6298,7 +6352,8 @@ void PosdbTable::intersectLists10_r ( ) {
 		long k; 
 		for ( k = 0 ; k < m_numQueryTermInfos ; k++ ) {
 			// skip if to the left of a pipe operator
-			if ( bflags[k] & (BF_PIPED|BF_NEGATIVE) ) continue;
+			if ( bflags[k] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) 
+				continue;
 			if ( xpos[k] ) break;
 		}
 		// all lists are now exhausted
@ -6337,12 +6392,12 @@ void PosdbTable::intersectLists10_r ( ) {
 	for ( long i = 0   ; i < m_numQueryTermInfos ; i++ ) {

 	// skip if to the left of a pipe operator
-	if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
+	if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;

 	for ( long j = i+1 ; j < m_numQueryTermInfos ; j++ ) {

 		// skip if to the left of a pipe operator
-		if ( bflags[j] & (BF_PIPED|BF_NEGATIVE) ) continue;
+		if ( bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;

 		//
 		// get score for term pair from non-body occuring terms
@ -6404,6 +6459,12 @@ void PosdbTable::intersectLists10_r ( ) {
 	     m_r->m_language == docLang)
 		score *= SAMELANGMULT;

+	//
+	// if we have a gbsortby:price term then score exclusively on that
+	//
+	if ( m_sortByTermNum >= 0 )
+		score = g_posdb.getFloat ( miniMergedList[m_sortByTermNum] );
+
 	// . seoDebug hack so we can set "dcs"
 	// . we only come here if we actually made it into m_topTree
 	if ( secondPass || m_r->m_seoDebug ) {
--- a/Posdb.h
+++ b/Posdb.h
@ -99,6 +99,7 @@ float getTermFreqWeight  ( long long termFreq , long long numDocsInColl );
 #define BF_SYNONYM            0x04
 #define BF_NEGATIVE           0x08  // query word has a negative sign before it
 #define BF_BIGRAM             0x10  // query word has a negative sign before it
+#define BF_NUMBER             0x20  // is it like gbsortby:price? numeric?

 void printTermList ( long i, char *list, long listSize ) ;

@ -197,6 +198,23 @@ class Posdb {
 		if ( langId & 0x20 ) kp->n0 |= 0x08;
 	}

+	// set the word position bits et al to this float
+	void setFloat ( void *vkp , float f ) {
+		*(float *)(((char *)vkp) + 2) = f; };
+
+	// and read the float as well
+	float getFloat ( void *vkp ) {
+		return *(float *)(((char *)vkp) + 2); };
+
+	void setAlignmentBit ( void *vkp , char val ) {
+		char *p = (char *)vkp;
+		if ( val ) p[1] = p[1] | 0x02;
+		else       p[1] = p[1] & 0xfd;
+	};
+
+	bool isAlignmentBitClear ( void *vkp ) {
+		return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
+	};

 	void makeStartKey ( void *kp, long long termId , 
 			    long long docId=0LL){
@ -427,7 +445,7 @@ class PosdbList : public RdbList {
 #include "Query.h"         // MAX_QUERY_TERMS, qvec_t

 // max # search results that can be viewed without using TopTree
-#define MAX_RESULTS 1000
+//#define MAX_RESULTS 1000

 class PosdbTable {

@ -575,6 +593,7 @@ class PosdbTable {

 	class Msg39Request *m_r;

+	long m_sortByTermNum;

 	// the new intersection/scoring algo
 	void intersectLists10_r ( );	
--- a/PostQueryRerank.cpp
+++ b/PostQueryRerank.cpp
@ -162,6 +162,11 @@ bool PostQueryRerank::set2 ( long resultsWanted ) {
 	m_pageUrl = (Url *)mcalloc( sizeof(Url)*m_maxResultsToRerank,
 				    "pqrpageUrls" );

+	if ( ! m_pageUrl ) {
+		log("pqr: had out of memory error");
+		return false;
+	}
+
 	return true;
 }

--- a/Process.cpp
+++ b/Process.cpp
@ -500,6 +500,11 @@ bool Process::isAnyTreeSaving ( ) {
 		Rdb *rdb = m_rdbs[i];
 		if ( rdb->m_isCollectionLess ) continue;
 		if ( rdb->isSavingTree() ) return true;
+		// we also just disable writing below in Process.cpp
+		// while saving other files. so hafta check that as well
+		// since we use isAnyTreeSaving() to determine if we can
+		// write to the tree or not.
+		if ( ! rdb->isWritable() ) return true;
 	}
 	return false;
 }
@ -1064,7 +1069,10 @@ void processSleepWrapper ( int fd , void *state ) {
 	//if ( ! isClockInSync() && ! g_hostdb.m_myHost->m_isProxy ) return;

 	// get time the day started
-	long now = getTimeLocal();//GlobalNoCore();
+	long now;
+	if ( g_hostdb.m_myHost->m_isProxy ) now = getTimeLocal();
+	else now = getTimeGlobal();
+
 	// set this for the first time
 	if ( g_process.m_lastSaveTime == 0 )
 		g_process.m_lastSaveTime = now;
--- a/Proxy.cpp
+++ b/Proxy.cpp
@ -60,6 +60,7 @@ struct StateControl{
 	HttpRequest m_hr;
 	Host *m_forwardHost;
 	float m_pending;
+	bool m_isEventGuru;
 };

 #define UIF_ADMIN   0x01
@ -370,6 +371,8 @@ bool Proxy::handleRequest (TcpSocket *s){
 	char *host  = hr.getHost();
 	char *hdom = host;
 	if ( strncasecmp(hdom,"www.",4) == 0 ) hdom += 4;
+	if ( strncasecmp(hdom,"www2.",5) == 0 ) hdom += 5;
+	if ( strncasecmp(hdom,"www1.",5) == 0 ) hdom += 5;
 	// auto redirect eventguru.com to www.eventguru.com so cookies
 	// are consistent
 	if ( ! redir && 
@ -387,9 +390,19 @@ bool Proxy::handleRequest (TcpSocket *s){
 		redirLen = gbstrlen(redir);
 	}

+	bool isEventGuru = false;
+	if ( strcasecmp(hdom,"eventguru.com") == 0 )
+		isEventGuru = true;
+
+#ifdef MATTWELLS
+#define HTTPS_REDIR 1
+#endif
+

 	if ( redirLen > 0 && redir ) {
-		//redirect:
+#ifdef HTTPS_REDIR
+	redirect:
+#endif
 		HttpMime m;
 		m.makeRedirMime (redir,redirLen);
 		// . move the reply to a send buffer
@ -431,6 +444,10 @@ bool Proxy::handleRequest (TcpSocket *s){
 	char *path = hr.getPath();
 	//long pathLen = hr.getPathLen();

+	// serve events on the gigablast.com domain:
+	if ( path && strncmp(path,"/events",7) == 0 )
+		isEventGuru = true;
+	
 	/*
 	bool badPage = false;
 	if ( n < 0 ) badPage = true;
@ -502,6 +519,32 @@ bool Proxy::handleRequest (TcpSocket *s){
 	if ( ! strncmp(path,"/?id="        ,5 ) ) handleIt = false;


+	// log the request iff filename does not end in .gif .jpg .
+	char *f = NULL;
+	long  flen = 0;
+	if ( isEventGuru ) {
+		f     = hr.getFilename();
+		flen  = hr.getFilenameLen();
+	}
+
+	// proxy will handle eventguru images i guess
+	bool  isGif = ( f && flen >= 4 && strncmp(&f[flen-4],".gif",4) == 0 );
+	bool  isJpg = ( f && flen >= 4 && strncmp(&f[flen-4],".jpg",4) == 0 );
+	bool  isBmp = ( f && flen >= 4 && strncmp(&f[flen-4],".bmp",4) == 0 );
+	bool  isPng = ( f && flen >= 4 && strncmp(&f[flen-4],".png",4) == 0 );
+	bool  isIco = ( f && flen >= 4 && strncmp(&f[flen-4],".ico",4) == 0 );
+	bool  isPic = (isGif | isJpg | isBmp | isPng || isIco);
+
+	// use event guru favicon?
+	//if ( isEventGuru && isIco && strcmp(f,"favicon.ico") == 0 ) {
+	//	f = "eventguru_favicon.ico";
+	//	flen = gbstrlen(f);
+	//}
+
+	// eventguru.com host: in mime?
+	if ( isEventGuru && ! isPic )
+		handleIt = false;
+
 	// only proxy holds the accounting info
 	if ( ! strncmp ( path ,"/account", 8 ) ) {
 		printRequest(s, &hr);
@ -515,12 +558,14 @@ bool Proxy::handleRequest (TcpSocket *s){
 	if ( tcp == &g_httpServer.m_ssltcp ) max = g_conf.m_httpsMaxSockets;
 	else                                 max = g_conf.m_httpMaxSockets;

-#ifdef _HTTPS_REDIR_
+#ifdef HTTPS_REDIR
 	// if hitting root page then tell them to go to https
 	// if not autobanned... but if it is an autobanned request on root
 	// page it should have go the turing test above!
 	if ( n == PAGE_ROOT && 
 	     ! g_isYippy &&
+	     // not event guru homepage
+	     ! isEventGuru &&
 	     // if not already on https
 	     tcp != &g_httpServer.m_ssltcp &&
 	     // do not redirect http://www.gigablast.com/?c=dmoz3 (directory)!
@ -1265,6 +1310,8 @@ bool Proxy::forwardRequest ( StateControl *stC ) {
 			p[5] = '9';
 			break;
 		}
+		// code is invalid if is not for an old client
+		//if ( userId32b == 0 ) code = NULL;
 	}


@ -1665,7 +1712,7 @@ void Proxy::gotReplyPage ( void *state, UdpSlot *slot ) {

 	// do not print login bars in the xml!! do not print for ixquick
 	// which gets results in html...
-	if ( ! stC->m_raw && ! stC->m_ch )
+	if ( ! stC->m_raw && ! stC->m_ch && ! stC->m_isEventGuru )
 		newReply = storeLoginBar ( reply , 
 					   size ,  // transmit size
 					   size , // allocsize
@ -5153,11 +5200,16 @@ void Proxy::printUsers ( SafeBuf *sb ) {
 		// but if admin we should still have set our cookie
 		// adminsessid to our current session id so we know we are
 		// also the admin!
-		sb->safePrintf("<td><a href=/account?login=%s&password=%s>"
-			       "%s</td>"
+		sb->safePrintf("<td><nobr>%li. "
+			       "<a href=/account?login=%s&password=%s>"
+			       "%s</a></nobr></td>"
+			       ,i
 			       ,ui->m_login
 			       ,ui->m_password 
-			       ,ui->m_login);
+			       ,ui->m_login
+			       //,ui->m_userId32
+			       );
 	}
+	sb->safePrintf("</tr>\n");
 	sb->safePrintf("</table>\n");
 }
--- a/Query.cpp
+++ b/Query.cpp
@ -2199,6 +2199,10 @@ bool Query::setQWords ( char boolFlag ,
 		     fieldCode == FIELD_IP   ||
 		     fieldCode == FIELD_ISCLEAN ||
 		     fieldCode == FIELD_QUOTA ||
+		     fieldCode == FIELD_GBSORTBY ||
+		     fieldCode == FIELD_GBREVSORTBY ||
+		     fieldCode == FIELD_GBNUMBERMIN ||
+		     fieldCode == FIELD_GBNUMBERMAX ||
 		     fieldCode == FIELD_GBAD  ) {
 			// find first space -- that terminates the field value
 			char *end = 
@ -2210,6 +2214,15 @@ bool Query::setQWords ( char boolFlag ,
 			ignoreTilSpace = true;
 			// the hash
 			unsigned long long wid = hash64 ( w , wlen, 0LL );
+
+			// i've decided not to make 
+			// gbsortby:products.offerPrice case sensitive
+			if ( fieldCode == FIELD_GBSORTBY ||
+			     fieldCode == FIELD_GBREVSORTBY ||
+			     fieldCode == FIELD_GBNUMBERMIN ||
+			     fieldCode == FIELD_GBNUMBERMAX )
+				wid = hash64Lower_utf8 ( w , wlen , 0LL );
+
 			// should we have normalized before hashing?
 			if ( fieldCode == FIELD_URL ||
 			     fieldCode == FIELD_LINK ||
@ -3032,6 +3045,12 @@ struct QueryField g_fields[] = {
 	{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
 	{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
 	{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
+	{"gbsortby", FIELD_GBSORTBY, false,""},
+	{"gbrevsortby", FIELD_GBREVSORTBY, false,""},
+
+	{"gbnumbermin", FIELD_GBNUMBERMIN, false,""},
+	{"gbnumbermax", FIELD_GBNUMBERMAX, false,""},
+
 	{"gbcountry",FIELD_GBCOUNTRY,false,""},
 	{"gbad",FIELD_GBAD,false,""},

--- a/Query.h
+++ b/Query.h
@ -103,7 +103,11 @@ typedef unsigned long long qvec_t;
 #define FIELD_GBCSENUM         50
 #define FIELD_GBSECTIONHASH    51
 #define FIELD_GBDOCID          52
-#define FIELD_GBCONTENTHASH    53
+#define FIELD_GBCONTENTHASH    53 // for deduping at spider time
+#define FIELD_GBSORTBY         54 // i.e. sortby:price -> numeric termlist
+#define FIELD_GBREVSORTBY      55 // i.e. sortby:price -> low to high
+#define FIELD_GBNUMBERMIN      56
+#define FIELD_GBNUMBERMAX      57

 #define FIELD_GBOTHER 92

--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 open-source-search-engine
 =========================

-An open source web and enterprise search engine. As can be seen http://www.gigablast.com/ 
+An open source web and enterprise search engine. As can be seen on http://www.gigablast.com/ .

 RUNNING GIGABLAST
 -----------------
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -132,7 +132,7 @@ bool Rdb::init ( char          *dir                  ,
 	// sanity
 	if ( ! dir ) { char *xx=NULL;*xx=0; }
 	// this is the working dir, all collection repositiories are subdirs
-	m_dir.set ( dir );
+	//m_dir.set ( dir );
 	// catdb, statsdb, accessdb, facebookdb, syncdb
 	m_isCollectionLess = isCollectionLess;
 	// save the dbname NULL terminated into m_dbname/m_dbnameLen
@ -466,6 +466,11 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
 // . if this rdb is collectionless we set m_collectionlessBase in addBase()
 bool Rdb::addColl ( char *coll ) {
 	collnum_t collnum = g_collectiondb.getCollnum ( coll );
+	return addColl2 ( collnum );
+}
+
+bool Rdb::addColl2 ( collnum_t collnum ) {
+
 	// catdb,statsbaccessdb,facebookdb,syncdb
 	if ( m_isCollectionLess )
 		collnum = (collnum_t)0;
@ -477,6 +482,12 @@ bool Rdb::addColl ( char *coll ) {
 			   "breech maximum number of collections, %lli.",
 			   m_dbname,collnum,maxColls);
 	}
+
+
+	CollectionRec *cr = g_collectiondb.m_recs[collnum];
+	char *coll = NULL;
+	if ( cr ) coll = cr->m_coll;
+
 	// . ensure no previous one exists
 	// . well it will be there but will be uninitialized, m_rdb will b NULL
 	RdbBase *base = getBase ( collnum );
@ -506,8 +517,9 @@ bool Rdb::addColl ( char *coll ) {
 	if(m_useTree) tree    = &m_tree;
 	else          buckets = &m_buckets;

-	// init it
-	if ( ! base->init ( m_dir.getDir() ,
+	// . init it
+	// . g_hostdb.m_dir should end in /
+	if ( ! base->init ( g_hostdb.m_dir, // m_dir.getDir() ,
 					m_dbname        ,
 					m_dedup         ,
 					m_fixedDataSize ,
@ -527,15 +539,16 @@ bool Rdb::addColl ( char *coll ) {
 					m_biasDiskPageCache ) ) {
 		logf(LOG_INFO,"db: %s: Failed to initialize db for "
 		     "collection \"%s\".", m_dbname,coll);
-		exit(-1);
+		//exit(-1);
 		return false;
 	}

 	// . set CollectionRec::m_numPos/NegKeysInTree[rdbId]
 	// . these counts are now stored in the CollectionRec and not
 	//   in RdbTree since the # of collections can be huge!
-	CollectionRec *cr = g_collectiondb.m_recs[collnum];
-	m_tree.setNumKeys ( cr );
+	if ( m_useTree ) {
+		m_tree.setNumKeys ( cr );
+	}

 	//if ( (long)collnum >= m_numBases ) m_numBases = (long)collnum + 1;
 	// Success
@ -544,7 +557,7 @@ bool Rdb::addColl ( char *coll ) {

 bool Rdb::resetColl ( collnum_t collnum , collnum_t newCollnum ) {

-	char *coll = g_collectiondb.m_recs[collnum]->m_coll;
+	//char *coll = g_collectiondb.m_recs[collnum]->m_coll;

 	// remove these collnums from tree
 	if(m_useTree) m_tree.delColl    ( collnum );
@ -552,11 +565,48 @@ bool Rdb::resetColl ( collnum_t collnum , collnum_t newCollnum ) {

 	// . close all files, set m_numFiles to 0 in RdbBase
 	// . TODO: what about outstanding merge or dump operations?
-	RdbBase *base = getBase ( collnum );
-	base->reset( );
+	// . it seems like we can't really recycle this too easily 
+	//   because reset it not resetting filenames or directory name?
+	//   just nuke it and rebuild using addColl2()...
+	RdbBase *oldBase = getBase ( collnum );
+	mdelete (oldBase, sizeof(RdbBase), "Rdb Coll");
+	delete  (oldBase);

-	// update this as well
-	base->m_collnum = newCollnum;
+	//base->reset( );
+
+	// NULL it out...
+	CollectionRec *oldcr = g_collectiondb.getRec(collnum);
+	oldcr->m_bases[(unsigned char)m_rdbId] = NULL;
+	char *coll = oldcr->m_coll;
+
+	char *msg = "deleted";
+
+	// if just resetting recycle base
+	if ( collnum != newCollnum ) {
+		addColl2 ( newCollnum );
+		// make a new base now
+		//RdbBase *newBase = mnew
+		// new cr
+		//CollectionRec *newcr = g_collectiondb.getRec(newCollnum);
+		// update this as well
+		//base->m_collnum = newCollnum;
+		// and the array
+		//newcr->m_bases[(unsigned char)m_rdbId] = base;
+		msg = "moved";
+	}
+
+	
+	log("rdb: %s base from collrec "
+	    "rdb=%s rdbid=%li coll=%s collnum=%li newcollnum=%li",
+	    msg,m_dbname,(long)m_rdbId,coll,(long)collnum,
+	    (long)newCollnum);
+
+
+	// new dir. otherwise RdbDump will try to dump out the recs to
+	// the old dir and it will end up coring
+	//char tmp[1024];
+	//sprintf(tmp , "%scoll.%s.%li",g_hostdb.m_dir,coll,(long)newCollnum );
+	//m_dir.set ( tmp );

 	// move the files into trash
 	// nuke it on disk
@ -597,19 +647,6 @@ bool Rdb::delColl ( char *coll ) {
 	// move all files to trash and clear the tree/buckets
 	resetColl ( collnum , collnum );

-	mdelete (base, sizeof(RdbBase), "Rdb Coll");
-	delete  (base);
-	//m_bases[collnum] = NULL;
-
-	CollectionRec *cr = g_collectiondb.getRec(collnum);
-
-	// NULL it out...
-	cr->m_bases[(unsigned char)m_rdbId] = NULL;
-	
-	log("rdb: deleted base from collrec "
-	    "rdb=%s rdbid=%li coll=%s collnum=%li base=0x%lx",
-	    m_dbname,(long)m_rdbId,coll,(long)collnum,(long)base);
-
 	// remove these collnums from tree
 	//if(m_useTree) m_tree.delColl    ( collnum );
 	//else          m_buckets.delColl ( collnum );
@ -921,7 +958,8 @@ bool Rdb::saveMaps ( bool useThread ) {
 		// shut it down
 		RdbBase *base = getBase(i);
 		//if ( m_bases[i] ) m_bases[i]->closeMaps ( m_urgent );
-		if ( base ) base->closeMaps ( m_urgent );
+		//if ( base ) base->closeMaps ( m_urgent );
+		if ( base ) base->saveMaps ( useThread );
 	}
 	return true;
 }
@ -1242,6 +1280,7 @@ bool Rdb::gotTokenForDump ( ) {
 	m_dumpCollnum = (collnum_t)-1;
 	// clear this for dumpCollLoop()
 	g_errno = 0;
+	m_dumpErrno = 0;
 	m_fn = -1000;
 	// this returns false if blocked, which means we're ok, so we ret true
 	if ( ! dumpCollLoop ( ) ) return true;
@ -1414,9 +1453,16 @@ bool Rdb::dumpCollLoop ( ) {

 	// error?
 	if ( g_errno ) {
-		log("rdb: error dumping = %s",mstrerror(g_errno));
+		log("rdb: error dumping = %s . coll deleted from under us?",
+		    mstrerror(g_errno));
+		// shit, what to do here? this is causing our RdbMem
+		// to get corrupted!
+		// because if we end up continuing it calls doneDumping()
+		// and updates RdbMem! maybe set a permanent error then!
+		// and if that is there do not clear RdbMem!
+		m_dumpErrno = g_errno;
 		// for now core out
-		char *xx=NULL;*xx=0;
+		//char *xx=NULL;*xx=0;
 	}

 	// loop back up since we did not block
@ -1437,11 +1483,12 @@ void Rdb::doneDumping ( ) {
 	// msg
 	//log(LOG_INFO,"db: Done dumping %s to %s (#%li): %s.",
 	//    m_dbname,m_files[n]->getFilename(),n,mstrerror(g_errno));
-	log(LOG_INFO,"db: Done dumping %s: %s.",m_dbname,mstrerror(g_errno));
+	log(LOG_INFO,"db: Done dumping %s: %s.",m_dbname,
+	    mstrerror(m_dumpErrno));
 	// give the token back so someone else can dump or merge
 	//g_msg35.releaseToken();
 	// free mem in the primary buffer
-	if ( ! g_errno ) m_mem.freeDumpedMem();
+	if ( ! m_dumpErrno ) m_mem.freeDumpedMem();
 	// . tell RdbDump it is done
 	// . we have to set this here otherwise RdbMem's memory ring buffer
 	//   will think the dumping is no longer going on and use the primary
@ -2839,6 +2886,12 @@ void Rdb::enableWrites  () {
 	else m_buckets.enableWrites();
 }

+bool Rdb::isWritable ( ) {
+	if(m_useTree) return m_tree.m_isWritable;
+	return m_buckets.m_isWritable;
+}
+
+
 bool Rdb::needsSave() {
 	if(m_useTree) return m_tree.m_needsSave; 
 	else return m_buckets.needsSave();
--- a/Rdb.h
+++ b/Rdb.h
@ -10,7 +10,7 @@
 #include "RdbMem.h"
 #include "RdbCache.h"
 #include "RdbDump.h"
-#include "Dir.h"
+//#include "Dir.h"
 #include "RdbBuckets.h"

 // . each Rdb instance has an ID
@ -86,6 +86,7 @@ class Rdb {
 	~Rdb ( );

 	bool addColl ( char *coll );
+	bool addColl2 ( collnum_t collnum );
 	bool delColl ( char *coll );

 	bool resetColl ( collnum_t collnum , collnum_t newCollnum ) ;
@ -164,7 +165,8 @@ class Rdb {
 	bool deleteRecord ( collnum_t collnum , char *key );

 	// get the directory name where this rdb stores it's files
-	char *getDir       ( ) { return m_dir.getDirname(); };
+	//char *getDir       ( ) { return m_dir.getDirname(); };
+	char *getDir       ( ) { return g_hostdb.m_dir; };
 	char *getStripeDir ( ) { return g_conf.m_stripeDir; };

 	long getFixedDataSize ( ) { return m_fixedDataSize; };
@ -185,7 +187,7 @@ class Rdb {
 	
 	void disableWrites ();
 	void enableWrites  ();
-
+	bool isWritable ( ) ;

 	RdbBase *getBase ( collnum_t collnum ) ;
 	long getNumBases ( ) { 	return g_collectiondb.m_numRecs; };
@ -352,7 +354,7 @@ class Rdb {
 	bool      m_dedup;
 	long      m_fixedDataSize;

-	Dir       m_dir;
+	//Dir       m_dir;
 	char      m_dbname [32];
 	long      m_dbnameLen;

@ -394,6 +396,8 @@ class Rdb {
 	long      m_numFilesToMerge   ;
 	long      m_mergeStartFileNum ;

+	long m_dumpErrno;
+
 	// a dummy data string for deleting records when m_fixedDataSize > 0
 	char     *m_dummy;
 	long      m_dummySize ; // size of that dummy data
--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -127,8 +127,15 @@ bool RdbBase::init ( char  *dir            ,
 	// set all our contained classes
 	//m_dir.set ( dir );
 	// set all our contained classes
+	// . "tmp" is bogus
+	// . /home/mwells/github/coll.john-test1113.654coll.john-test1113.655
 	char tmp[1024];
 	sprintf ( tmp , "%scoll.%s.%li" , dir , coll , (long)collnum );
+
+	// debug
+	log("base: adding new base for dir=%s coll=%s collnum=%li db=%s",
+	    dir,coll,(long)collnum,dbname);
+
 	// catdb is collection independent

 	// make a special subdir to store the map and data files in if
@ -261,7 +268,8 @@ bool RdbBase::init ( char  *dir            ,
 	// we can't merge more than MAX_RDB_FILES files at a time
 	if ( minToMergeArg > MAX_RDB_FILES ) minToMergeArg = MAX_RDB_FILES;
 	m_minToMergeArg = minToMergeArg;
-	// set our m_files array
+	// . set our m_files array
+	// . m_dir is bogus causing this to fail
 	if ( ! setFiles () ) return false;
 	//long dataMem;
 	// if we're in read only mode, don't bother with *ANY* trees
@ -491,9 +499,11 @@ bool RdbBase::removeRebuildFromFilename ( BigFile *f ) {
 bool RdbBase::setFiles ( ) {
 	// set our directory class
 	if ( ! m_dir.open ( ) )
+		// we are getting this from a bogus m_dir
 		return log("db: Had error opening directory %s", getDir());
 	// note it
-	logf(LOG_INFO,"db: Loading files for %s.",m_dbname );
+	logf(LOG_INFO,"db: Loading files for %s coll=%s (%li).",
+	     m_dbname,m_coll,(long)m_collnum );
 	// . set our m_files array
 	// . addFile() will return -1 and set g_errno on error
 	// . the lower the fileId the older the data 
@ -600,6 +610,8 @@ bool RdbBase::setFiles ( ) {
 			return false;
 	}

+	m_dir.close();
+
 	if ( ! converting ) return true;

 	// now if we are converting old titledb names to new...
@ -723,7 +735,6 @@ long RdbBase::addFile ( long id , bool isNew , long mergeNum , long id2 ,
 	sprintf ( name , "%s%04li.map", m_dbname, id );
 	m->set ( getDir() , name , m_fixedDataSize , m_useHalfKeys , m_ks ,
 		 m_pageSize );
-	if ( ! isNew ) logf(LOG_INFO,"db: Adding %s.", name );
 	if ( ! isNew && ! m->readMap ( f ) ) { 
 		// if out of memory, do not try to regen for that
 		if ( g_errno == ENOMEM ) return -1;
@ -759,6 +770,8 @@ long RdbBase::addFile ( long id , bool isNew , long mergeNum , long id2 ,
 		g_statsdb.m_disabled = false;
 		if ( ! status ) return log("db: Save failed.");
 	}
+	if ( ! isNew ) logf(LOG_INFO,"db: Added %s for collnum=%li pages=%li",
+			    name ,(long)m_collnum,m->getNumPages());
 	// open this big data file for reading only
 	if ( ! isNew ) {
 		if ( mergeNum < 0 ) 
@ -1603,7 +1616,8 @@ void RdbBase::gotTokenForMerge ( ) {
 			return;
 		}
 		// make a log note
-		log(LOG_INFO,"merge: Resuming killed merge for %s.",m_dbname);
+		log(LOG_INFO,"merge: Resuming killed merge for %s coll=%s.",
+		    m_dbname,m_coll);
 		// compute the total size of merged file
 		mint = 0;
 		long mm = 0;
--- a/RdbCache.cpp
+++ b/RdbCache.cpp
@ -416,6 +416,9 @@ bool RdbCache::getRecord ( collnum_t collnum   ,
 	if ( maxAge == 0 ) return false;
 	// bail if no cache
 	if ( m_numPtrsMax <= 0 ) return false;
+	// if init() called failed because of oom...
+	if ( ! m_ptrs )
+		return log("cache: getRecord: failed because oom");
 	// time it -- debug
 	long long t = 0LL ;
 	if ( g_conf.m_logTimingDb ) t = gettimeofdayInMillisecondsLocal();
--- a/RdbTree.cpp
+++ b/RdbTree.cpp
@ -32,6 +32,10 @@ RdbTree::RdbTree () {
 	m_useProtection = false;
 	m_pickRight     = false;
 	m_gettingList   = 0;
+
+	// before resetting... we have to set this so clear() won't breach buffers
+	m_rdbId = -1;
+
 	reset();
 }

@ -125,10 +129,6 @@ bool RdbTree::set ( long fixedDataSize ,
 	// sanity
 	if ( rdbId < -1       ) { char *xx=NULL;*xx=0; }
 	if ( rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
-	// is it a valid one
-	m_isRealTree = true;
-	if ( m_rdbId <= RDB_NONE ) m_isRealTree = false;
-	if ( m_rdbId >= RDB_END  ) m_isRealTree = false;
 	// if its doledb, set it
 	//if ( dbname && strcmp(dbname,"doledb") == 0 ) m_rdbId = RDB_DOLEDB;
 	// adjust m_maxMem to virtual infinity if it was -1
@ -151,7 +151,7 @@ bool RdbTree::set ( long fixedDataSize ,
 	// initiate protection
 	if ( m_useProtection ) protect();
 	// allocate the nodes
-	return growTree ( maxNumNodes );
+	return growTree ( maxNumNodes , 0 );
 }

 void RdbTree::reset ( ) {
@ -273,11 +273,12 @@ long RdbTree::clear ( ) {
 	// clear tree counts for all collections!
 	long nc = g_collectiondb.m_numRecs;
 	// BUT only if we are an Rdb::m_tree!!!
-	if ( ! m_isRealTree ) nc = 0;
+	if ( m_rdbId == -1 ) nc = 0;
 	// otherwise, we overwrite stuff in CollectionRec we shouldn't
 	for ( long i = 0 ; i < nc ; i++ ) {
 		CollectionRec *cr = g_collectiondb.getRec(i);
 		if ( ! cr ) continue;
+		//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
 		cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
 		cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
 	}
@ -547,7 +548,8 @@ long RdbTree::addNode ( collnum_t collnum ,
 		// collections using the same Rdb::m_tree!
 		// crap, when fixing a tree this will segfault because
 		// m_recs[collnum] is NULL.
-		if ( m_isRealTree && g_collectiondb.m_recs[collnum] ) {
+		if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
+			//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
 			g_collectiondb.m_recs[collnum]->
 				m_numNegKeysInTree[(unsigned char)m_rdbId] =0;
 			g_collectiondb.m_recs[collnum]->
@ -629,7 +631,8 @@ long RdbTree::addNode ( collnum_t collnum ,
 		// collections using the same Rdb::m_tree!
 		// crap, when fixing a tree this will segfault because
 		// m_recs[collnum] is NULL.
-		if ( m_isRealTree && g_collectiondb.m_recs[collnum] ) {
+		if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
+			//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
 			g_collectiondb.m_recs[collnum]->
 				m_numNegKeysInTree[(unsigned char)m_rdbId]++;
 		}
@ -639,7 +642,8 @@ long RdbTree::addNode ( collnum_t collnum ,
 		//m_numPosKeysPerColl[collnum]++;
 		// crap, when fixing a tree this will segfault because
 		// m_recs[collnum] is NULL.
-		if ( m_isRealTree && g_collectiondb.m_recs[collnum] ) {
+		if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
+			//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
 			g_collectiondb.m_recs[collnum]->
 				m_numPosKeysInTree[(unsigned char)m_rdbId]++;
 		}
@ -834,14 +838,14 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
 	if ( KEYNEG(m_keys,i,m_ks) ) {
 		m_numNegativeKeys--;
 		//m_numNegKeysPerColl[m_collnums[i]]--;
-		if ( m_isRealTree )
+		if ( m_rdbId >= 0 )
 			g_collectiondb.m_recs[m_collnums[i]]->
 				m_numPosKeysInTree[(unsigned char)m_rdbId]--;
 	}
 	else {
 		m_numPositiveKeys--;
 		//m_numPosKeysPerColl[m_collnums[i]]--;
-		if ( m_isRealTree )
+		if ( m_rdbId >= 0 )
 			g_collectiondb.m_recs[m_collnums[i]]->
 				m_numPosKeysInTree[(unsigned char)m_rdbId]--;
 	}
@ -868,7 +872,8 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
 	m_numPositiveKeys = 0;
 	//m_numNegKeysPerColl[m_collnums[i]] = 0;
 	//m_numPosKeysPerColl[m_collnums[i]] = 0;
-	if ( m_isRealTree ) {
+	if ( m_rdbId >= 0 ) {
+		//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
 		g_collectiondb.m_recs[m_collnums[i]]->
 			m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
 		g_collectiondb.m_recs[m_collnums[i]]->
@ -937,16 +942,20 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
 	if ( KEYNEG(m_keys,i,m_ks) ) {
 		m_numNegativeKeys--;
 		//m_numNegKeysPerColl[m_collnums[i]]--;
-		if ( m_isRealTree )
+		if ( m_rdbId >= 0 ) {
+			//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
 			g_collectiondb.m_recs[m_collnums[i]]->
 				m_numNegKeysInTree[(unsigned char)m_rdbId]--;
+		}
 	}
 	else {
 		m_numPositiveKeys--;
 		//m_numPosKeysPerColl[m_collnums[i]]--;
-		if ( m_isRealTree )
+		if ( m_rdbId >= 0 ) {
+			//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
 			g_collectiondb.m_recs[m_collnums[i]]->
 				m_numPosKeysInTree[(unsigned char)m_rdbId]--;
+		}
 	}
 	// debug step -- check chain from iparent down making sure that
 	// all kids don't have -2 for their parent... seems to be a rare bug
@ -1310,7 +1319,7 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {

 // . grow tree to "n" nodes
 // . this will now actually grow from a current size to a new one
-bool RdbTree::growTree ( long nn ) {
+bool RdbTree::growTree ( long nn , long niceness ) {
 	// if we're that size, bail
 	if ( m_numNodes == nn ) return true;

@ -1337,27 +1346,35 @@ bool RdbTree::growTree ( long nn ) {
 	long cs = sizeof(collnum_t);
 	cp =(collnum_t *)mrealloc (m_collnums, on*cs,nn*cs,m_allocName);
 	if ( ! cp ) goto error;
+	QUICKPOLL(niceness);
 	kp = (char  *) mrealloc ( m_keys    , on*k , nn*k , m_allocName );
 	if ( ! kp ) goto error;
+	QUICKPOLL(niceness);
 	lp = (long  *) mrealloc ( m_left    , on*4 , nn*4 , m_allocName );
 	if ( ! lp ) goto error;
+	QUICKPOLL(niceness);
 	rp = (long  *) mrealloc ( m_right   , on*4 , nn*4 , m_allocName );
 	if ( ! rp ) goto error;
+	QUICKPOLL(niceness);
 	pp = (long  *) mrealloc ( m_parents , on*4 , nn*4 , m_allocName );
 	if ( ! pp ) goto error;
+	QUICKPOLL(niceness);

 	// deal with data, sizes and depth arrays on a basis of need
 	if ( m_fixedDataSize !=  0 ) {
 		dp =(char **)mrealloc (m_data  , on*d,nn*d,m_allocName);
 		if ( ! dp ) goto error;
+		QUICKPOLL(niceness);
 	}
 	if ( m_fixedDataSize == -1 ) {
 		sp =(long  *)mrealloc (m_sizes , on*4,nn*4,m_allocName);
 		if ( ! sp ) goto error;
+		QUICKPOLL(niceness);
 	}
 	if ( m_doBalancing         ) {
 		tp =(char  *)mrealloc (m_depth , on  ,nn  ,m_allocName);
 		if ( ! tp ) goto error;
+		QUICKPOLL(niceness);
 	}

 	// re-assign
@ -1385,6 +1402,7 @@ bool RdbTree::growTree ( long nn ) {

 	// protect it from writes
 	if ( m_useProtection ) protect ( );
+	QUICKPOLL(niceness);
 	return true;

 error:
@ -1399,41 +1417,49 @@ bool RdbTree::growTree ( long nn ) {
 		ss = (collnum_t *)mrealloc ( cp , nn*cs , on*cs , m_allocName);
 		if ( ! ss ) { char *xx = NULL; *xx = 0; }
 		m_collnums = ss;
+		QUICKPOLL(niceness);
 	}
 	if ( kp ) {
 		kk = (char *)mrealloc ( kp, nn*k, on*k, m_allocName );
 		if ( ! kk ) { char *xx = NULL; *xx = 0; }
 		m_keys = kk;
+		QUICKPOLL(niceness);
 	}
 	if ( lp ) {
 		x = (long *)mrealloc ( lp , nn*4 , on*4 , m_allocName );
 		if ( ! x ) { char *xx = NULL; *xx = 0; }
 		m_left = x;
+		QUICKPOLL(niceness);
 	}
 	if ( rp ) {
 		x = (long *)mrealloc ( rp , nn*4 , on*4 , m_allocName );
 		if ( ! x ) { char *xx = NULL; *xx = 0; }
 		m_right = x;
+		QUICKPOLL(niceness);
 	}
 	if ( pp ) {
 		x = (long *)mrealloc ( pp , nn*4 , on*4 , m_allocName );
 		if ( ! x ) { char *xx = NULL; *xx = 0; }
 		m_parents = x;
+		QUICKPOLL(niceness);
 	}
 	if ( dp && m_fixedDataSize != 0 ) {
 		p = (char **)mrealloc ( dp , nn*d , on*d , m_allocName );
 		if ( ! p ) { char *xx = NULL; *xx = 0; }
 		m_data = p;
+		QUICKPOLL(niceness);
 	}
 	if ( sp && m_fixedDataSize == -1 ) {
 		x = (long *)mrealloc ( sp , nn*4 , on*4 , m_allocName );
 		if ( ! x ) { char *xx = NULL; *xx = 0; }
 		m_sizes = x;
+		QUICKPOLL(niceness);
 	}
 	if ( tp && m_doBalancing ) {
 		s = (char *)mrealloc ( tp , nn   , on   , m_allocName );
 		if ( ! s ) { char *xx = NULL; *xx = 0; }
 		m_depth = s;
+		QUICKPOLL(niceness);
 	}

 	return log("db: Failed to grow tree for %s from %li to %li bytes: %s.",
@ -2612,7 +2638,7 @@ bool RdbTree::fastLoad ( BigFile *f , RdbMem *stack ) {
 	if ( m_numNodes < minUnusedNode ) {
 		log(LOG_INIT,
 		    "db: Growing tree to make room for %s",f->getFilename());
-		if ( ! growTree ( minUnusedNode ) ) {
+		if ( ! growTree ( minUnusedNode , 0 ) ) {
 			f->close();
 			m_isLoading = false;
 			return log("db: Failed to grow tree: %s.",
@ -3050,14 +3076,14 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
 }

 long  RdbTree::getNumNegativeKeys ( collnum_t collnum ) { 
-	if ( ! m_isRealTree ) { char *xx=NULL;*xx=0; }
+	if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
 	CollectionRec *cr = g_collectiondb.m_recs[collnum];
 	if ( ! cr ) return 0;
 	return cr->m_numNegKeysInTree[(unsigned char)m_rdbId]; 
 }

 long  RdbTree::getNumPositiveKeys ( collnum_t collnum ) { 
-	if ( ! m_isRealTree ) { char *xx=NULL;*xx=0; }
+	if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
 	CollectionRec *cr = g_collectiondb.m_recs[collnum];
 	if ( ! cr ) return 0;
 	return cr->m_numPosKeysInTree[(unsigned char)m_rdbId]; 
@ -3067,6 +3093,8 @@ void RdbTree::setNumKeys ( CollectionRec *cr ) {

 	if ( ! cr ) return;

+	if ( ((unsigned char)m_rdbId) >= RDB_END ) { char *xx=NULL;*xx=0; }
+
 	collnum_t collnum = cr->m_collnum;
 	cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
 	cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
--- a/RdbTree.h
+++ b/RdbTree.h
@ -360,7 +360,7 @@ class RdbTree {
 	// need to pass this file to the fastSave() thread
 	//BigFile *m_saveFile;
 	char  m_rdbId;
-	char  m_isRealTree;
+	//char  m_isRealTree;
 	char  m_dir[128];
 	char  m_dbname[32];
 	char  m_memTag[16];
@ -401,7 +401,7 @@ class RdbTree {
 	// . returns true if tree doesn't need to grow/shrink
        // . re-allocs the m_keys,m_data,m_sizes,m_leftNodes,m_rightNodes
 	// . used for growing AND shrinking the table
-        bool  growTree  ( long newNumNodes );
+        bool  growTree  ( long newNumNodes , long niceness );

 	// are we responsible for freeing nodes' data
 	bool    m_ownData;
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -22,11 +22,12 @@
 // 	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
 // };

-SafeBuf::SafeBuf(long initSize) {
+SafeBuf::SafeBuf(long initSize, char *label ) {
 	if(initSize <= 0) initSize = 1;
 	m_capacity = initSize;
 	m_length = 0;
-	m_buf = (char*)mrealloc(NULL, 0, m_capacity, "SafeBuf");
+	m_label = label;
+	m_buf = (char*)mrealloc(NULL, 0, m_capacity, m_label );
 	if(!m_buf) m_capacity = 0;
 	m_usingStack = false;
 	m_encoding = csUTF8;
@ -39,6 +40,11 @@ SafeBuf::SafeBuf() {
 	m_buf = NULL;
 	m_usingStack = false;
 	m_encoding = csUTF8;
+	m_label = NULL;
+}
+
+void SafeBuf::setLabel ( char *label ) {
+	m_label = label;
 }

 SafeBuf::SafeBuf(char* stackBuf, long cap) {
@ -47,6 +53,7 @@ SafeBuf::SafeBuf(char* stackBuf, long cap) {
 	m_buf = stackBuf;
 	m_length = 0;
 	m_encoding = csUTF8;
+	m_label = NULL;
 }

 SafeBuf::SafeBuf(char *heapBuf, long bufMax, long bytesInUse, bool ownData) {
@ -292,8 +299,14 @@ bool SafeBuf::advance ( long i ) {
 	return true;
 }

-bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
-	if ( ! label ) label = "SafeBuf";
+bool SafeBuf::reserve(long i , char *label, bool clearIt ) {
+
+	// if we don't already have a label and they provided one, use it
+	if ( ! m_label ) {
+		if ( label ) m_label = label;
+		else         m_label = "SafeBuf";
+	}
+
 	if(m_length + i > m_capacity) {
 		char *tmpBuf = m_buf;
 		long tmpCap = m_capacity;
@ -301,7 +314,7 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
 			m_buf = NULL;
 			m_capacity += i;
 			//if(m_capacity < 8) m_capacity = 8;
-			m_buf = (char*)mrealloc(m_buf, 0, m_capacity, label);
+			m_buf = (char*)mrealloc(m_buf, 0, m_capacity,m_label);
 			if(!m_buf) {
 				m_buf = tmpBuf;
 				m_capacity = tmpCap;
@ -320,7 +333,7 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
 		}
 		m_capacity += i;
 		//if(m_capacity < 8) m_capacity = 8;
-		m_buf = (char*)mrealloc(m_buf, tmpCap, m_capacity,label);
+		m_buf = (char*)mrealloc(m_buf, tmpCap, m_capacity,m_label);
 		if(!m_buf) {
 			m_buf = tmpBuf;
 			m_capacity = tmpCap;
@ -344,11 +357,11 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {

 //reserve this many bytes, if we need to alloc, we double the 
 //buffer size.
-bool SafeBuf::reserve2x(long i) {
+bool SafeBuf::reserve2x(long i, char *label) {
 	//watch out for overflow!
 	if((m_capacity << 1) + i < 0) return false;
 	if(i + m_length >= m_capacity)
-		return reserve(m_capacity + i);
+		return reserve(m_capacity + i,label);
 	else return true;
 }

@ -369,8 +382,8 @@ long SafeBuf::dumpToFile(char *filename ) {
 		    filename);
 		return -1;
 	}
-	logf(LOG_DEBUG, "test: safebuf %li bytes written to %s",m_length,
-	     filename);
+	//logf(LOG_DEBUG, "test: safebuf %li bytes written to %s",m_length,
+	//     filename);
 retry23:
 	long bytes = write(fd, (char*)m_buf, m_length) ;
 	if ( bytes != m_length ) {
@ -972,7 +985,8 @@ bool  SafeBuf::htmlEncode(char *s, long len, bool encodePoundSign ,
 	// . sanity check
 	if ( m_encoding == csUTF16 ) { char *xx = NULL; *xx = 0; }
 	// alloc some space if we need to. add a byte for NULL termination.
-	if(m_length+len+1>=m_capacity && !reserve(m_capacity+len))return false;
+	if(m_length+len+1>=m_capacity && !reserve(m_capacity+len+1))
+		return false;
 	// tmp vars
 	char *t    = m_buf + m_length;
 	char *tend = m_buf + m_capacity;
@ -2517,7 +2531,11 @@ bool SafeBuf::decodeJSON ( long niceness ) {
 // . this is used by xmldoc.cpp to PARTIALLY decode a json buf so we do not
 //   index letters in escapes like \n \r \f \t \uxxxx \\ \/
 // . SO we do keep \" 
-bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness) {
+// . so when indexing a doc we set decodeAll to FALSE, but if you want to 
+//   decode quotation marks as well then set decodeAll to TRUE!
+bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, 
+				     long jsonLen, 
+				     long niceness ) {

 	// how much space to reserve for the copy?
 	long need = jsonLen;
@ -2579,6 +2597,15 @@ bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness) {
 				src += 2;
 				continue;
 			}
+			// we do not decode quotation marks when indexing
+			// the doc so we can preserve json names/value pair
+			// information for indexing purposes. however,
+			// Title.cpp DOES want to decode quotations.
+			if ( src[1] == '\"' ) { // && decodeAll ) {
+				*dst++ = '\"';
+				src += 2;
+				continue;
+			}
 			// utf8? if not, just skip the slash
 			if ( src[1] != 'u'  ) { 
 				// no, keep the slash so if we have /"
@ -3155,3 +3182,49 @@ bool SafeBuf::htmlDecode ( char *src,
 	// good to go
 	return true;
 }
+
+void SafeBuf::replaceChar ( char src , char dst ) {
+	char *px = m_buf;
+	char *pxEnd = m_buf + m_length;
+	for ( ; px < pxEnd ; px++ ) if ( *px == src ) *px = dst;
+}
+
+
+// encode a double quote char to two double quote chars
+bool SafeBuf::csvEncode ( char *s , long len , long niceness ) {
+
+	if ( ! s ) return true;
+
+	// assume all chars are double quotes and will have to be encoded
+	long need = len * 2 + 1;
+	if ( ! reserve ( need ) ) return false;
+
+	// tmp vars
+	char *dst  = m_buf + m_length;
+	//char *dstEnd = m_buf + m_capacity;
+
+	// scan through all 
+	char *send = s + len;
+	for ( ; s < send ; s++ ) {
+		// breathe
+		QUICKPOLL ( niceness );
+		// convert it?
+		if ( *s == '\"' ) {
+			*dst++ = '\"';
+			*dst++ = '\"';
+			continue;
+		}
+		//if ( *s == '\\' ) {
+		//	*dst++ = '\\';
+		//	*dst++ = '\\';
+		//	continue;
+		//}
+		*dst++ = *s;
+	}
+
+	m_length += dst - (m_buf + m_length);
+
+	nullTerm();
+
+	return true;
+}
--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -8,12 +8,14 @@
 struct SafeBuf {
 	//*TRUCTORS
 	SafeBuf();
-	SafeBuf(long initSize);
+	SafeBuf(long initSize, char *label = NULL);
 	//be careful with passing in a stackBuf! it could go out
 	//of scope independently of the safebuf.
 	SafeBuf(char* stackBuf, long cap);
 	SafeBuf(char *heapBuf, long bufMax, long bytesInUse, bool ownData);
 	~SafeBuf();
+
+	void setLabel ( char *label );
 	
 	// CAUTION: BE CAREFUL WHEN USING THE FOLLOWING TWO FUNCTIONS!!
 	// setBuf() allows you reset the contents of the SafeBuf to either
@ -59,6 +61,7 @@ struct SafeBuf {
 	bool convertJSONtoXML ( long niceness , long startConvertPos );

 	bool safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness);
+	//			    bool decodeAll = false );

 	bool decodeJSONToUtf8 ( long niceness );
 	bool decodeJSON ( long niceness );
@ -96,6 +99,9 @@ struct SafeBuf {
 	bool  safeStrcpy ( char *s ) ;
 	bool  safeStrcpyPrettyJSON ( char *decodedJson ) ;
 	bool  safeUtf8ToJSON ( char *utf8 ) ;
+
+	bool  csvEncode ( char *s , long len , long niceness = 0 );
+
 	//bool  pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
 	bool  cat(SafeBuf& c);
 	// . only cat the sections/tag that start with "tagFilter"
@ -106,10 +112,11 @@ struct SafeBuf {
 	void  reset() { m_length = 0; }
 	void  purge(); // Clear all data and free all allocated memory
 	bool  advance ( long i ) ;
+
 	// . if clearIt is true we init the new buffer space to zeroes
 	// . used by Collectiondb.cpp
 	bool  reserve(long i, char *label=NULL , bool clearIt = false );
-	bool  reserve2x(long i);
+	bool  reserve2x(long i, char *label = NULL );

 	char *makeSpace ( long size ) {
 		if ( ! reserve ( size ) ) return NULL;
@ -143,6 +150,7 @@ struct SafeBuf {
 			     char *t , long tlen ,
 			     long niceness ,
 			     long startOff = 0 );
+	void replaceChar ( char src , char dst );
 	bool  copyToken(char* s);;
 	//output encoding
 	bool  setEncoding(short cs);
@ -326,6 +334,7 @@ struct SafeBuf {
 	long  m_capacity;
 	long  m_length;
 	char *m_buf;
+	char *m_label;
 	bool  m_usingStack;
 	short m_encoding; // output charset

--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -342,7 +342,7 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {

 	// we need to get some cgi values in order to correct the defaults
 	// based on if we're doing an xml feed, have a site: query, etc.
-	long  xml      = r->getLong ( "xml" , 0 ); // was "raw"
+	//long  xml      = r->getLong ( "xml" , 0 ); // was "raw"
 	long  siteLen  = 0; r->getString ("site",&siteLen);
 	long  sitesLen = 0; 
 	char *sites = r->getString ("sites",&sitesLen,NULL);
@ -353,8 +353,11 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 	       ! m_whiteListBuf.nullTerm() ) )
 		return log("query: unable to strcpy whitelist");
 	
+
+	char format = getFormatFromRequest ( r );
+
 	// now override automatic defaults for special cases
-	if ( xml > 0 ) {
+	if ( format != FORMAT_HTML ) {
 		m_familyFilter            = 0;
 		// this is causing me a headache when on when i dont know it
 		m_restrictIndexdbForQuery   = false;
@ -365,6 +368,8 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 		m_spellCheck              = 0;
 		m_refs_numToGenerate      = 0;
 		m_refs_docsToScan         = 0;
+		// default scoring info to off
+		m_getDocIdScoringInfo = false;
 	}
 	else if ( m_siteLen > 0 ) {
 		m_restrictIndexdbForQuery = false;
@ -654,18 +659,19 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 	// use "&dg=1" to debug gigabits
 	m_debugGigabits = r->getLong("dg",0);

+	// override
+	m_format = format;

 	// . omit scoring info from the xml feed for now
 	// . we have to roll this out to gk144 net i think
-	if ( xml > 0 )
-		m_getDocIdScoringInfo = 0;
+	//if ( m_format != FORMAT_HTML )
+	//	m_getDocIdScoringInfo = 0;

 	// turn off by default!
 	if ( ! r->getLong("gigabits",0) ) {
 		m_numTopicGroups = 0;
 	}

-
 	//////////////////////////////////////
 	//
 	// transform input into classes
@ -709,7 +715,8 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {

 	// . returns false and sets g_errno on error
 	// . sets m_qbuf1 and m_qbuf2
-	if ( ! setQueryBuffers ( r ) ) return false;
+	if ( ! setQueryBuffers (r) )
+		return log("query: setQueryBuffers: %s",mstrerror(g_errno));

 	/* --- Virtual host language detection --- */
 	if(r->getHost()) {
@ -1089,10 +1096,11 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
 	//	if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
 	//	if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
 	//}
-	
+
 	// append plus terms
 	if ( m_plusLen > 0 ) {
-		char *s = m_plus, *send = m_plus + m_plusLen;
+		char *s = m_plus;
+		char *send = m_plus + m_plusLen;
 		//if ( p > pstart && p < pend ) *p++  = ' ';
 		//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
@ -1108,7 +1116,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
 			} else {
 				while (!isspace(*s2) && s2 < send) s2++;
 			}
-			if (s < send) break;
+			if (s2 < send) break;
 			//if (p < pend) *p++ = '+';
 			//if (p2 < pend2) *p2++ = '+';
 			m_sbuf1.pushChar('+');
@ -1142,7 +1150,8 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
 	}  
 	// append minus terms
 	if ( m_minusLen > 0 ) {
-		char *s = m_minus, *send = m_minus + m_minusLen;
+		char *s = m_minus;
+		char *send = m_minus + m_minusLen;
 		//if ( p > pstart && p < pend ) *p++  = ' ';
 		//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
 		if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
@ -1158,7 +1167,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
 			} else {
 				while (!isspace(*s2) && s2 < send) s2++;
 			}
-			if (s < send) break;
+			if (s2 < send) break;
 			//if (p < pend) *p++ = '-';
 			//if (p2 < pend2) *p2++ = '-';
 			m_sbuf1.pushChar('-');
@ -1202,9 +1211,9 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
 	}

 	// null terms
-	m_sbuf1.pushChar('\0');
-	m_sbuf2.pushChar('\0');
-	m_sbuf3.pushChar('\0');
+	if ( ! m_sbuf1.pushChar('\0') ) return false;
+	if ( ! m_sbuf2.pushChar('\0') ) return false;
+	if ( ! m_sbuf3.pushChar('\0') ) return false;

 	// the natural query
 	m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset;
@ -1239,6 +1248,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
 	long dcatId  = -1;
 	// get the final query
 	char *q =m_sbuf1.getBufStart();
+
 	if ( q ) sscanf(q,"gbpcatid:%li",&pcatId);
 	if ( q ) sscanf(q,"gbcatid:%li",&dcatId);
 	// pick the one that is valid
@ -1301,3 +1311,33 @@ uint8_t SearchInput::detectQueryLanguage(void) {

 	return(lang);
 }
+
+
+char getFormatFromRequest ( HttpRequest *r ) {
+	char format = FORMAT_HTML;
+
+	// what format should search results be in? default is html
+	char *formatStr = r->getString("format", NULL );
+
+	if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
+	if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
+	if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
+	if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
+
+
+	// support old api &xml=1 to mean &format=1
+	if ( r->getLong("xml",0) ) {
+		format = FORMAT_XML;
+	}
+
+	// also support &json=1
+	if ( r->getLong("json",0) ) {
+		format = FORMAT_JSON;
+	}
+
+	if ( r->getLong("csv",0) ) {
+		format = FORMAT_CSV;
+	}
+
+	return format;
+}
--- a/SearchInput.h
+++ b/SearchInput.h
@ -22,6 +22,8 @@

 #define MAX_TOPIC_GROUPS 10

+char getFormatFromRequest ( class HttpRequest *r ) ;
+
 // . parameters used to generate a set of related topics (gigabits)
 // . you can have Msg24 generate multiple sets of related topics in one call
 class TopicGroup {
@ -43,6 +45,11 @@ class TopicGroup {
        long m_topicMaxPunctLen;
 };

+#define FORMAT_HTML 0
+#define FORMAT_XML  1
+#define FORMAT_JSON 2
+#define FORMAT_CSV  3
+
 class SearchInput {

 public:
@ -211,7 +218,13 @@ class SearchInput {
 	
 	// tier sizes can change with different "raw" values, therefore,
 	// so can search results
-	long   m_xml;                        // msg40
+	//long   m_xml;                        // msg40
+	// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON
+	//long  m_formatStrLen;
+	//char *m_formatStr;
+
+	// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
+	char m_format;

 	// this should be part of the key because it will affect the results!
 	char   m_queryExpansion;
--- a/Sections.cpp
+++ b/Sections.cpp
@ -252,6 +252,8 @@ bool Sections::set ( Words     *w                       ,
 	// breathe
 	QUICKPOLL(m_niceness);

+	m_sectionPtrBuf.setLabel("psectbuf");
+
 	// separate buf now for section ptr for each word
 	if ( ! m_sectionPtrBuf.reserve ( nw *4 ) ) return true;
 	m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart();
@ -260,6 +262,8 @@ bool Sections::set ( Words     *w                       ,
 	// allocate m_sectionBuf
 	m_sections = NULL;

+	m_sectionBuf.setLabel ( "sectbuf" );
+
 	if ( ! m_sectionBuf.reserve ( need ) )
 		return true;

@ -15160,6 +15164,9 @@ bool Sections::print2 ( SafeBuf *sbuf ,

 	// save ptrs
 	m_sbuf = sbuf;
+
+	m_sbuf->setLabel ("sectprnt");
+
 	//m_pt = pt;
 	//m_et = et;
 	//m_at = at;
--- a/Speller.cpp
+++ b/Speller.cpp
@ -1000,6 +1000,8 @@ bool Speller::loadUnifiedDict() {

 	bool needRebuild = false;

+	m_unifiedBuf.setLabel("unibuf");
+
 	// this MUST be there
 	if ( m_unifiedBuf.fillFromFile(g_hostdb.m_dir,
 				       "unifiedDict-buf.txt" ) == 0 ) 
--- a/Spider.cpp
+++ b/Spider.cpp
--- a/Spider.h
+++ b/Spider.h
@ -45,6 +45,9 @@
 #define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
 #define SP_COMPLETED    9 // crawl is done, and no repeatCrawl is scheduled

+bool testPatterns ( ) ;
+bool doesStringContainPattern ( char *content , char *pattern ) ;
+
 bool getSpiderStatusMsg ( class CollectionRec *cx , 
 			  class SafeBuf *msg , 
 			  long *status ) ;
@ -603,6 +606,8 @@ class SpiderRequest {
 	long    m_hasContactInfoValid     :1;
 	long    m_isContactyValid         :1;
 	long    m_hasAddressValid         :1;
+	//long    m_matchesUrlCrawlPattern  :1;
+	//long    m_matchesUrlProcessPattern:1;
 	long    m_hasTODValid             :1;
 	long    m_hasSiteVenueValid       :1;
 	long    m_siteNumInlinksValid     :1;
@ -832,8 +837,8 @@ class SpiderReply {
 	// was the request an injection request
 	long    m_fromInjectionRequest    :1; 
 	// did we TRY to send it to the diffbot backend filter? might be err?
-	long    m_sentToDiffbot:1;
-	long    m_reserved2 :1;
+	long    m_sentToDiffbot           :1;
+	long    m_hadDiffbotError         :1;
 	long    m_reserved3 :1;
 	long    m_reserved4 :1;

@ -1111,6 +1116,7 @@ class SpiderColl {
 	key_t      m_waitingTreeKey;
 	bool       m_waitingTreeKeyValid;
 	long       m_scanningIp;
+	bool       m_gotNewRequestsForScanningIp;

 	// start key for reading doledb
 	key_t m_msg5StartKey;
@ -1125,7 +1131,7 @@ class SpiderColl {

 	// for reading lists from spiderdb
 	Msg5 m_msg5;
-	bool m_gettingList;
+	bool m_gettingList1;

 	// how many outstanding spiders a priority has
 	long m_outstandingSpiders[MAX_SPIDER_PRIORITIES];
@ -1276,7 +1282,7 @@ class SpiderLoop {

 	bool printLockTable ( );

-	long getNumSpidersOutPerIp ( long firstIp ) ;
+	long getNumSpidersOutPerIp ( long firstIp , collnum_t collnum ) ;

 	// free all XmlDocs and m_list
 	void reset();
@ -1301,7 +1307,7 @@ class SpiderLoop {
 	// . returns true and sets g_errno on error
 	bool spiderUrl9 ( class SpiderRequest *sreq ,
 			 key_t *doledbKey       ,
-			  char  *coll            ,
+			  collnum_t collnum,//char  *coll            ,
 			  long sameIpWaitTime , // in milliseconds
 			  long maxSpidersOutPerIp );

@ -1312,7 +1318,8 @@ class SpiderLoop {
 	// state memory for calling SpiderUrl2() (maybe also getLocks()!)
 	SpiderRequest *m_sreq;

-	char      *m_coll;
+	//char      *m_coll;
+	collnum_t  m_collnum;
 	char      *m_content;
 	long       m_contentLen;
 	char       m_contentHasMime;
@ -1354,7 +1361,7 @@ class SpiderLoop {
 	class SpiderColl *m_sc;

 	// used to avoid calling getRec() twice!
-	bool m_gettingList;
+	//bool m_gettingList0;

 	long m_outstanding1;
 	bool m_gettingDoledbList;
--- a/Stats.cpp
+++ b/Stats.cpp
@ -499,7 +499,7 @@ void drawLine2 ( SafeBuf &sb ,
 	sb.safePrintf("<div style=\"position:absolute;"
 		      "left:%li;"
 		      "top:%li;"
-		      "background-color:#%lx;"
+		      "background-color:#%06lx;"
 		      "z-index:-5;"
 		      "min-height:%lipx;"
 		      "min-width:%lipx;\"></div>\n"
--- a/Stats.h
+++ b/Stats.h
@ -25,7 +25,7 @@ class StatPoint {

 #define MAX_POINTS 6000
 #define MAX_WIDTH  6
-#define DY         600              // pixels vertical
+#define DY         1000              // pixels vertical
 #define DX         1000             // pixels across
 #define DT         (20*1000)        // time window, 20 seconds
 #define MAX_LINES  (DY / (MAX_WIDTH+1)) // leave free pixel above each line
--- a/Statsdb.cpp
+++ b/Statsdb.cpp
@ -526,16 +526,16 @@ bool Statsdb::makeGIF ( long t1Arg ,

 #define MAX_POINTS 6000
 #define MAX_WIDTH  6
-#define DY         600              // pixels vertical
-#define DX         1000             // pixels across
-#define MAX_LINES  (DY / (MAX_WIDTH+1)) // leave free pixel above each line
+#define DY2        600              // pixels vertical
+#define DX2        1000             // pixels across
+#define MAX_LINES2  (DY2 / (MAX_WIDTH+1)) // leave free pixel above each line

 long Statsdb::getImgHeight() {
-	return (long)DY + m_by * 2;
+	return (long)DY2 + m_by * 2;
 }

 long Statsdb::getImgWidth() {
-	return (long)DX + m_bx * 2;
+	return (long)DX2 + m_bx * 2;
 }

 // these are used for storing the "events"
@ -599,7 +599,7 @@ bool Statsdb::gifLoop ( ) {
 	// gif size
 	//char tmp[64];
 	// dimensions of the gif
-	//sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
+	//sprintf ( tmp , "%lix%li", (long)DX2+m_bx*2 , (long)DY2+m_by*2 );
 	//GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
 	// create one
 	//GIFPlotter plotter ( NULL , m_fd , NULL );
@ -607,7 +607,7 @@ bool Statsdb::gifLoop ( ) {
 	//plotter.openpl ( );

 	// define the space with boundaries 100 unit wide boundaries
-	//plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );
+	//plotter.space ( 0 , 0 , DX2 + m_bx * 2 , DY2 + m_by * 2 );

 	// line thickness in user coordinates (pixels for us)
 	//plotter.linewidth ( 1 );       
@ -628,7 +628,7 @@ bool Statsdb::gifLoop ( ) {
 		      "z-index:-10;"
 		      // the tick marks we print below are based on it
 		      // being a window of the last 20 seconds... and using
-		      // DX pixels
+		      // DX2 pixels
 		      "min-width:%lipx;"
 		      "min-height:%lipx;"
 		      //"width:100%%;"
@ -637,15 +637,15 @@ bool Statsdb::gifLoop ( ) {
 		      "margin-bottom:10px;"
 		      "margin-right:10px;"
 		      "margin-left:10px;\">"
-		      ,(long)DX + 2 *m_bx
-			,(long)DY + 2*m_by);
+		      ,(long)DX2 + 2 *m_bx
+			,(long)DY2 + 2*m_by);


 	// draw the x-axis
-	//plotter.line ( m_bx , m_by , DX + m_bx , m_by  );
+	//plotter.line ( m_bx , m_by , DX2 + m_bx , m_by  );

 	// 10 x-axis tick marks
-	for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
+	for ( int x = DX2/20 ; x <= DX2 ; x += DX2/20 ) {
 		// tick mark
 		//plotter.line ( x , -20 , x , 20 );
 		m_gw.safePrintf("<div style=\"position:absolute;"
@ -657,7 +657,7 @@ bool Statsdb::gifLoop ( ) {
 			      "min-width:3px;\"></div>\n"
 			      , m_bx + (long)x-1
 			      );
-		long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt;
+		long xv = (long)(dt * (long long)x/(long long)DX2)-(long)dt;
 		// LABEL
 		m_gw.safePrintf("<div style=\"position:absolute;"
 				"left:%li;"
@ -780,8 +780,8 @@ bool Statsdb::gifLoop ( ) {
 		// ensure at least 3 units wide for visibility
 		if ( x2 < x1 + 10 ) x2 = x1 + 10;
 		// . flip the y so we don't have to scroll the browser down
-		// . DY does not include the axis and tick marks
-		//long fy1 = DY - y1 + m_by ;
+		// . DY2 does not include the axis and tick marks
+		//long fy1 = DY2 - y1 + m_by ;
 		// plot it
 		//plotter.line ( x1 , fy1 , x2 , fy1 );
 		drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness );
@ -918,7 +918,7 @@ char *Statsdb::plotGraph ( char *pstart ,

 	// . the minimum difference between ymax and ymin is minDiff.
 	// . this prevents us from zooming in too close!
-	float minDiff = (float)DY     * label->m_minRes ;
+	float minDiff = (float)DY2     * label->m_minRes ;
 	// we are already scaled!
 	float ourDiff = (ymax - ymin) ;

@ -976,14 +976,14 @@ char *Statsdb::plotGraph ( char *pstart ,
 		float y1 = lasty;

 		// normalize y into pixel space
-		y2 = ((float)DY * (y2 - ymin)) / (ymax-ymin);
+		y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);

 		// set lasts for next iteration of this loop
 		lastx = x2;
 		lasty = y2;

 		// . flip the y so we don't have to scroll the browser down
-		// . DY does not include the axis and tick marks
+		// . DY2 does not include the axis and tick marks
 		// . do not flip y any more for statsdb graphs
 		long fy1 = (long)(y1+.5);// + m_by ;
 		long fy2 = (long)(y2+.5);// + m_by ;
@ -1011,7 +1011,7 @@ char *Statsdb::plotGraph ( char *pstart ,

 		// plot it
 		// BUT only iff not more than 5 seconds difference
-		//float secondsPerPixel = (m_t2-m_t1)/(float)DX;
+		//float secondsPerPixel = (m_t2-m_t1)/(float)DX2;

 		// avoid this for now. mdw oct 14 2013.
 		//float dt = (x2 - x1) * secondsPerPixel;
@ -1068,7 +1068,7 @@ void Statsdb::drawHR ( float z ,
 		       long color ) {

 	// convert into yspace
-	float z2 = ((float)DY * (float)(z - ymin)) /(float)(ymax-ymin);
+	float z2 = ((float)DY2 * (float)(z - ymin)) /(float)(ymax-ymin);
 	// avoid collisions with other graphs
 	z2 += zoff;
 	// border
@ -1076,7 +1076,7 @@ void Statsdb::drawHR ( float z ,
 	// round off error
 	z2 += 0.5;
 	// for adjusatmnet
-	float ptsPerPixel = (ymax-ymin)/ (float)DY;
+	float ptsPerPixel = (ymax-ymin)/ (float)DY2;
 	// make an adjustment to the label then! -- Commented out because it's currently not used.
 	float zadj = zoff * ptsPerPixel;

@ -1088,9 +1088,9 @@ void Statsdb::drawHR ( float z ,
 	//		    ((color >>  0) & 0xff) << 8 );

 	// horizontal line
-	//plotter->line ( m_bx, (long)z2 , DX + m_bx, (long)z2 );
+	//plotter->line ( m_bx, (long)z2 , DX2 + m_bx, (long)z2 );
 	long width = 1;
-	drawLine3 ( m_gw, 0, DX , (long)z2,color, width); 
+	drawLine3 ( m_gw, 0, DX2 , (long)z2,color, width); 


 	// make label
@ -1364,7 +1364,7 @@ bool Statsdb::addPoint ( long      x        ,
 			 class     StatState *ss ) {

 	// convert x into pixel position
-	float xf = (float)DX * (float)(x - m_t1) / (float)(m_t2 - m_t1);
+	float xf = (float)DX2 * (float)(x - m_t1) / (float)(m_t2 - m_t1);
 	// round it to nearest pixel
 	long  x2 = (long)(xf + .5) ;//+ m_bx;
 	// make this our y pos
@ -1446,12 +1446,12 @@ bool Statsdb::addEventPoint ( long  t1        ,
 			      long  thickness ) {
 	
 	// convert t1 into pixel position
-	float af = (float)DX * (float)(t1 - m_t1) / (float)(m_t2 - m_t1);
+	float af = (float)DX2 * (float)(t1 - m_t1) / (float)(m_t2 - m_t1);
 	// round it to nearest pixel
 	long  a = (long)(af + .5) ;//+ m_bx;

 	// convert t2 into pixel position
-	//float bf = (float)DX * (float)(t2 - m_t1) / (float)(m_t2 - m_t1);
+	//float bf = (float)DX2 * (float)(t2 - m_t1) / (float)(m_t2 - m_t1);
 	// round it to nearest pixel
 	//long  b = (long)(bf + .5) + m_bx;
 	//if ( a > b ) { char *xx=NULL;*xx=0; }
@ -1468,7 +1468,7 @@ bool Statsdb::addEventPoint ( long  t1        ,
 	}

 	// go down each line of points
-	for ( long i = 0 ; i < MAX_LINES ; i++ ) {
+	for ( long i = 0 ; i < MAX_LINES2 ; i++ ) {
 		// breathe
 		QUICKPOLL ( m_niceness );
 		// . is there room for us in this line?
--- a/Synonyms.cpp
+++ b/Synonyms.cpp
@ -429,6 +429,8 @@ char *getSourceString ( char source ) {
 	if ( source == SOURCE_BIGRAM ) return "bigram";
 	if ( source == SOURCE_TRIGRAM ) return "trigram";
 	if ( source == SOURCE_WIKTIONARY_EN ) return "wiktionary-en";
+	// the thing we are hashing is a "number"
+	if ( source == SOURCE_NUMBER ) return "number";
 	return "unknown";
 }

--- a/Synonyms.h
+++ b/Synonyms.h
@ -15,6 +15,7 @@
 #define SOURCE_GENERATED  4
 #define SOURCE_BIGRAM     5
 #define SOURCE_TRIGRAM    6
+#define SOURCE_NUMBER     7

 // per word!
 #define MAX_SYNS 64
--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -2513,6 +2513,13 @@ bool Msg8a::getTagRec ( Url   *url ,
 			TagRec *tagRec ,
 			bool   doInheritance ,
 			char   rdbId ) {
+
+
+	CollectionRec *cr = g_collectiondb.getRec ( coll );
+	if ( ! cr ) { 
+		g_errno = ENOCOLLREC;
+		return true;
+	}
 	
 	// reset tag rec
 	tagRec->reset();//m_numListPtrs = 0;
--- a/TcpServer.cpp
+++ b/TcpServer.cpp
@ -780,7 +780,10 @@ TcpSocket *TcpServer::getNewSocket ( ) {
 		log("tcp: using statically linked libc that only supports "
 		    "an fd of up to %li, but got an fd = %li. fd_set is "
 		    "only geared for 1024 bits of file descriptors for "
-		    "doing poll() in Loop.cpp",
+		    "doing poll() in Loop.cpp. Ensure 'ulimit -a' limits "
+		    "open files to 1024. "
+		    "Check open fds using ls /proc/<gb-pid>/fds/ and ensure "
+		    "they are all BELOW 1024.",
 		    (long)MAX_NUM_FDS,(long)sd);
 		char *xx=NULL;*xx=0; 
 	}
@ -1092,7 +1095,7 @@ bool TcpServer::closeLeastUsed ( long maxIdleTime ) {
 // . g_errno will be set by Loop if there was a kinda socket reset error
 void readSocketWrapper ( int sd , void *state ) {
 	// debug msg
-	// log("........... TcpServer::readSocketWrapper\n");
+	//log("........... TcpServer::readSocketWrapper\n");
 	// extract our this ptr
 	TcpServer *THIS = (TcpServer *)state;
 	// get a TcpSocket from sd
@ -1239,8 +1242,13 @@ long TcpServer::readSocket ( TcpSocket *s ) {

 	// do the read
 	int n;
-	if (m_useSSL)
-		n = SSL_read ( s->m_ssl, s->m_readBuf + s->m_readOffset, avail );
+	if (m_useSSL) {
+		//long long now1 = gettimeofdayInMilliseconds();
+		n = SSL_read(s->m_ssl, s->m_readBuf + s->m_readOffset, avail );
+		//long long now2 = gettimeofdayInMilliseconds();
+		//long long took = now2 - now1 ;
+		//if ( took >= 2 ) log("tcp: ssl_read took %llims", took);
+	}
 	else
 		n = ::read ( s->m_sd, s->m_readBuf + s->m_readOffset, avail );

@ -1483,8 +1491,13 @@ long TcpServer::writeSocket ( TcpSocket *s ) {
 	// send this piece
 	int n;
 retry10:
-	if (m_useSSL)
+	if (m_useSSL) {
+		//long long now1 = gettimeofdayInMilliseconds();
 		n = SSL_write ( s->m_ssl, msg + s->m_sendOffset, toSend );
+		//long long now2 = gettimeofdayInMilliseconds();
+		//long long took = now2 - now1 ;
+		//if ( took >= 2 ) log("tcp: ssl_write took %llims", took);
+	}
 	else
 		n = ::send ( s->m_sd , msg + s->m_sendOffset , toSend , 0 );
 	// cancel harmless errors, return -1 on severe ones
@ -1626,8 +1639,12 @@ connected:
 		int r;
 		s->m_ssl = SSL_new(m_ctx);
 		SSL_set_fd(s->m_ssl, s->m_sd);
+		//long long now1 = gettimeofdayInMilliseconds();
 		SSL_set_connect_state(s->m_ssl);
 		r = SSL_connect(s->m_ssl);
+		//long long now2 = gettimeofdayInMilliseconds();
+		//long long took = now2 - now1 ;
+		//if ( took >= 2 ) log("tcp: ssl_connect took %llims", took);
 		if (!s->m_ssl) {
 			log("ssl: SSL is NULL after connect.");
 			char *xx = NULL; *xx = 0;
@ -2092,9 +2109,19 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
 	}

 	//log("ssl: SSL_accept %li",newsd);
+	long long now1 = gettimeofdayInMilliseconds();
 retry19:
-	// javier put this in here, but it was not non-blocking!!!
+	// . javier put this in here, but it was not non-blocking!!!
+	// . it is non-blocking now, however, when it does block and
+	//   complete the accept it takes 10ms on sp1, a server from ~2009
+	//   using a custom build of the lastest libssl.a from about 2013.
+	// . this accept needs to be put in a thread then, maybe multiple 
+	//   threads
 	int r = SSL_accept(s->m_ssl);
+	long long now2 = gettimeofdayInMilliseconds();
+	long long took = now2 - now1 ;
+	if ( took >= 2 ) 
+		log("tcp: ssl_accept %li took %llims", (long)newsd, took);
 	// did it block?
 	if ( r < 0 && errno == EINTR ) goto retry19;
 	// copy errno to g_errno
@ -2103,7 +2130,7 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
 	if ( g_errno == SSL_ERROR_WANT_READ ||
 	     g_errno == SSL_ERROR_WANT_WRITE ||
 	     g_errno == EAGAIN ) {
-		//log("ssl: SSL_accept blocked %li",newsd);
+		//log("ssl: SSL_accept would block %li",newsd);
 		return true;
 	}
 	// any other?
@ -2117,8 +2144,9 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
 	}

 	// log this so we can monitor if we get too many of these per second
-	// because they take like 10ms each on sp1!!! mdw
-	log("ssl: SSL_accept (~10ms) completed %li",newsd);
+	// because they take like 10ms each on sp1!!! (even with non-blocking 
+	// sockets, they'll block for 10ms) - mdw 2013
+	//log("ssl: SSL_accept (~10ms) completed %li",newsd);
 	// ok, we got it
 	s->m_sockState = ST_READING;
 	return true;
--- a/Test.cpp
+++ b/Test.cpp
@ -126,7 +126,7 @@ void Test::removeFiles ( ) {
 	// . kinda like Collectiondb::deleteRec() i guess but we need to
 	//   preserve the parms!!
 	// . deletetagdb = false
-	g_collectiondb.resetColl ( "test"  , NULL ); // false );
+	g_collectiondb.resetColl ( "test"  , NULL , true );

 	// reset event count
 	//g_collectiondb.countEvents();
--- a/Threads.cpp
+++ b/Threads.cpp
@ -303,6 +303,10 @@ bool Threads::init ( ) {
 	// generic multipurpose
 	if ( ! g_threads.registerType (GENERIC_THREAD,100/*maxThreads*/,100) ) 
 		return log("thread: Failed to register thread type." );
+	// for call SSL_accept() which blocks for 10ms even when socket
+	// is non-blocking...
+	//if (!g_threads.registerType (SSLACCEPT_THREAD,20/*maxThreads*/,100)) 
+	//	return log("thread: Failed to register thread type." );

 #ifndef PTHREADS

@ -884,20 +888,28 @@ bool ThreadQueue::timedCleanUp ( long maxNiceness ) {

 #ifdef PTHREADS		

-		// . join up with that thread
-		// . damn, sometimes he can block forever on his
-		//   call to sigqueue(), 
-		long status =  pthread_join ( t->m_joinTid , NULL );
-		if ( status != 0 ) {
-		  log("threads: pthread_join %li = %s (%li)",
-		      (long)t->m_joinTid,mstrerror(status),status);
-		}
-		// debug msg
-		if ( g_conf.m_logDebugThread )
-			log(LOG_DEBUG,"thread: joined1 with t=0x%lx "
-			    "jointid=0x%lx.",
-			    (long)t,(long)t->m_joinTid);

+		// if pthread_create() failed it returns the errno and we
+		// needsJoin is false, so do not try to join
+		// to a thread if we did not create it, lest pthread_join()
+		// cores
+		if ( t->m_needsJoin ) {
+			// . join up with that thread
+			// . damn, sometimes he can block forever on his
+			//   call to sigqueue(), 
+			long status =  pthread_join ( t->m_joinTid , NULL );
+			if ( status != 0 ) {
+				log("threads: pthread_join %li = %s (%li)",
+				    (long)t->m_joinTid,mstrerror(status),
+				    status);
+			}
+			// debug msg
+			if ( g_conf.m_logDebugThread )
+				log(LOG_DEBUG,"thread: joined1 with t=0x%lx "
+				    "jointid=0x%lx.",
+				    (long)t,(long)t->m_joinTid);
+		}
+		
 #else

 	again:
@ -1211,20 +1223,22 @@ bool ThreadQueue::cleanUp ( ThreadEntry *tt , long maxNiceness ) {

 #ifdef PTHREADS		

-		// . join up with that thread
-		// . damn, sometimes he can block forever on his
-		//   call to sigqueue(), 
-		long status =  pthread_join ( t->m_joinTid , NULL );
-		if ( status != 0 ) {
-		  log("threads: pthread_join2 %li = %s (%li)",
-		      (long)t->m_joinTid,mstrerror(status),status);
+		if ( t->m_needsJoin ) {
+			// . join up with that thread
+			// . damn, sometimes he can block forever on his
+			//   call to sigqueue(), 
+			long status =  pthread_join ( t->m_joinTid , NULL );
+			if ( status != 0 ) {
+				log("threads: pthread_join2 %li = %s (%li)",
+				    (long)t->m_joinTid,mstrerror(status),
+				    status);
+			}
+			// debug msg
+			if ( g_conf.m_logDebugThread )
+				log(LOG_DEBUG,"thread: joined2 with t=0x%lx "
+				    "jointid=0x%lx.",
+				    (long)t,(long)t->m_joinTid);
 		}
-		// debug msg
-		if ( g_conf.m_logDebugThread )
-			log(LOG_DEBUG,"thread: joined2 with t=0x%lx "
-			    "jointid=0x%lx.",
-			    (long)t,(long)t->m_joinTid);
-
 #else

 	again:
@ -1591,7 +1605,7 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
 	// return if the max is already launched
 	if ( active >= m_maxLaunched ) return false;

-	// do not launch a low priority merge, addlists or filter thread if we
+	// do not launch a low priority merge, intersect or filter thread if we
 	// have high priority cpu threads already going on. this way a
 	// low priority spider thread will not launch if a high priority
 	// cpu-based thread of any kind (right now just MERGE or INTERSECT) 
@ -1642,7 +1656,7 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
 	// i dunno what the point of this was... so i commented it out
 	//long max2 = g_conf.m_queryMaxDiskThreads ;
 	//if ( max2 <= 0 ) max2 = 1;
-	// only do this check if we're a addlists thread queue
+	// only do this check if we're a addlists/instersect thread queue
 	//if (m_threadType == INTERSECT_THREAD&& hiActive >= max2)return false;

 	// loop through candidates
@ -2008,7 +2022,26 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
 	//
 #else

-	pthread_create ( &t->m_joinTid , &s_attr, startUp2 , t) ;
+	// assume it does not go through
+	t->m_needsJoin = false;
+
+	// pthread inherits our sigmask, so don't let it handle sigalrm
+	// signals in Loop.cpp, it'll screw things up. that handler
+	// is only meant to be called by the main process. if we end up
+	// double calling it, this thread may think g_callback is non-null
+	// then it gets set to NULL, then the thread cores! seen it...
+	sigset_t sigs;
+	sigemptyset ( &sigs );
+	sigaddset   ( &sigs , SIGALRM );
+	if ( sigprocmask ( SIG_BLOCK  , &sigs , NULL ) < 0 )
+		log("threads: failed to block sig");
+
+	// this returns 0 on success, or the errno otherwise
+	g_errno = pthread_create ( &t->m_joinTid , &s_attr, startUp2 , t) ;
+
+	if ( sigprocmask ( SIG_UNBLOCK  , &sigs , NULL ) < 0 )
+		log("threads: failed to unblock sig");
+

 #endif

@ -2020,6 +2053,8 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {

 	// return true on successful creation of the thread
 	if ( g_errno == 0 ) {
+		// good stuff, the thread needs a join now
+		t->m_needsJoin = true;
 		if ( count > 0 ) 
 			log("thread: Call to clone looped %li times.",count);
 		return true;
@ -2047,6 +2082,11 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
 #ifndef PTHREADS
 hadError:
 #endif
+
+	if ( g_errno )
+		log("thread: pthread_create had error = %s",
+		    mstrerror(g_errno));
+
 	// it didn't launch, did it? dec the count.
 	m_launched--;
 	// priority-based LOCAL & GLOBAL launch counts
@ -2326,7 +2366,7 @@ const char *ThreadQueue::getThreadType ( ) {
 	const char *s = "unknown";
 	if ( m_threadType == DISK_THREAD      ) s = "disk";
 	if ( m_threadType == MERGE_THREAD     ) s = "merge";
-	if ( m_threadType == INTERSECT_THREAD ) s = "addlists";
+	if ( m_threadType == INTERSECT_THREAD ) s = "intersectlists";
 	if ( m_threadType == FILTER_THREAD    ) s = "filter";
 	if ( m_threadType == SAVETREE_THREAD  ) s = "savetree";
 	if ( m_threadType == UNLINK_THREAD    ) s = "unlink";
--- a/Threads.h
+++ b/Threads.h
@ -21,6 +21,7 @@ pid_t getpidtid();
 #define SAVETREE_THREAD  4
 #define UNLINK_THREAD    5
 #define GENERIC_THREAD   6
+//#define SSLACCEPT_THREAD 7
 #define GB_SIGRTMIN	 (SIGRTMIN+4)
 #define MAX_NICENESS     2
 // . a ThreadQueue has a list of thread entries
@ -54,6 +55,7 @@ class ThreadEntry {
 	long         m_stackSize                ;
 	long         m_si                       ; // s_stackPtrs[i] = m_stack

+	bool      m_needsJoin;
 	pthread_t m_joinTid;
 };

--- a/Title.cpp
+++ b/Title.cpp
@ -113,8 +113,17 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 	char *val = NULL;
 	long vlen;
 	// look for the "title:" field in json then use that
-	if ( xd->m_contentType == CT_JSON )
-		val = getJSONFieldValue ( xd->ptr_utf8Content,"title",&vlen);
+	SafeBuf jsonTitle;
+	if ( xd->m_contentType == CT_JSON ) {
+		char *jt;
+		jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
+		if ( jt && vlen > 0 ) {
+			jsonTitle.safeDecodeJSONToUtf8 (jt, vlen, m_niceness);
+							//true ); // decodeAll?
+			jsonTitle.nullTerm();
+			val = jsonTitle.getBufStart();
+		}
+	}
 	// if we had a title: field in the json...
 	if ( val ) {
 		char *dst = NULL;
--- a/Wiktionary.cpp
+++ b/Wiktionary.cpp
@ -22,6 +22,8 @@ Wiktionary::Wiktionary () {
 	// . now m_langTable just maps to langId, no POS bits...
 	//m_langTable.set ( 6 , 1,0,NULL,0,false,0 ,"wkt-lang"); 
 	m_synTable.set  ( 6 , 4,0,NULL,0,true,0 ,"wkt-synt"); 
+
+	m_synBuf.setLabel("synbuf");
 }

 void Wiktionary::reset() {
@ -47,6 +49,11 @@ Wiktionary::~Wiktionary () {


 bool Wiktionary::test ( ) {
+
+	// test words parsing here
+	//Words w;
+	//w.set9 ("get $4,500.00 now",0);
+
 	// test it out!
 	char *str = "love";//pie"; //forsake";
 	//long long wid = hash64Lower_utf8(str);
--- a/Words.cpp
+++ b/Words.cpp
@ -416,7 +416,38 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
 	}
 	// . c#, j#, ...
 	if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;
+
+	// comma is ok if like ,ddd!d
+	if ( s[i]==',' && 
+	     i-j <= 3 &&
+	     is_digit(s[i-1]) ) {
+		// if word so far is 2 or 3 chars, make sure digits
+		if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
+		if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
+		// scan forward
+	subloop:
+		if ( s[i] == ',' &&
+		     is_digit(s[i+1]) &&
+		     is_digit(s[i+2]) &&
+		     is_digit(s[i+3]) &&
+		     ! is_digit(s[i+4]) ) {
+			i += 4;
+			goto subloop;
+		}
+	}
+
+	// decimal point?
+	if ( s[i] == '.' &&
+	     is_digit(s[i-1]) &&
+	     is_digit(s[i+1]) ) {
+		// allow the decimal point
+		i++;
+		// skip over string of digits
+		while ( is_digit(s[i]) ) i++;
+	}
 	
+ nogo:
+
 	// allow for words like we're dave's and i'm
 	if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
 		i++;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -630,6 +630,8 @@ class XmlDoc {
 	long *getIp ( ) ;
 	long *gotIp ( bool save ) ;
 	bool *getIsAllowed ( ) ;
+	long *getFinalCrawlDelay();
+	long      m_finalCrawlDelay;
 	//long getTryAgainTimeDelta() { 
 	//	if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;}
 	//	return m_tryAgainTimeDelta;
@ -752,6 +754,7 @@ class XmlDoc {
 	bool hashDMOZCategories ( class HashTableX *table ) ;
 	bool hashLinks ( class HashTableX *table ) ;
 	bool hashUrl ( class HashTableX *table ) ;
+	bool hashDateNumbers ( class HashTableX *tt ) ;
 	bool hashSections ( class HashTableX *table ) ;
 	bool hashIncomingLinkText ( class HashTableX *table            ,
 				    bool       hashAnomalies    ,
@ -849,6 +852,15 @@ class XmlDoc {
 			  long              niceness       );


+	bool hashNumber ( char *beginBuf ,
+			  char *buf , 
+			  long bufLen , 
+			  class HashInfo *hi ) ;
+
+	bool hashNumber2 ( float f , 
+			   class HashInfo *hi ,
+			   char *gbsortByStr ) ;
+
 	// print out for PageTitledb.cpp and PageParser.cpp
 	bool printDoc ( class SafeBuf *pbuf );
 	bool printMenu ( class SafeBuf *pbuf );
@ -1159,6 +1171,7 @@ class XmlDoc {
 	*/
 	bool m_httpStatusValid;
 	bool m_crawlDelayValid;
+	bool m_finalCrawlDelayValid;
 	bool m_titleRecKeyValid;
 	bool m_adVectorValid;
 	bool m_wikiDocIdsValid;
@ -1279,6 +1292,7 @@ class XmlDoc {
 	bool m_replyValid;
 	bool m_recycleDiffbotReplyValid;
 	bool m_diffbotReplyValid;
+	bool m_tokenizedDiffbotReplyValid;
 	//bool m_diffbotUrlCrawlPatternMatchValid;
 	//bool m_diffbotUrlProcessPatternMatchValid;
 	//bool m_diffbotPageProcessPatternMatchValid;
@ -1480,6 +1494,7 @@ class XmlDoc {
 	char m_isWWWDup;
 	char m_calledMsg0b;
 	Url  m_tmpUrl;
+
 	SafeBuf m_tmpsb1;
 	SafeBuf m_tmpsb2;
 	SafeBuf m_turkBuf;
@ -1548,9 +1563,9 @@ class XmlDoc {
 	//
 	XmlDoc *m_dx;
 	char *m_diffbotObj;
-	char *m_diffbotObjEnd;
-	char  m_diffbotSavedChar;
 	SafeBuf m_diffbotReply;
+	SafeBuf *m_tokenizedDiffbotReplyPtr;
+	SafeBuf  m_tokenizedDiffbotReply;
 	long m_diffbotReplyError;
 	bool m_recycleDiffbotReply;
 	//bool m_diffbotUrlCrawlPatternMatch;
@ -1562,6 +1577,7 @@ class XmlDoc {
 	SafeBuf m_diffbotApiUrl;

 	bool *getRecycleDiffbotReply ( ) ;
+	SafeBuf *getTokenizedDiffbotReply ( ) ;
 	SafeBuf *getDiffbotReply ( ) ;
 	//bool doesUrlMatchDiffbotCrawlPattern() ;
 	//bool doesUrlMatchDiffbotProcessPattern() ;
--- a/blaster.cpp
+++ b/blaster.cpp
@ -382,9 +382,9 @@ void gotDocWrapper ( void *state , TcpSocket *s ) {
 	// parse status message out of response

 	// HTTP/1.0
-	while ( p < pend && !is_space(*p) ) p++;
+	while ( p < pend && !isspace(*p) ) p++;
 	// skip space
-	while ( p < pend &&  is_space(*p) ) p++;
+	while ( p < pend &&  isspace(*p) ) p++;
 	// copy to end of line
 	while (p < pend && mlen < 255 && *p != '\r' && *p != '\n'){
 		message[mlen++] = *p;
--- a/coll.main.0/coll.conf
+++ b/coll.main.0/coll.conf
@ -4,7 +4,12 @@
 <notifyUrl><![CDATA[ccc]]></>
 <collectiveRespiderFrequency>0.000000</>
 <collectiveCrawlDelay>0.250000</>
+<diffbotApiUrl><![CDATA[]]></>
+<diffbotUrlCrawlPattern><![CDATA[]]></>
+<diffbotUrlProcessPattern><![CDATA[]]></>
 <diffbotPageProcessPattern><![CDATA[]]></>
+<diffbotUrlCrawlRegEx><![CDATA[]]></>
+<diffbotUrlProcessRegEx><![CDATA[]]></>
 <diffbotOnlyProcessIfNew>1</>
 <diffbotSeeds><![CDATA[]]></>
 <isCustomCrawl>0</>
@ -79,6 +84,9 @@
 # The spider round number.
 <spiderRoundNum>0</>

+# The spider status number.
+<spiderStatus>0</>
+
 # Do searches for queries in this hosts part of the query log.
 <scrapingEnabledProcog>0</>

@ -326,12 +334,12 @@
 <maxRobotstxtCacheAge>86400</>

 # Only spider URLs scheduled to be spidered at this time or after. In UTC.
-<spiderStartTime>17 Jan 1970 20:00 UTC</>
+<spiderStartTime>24 Jan 1970 20:00 UTC</>

 # Only spider URLs scheduled to be spidered at this time or before. If "use
 # current time" is true then the current local time is used for this value
 # instead. in UTC.
-<spiderEndTime>01 Jan 1970 08:00 UTC</>
+<spiderEndTime>08 Jan 1970 08:00 UTC</>

 # Use the current time as the spider end time?
 <useCurrentTime>1</>
@ -812,7 +820,7 @@
 <highlightQueryTermsInRelatedPagesSummary>0</>

 # Truncates a related page title after this many charaters and adds ...
-<numberOfCharactersToDisplayInTitleBeforeTruncating>50</>
+<numberOfCharactersToDisplayInTitleBeforeTruncating>0</>

 # Use the search results' links in order to generate related pages.
 <useResultsPagesAsReferences>0</>
@ -1017,173 +1025,23 @@
 <filterExpression><![CDATA[hopcount&gt;=3]]></>
 <filterExpression><![CDATA[isnew]]></>
 <filterExpression><![CDATA[default]]></>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>1</>
-<spidersEnabled>0</>
-<spidersEnabled>1</>
-<spidersEnabled>0</>
-<spidersEnabled>1</>
-<spidersEnabled>0</>
-<spidersEnabled>1</>
-<spidersEnabled>0</>
-<spidersEnabled>1</>
-<spidersEnabled>0</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>1.000000</>
-<filterFrequency>1.000000</>
-<filterFrequency>1.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>7.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>10.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>20.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>30.000000</>
-<filterFrequency>30.000000</>
+
+# Use <harvestLinks> tag.
+
+# Use <spidersEnabled> tag.
+
+# Use <filterFrequency> tag.

 # Do not allow more than this many outstanding spiders for all urls in this
 # priority.
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>4</>
-<maxSpidersPerRule>2</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>2</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
+# Use <maxSpidersPerRule> tag.

 # Allow this many spiders per IP.
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
+# Use <maxSpidersPerIp> tag.

 # Wait at least this long before downloading urls from the same IP address.
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<filterPriority>80</>
-<filterPriority>-3</>
-<filterPriority>-3</>
-<filterPriority>-3</>
-<filterPriority>-3</>
-<filterPriority>-3</>
-<filterPriority>-3</>
-<filterPriority>-3</>
-<filterPriority>0</>
-<filterPriority>-3</>
-<filterPriority>-3</>
-<filterPriority>-3</>
-<filterPriority>3</>
-<filterPriority>45</>
-<filterPriority>85</>
-<filterPriority>50</>
-<filterPriority>48</>
-<filterPriority>49</>
-<filterPriority>47</>
-<filterPriority>40</>
-<filterPriority>39</>
-<filterPriority>30</>
-<filterPriority>29</>
-<filterPriority>20</>
-<filterPriority>19</>
-<filterPriority>1</>
-<filterPriority>0</>
-<diffbotAPI><![CDATA[]]></>
+# Use <spiderIpWait> tag.
+
+# Use <filterPriority> tag.
+
+# Use <diffbotAPI> tag.
--- a/errnotest.cpp
+++ b/errnotest.cpp
@ -0,0 +1,66 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <sched.h>
+#include <unistd.h>
+#include <assert.h>
+
+static int s_called = 0;
+
+#define MAX_PID 32767
+static int  s_errno ;
+static int  s_errnos [ MAX_PID + 1 ];
+
+static long s_bad = 0;
+static long s_badPid = -1;
+
+// WARNING: you MUST compile with -DREENTRANT for this to work
+int *__errno_location (void) {
+	long pid = (long) getpid();
+	s_called++;
+	if ( pid <= (long)MAX_PID ) return &s_errnos[pid];
+	s_bad++;
+	s_badPid = pid;
+	return &s_errno; 
+}
+
+//extern __thread int errno;
+
+int g_errno = 0;
+
+int startup ( void *state ) {
+	char buf[5];
+	// this sets errno, but does not seem to call our __errno_location
+	// override, BUT does seem to not affect "errno" in main() either!
+	// maybe this is the TLS support?
+	int bytes = read(-9,buf,5);
+	//errno = 7; // E2BIG;
+	//assert ( errno && bytes == -1 );
+	g_errno = errno;
+}
+
+
+int main() {
+	errno = 10; // EINVAL;
+	g_errno = 10;
+	char stack[10000];
+	pid_t pid = clone( startup , 
+			   stack + 10000 ,
+			   //CLONE_SETTLS | 
+			   CLONE_VM | SIGCHLD,
+			   NULL );
+	int status;
+	waitpid ( pid , &status, 0  );
+
+	if ( s_called ) fprintf(stderr,"__errno_location() was called %i "
+				"times\n",s_called);
+
+	if ( errno != 10 ) fprintf(stderr,"errno=%i (failed)\n",errno);
+	else fprintf(stderr,"errno=%i (success)\n",errno);
+
+	if ( g_errno == 10 || g_errno == 0 ) 
+		fprintf(stderr,"gerrno=%i (failed)\n",g_errno);
+	else 
+		fprintf(stderr,"gerrno=%i (success)\n",g_errno);
+}
--- a/fctypes.cpp
+++ b/fctypes.cpp
@ -999,14 +999,27 @@ long long atoll2 ( const char *s, long len ) {
 double atof2 ( const char *s, long len ) {
 	// skip over spaces
 	const char *end = s + len;
-	while ( s < end && is_wspace_a ( *s ) ) s++;
+	while ( s < end && is_wspace_a ( *s ) ) { s++; len--; }
 	// return 0 if all spaces
 	if ( s == end ) return 0;
-	char buf[128];
+	char tmpBuf[128];
 	if ( len >= 128 ) len = 127;
-	strncpy ( buf , s , len );
-	buf[len] = '\0';
-	return atof ( buf );
+	//strncpy ( dst , s , len );
+
+	const char *p = s;
+	const char *srcEnd = s + len;
+	char *dst = tmpBuf;
+	// remove commas
+	for ( ; p < srcEnd ; p++ ) {
+		// skip commas
+		if ( *p == ',' ) continue;
+		// otherwise store it
+		*dst++ = *p;
+	}
+	// null term
+	*dst = '\0';
+	//buf[len] = '\0';
+	return atof ( tmpBuf );
 }

 double atod2 ( char *s, long len ) {
--- a/gb.conf
+++ b/gb.conf
@ -57,7 +57,7 @@
 <doNarrowSearch>0</>

 # Overrides all spidering for all collections on just this host.
-<localSpideringEnabled>1</>
+<localSpideringEnabled>0</>

 # Overrides all add urls for all collections on just this host.
 <localAddUrlEnabled>1</>
@ -73,10 +73,10 @@
 <qaSearchTestEnabled>1</>

 # Enable spidering on all hosts
-<allSpidersOn>1</>
+<allSpidersOn>0</>

 # Disable spidering on all hosts
-<allSpidersOff>1</>
+<allSpidersOff>0</>

 # Serves ads unless pure=1 is in cgi parms.
 <adFeedEnabled>0</>
@ -385,7 +385,7 @@

 # Maximum number of threads to use per Gigablast process for intersecting
 # docid lists. Generally, set this to the number of CPUs on the machine.
-<maxCpuThreads>1</>
+<maxCpuThreads>10</>

 # Maximum number of pages to index or delete from index per second for all
 # hosts combined.
--- a/html/admin.html
+++ b/html/admin.html
@ -78,21 +78,19 @@ You will need the following packages installed<br>
 2. Edit hosts.conf so the working directory is not /home/mwells/github/ but 
 rather your current working directory, where the 'gb' binary resides.
 <br><br>
-3. Run './gb 0'  to start a single gigablast node.
+3. Run './gb 0'  to start a single gigablast node which listens on port 8000.
 <br><br>
-4. Access the server with your browser on port 8000 (default port). You can change this default port in the gb.conf file.
+4. The first time you run it you will have to wait for it to build some binary data files from the txt files it uses that are based on wiktionary and wikipedia that it uses to do synonyms and phrasing. 
 <br><br>
-5. The first time you run it you will have to wait for it to build some binary data files from the txt files it uses that are based on wiktionary and wikipedia that it uses to do synonyms and phrasing. 
+5. Re-run it after it builds those binaries.
 <br><br>
-6. Re-run it after it builds those binaries.
+6. Check out the <a href=http://127.0.0.1:8000/master>Master Controls</a>. You need to connect to port 8000 from a local IP address or from an IP address on the same C-Class as part of Gigablast's security. Consider using an ssh tunnel if your browser's IP is not on the same C-Class as the server's. i.e. From your browser machine, ssh to the machine running the gb server: <i>ssh someservername.com -L 8000:127.0.0.1:8000</i> . Then on your browser go to the <a href=http://127.0.0.1:8000/master>Master Controls</a>.
 <br><br>
-7. Check out the <a href=http://127.0.0.1:8000/master>Master Controls</a>. You need to connect to port 8000 from a local IP address or from an IP address on the same C-Class as part of Gigablast's security. Consider using an ssh tunnel if your browser's IP is not on the same C-Class. i.e. From your browser machine, ssh to the machine running the gb server: <i>ssh someservername.com -L 8000:127.0.0.1:8000</i> . Then on your browser go to the <a href=http://127.0.0.1:8000/master>Master Controls</a>.
+7. Click on the <a href=http://127.0.0.1:8000/admin/inject?c=main>inject</a> menu and inject a URL into the index. It might be slow because it uses Google's public DNSes as specified in the Master Controls as 8.8.8.8 and 8.8.4.4. You should change those to your own local bind9 server for speed.
 <br><br>
-8. Click on the <a href=http://127.0.0.1:8000/admin/inject?c=main>inject</a> menu and inject a URL into the index. It might be slow because it uses Google's public DNSes as specified in the Master Controls as 8.8.8.8 and 8.8.4.4. You should change those to your own local bind9 server for speed.
+8. When the injection completes, try a <a href=http://127.0.0.1:8000/>search</a> for the document you injected.
 <br><br>
-9. When the injection completes, try a <a href=http://127.0.0.1:8000/>search</a> for the document you injected.
-<br><br>
-10. Turn on spiders on the <a href=http://127.0.0.1:8000>Master Controls</a> page so that it will begin spidering the outlinks of the page you injected.
+9. <a href=http://127.0.0.1:8000/master?se=1>Turn on spiders</a> on the <a href=http://127.0.0.1:8000/master>Master Controls</a> page so that it will begin spidering the outlinks of the page you injected.

 <br>

--- a/html/adv.html
+++ b/html/adv.html
@ -0,0 +1,115 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>Gigablast Advanced Search</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
+<meta name="MSSmartTagsPreventParsing" content="true" />
+<meta http-equiv="imagetoolbar" content="no" />
+<link href="stylesmain.css" rel="stylesheet" type="text/css" />
+<script type="text/javascript">
+<!--
+function x(){document.f.q.focus();}
+// -->
+</script>
+</head>
+
+<body onload="x()">
+<a href="/" target="_top"><img src="logo-small.png" alt="Gigablast Logo" title="Return to Basic Search" border="0" style="margin-bottom:15px;" /></a> 
+<h2>Advanced Search</h2>
+<form method="get" action="/search">
+	<table width="605" border="0" align="center" cellpadding="5" cellspacing="3">
+		<tbody>
+			<tr align="left" valign="middle">
+			<th colspan="3">Search for...</th>
+			</tr>
+			<tr align="left" valign="middle">
+				<td><strong>all</strong> of these words</td>
+				<td><input type="text" name="plus" size="40" /></td>
+				<td><input type="submit" value="Search" /></td>
+			</tr>
+			<tr align="left" valign="middle">
+				<td>this <strong>exact phrase</strong></td>
+				<td colspan="2"><input type="text" name="quote1" size="40" /></td>
+			</tr>
+			<tr align="left" valign="middle">
+				<td>and this <strong>exact phrase</strong></td>
+				<td colspan="2"><input type="text" name="quote2" size="40" /></td>
+			</tr>
+			<tr align="left" valign="middle">
+				<td><strong>any</strong> of these words</td>
+				<td colspan="2"><input type="text" name="q" size="40" /></td>
+			</tr>
+			<tr align="left" valign="middle">
+				<td><strong>none</strong> of these words</td>
+				<td colspan="2"><input type="text" name="minus" size="40" /></td>
+			</tr>
+
+			<tr align="left" valign="middle">
+				<td>In this language:
+				</td>
+				<td colspan="2">
+				<select name=gblang>
+				<option value=0>Any</option>
+				<option value=1>English</option>
+<option value=2>French</option>	
+<option value=3>Spanish</option>
+<option value=4>Russian</option>
+<option value=5>Turkish</option>
+<option value=6>Japanese</option>
+<option value=7>ChineseTrad</option>
+<option value=8>ChineseSimp</option>
+<option value=9>Korean</option>
+<option value=10>German</option>
+<option value=11>Dutch</option>
+<option value=12>Italian</option>
+<option value=13>Finnish</option>
+<option value=14>Swedish</option>
+<option value=15>Norwegian</option>
+<option value=16>Portuguese</option>
+<option value=17>Vietnamese</option>
+<option value=18>Arabic</option>
+<option value=19>Hebrew</option>
+<option value=20>Indonesian</option>
+<option value=21>Greek</option>
+<option value=22>Thai</option>
+<option value=23>Hindi</option>
+<option value=24>Bengala</option>
+<option value=25>Polish</option>
+<option value=26>Tagalog</option>
+				</select>
+				</td>
+			</tr>
+
+
+			<tr align="left" valign="middle">
+				<td>Restrict to this URL</td>
+				<td colspan="2"><input type="text" name="url" size="40" /></td>
+			</tr>
+			<tr align="left" valign="middle">
+				<td>Pages that link to this URL</td>
+				<td colspan="2"><input type="text" name="link" size="40" /></td>
+			</tr>
+			<tr align="left" valign="middle">
+				<td>Site Clustering</td>
+				<td colspan="2"><input type="radio" name="sc" value="1" checked="checked" />yes&nbsp;&nbsp;&nbsp;<input type="radio" name="sc" value="0" />no</td>
+			</tr>
+			<tr align="left" valign="middle">
+				<td>Number of summary excerpts</td>
+				<td colspan="2"><input type="radio" name="ns" value="0" />0&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="1" />1&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="2" />2&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="3" checked="checked" />3&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="4" />4&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="5" />5</td>
+			</tr>
+			<tr align="left" valign="middle">
+				<td>Results per Page</td>
+				<td colspan="2"><input type="radio" name="n" value="10" checked="checked" />10&nbsp;&nbsp;<input type="radio" name="n" value="20" />20&nbsp;&nbsp;<input type="radio" name="n" value="30" />30&nbsp;&nbsp;<input type="radio" name="n" value="40" />40&nbsp;&nbsp;<input type="radio" name="n" value="50" />50&nbsp;&nbsp;<input type="radio" name="n" value="100" />100</td>
+			</tr>
+			<tr align="left" valign="middle">
+				<td>Restrict to these Sites</td>
+				<td colspan="2"><textarea rows="10" cols="40" name="sites"></textarea></td>
+			</tr>
+	  </tbody>
+	</table>
+</form>
+<div id="footer">Copyright &copy; 2010-2020 <a href="http://www.gigablast.com" target="_top">Gigablast, 
+      Inc.</a> All rights reserved.</div>
+</body>
+</html>
--- a/html/dollargear.png
+++ b/html/dollargear.png
--- a/html/eventguru.png
+++ b/html/eventguru.png
--- a/html/gears.png
+++ b/html/gears.png
--- a/html/privacy.html
+++ b/html/privacy.html
@ -34,7 +34,7 @@ Gigablast - The Private Search Engine</font>
 Gigablast does not give your IP address to any third parties, nor allow any third party to deduce what queries might be coming from your IP address. Read the text below to understand what we mean by <i>deduce</i>.
 <br><br>

-In the summer of 2013 <a href=https://en.wikipedia.org/wiki/Edward_Snowden>Edward Snowden</a>, an ex-NSA contractor, described a secret NSA project known as <a href=https://en.wikipedia.org/wiki/PRISM_%28surveillance_program%29>project PRISM</a>. This project by the NSA wire taps not just live data traversing the internet, but also has automated access to large data repositories controlled by major internet companies. The data repositories consist of anything from search engine query logs and private emails to chat histories, among others. With today's fairly accurate audio-to-text transcription software, even services like <a href=https://en.wikipedia.org/wiki/Skype>Skype</a> audio and video calls are being tapped.
+In the summer of 2013 <a href=https://en.wikipedia.org/wiki/Edward_Snowden>Edward Snowden</a>, an ex-NSA contractor, described a secret NSA project known as <a href=https://en.wikipedia.org/wiki/PRISM_%28surveillance_program%29>project PRISM</a>, and more recently <a href="http://www.washingtonpost.com/world/national-security/nsa-infiltrates-links-to-yahoo-google-data-centers-worldwide-snowden-documents-say/2013/10/30/e51d661e-4166-11e3-8b74-d89d714ca4dd_story.html">Muscular</a>. These NSA projects wire tap not just live data traversing the internet, but also has automated access to large data repositories controlled by major internet companies. The data repositories consist of anything from search engine query logs and private emails to chat histories, among others. With today's fairly accurate audio-to-text transcription software, even services like <a href=https://en.wikipedia.org/wiki/Skype>Skype</a> audio and video calls are being tapped.
 <br><br>
 Such data access makes it very easy for government agencies like the NSA to set up large search engines that index these data streams and execute a list of queries on such search engines in order to profile and flag individuals for further examination.
 <br><br>
--- a/html/robots.txt
+++ b/html/robots.txt
@ -1,18 +1,18 @@
 User-Agent: googlebot
-Disallow: /search
+Disallow: /search?

 User-Agent: bingbot
-Disallow: /search
+Disallow: /search?

 User-Agent: msnbot
-Disallow: /search
+Disallow: /search?

 User-Agent: slurp
-Disallow: /search
+Disallow: /search?

 User-Agent: gigabot
-Disallow: /search
+Disallow: /search?

 User-Agent: *
-Disallow: /search
+Disallow: /search?

--- a/html/searchfeed.html
+++ b/html/searchfeed.html
@ -41,8 +41,9 @@ counts as a single query.
 <!--<li>Gigablast has many powerful <a href="/features.html">features</a>.
 <br><br>-->
 <li><a href=https://www.gigablast.com/account.html>Sign up now</a> to start accessing the feed.
-</ul>
 <br><br>
+<li>You can use the search results however you want. You can rearrange them, embed ads, etc.
+</ul>
 </td>
 </tr>
 </table>
--- a/main.cpp
+++ b/main.cpp
@ -2454,9 +2454,21 @@ int main ( int argc , char *argv[] ) {
 	if ( setrlimit(RLIMIT_CORE,&lim) )
 		log("db: setrlimit: %s.", mstrerror(errno) );
 	// limit fds
-	//lim.rlim_cur = lim.rlim_max = 511;
-	//if ( setrlimit(RLIMIT_NOFILE,&lim))
-	//	log("db: setrlimit2: %s.", mstrerror(errno) );
+	// try to prevent core from systems where it is above 1024
+	// because our FD_ISSET() libc function will core! (it's older)
+	long NOFILE = 1024;
+	lim.rlim_cur = lim.rlim_max = NOFILE;
+	if ( setrlimit(RLIMIT_NOFILE,&lim))
+		log("db: setrlimit RLIMIT_NOFILE %li: %s.",
+		    NOFILE,mstrerror(errno) );
+	struct rlimit rlim;
+	getrlimit ( RLIMIT_NOFILE,&rlim);
+	if ( (long)rlim.rlim_max > NOFILE || (long)rlim.rlim_cur > NOFILE ) {
+		log("db: setrlimit RLIMIT_NOFILE failed!");
+		char *xx=NULL;*xx=0;
+	}
+	log("db: RLIMIT_NOFILE = %li",(long)rlim.rlim_max);
+	//exit(0);
 	// . disable o/s's and hard drive's read ahead 
 	// . set multcount to 16 --> 1 interrupt for every 16 sectors read
 	// . multcount of 16 reduces OS overhead by 30%-50% (more throughput) 
--- a/urlinfo.cpp
+++ b/urlinfo.cpp
@ -154,6 +154,13 @@ int main ( int argc , char *argv[] ) {
 		printf("%s\n", out );
 	}

+	// encoded
+	char dst[MAX_URL_LEN+200];
+	urlEncode ( dst,MAX_URL_LEN+100,
+				u.getUrl(), u.getUrlLen(), 
+				false ); // are we encoding a request path?
+	printf("encoded: %s\n",dst);
+
 	// the probable docid
 	long long pd = g_titledb.getProbableDocId(&u);
 	printf("pdocid: %llu\n", pd );