Merge branch 'testing' of github.com:gigablast/open-source-search-engine into testing

2024-10-04 12:17:35 +03:00 · 2014-04-06 14:03:13 -07:00 · 2014-04-06 14:03:13 -07:00 · c20c30c53f
commit c20c30c53f
parent 23e5a94ddf 5ff88fafbc
37 changed files with 1222 additions and 806 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -138,6 +138,19 @@ bool Collectiondb::loadAllCollRecs ( ) {
 		if ( ! addExistingColl ( coll , collnum ) )
 			return false;
 	}
+	// if no existing recs added... add coll.main.0 always at startup
+	if ( m_numRecs == 0 ) {
+		log("admin: adding main collection.");
+		addNewColl ( "main",
+			     0 , // customCrawl ,
+			     NULL, 
+			     0 ,
+			     true , // bool saveIt ,
+			     // Parms.cpp reserves this so it can be sure
+			     // to add the same collnum to every shard
+			     0 );
+	}
+		
 	// note it
 	//log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
 	//    "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
@ -838,8 +851,8 @@ bool Collectiondb::resetColl ( char *coll ,  bool purgeSeeds) {
 		return true;
 	}

-	// get the CollectionRec for "test"
-	CollectionRec *cr = getRec ( coll ); // "test" );
+	// get the CollectionRec for "qatest123"
+	CollectionRec *cr = getRec ( coll ); // "qatest123" );

 	// must be there. if not, we create test i guess
 	if ( ! cr ) { 
@ -972,6 +985,39 @@ bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
 	return true;
 }

+// moves a file by first trying rename, then copying since cross device renaming doesn't work
+// returns 0 on success
+int mv(char* src, char* dest) {
+    int status = rename( src , dest );
+
+    if (status == 0)
+        return 0;
+    FILE *fsrc, *fdest;
+    fsrc = fopen(src, "r");
+    if (fsrc == NULL)
+        return -1;
+    fdest = fopen(dest, "w");
+    if (fdest == NULL) {
+        fclose(fsrc);
+        return -1;
+    }
+
+    const int BUF_SIZE = 1024;
+    char buf[BUF_SIZE];
+    while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
+        int read = fread(buf, 1, BUF_SIZE, fsrc);
+        fwrite(buf, 1, read, fdest);
+    }
+
+    fclose(fsrc);
+    fclose(fdest);
+    if (ferror(fdest) || ferror(fsrc))
+        return -1;
+
+    remove(src);
+    return 0;
+}
+
 // . returns false if we need a re-call, true if we completed
 // . returns true with g_errno set on error
 bool Collectiondb::resetColl2( collnum_t oldCollnum,
@ -982,8 +1028,8 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
 	// save parms in case we block
 	//we->m_purgeSeeds = purgeSeeds;

-	// now must be "test" only for now
-	//if ( strcmp(coll,"test") ) { char *xx=NULL;*xx=0; }
+	// now must be "qatest123" only for now
+	//if ( strcmp(coll,"qatest123") ) { char *xx=NULL;*xx=0; }
 	// no spiders can be out. they may be referencing the CollectionRec
 	// in XmlDoc.cpp... quite likely.
 	//if ( g_conf.m_spideringEnabled ||
@ -1018,6 +1064,18 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
 	//collnum_t oldCollnum = cr->m_collnum;
 	//collnum_t newCollnum = m_numRecs;

+	// in case of bulk job, be sure to save list of spots
+	// copy existing list to a /tmp, where they will later be transferred back to the new folder
+	char oldbulkurlsname[1036];
+	snprintf(oldbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)oldCollnum);
+	char newbulkurlsname[1036];
+	snprintf(newbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)newCollnum);
+	char tmpbulkurlsname[1036];
+	snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%li.bulkurls.txt",cr->m_coll,(long)oldCollnum);
+
+	if (cr->m_isCustomCrawl == 2)
+	    mv( oldbulkurlsname , tmpbulkurlsname );
+
 	// reset spider info
 	SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
 	if ( sc ) {
@ -1127,6 +1185,9 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
 	// save coll.conf to new directory
 	cr->save();

+	// be sure to copy back the bulk urls for bulk jobs
+	if (cr->m_isCustomCrawl == 2)
+	    mv( tmpbulkurlsname, newbulkurlsname );

 	// and clear the robots.txt cache in case we recently spidered a
 	// robots.txt, we don't want to use it, we want to use the one we
@ -1792,31 +1853,193 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {

 	long n = 0;

-	//strcpy(m_regExs   [n],"default");
+	/*
 	m_regExs[n].set("default");
 	m_regExs[n].nullTerm();
-	m_numRegExs++;
-
 	m_spiderFreqs     [n] = 30; // 30 days default
-	m_numRegExs2++;
-
 	m_spiderPriorities[n] = 0;
-	m_numRegExs3++;
-
 	m_maxSpidersPerRule[n] = 99;
-	m_numRegExs10++;
-
 	m_spiderIpWaits[n] = 1000;
-	m_numRegExs5++;
-
 	m_spiderIpMaxSpiders[n] = 7;
-	m_numRegExs6++;
-
-	//m_spidersEnabled[n] = 1;
-	//m_numRegExs7++;
-
 	m_harvestLinks[n] = 1;
-	m_numRegExs8++;
+	*/
+
+	m_regExs[n].set("isdocidbased");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 80;
+	n++;
+
+	m_regExs[n].set("ismedia");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = -3; // delete!
+	n++;
+
+	// if not in the site list then nuke it
+	m_regExs[n].set("!insitelist");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = -3; // delete!
+	n++;
+
+	m_regExs[n].set("errorcount>=3 && hastmperror");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 1; // 30 days default
+	m_maxSpidersPerRule  [n] = 1; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 3;
+	n++;
+
+	m_regExs[n].set("errorcount>=1 && hastmperror");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 1; // 30 days default
+	m_maxSpidersPerRule  [n] = 1; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 45;
+	n++;
+
+	m_regExs[n].set("isaddurl");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 85;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 50;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 48;
+	n++;
+
+	m_regExs[n].set("hopcount==0 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 49;
+	n++;
+
+	m_regExs[n].set("hopcount==0");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 10; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 47;
+	n++;
+
+	m_regExs[n].set("hopcount==1 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 40;
+	n++;
+
+	m_regExs[n].set("hopcount==1");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 39;
+	n++;
+
+	m_regExs[n].set("hopcount==2 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 30;
+	n++;
+
+	m_regExs[n].set("hopcount==2");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 29;
+	n++;
+
+	m_regExs[n].set("hopcount>=3 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 20;
+	n++;
+
+	m_regExs[n].set("hopcount>=3");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 19;
+	n++;
+
+	m_regExs[n].set("isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 30; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 2;
+	n++;
+
+	m_regExs[n].set("default");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 30; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 1;
+	n++;
+
+
+	m_numRegExs   = n;
+	m_numRegExs2  = n;
+	m_numRegExs3  = n;
+	m_numRegExs10 = n;
+	m_numRegExs5  = n;
+	m_numRegExs6  = n;
+	m_numRegExs8  = n;
+
+	// more rules
+
+
+

 	//m_spiderDiffbotApiNum[n] = 1;
 	//m_numRegExs11++;
@ -2064,7 +2287,7 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , long encapIp ) {
 }

 bool expandRegExShortcuts ( SafeBuf *sb ) ;
-bool updateSiteList ( collnum_t collnum , bool addSeeds );
+bool updateSiteListTables ( collnum_t collnum,bool addSeeds,char *siteListArg);
 void nukeDoledb ( collnum_t collnum );

 // . anytime the url filters are updated, this function is called
@ -2127,10 +2350,14 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 		// maybe this is good enough
 		//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
 		
+		CollectionRec *cr = sc->m_cr;
+
 		// . rebuild sitetable? in PageBasic.cpp.
 		// . re-adds seed spdierrequests using msg4
 		// . true = addSeeds
-		updateSiteList ( m_collnum , true );
+		updateSiteListTables ( m_collnum , 
+				       true , 
+				       cr->m_siteListBuf.getBufStart() );
 	}


--- a/Dates.cpp
+++ b/Dates.cpp
@ -1318,7 +1318,7 @@ sections. -- todo -- might be an alignment issue... check out later

 // . make a whole new set of urls for pub date detection
 // . grab that sample set from buzz wiki page
-// . record the correct pub date for urls in the "test" coll and make sure
+// . record the correct pub date for urls in the "qatest123" coll and make sure
 //   we get them each time, otherwise core dump!!
 // . check the date we extract with the rss feed. that is a good test too!
 //   report on that accuracy in the logs and on the stats page.
@ -2428,7 +2428,7 @@ bool Dates::setPart1 ( //char       *u        ,
 	//if ( m_nw != words->m_numWords ) { char *xx=NULL; *xx=0; }

 	// . get the current time in utc
-	// . NO! to ensure the "test" collection re-injects docs exactly
+	// . NO! to ensure the "qatest123" collection re-injects docs exactly
 	//   the same, use the spideredTime from the doc
 	// . we make sure to save this in the test subdir somehow..
 	//m_now      = nd->m_spideredTime; // getTimeSynced();
@ -3283,7 +3283,7 @@ bool Dates::setPart1 ( //char       *u        ,
 	//    DF_NOTCLOCK flags from this.

 	// . current time. sync'd with host #0 who uses ntp supposedly...! :(
-	// . to ensure that the "test" subdir re-injects docs exactly the
+	// . to ensure that the "qatest123" subdir re-injects docs exactly the
 	//   same, we need to use this date now
 	long now = nd->m_spideredTime; 
 	// how long has elapsed since we downloaded it last approx.?
@ -3294,7 +3294,8 @@ bool Dates::setPart1 ( //char       *u        ,
 	// might have been different than ours... actually i think our
 	// spiderdate.txt file had an older date in it from a previous round!
 	// so disable this when test spidering.
-	if ( elapsed<0 && g_conf.m_testSpiderEnabled && !strcmp(m_coll,"test"))
+	if ( elapsed<0 && g_conf.m_testSpiderEnabled && !strcmp(m_coll,
+								"qatest123"))
 		elapsed = 0;
 	// is true.
 	if ( elapsed < 0 ) { 
--- a/Errno.cpp
+++ b/Errno.cpp
@ -167,7 +167,7 @@ case	EFAKEFIRSTIP: return "Fake firstIp";
 case	EBADHOSTSCONF: return "A hosts.conf is out of sync";
 case    EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
 case	EDOCNONCANONICAL: return "Url was dup of canonical page";
-case    ECUSTOMCRAWLMISMATCH: return "Crawl type mismatch";
+case    ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
 	}
 	// if the remote error bit is clear it must be a regulare errno
 	//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
--- a/HttpRequest.h
+++ b/HttpRequest.h
@ -28,10 +28,14 @@
 #include "TcpSocket.h"

 // values for HttpRequest::m_replyFormat
-#define FORMAT_HTML 0
-#define FORMAT_XML  1
-#define FORMAT_JSON 2
-#define FORMAT_CSV  3
+#define FORMAT_HTML 1
+#define FORMAT_XML  2
+#define FORMAT_JSON 3
+#define FORMAT_CSV  4
+#define FORMAT_TXT  5
+#define FORMAT_PROCOG 6
+
+

 class HttpRequest {

--- a/Images.cpp
+++ b/Images.cpp
@ -445,7 +445,7 @@ bool Images::downloadImages () {
 		r->reset();
 		r->m_maxTextDocLen  = 200000;
 		r->m_maxOtherDocLen = 500000;
-		if ( ! strcmp(cr->m_coll,"test")) {
+		if ( ! strcmp(cr->m_coll,"qatest123")) {
 			r->m_useTestCache   = 1;
 			r->m_addToTestCache = 1;
 		}
--- a/Json.cpp
+++ b/Json.cpp
@ -433,7 +433,7 @@ char *JsonItem::getValueAsString ( long *valueLen ) {

 	// numbers...
 	static char s_numBuf[64];
-	if ( m_valueLong == (long)m_valueDouble ) {
+	if ( (float)m_valueLong == m_valueDouble ) {
 		*valueLen = sprintf ( s_numBuf,"%li", m_valueLong );
 		return s_numBuf;
 	}
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -3935,7 +3935,7 @@ LinkInfo *makeLinkInfo ( char        *coll                    ,
 	// . how many unique ips link to us?
 	// . this count includes internal IPs as well
 	info->m_numUniqueIps           = msg25->m_uniqueIps;
-	// keep things consistent for the "test" coll
+	// keep things consistent for the "qatest123" coll
 	info->m_reserved1              = 0;
 	info->m_reserved2              = 0;
 	// how many total GOOD inlinks we got. does not include internal cblock
--- a/2
+++ b/2
@ -57,7 +57,7 @@ OBJS =  UdpSlot.o Rebalance.o \
 	PostQueryRerank.o Msge0.o Msge1.o \
 	CountryCode.o DailyMerge.o CatRec.o Tagdb.o \
 	Users.o Images.o Wiki.o Wiktionary.o Scraper.o \
-	Dates.o Sections.o SiteGetter.o Syncdb.o \
+	Dates.o Sections.o SiteGetter.o Syncdb.o qa.o \
 	Placedb.o Address.o Test.o GeoIP.o GeoIPCity.o Synonyms.o \
 	Cachedb.o Monitordb.o dlstubs.o PageCrawlBot.o Json.o PageBasic.o

--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
 			"(compatible; MSIE 6.0; Windows 98; "
 			"Win 9x 4.90)" ;

+	// for bulk jobs avoid actual downloads of the page for efficiency
+	if ( r->m_isCustomCrawl == 2 ) {
+		char *s = 
+			"HTTP/1.0 200 (OK)\r\n"
+			"Content-Length: 0\r\n"
+			"Connection: Close\r\n"
+			"Content-Type: text/html\r\n\r\n";
+		long slen = gbstrlen(s);
+		long fakeBufSize = slen + 1;
+		char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
+		gotHttpReply2 ( r , 
+				fakeBuf,
+				fakeBufSize, // include \0
+				fakeBufSize, // allocsize
+				NULL ); // tcpsock
+		return;
+	}
+
+
 	// download it
 	if ( ! g_httpServer.getDoc ( r->m_url             ,
 				     r->m_urlIp           ,
@ -1390,7 +1409,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {

 //
 //
-// . UTILITY FUNCTIONS for injecting into the "test" collection
+// . UTILITY FUNCTIONS for injecting into the "qatest123" collection
 // . we need to ensure that the web pages remain constant so we store them
 //
 //
@ -1400,7 +1419,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
 // . now that we are lower level in Msg13.cpp, set "ts" not "slot"
 bool getTestDoc ( char *u , TcpSocket *ts , Msg13Request *r ) {
 	// sanity check
-	//if ( strcmp(m_coll,"test") ) { char *xx=NULL;*xx=0; }
+	//if ( strcmp(m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
 	// hash the url into 64 bits
 	long long h = hash64 ( u , gbstrlen(u) );
 	// read the spider date file first
@ -1547,7 +1566,7 @@ bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) {
 	return true;
 }

-// add it to our "test" subdir
+// add it to our "qatest123" subdir
 bool addTestDoc ( long long urlHash64 , char *httpReply , long httpReplySize ,
 		  long err , Msg13Request *r ) {

--- a/Msg13.h
+++ b/Msg13.h
@ -32,6 +32,8 @@ public:
 	// if doing spider compression, compute contentHash32 of document
 	// downloaded, and if it matches this then send back EDOCUNCHANGED
 	long  m_contentHash32;
+	// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
+	char m_isCustomCrawl;
 	// send back error ENOGOODDATE if it does not have one. but if
 	// harvestLinks is true, just send back a filtered list of links
 	long  m_requireGoodDate:1;
--- a/Msg4.cpp
+++ b/Msg4.cpp
@ -159,7 +159,7 @@ public:
 };


-// . injecting into the "test" coll flushes after each inject
+// . injecting into the "qatest123" coll flushes after each inject
 // . returns false if blocked and callback will be called
 bool flushMsg4Buffers ( void *state , void (* callback) (void *) ) {
 	// if all empty, return true now
--- a/Msg5.cpp
+++ b/Msg5.cpp
@ -859,9 +859,9 @@ bool Msg5::needsRecall ( ) {
 	if ( m_round == 0 ) logIt = false;
 	if ( logIt )
 		logf(LOG_DEBUG,"db: Reading %li again from %s (need %li total "
-		     "got %li) this=0x%lx round=%li.", 
+		     "got %li) cn=%li this=0x%lx round=%li.", 
 		     m_newMinRecSizes , base->m_dbname , m_minRecSizes, 
-		     m_list->m_listSize, (long)this , m_round );
+		     m_list->m_listSize, (long)m_collnum,(long)this, m_round );
 	m_round++;
 	// record how many screw ups we had so we know if it hurts performance
 	base->m_rdb->didReSeek ( );
--- a/Msge1.cpp
+++ b/Msge1.cpp
@ -116,7 +116,7 @@ bool Msge1::getFirstIps ( TagRec **grv ,
 	if ( ! launchRequests ( 0 ) ) return false;

 	// save it? might be a page parser
-	//if ( ! strcmp(m_coll,"test") ) saveTestBuf();
+	//if ( ! strcmp(m_coll,"qatest123") ) saveTestBuf();

 	// none blocked, we are done
 	return true;
@ -219,7 +219,7 @@ bool Msge1::launchRequests ( long starti ) {

 	/*
 	// look up in our m_testBuf.
-	if ( m_coll && ! strcmp(m_coll,"test") ) {
+	if ( m_coll && ! strcmp(m_coll,"qatest123") ) {
 		bool found = false;
 		// do we got it?
 		long quickIp ; bool status = getTestIp ( p , &quickIp, &found);
@ -300,7 +300,7 @@ bool Msge1::sendMsgC ( long i , char *host , long hlen ) {


 	// look up in our m_testBuf.
-	if ( m_coll && ! strcmp(m_coll,"test") ) {
+	if ( m_coll && ! strcmp(m_coll,"qatest123") ) {
 		bool found = false;
 		// shortcut
 		//char *p = m_urlPtrs[n];
@ -340,7 +340,7 @@ void gotMsgCWrapper ( void *state , long ip ) {
 	if ( ! THIS->launchRequests(i) ) return;
 	// . save it if we should. might be a page parser
 	// . mdw i uncommented this when we cored all the time
-	//if ( ! strcmp(THIS->m_coll,"test")) saveTestBuf();
+	//if ( ! strcmp(THIS->m_coll,"qatest123")) saveTestBuf();
 	// must be all done, call the callback
 	THIS->m_callback ( THIS->m_state );
 }
@ -364,7 +364,7 @@ bool Msge1::doneSending ( long i ) {
 	//      n, i,  m_urls[i].getUrl() ,iptoa(ip));

 	// store it?
-	if ( ! strcmp(m_coll,"test") ) {
+	if ( ! strcmp(m_coll,"qatest123") ) {
 		// get host
 		long  hlen = 0;
 		char *host = getHostFast ( m_urlPtrs[n] , &hlen );
@ -511,9 +511,9 @@ static char *s_last         = NULL ;
 static long  s_lastLen      = 0    ;
 static HashTableX s_ht;

-// . only call this if the collection is "test"
+// . only call this if the collection is "qatest123"
 // . we try to get the ip by accessing the "./test/ips.txt" file
-// . we also ad ips we lookup to that file in the collection is "test"
+// . we also ad ips we lookup to that file in the collection is "qatest123"
 // . returns false and sets g_errno on error, true on success
 bool getTestIp ( char *url , long *retIp , bool *found , long niceness ,
 		 char *testDir ) {
@ -533,8 +533,8 @@ bool getTestIp ( char *url , long *retIp , bool *found , long niceness ,
 	// assume not found
 	*found = false;

-	// . if we are the "test" collection, check for "./test/ips.txt" file
-	//   that gives us the ips of the given urls. 
+	// . if we are the "qatestq123" collection, check for "./test/ips.txt"
+	//   file that gives us the ips of the given urls. 
 	// . if we end up doing some lookups we should append to that file
 	if ( ! s_testBuf || s_needsReload ) {
 		// assume needs reload now
--- a/PageBasic.cpp
+++ b/PageBasic.cpp
@ -73,7 +73,9 @@ public:
 // . uses msg4 to add seeds to spiderdb if necessary
 // . only adds seeds for the shard we are on iff we are responsible for
 //   the fake firstip!!!
-bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
+bool updateSiteListTables ( collnum_t collnum , 
+			    bool addSeeds ,
+			    char *siteListArg ) {

 	CollectionRec *cr = g_collectiondb.getRec ( collnum );
 	if ( ! cr ) return true;
@ -113,6 +115,8 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
 	}

 	// get the old sitelist Domain Hash to PatternData mapping table
+	// which tells us what domains, subdomains or paths we can or
+	// can not spider...
 	HashTableX *dt = &sc->m_siteListDomTable;

 	// reset it
@ -142,10 +146,10 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
 	// use this so it will be free automatically when msg4 completes!
 	SafeBuf *spiderReqBuf = &sc->m_msg4x.m_tmpBuf;

-	char *siteList = cr->m_siteListBuf.getBufStart();
+	//char *siteList = cr->m_siteListBuf.getBufStart();

 	// scan the list
-	char *pn = siteList;
+	char *pn = siteListArg;

 	// completely empty?
 	if ( ! pn ) return true;
@ -156,7 +160,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {

 	Url u;

-	for ( ; *pn ; pn++ , lineNum++ ) {
+	for ( ; *pn ; lineNum++ ) {

 		// get end
 		char *s = pn;
@ -169,6 +173,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
 	        char *pe = pn;
 		for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- );

+		// advance over '\n' for next line
+		if ( *pn && *pn == '\n' ) pn++;
+
 		// make hash of the line
 		long h32 = hash32 ( s , pe - s );

@ -287,7 +294,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
 		if ( ! isFilter ) continue;
 		
 		
-		// make the data node
+		// make the data node used for filtering urls during spidering
 		PatternData pd;
 		// hash of the subdomain or domain for this line in sitelist
 		pd.m_thingHash32 = u.getHostHash32();
@ -388,10 +395,15 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
 	// check domain specific tables
 	HashTableX *dt = &sc->m_siteListDomTable;

+	// get this
+	CollectionRec *cr = sc->m_cr;
+
 	// need to build dom table for pattern matching?
-	if ( dt->getNumSlotsUsed() == 0 ) {
+	if ( dt->getNumSlotsUsed() == 0 && cr ) {
 		// do not add seeds, just make siteListDomTable, etc.
-		updateSiteList ( sc->m_collnum , false );
+		updateSiteListTables ( sc->m_collnum , 
+				       false , // add seeds?
+				       cr->m_siteListBuf.getBufStart() );
 	}

 	if ( dt->getNumSlotsUsed() == 0 ) { 
@ -728,6 +740,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {

 	char  buf [ 128000 ];
 	SafeBuf sb(buf,128000);
+	sb.reset();

 	char *fs = hr->getString("format",NULL,NULL);
 	char fmt = FORMAT_HTML;
@ -761,7 +774,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 	//
 	// show stats
 	//
-	if ( fmt == FMT_HTML ) {
+	if ( fmt == FORMAT_HTML ) {

 		char *seedStr = cr->m_diffbotSeeds.getBufStart();
 		if ( ! seedStr ) seedStr = "";
@ -773,45 +786,23 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 		long sentAlert = (long)ci->m_sentCrawlDoneAlert;
 		if ( sentAlert ) sentAlert = 1;

-		sb.safePrintf(
+		//sb.safePrintf(
+		//	      "<form method=get action=/crawlbot>"
+		//	      "%s"
+		//	      , sb.getBufStart() // hidden input token/name/..
+		//	      );
+
+		char *hurts = "No";
+		if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
+			hurts = "Yes";

-			      "<form method=get action=/crawlbot>"
-			      "%s"
-			      , sb.getBufStart() // hidden input token/name/..
-			      );
 		sb.safePrintf("<TABLE border=0>"
 			      "<TR><TD valign=top>"

 			      "<table border=0 cellpadding=5>"

-			      //
 			      "<tr>"
-			      "<td><b>Crawl Name:</td>"
-			      "<td>%s</td>"
-			      "</tr>"
-
-			      "<tr>"
-			      "<td><b>Crawl Type:</td>"
-			      "<td>%li</td>"
-			      "</tr>"
-
-			      //"<tr>"
-			      //"<td><b>Collection Alias:</td>"
-			      //"<td>%s%s</td>"
-			      //"</tr>"
-
-			      "<tr>"
-			      "<td><b>Token:</td>"
-			      "<td>%s</td>"
-			      "</tr>"
-
-			      "<tr>"
-			      "<td><b>Seeds:</td>"
-			      "<td>%s</td>"
-			      "</tr>"
-
-			      "<tr>"
-			      "<td><b>Crawl Status:</td>"
+			      "<td><b>Crawl Status Code:</td>"
 			      "<td>%li</td>"
 			      "</tr>"

@ -820,14 +811,14 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 			      "<td>%s</td>"
 			      "</tr>"

-			      "<tr>"
-			      "<td><b>Rounds Completed:</td>"
-			      "<td>%li</td>"
-			      "</tr>"
+			      //"<tr>"
+			      //"<td><b>Rounds Completed:</td>"
+			      //"<td>%li</td>"
+			      //"</tr>"

 			      "<tr>"
 			      "<td><b>Has Urls Ready to Spider:</td>"
-			      "<td>%li</td>"
+			      "<td>%s</td>"
 			      "</tr>"


@ -838,12 +829,8 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 			      //"</tr>"

 			      "<tr>"
-			      "<td><b>Objects Found</b></td>"
-			      "<td>%lli</td>"
-			      "</tr>"
-
-			      "<tr>"
-			      "<td><b>URLs Harvested</b> (inc. dups)</td>"
+			      "<td><b>URLs Harvested</b> "
+			      "(may include dups)</td>"
 			      "<td>%lli</td>"
     
 			      "</tr>"
@ -862,60 +849,24 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
 			      "<td><b>Page Crawl Successes</b></td>"
 			      "<td>%lli</td>"
 			      "</tr>"
-
-			      "<tr>"
-			      "<td><b>Page Crawl Successes This Round</b></td>"
-			      "<td>%lli</td>"
-			      "</tr>"
-
-			      "<tr>"
-			      "<td><b>Page Process Attempts</b></td>"
-			      "<td>%lli</td>"
-			      "</tr>"
-
-			      "<tr>"
-			      "<td><b>Page Process Successes</b></td>"
-			      "<td>%lli</td>"
-			      "</tr>"
-
-			      "<tr>"
-			      "<td><b>Page Process Successes This Round</b></td>"
-			      "<td>%lli</td>"
-			      "</tr>"
-
-			      
-			      , cr->m_diffbotCrawlName.getBufStart()
-			      
-			      , (long)cr->m_isCustomCrawl
-
-			      , cr->m_diffbotToken.getBufStart()
-
-			      , seedStr
-
 			      , crawlStatus
 			      , tmp.getBufStart()
-			      , cr->m_spiderRoundNum
-			      , cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
+			      //, cr->m_spiderRoundNum
+			      //, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
+			      , hurts

-			      , cr->m_globalCrawlInfo.m_objectsAdded -
-			        cr->m_globalCrawlInfo.m_objectsDeleted
 			      , cr->m_globalCrawlInfo.m_urlsHarvested
 			      //, cr->m_globalCrawlInfo.m_urlsConsidered

 			      , cr->m_globalCrawlInfo.m_pageDownloadAttempts
 			      , cr->m_globalCrawlInfo.m_pageDownloadSuccesses
-			      , cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
-
-			      , cr->m_globalCrawlInfo.m_pageProcessAttempts
-			      , cr->m_globalCrawlInfo.m_pageProcessSuccesses
-			      , cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
 			      );

 	}

-	if ( fmt != FORMAT_JSON )
-		// wrap up the form, print a submit button
-		g_pages.printAdminBottom ( &sb );
+	//if ( fmt != FORMAT_JSON )
+	//	// wrap up the form, print a submit button
+	//	g_pages.printAdminBottom ( &sb );

 	return g_httpServer.sendDynamicPage (socket, 
 					     sb.getBufStart(), 
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -25,11 +25,11 @@
 #include "Parms.h"

 // so user can specify the format of the reply/output
-#define FMT_HTML 1
-#define FMT_XML  2
-#define FMT_JSON 3
-#define FMT_CSV  4
-#define FMT_TXT  5
+//#define FMT_HTML 1
+//#define FMT_XML  2
+//#define FMT_JSON 3
+//#define FMT_CSV  4
+//#define FMT_TXT  5

 void doneSendingWrapper ( void *state , TcpSocket *sock ) ;
 bool sendBackDump ( TcpSocket *s,HttpRequest *hr );
@ -158,25 +158,25 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {

 	if ( ( xx = strstr ( path , "_data.json" ) ) ) {
 		rdbId = RDB_TITLEDB;
-		fmt = FMT_JSON;
+		fmt = FORMAT_JSON;
 		downloadJSON = true;
 	}
 	else if ( ( xx = strstr ( path , "_data.csv" ) ) ) {
 		rdbId = RDB_TITLEDB;
 		downloadJSON = true;
-		fmt = FMT_CSV;
+		fmt = FORMAT_CSV;
 	}
 	else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
 		rdbId = RDB_SPIDERDB;
-		fmt = FMT_CSV;
+		fmt = FORMAT_CSV;
 	}
 	else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
 		rdbId = RDB_SPIDERDB;
-		fmt = FMT_TXT;
+		fmt = FORMAT_TXT;
 	}
 	else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
 		rdbId = RDB_TITLEDB;
-		fmt = FMT_TXT;
+		fmt = FORMAT_TXT;
 	}

 	// sanity, must be one of 3 download calls
@ -213,7 +213,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {

 	// . if doing download of csv, make it search results now!
 	// . make an httprequest on stack and call it
-	if ( fmt == FMT_CSV && rdbId == RDB_TITLEDB ) {
+	if ( fmt == FORMAT_CSV && rdbId == RDB_TITLEDB ) {
 		char tmp2[5000];
 		SafeBuf sb2(tmp2,5000);
 		long dr = 1;
@ -247,7 +247,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {

 	// . if doing download of json, make it search results now!
 	// . make an httprequest on stack and call it
-	if ( fmt == FMT_JSON && rdbId == RDB_TITLEDB ) {
+	if ( fmt == FORMAT_JSON && rdbId == RDB_TITLEDB ) {
 		char tmp2[5000];
 		SafeBuf sb2(tmp2,5000);
 		long dr = 1;
@ -514,13 +514,13 @@ bool StateCD::sendList ( ) {
 	//sb.setLabel("dbotdmp");

 	char *ct = "text/csv";
-	if ( m_fmt == FMT_JSON )
+	if ( m_fmt == FORMAT_JSON )
 		ct = "application/json";
-	if ( m_fmt == FMT_XML )
+	if ( m_fmt == FORMAT_XML )
 		ct = "text/xml";
-	if ( m_fmt == FMT_TXT )
+	if ( m_fmt == FORMAT_TXT )
 		ct = "text/plain";
-	if ( m_fmt == FMT_CSV )
+	if ( m_fmt == FORMAT_CSV )
 		ct = "text/csv";

 	// . if we haven't yet sent an http mime back to the user
@ -545,13 +545,13 @@ bool StateCD::sendList ( ) {

 	//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );

-	if ( ! m_printedFirstBracket && m_fmt == FMT_JSON ) {
+	if ( ! m_printedFirstBracket && m_fmt == FORMAT_JSON ) {
 		sb.safePrintf("[\n");
 		m_printedFirstBracket = true;
 	}

 	// these are csv files not xls
-	//if ( ! m_printedFirstBracket && m_fmt == FMT_CSV ) {
+	//if ( ! m_printedFirstBracket && m_fmt == FORMAT_CSV ) {
 	//	sb.safePrintf("sep=,\n");
 	//	m_printedFirstBracket = true;
 	//}
@ -638,7 +638,7 @@ bool StateCD::sendList ( ) {
 		// use this for printing out urls.csv as well...
 		m_printedEndingBracket = true;
 		// end array of json objects. might be empty!
-		if ( m_rdbId == RDB_TITLEDB && m_fmt == FMT_JSON )
+		if ( m_rdbId == RDB_TITLEDB && m_fmt == FORMAT_JSON )
 			sb.safePrintf("\n]\n");
 		//log("adding ]. len=%li",sb.length());
 		// i'd like to exit streaming mode here. i fixed tcpserver.cpp
@ -853,7 +853,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 		}

 		// "csv" is default if json not specified
-		if ( m_fmt == FMT_JSON ) 
+		if ( m_fmt == FORMAT_JSON ) 
 			sb->safePrintf("[{"
 				       "{\"url\":"
 				       "\"%s\"},"
@ -875,6 +875,19 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 				       );
 		// but default to csv
 		else {
+		    if (cr && cr->m_isCustomCrawl == 1 && sreq && !sreq->m_isAddUrl && !sreq->m_isInjecting) {
+		        if (cr->m_diffbotUrlCrawlPattern.m_length == 0
+                    && cr->m_diffbotUrlProcessPattern.m_length == 0) {
+		            // If a crawl and there are no urlCrawlPattern or urlCrawlRegEx values, only return URLs from seed domain
+		            if (sreq && !sreq->m_sameDom)
+		                continue;
+		        } else {
+		            // TODO: if we get here, we have a crawl with a custom urlCrawlPattern and/or custom
+		            //       urlProcessPattern. We have to check if the current url matches the pattern
+
+		        }
+		    }
+
 			sb->safePrintf("\"%s\",\"%s\","
 				       , sreq->m_url
 				       , as
@ -984,7 +997,7 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){

 		// if not json, just print the json item out in csv
 		// moved into PageResults.cpp...
-		//if ( m_fmt == FMT_CSV ) {
+		//if ( m_fmt == FORMAT_CSV ) {
 		//	printJsonItemInCsv ( json , sb );
 		//	continue;
 		//}
@ -1324,7 +1337,7 @@ bool sendReply2 (TcpSocket *socket , long fmt , char *msg ) {

 	// send this back to browser
 	SafeBuf sb;
-	if ( fmt == FMT_JSON ) {
+	if ( fmt == FORMAT_JSON ) {
 		sb.safePrintf("{\n\"response\":\"success\",\n"
 			      "\"message\":\"%s\"\n}\n"
 			      , msg );
@ -1355,7 +1368,7 @@ bool sendErrorReply2 ( TcpSocket *socket , long fmt , char *msg ) {

 	// send this back to browser
 	SafeBuf sb;
-	if ( fmt == FMT_JSON ) {
+	if ( fmt == FORMAT_JSON ) {
 		sb.safePrintf("{\"error\":\"%s\"}\n"
 			      , msg );
 		ct = "application/json";
@ -1463,7 +1476,7 @@ void injectedUrlWrapper ( void *state ) {

 	// send back the html or json response?
 	SafeBuf *response = &sb;
-	if ( st->m_fmt == FMT_JSON ) response = &js;
+	if ( st->m_fmt == FORMAT_JSON ) response = &js;

 	// . this will call g_httpServer.sendReply()
 	// . pass it in the injection response, "sb"
@ -1660,7 +1673,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 	// . now show stats for the current crawl
 	// . put in xml or json if format=xml or format=json or
 	//   xml=1 or json=1 ...
-	char fmt = FMT_JSON;
+	char fmt = FORMAT_JSON;

 	// token is always required. get from json or html form input
 	//char *token = getInputString ( "token" );
@ -1680,21 +1693,21 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 			name++;
 		}
 		// change default formatting to html
-		fmt = FMT_HTML;
+		fmt = FORMAT_HTML;
 	}


 	char *fs = hr->getString("format",NULL,NULL);
 	// give john a json api
-	if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
-	if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON;
-	if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML;
+	if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
+	if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
+	if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
 	// if we got json as input, give it as output
-	//if ( JS.getFirstItem() ) fmt = FMT_JSON;
+	//if ( JS.getFirstItem() ) fmt = FORMAT_JSON;



-	if ( ! token && fmt == FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) {
+	if ( ! token && fmt == FORMAT_JSON ) { // (cast==0|| fmt == FORMAT_JSON ) ) {
 		char *msg = "invalid token";
 		return sendErrorReply2 (socket,fmt,msg);
 	}
@ -1759,7 +1772,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 	//}

 	// just send back a list of all the collections after the delete
-	//if ( delColl && cast && fmt == FMT_JSON ) {
+	//if ( delColl && cast && fmt == FORMAT_JSON ) {
 	//	char *msg = "Collection deleted.";
 	//	return sendReply2 (socket,fmt,msg);
 	//}
@ -2108,7 +2121,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {

 	char bulkurlsfile[1024];
 	snprintf(bulkurlsfile, 1024, "%scoll.%s.%li/bulkurls.txt", g_hostdb.m_dir , coll , (long)st->m_collnum );
-	if ( spots ) {
+	if ( spots && cr && cr->m_isCustomCrawl == 2 ) {
 		log("crawlbot: got spots (len=%li) to add coll=%s (%li)",
 		    (long)gbstrlen(spots),coll,(long)st->m_collnum);
 		FILE *f = fopen(bulkurlsfile, "w");
@ -2120,7 +2133,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 	}

 	// if restart flag is on and the file with bulk urls exists, get spots from there
-	if ( !spots && restartColl ) {
+	if ( !spots && restartColl && cr && cr->m_isCustomCrawl ) {
 	    FILE *f = fopen(bulkurlsfile, "r");
 	    if (f != NULL) {
 	        fseek(f, 0, SEEK_END);
@ -2250,7 +2263,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 /*
 bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {

-	if ( fmt == FMT_JSON )
+	if ( fmt == FORMAT_JSON )
 		sb.safePrintf("\"urlFilters\":[");

 	// skip first filters that are:
@ -2290,7 +2303,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
 		// urls higher spider priority, so skip it
 		if ( strncmp(expression,"ismanualadd && ",15) == 0 )
 			continue;
-		if ( fmt == FMT_HTML ) {
+		if ( fmt == FORMAT_HTML ) {
 			sb.safePrintf("<tr>"
 				      "<td>Expression "
 				      "<input type=text "
@ -2315,7 +2328,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
 		sb.pushChar('\n');
 	}

-	if ( fmt == FMT_JSON ) {
+	if ( fmt == FORMAT_JSON ) {
 		// remove trailing comma
 		sb.removeLastChar('\n');
 		sb.removeLastChar(',');
@ -2506,7 +2519,7 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
 	  true // isJSON?
 	  );
 	*/
-	//printUrlFilters ( sb , cx , FMT_JSON );
+	//printUrlFilters ( sb , cx , FORMAT_JSON );
 	// end that collection rec
 	sb->safePrintf("}\n");

@ -2524,7 +2537,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	// store output into here
 	SafeBuf sb;

-	if ( fmt == FMT_HTML )
+	if ( fmt == FORMAT_HTML )
 		sb.safePrintf(
 			      "<html>"
 			      "<title>Crawlbot - "
@ -2560,7 +2573,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	lb.urlEncode(name);
 	lb.safePrintf ("&token=");
 	lb.urlEncode(token);
-	if ( fmt == FMT_HTML ) lb.safePrintf("&format=html");
+	if ( fmt == FORMAT_HTML ) lb.safePrintf("&format=html");
 	lb.nullTerm();
 	

@ -2577,7 +2590,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	//}
 			

-	if ( fmt == FMT_HTML ) {
+	if ( fmt == FORMAT_HTML ) {
 		sb.safePrintf("<table border=0>"
 			      "<tr><td>"
 			      "<b><font size=+2>"
@ -2632,7 +2645,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	//
 	// print list of collections controlled by this token
 	//
-	for ( long i = 0 ; fmt == FMT_HTML && i<g_collectiondb.m_numRecs;i++ ){
+	for ( long i = 0 ; fmt == FORMAT_HTML && i<g_collectiondb.m_numRecs;i++ ){
 		CollectionRec *cx = g_collectiondb.m_recs[i];
 		if ( ! cx ) continue;
 		// get its token if any
@ -2664,19 +2677,19 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			sb.safePrintf("</font></b>");
 	}

-	if ( fmt == FMT_HTML )
+	if ( fmt == FORMAT_HTML )
 		sb.safePrintf ( "</center><br/>" );

 	// the ROOT JSON [
-	if ( fmt == FMT_JSON )
+	if ( fmt == FORMAT_JSON )
 		sb.safePrintf("{\n");

 	// injection is currently not in use, so this is an artifact:
-	if ( fmt == FMT_JSON && injectionResponse )
+	if ( fmt == FORMAT_JSON && injectionResponse )
 		sb.safePrintf("\"response\":\"%s\",\n\n"
 			      , injectionResponse->getBufStart() );

-	if ( fmt == FMT_JSON && urlUploadResponse )
+	if ( fmt == FORMAT_JSON && urlUploadResponse )
 		sb.safePrintf("\"response\":\"%s\",\n\n"
 			      , urlUploadResponse->getBufStart() );

@ -2689,14 +2702,14 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,

 	// the items in the array now have type:bulk or type:crawl
 	// so call them 'jobs'
-	if ( fmt == FMT_JSON )
+	if ( fmt == FORMAT_JSON )
 		sb.safePrintf("\"jobs\":[");//\"collections\":");

 	long summary = hr->getLong("summary",0);
 	// enter summary mode for json
-	if ( fmt != FMT_HTML ) summary = 1;
+	if ( fmt != FORMAT_HTML ) summary = 1;
 	// start the table
-	if ( summary && fmt == FMT_HTML ) {
+	if ( summary && fmt == FORMAT_HTML ) {
 		sb.safePrintf("<table border=1 cellpadding=5>"
 			      "<tr>"
 			      "<td><b>Collection</b></td>"
@ -2727,11 +2740,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,


 		// just print out single crawl info for json
-		if ( fmt != FMT_HTML && cx != cr && name3 ) 
+		if ( fmt != FORMAT_HTML && cx != cr && name3 ) 
 			continue;

 		// if json, print each collectionrec
-		if ( fmt == FMT_JSON ) {
+		if ( fmt == FORMAT_JSON ) {
 			if ( ! firstOne ) 
 				sb.safePrintf(",\n\t");
 			firstOne = false;
@ -2773,7 +2786,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      , cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
 			      );
 	}
-	if ( summary && fmt == FMT_HTML ) {
+	if ( summary && fmt == FORMAT_HTML ) {
 		sb.safePrintf("</table></html>" );
 		return g_httpServer.sendDynamicPage (socket, 
 						     sb.getBufStart(), 
@ -2781,7 +2794,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 						     0); // cachetime
 	}

-	if ( fmt == FMT_JSON ) 
+	if ( fmt == FORMAT_JSON ) 
 		// end the array of collection objects
 		sb.safePrintf("\n]\n");

@ -2795,7 +2808,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	//
 	// show urls being crawled (ajax) (from Spider.cpp)
 	//
-	if ( fmt == FMT_HTML ) {
+	if ( fmt == FORMAT_HTML ) {
 		sb.safePrintf ( "<table width=100%% cellpadding=5 "
 				"style=border-width:1px;border-style:solid;"
 				"border-color:black;>"
@ -2866,7 +2879,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	rand64 |=  r2;


-	if ( fmt == FMT_HTML ) {
+	if ( fmt == FORMAT_HTML ) {
 		sb.safePrintf("<br>"
 			      "<table border=0 cellpadding=5>"
 			      
@ -2939,12 +2952,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      );
 	}

-	if ( injectionResponse && fmt == FMT_HTML )
+	if ( injectionResponse && fmt == FORMAT_HTML )
 		sb.safePrintf("<br><font size=-1>%s</font>\n"
 			      ,injectionResponse->getBufStart() 
 			      );

-	if ( fmt == FMT_HTML )
+	if ( fmt == FORMAT_HTML )
 		sb.safePrintf(//"<input type=hidden name=c value=\"%s\">"
 			      //"<input type=hidden name=crawlbotapi value=1>"
 			      "</td>"
@ -2983,7 +2996,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	//
 	// show stats
 	//
-	if ( fmt == FMT_HTML ) {
+	if ( fmt == FORMAT_HTML ) {

 		char *seedStr = cr->m_diffbotSeeds.getBufStart();
 		if ( ! seedStr ) seedStr = "";
@ -3641,7 +3654,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,


 	// xml or json does not show the input boxes
-	//if ( format != FMT_HTML ) 
+	//if ( format != FORMAT_HTML ) 
 	//	return g_httpServer.sendDynamicPage ( s, 
 	//					      sb.getBufStart(), 
 	//					      sb.length(),
@ -3664,7 +3677,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 		s2 = "";
 	}

-	if ( fmt == FMT_HTML )
+	if ( fmt == FORMAT_HTML )
 		sb.safePrintf(
 			      
 			      "<a onclick="
@ -3708,7 +3721,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	//
 	// print url filters. HACKy...
 	//
-	if ( fmt == FMT_HTML )
+	if ( fmt == FORMAT_HTML )
 		g_parms.sendPageGeneric ( socket ,
 					  hr ,
 					  PAGE_FILTERS ,
@ -3719,7 +3732,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	//
 	// end HACKy hack
 	//
-	if ( fmt == FMT_HTML )
+	if ( fmt == FORMAT_HTML )
 		sb.safePrintf(
 			      "</form>"
 			      "</div>"
@ -3747,7 +3760,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	//
 	// show simpler url filters table
 	//
-	if ( fmt == FMT_HTML ) {
+	if ( fmt == FORMAT_HTML ) {
 		/*
 		sb.safePrintf ( "<table>"
 				"<tr><td colspan=2>"
@ -3783,7 +3796,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	//
 	// show reset and delete crawl buttons
 	//
-	if ( fmt == FMT_HTML ) {
+	if ( fmt == FORMAT_HTML ) {
 		sb.safePrintf(
 			      "<table cellpadding=5>"
 			      "<tr>"
@ -3846,13 +3859,13 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,


 	// the ROOT JSON }
-	if ( fmt == FMT_JSON )
+	if ( fmt == FORMAT_JSON )
 		sb.safePrintf("}\n");

 	char *ct = "text/html";
-	if ( fmt == FMT_JSON ) ct = "application/json";
-	if ( fmt == FMT_XML ) ct = "text/xml";
-	if ( fmt == FMT_CSV ) ct = "text/csv";
+	if ( fmt == FORMAT_JSON ) ct = "application/json";
+	if ( fmt == FORMAT_XML ) ct = "text/xml";
+	if ( fmt == FORMAT_CSV ) ct = "text/csv";

 	// this could be in html json or xml
 	return g_httpServer.sendDynamicPage ( socket, 
@ -3946,7 +3959,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,

 // . do not add dups into m_diffbotSeeds safebuf
 // . return 0 if not in table, 1 if in table. -1 on error adding to table.
-long isInSeedBuf ( CollectionRec *cr , Url *url ) {
+long isInSeedBuf ( CollectionRec *cr , char *url, int len ) {

 	HashTableX *ht = &cr->m_seedHashTable;

@ -3973,7 +3986,7 @@ long isInSeedBuf ( CollectionRec *cr , Url *url ) {
 	}

 	// is this url in the hash table?
-	long long u64 = hash64 ( url->getUrl() , url->getUrlLen() );
+	long long u64 = hash64 ( url, len );
 	
 	if ( ht->isInTable ( &u64 ) ) return 1;

@ -4072,7 +4085,7 @@ bool getSpiderRequestMetaList ( char *doc ,
 		if ( ! cr ) continue;

 		// do not add dups into m_diffbotSeeds safebuf
-		long status = isInSeedBuf ( cr , &url );
+		long status = isInSeedBuf ( cr , saved , end - saved );

 		// error?
 		if ( status == -1 ) {
@ -4129,7 +4142,7 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
 	char *json = hr->getString("json");
 	if ( ! json ) 
 		return sendReply2 ( socket, 
-				    FMT_JSON,
+				    FORMAT_JSON,
 				    "No &json= provided in request.");


@ -4138,12 +4151,12 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,

 	// wtf?
 	if ( ! status ) 
-		return sendReply2 ( socket, FMT_JSON,
+		return sendReply2 ( socket, FORMAT_JSON,
 				    "Error with JSON parser.");

 	// error adding it?
 	if ( ! cr )
-		return sendReply2 ( socket,FMT_JSON,
+		return sendReply2 ( socket,FORMAT_JSON,
 				    "Failed to create new collection.");

 	ji = JP.getFirstItem();
--- a/PageParser.cpp
+++ b/PageParser.cpp
@ -561,7 +561,7 @@ bool processLoop ( void *state ) {
 		// . save the ips.txt file if we are the test coll
 		// . saveTestBuf() is a function in Msge1.cpp
 		CollectionRec *cr = xd->getCollRec();
-		if ( xd && cr && cr->m_coll && ! strcmp ( cr->m_coll,"test") )
+		if ( xd && cr && cr->m_coll && !strcmp(cr->m_coll,"qatest123"))
 			// use same dir that XmlDoc::getTestDir() would use
 			saveTestBuf ( "test-page-parser" );
 		// now get the meta list, in the process it will print out a 
@ -855,7 +855,7 @@ bool gotXmlDoc ( void *state ) {

 	// . save the ips.txt file if we are the test coll
 	// . saveTestBuf() is a function in Msge1.cpp
-	//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "test")) 
+	//if ( xd && xd->m_coll && ! strcmp ( xd->m_coll , "qatest123")) 
 	//	// use same dir that XmlDoc::getTestDir() would use
 	//	saveTestBuf ( "test-page-parser" );

--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -985,7 +985,7 @@ bool printSearchResultsHeader ( State0 *st ) {
 	    sb->safePrintf("\"currentTimeUTC\":%lu,\n", (long)(globalNowMS/1000));
 	}

-	// show response time
+	// show response time if not doing Quality Assurance
 	if ( si->m_format == FORMAT_XML )
 		sb->safePrintf("\t<responseTimeMS>%lli</responseTimeMS>\n",
 			      st->m_took);
@ -2148,8 +2148,13 @@ bool printResult ( State0 *st, long ix ) {
 			// so fix that shit here...
 			//float f = mr->m_lastSpidered;
 			//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
-			sb->safePrintf(",\"lastCrawlTimeUTC\":%li}\n",
+			sb->safePrintf(",\"lastCrawlTimeUTC\":%li\n",
 				       mr->m_lastSpidered);
+			// also include a timestamp field with an RFC 1123 formatted date
+			char timestamp[50];
+			struct tm *ptm = gmtime ( &mr->m_lastSpidered );
+			strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm);
+			sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp);
 		}

 		//mr->size_content );
--- a/PageRoot.cpp
+++ b/PageRoot.cpp
@ -169,6 +169,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
 	sb.safePrintf("</form>\n");
 	sb.safePrintf("<br>\n");
 	sb.safePrintf("\n");
+
+	// print any red boxes we might need to
+	if ( printRedBox2 ( &sb , true ) )
+		sb.safePrintf("<br>\n");
+
 	sb.safePrintf("<table cellpadding=3>\n");
 	sb.safePrintf("\n");

--- a/Pages.cpp
+++ b/Pages.cpp
@ -50,6 +50,9 @@ static WebPage s_pages[] = {
 	  "dummy page - if set in the users row then user will have master=0 and "
 	  " collection links will be highlighted in red",
 	  NULL, 0 },  
+
+
+
 	//{ PAGE_QUALITY         , "quality",     0, "quality",  0, 0,
 	//  "dummy page - if set in the users row then  \"Quality Control\""
 	//  " will be printed besides the logo for certain pages",
@ -102,12 +105,66 @@ static WebPage s_pages[] = {
 	//  "Basic diffbot page.",  sendPageBasicDiffbot  , 0 } ,
 	{ PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 , 
 	  "Basic security page.", sendPageGeneric  , 0 } ,
+	{ PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 , 
+	  "Basic search page.", sendPageRoot  , 0 } ,
+


 	{ PAGE_MASTER    , "admin/master"  , 0 , "master controls" ,  1 , 0 , 
 	  //USER_MASTER | USER_PROXY ,
 	  "master controls page",
 	  sendPageGeneric  , 0 } ,
+	{ PAGE_SEARCH    , "admin"   , 0 , "search controls" ,  1 , 1,
+	  //USER_ADMIN | USER_MASTER   , 
+	  "search controls page",
+	  sendPageGeneric  , 0 } ,
+	{ PAGE_SPIDER    , "admin/spider"   , 0 , "spider controls" ,  1 , 0,
+	  //USER_ADMIN | USER_MASTER | USER_PROXY   ,
+	  "spider controls page",
+	  sendPageGeneric  , 0 } ,
+	{ PAGE_LOG       , "admin/log"     , 0 , "log controls"     ,  1 , 0 ,
+	  //USER_MASTER | USER_PROXY,
+	  "log page",
+	  sendPageGeneric  , 0 } ,
+	{ PAGE_SECURITY, "admin/security2", 0 , "security"     ,  1 , 0 ,
+	  //USER_MASTER | USER_PROXY ,
+	  "advanced security page",
+	  sendPageGeneric , 0 } ,
+	{ PAGE_ADDCOLL   , "admin/addcoll" , 0 , "add collection"  ,  1 , 0 ,
+	  //USER_MASTER , 
+	  "add a new collection using this page",
+	  sendPageAddColl  , 0 } ,
+	{ PAGE_DELCOLL   , "admin/delcoll" , 0 , "delete collections" ,  1 ,0,
+	  //USER_MASTER , 
+	  "delete a collection using this page",
+	  sendPageDelColl  , 0 } ,
+	{ PAGE_REPAIR    , "admin/repair"   , 0 , "repair" ,  1 , 0 ,
+	  //USER_MASTER ,
+	  "repair page",
+	  sendPageGeneric   , 0 },
+	{ PAGE_SITES   , "admin/sites", 0 , "site list" ,  1 , 1,
+	  "what sites can be spidered",
+	  sendPageGeneric , 0 } , // sendPageBasicSettings
+	{ PAGE_FILTERS   , "admin/filters", 0 , "url filters" ,  1 , 1,
+	  //USER_ADMIN | USER_MASTER   , 
+	  "prioritize urls for spidering",
+	  sendPageGeneric  , 0 } ,
+	{ PAGE_INJECT    , "admin/inject"   , 0 , "inject url" ,  0 , 1 ,
+	  //USER_ADMIN | USER_MASTER   ,
+	  "inject url in the index here",
+	  sendPageInject   , 2 } ,
+	// this is the addurl page the the admin!
+	{ PAGE_ADDURL2   , "admin/addurl"   , 0 , "add urls" ,  0 , 0 ,
+	  "add url page for admin",
+	  sendPageAddUrl2   , 0 } ,
+	{ PAGE_REINDEX   , "admin/reindex"  , 0 , "query reindex" ,  0 , 0 ,
+	  //USER_ADMIN | USER_MASTER, 
+	  "reindex url page",
+	  sendPageReindex  , 0 } ,
+
+
+
+

 	{ PAGE_HOSTS     , "admin/hosts"   , 0 , "hosts" ,  0 , 0 ,
 	  //USER_MASTER | USER_PROXY,
@ -134,10 +191,7 @@ static WebPage s_pages[] = {
 	  //USER_MASTER | USER_PROXY,
 	  "sockets page",
 	  sendPageSockets  , 0 } ,
-	{ PAGE_LOG       , "admin/log"     , 0 , "log controls"     ,  1 , 0 ,
-	  //USER_MASTER | USER_PROXY,
-	  "log page",
-	  sendPageGeneric  , 0 } ,
+
 	{ PAGE_LOGVIEW    , "admin/logview"   , 0 , "log view" ,  0 , 0 ,
 	  //USER_MASTER ,  
 	  "logview page",
@ -147,18 +201,6 @@ static WebPage s_pages[] = {
 //	  "sync page",
 //	  sendPageGeneric  , 0 } ,

-	{ PAGE_SECURITY, "admin/security2", 0 , "security"     ,  1 , 0 ,
-	  //USER_MASTER | USER_PROXY ,
-	  "advanced security page",
-	  sendPageGeneric , 0 } ,
-	{ PAGE_ADDCOLL   , "admin/addcoll" , 0 , "add collection"  ,  1 , 0 ,
-	  //USER_MASTER , 
-	  "add a new collection using this page",
-	  sendPageAddColl  , 0 } ,
-	{ PAGE_DELCOLL   , "admin/delcoll" , 0 , "delete collections" ,  1 ,0,
-	  //USER_MASTER , 
-	  "delete a collection using this page",
-	  sendPageDelColl  , 0 } ,
 	{ PAGE_AUTOBAN    ,"admin/autoban" , 0 , "autoban" ,  1 , 1 ,
 	  //USER_MASTER | USER_PROXY , 
 	  "autobanned ips",
@ -175,10 +217,6 @@ static WebPage s_pages[] = {
 	  //USER_MASTER ,
 	  "threads page",
 	  sendPageThreads  , 0 },
-	{ PAGE_REPAIR    , "admin/repair"   , 0 , "repair" ,  1 , 0 ,
-	  //USER_MASTER ,
-	  "repair page",
-	  sendPageGeneric   , 0 },
 	//{ PAGE_THESAURUS, "admin/thesaurus",    0 , "thesaurus", 0 , 0 ,
        //  //USER_MASTER ,
 	//  "thesaurus page",
@ -207,14 +245,6 @@ static WebPage s_pages[] = {
 	  "titledb page",
 	  sendPageTitledb  , 2 } ,
 	// 1 = usePost
-	{ PAGE_SEARCH    , "admin"   , 0 , "search controls" ,  1 , 1,
-	  //USER_ADMIN | USER_MASTER   , 
-	  "search controls page",
-	  sendPageGeneric  , 0 } ,
-	{ PAGE_SPIDER    , "admin/spider"   , 0 , "spider controls" ,  1 , 0,
-	  //USER_ADMIN | USER_MASTER | USER_PROXY   ,
-	  "spider controls page",
-	  sendPageGeneric  , 0 } ,

 	{ PAGE_CRAWLBOT    , "crawlbot"   , 0 , "crawlbot" ,  1 , 0,
 	  "simplified spider controls page",
@ -229,30 +259,6 @@ static WebPage s_pages[] = {
 	//  "spider priorities page",
 	//  sendPageGeneric  , 0 } ,

-	{ PAGE_SITES   , "admin/sites", 0 , "site list" ,  1 , 1,
-	  "what sites can be spidered",
-	  sendPageGeneric , 0 } , // sendPageBasicSettings
-
-	{ PAGE_FILTERS   , "admin/filters", 0 , "url filters" ,  1 , 1,
-	  //USER_ADMIN | USER_MASTER   , 
-	  "prioritize urls for spidering",
-	  sendPageGeneric  , 0 } ,
-
-	{ PAGE_INJECT    , "admin/inject"   , 0 , "inject url" ,  0 , 1 ,
-	  //USER_ADMIN | USER_MASTER   ,
-	  "inject url in the index here",
-	  sendPageInject   , 2 } ,
-
-	// this is the addurl page the the admin!
-	{ PAGE_ADDURL2   , "admin/addurl"   , 0 , "add urls" ,  0 , 0 ,
-	  "add url page for admin",
-	  sendPageAddUrl2   , 0 } ,
-
-	{ PAGE_REINDEX   , "admin/reindex"  , 0 , "query reindex" ,  0 , 0 ,
-	  //USER_ADMIN | USER_MASTER, 
-	  "reindex url page",
-	  sendPageReindex  , 0 } ,
-
 	//{ PAGE_KEYWORDS, "admin/queries",0,"queries" ,  0 , 1 ,
 	//  "get queries a url matches",
 	//  sendPageMatchingQueries   , 2 } ,
@ -893,8 +899,6 @@ bool Pages::getNiceness ( long page ) {
 	return s_pages[page].m_niceness;
 }

-bool printRedBox ( SafeBuf *mb ) ;
-
 ///////////////////////////////////////////////////////////
 //
 // Convenient html printing routines
@ -1056,6 +1060,7 @@ bool Pages::printAdminTop (SafeBuf     *sb   ,
 	//if ( page == PAGE_BASIC_DIFFBOT ) isBasic = true;
 	//if ( page == PAGE_BASIC_SEARCH  ) isBasic = true;
 	if ( page == PAGE_BASIC_SECURITY ) isBasic = true;
+	if ( page == PAGE_BASIC_SEARCH ) isBasic = true;

 	//
 	// print breadcrumb. main > Basic > Settings
@ -1791,7 +1796,7 @@ bool  Pages::printAdminLinks ( SafeBuf *sb,
 		// is this page basic?
 		bool pageBasic = false;
 		if ( i >= PAGE_BASIC_SETTINGS &&
-		     i <= PAGE_BASIC_SECURITY )
+		     i <= PAGE_BASIC_SEARCH )
 			pageBasic = true;

 		// print basic pages under the basic menu, advanced pages
@ -2627,9 +2632,18 @@ bool sendPageLogin ( TcpSocket *socket , HttpRequest *hr ) {
 					      NULL);// cookie
 }

+bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage ) {
+	SafeBuf mb;
+	// return false if no red box
+	if ( ! printRedBox ( &mb , isRootWebPage ) ) return false;
+	// otherwise, print it
+	sb->safeStrcpy ( mb.getBufStart() );
+	// return true since we printed one
+	return true;
+}

 // emergency message box
-bool printRedBox ( SafeBuf *mb ) {
+bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) {

 	PingServer *ps = &g_pingServer;

@ -2649,11 +2663,33 @@ bool printRedBox ( SafeBuf *mb ) {
 	char *boxEnd =
 		"</td></tr></table>";

-	bool adds = false;
+	long adds = 0;


 	mb->safePrintf("<div style=max-width:500px;>");

+	// are we just starting off? give them a little help.
+	CollectionRec *cr = g_collectiondb.getRec("main");
+	if ( g_collectiondb.m_numRecs == 1 && 
+	     cr &&
+	     isRootWebPage &&
+	     cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) {
+		if ( adds ) mb->safePrintf("<br>");
+		adds++;
+		mb->safePrintf("%s",box);
+		mb->safePrintf("Welcome to Gigablast. The most powerful "
+			       "search engine you can legally download. "
+			       "Please add the websites you want to spider "
+			       "<a href=/admin/settings?c=main>here</a>."
+			       );
+		mb->safePrintf("%s",boxEnd);
+	}
+
+	if ( isRootWebPage ) {
+		mb->safePrintf("</div>");
+		return (bool)adds;
+	}
+
 	if ( g_conf.m_numConnectIps == 0 && g_conf.m_numMasterPwds == 0 ) {
 		if ( adds ) mb->safePrintf("<br>");
 		adds++;
@ -2738,5 +2774,5 @@ bool printRedBox ( SafeBuf *mb ) {

 	mb->safePrintf("</div>");

-	return adds;
+	return (bool)adds;
 }
--- a/Pages.h
+++ b/Pages.h
@ -5,6 +5,9 @@
 #ifndef _PAGES_H_
 #define _PAGES_H_

+bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage = false ) ;
+bool printRedBox ( SafeBuf *mb , bool isRootWebPage = false ) ;
+
 // for PageEvents.cpp and Accessdb.cpp
 //#define RESULTSWIDTHSTR "550px"

@ -304,25 +307,36 @@ enum {
 	//PAGE_BASIC_SEARCH , // TODO
 	//PAGE_BASIC_DIFFBOT , // TODO
 	PAGE_BASIC_SECURITY ,
+	PAGE_BASIC_SEARCH ,

 	// master admin pages
 	PAGE_MASTER      , 
+	PAGE_SEARCH      ,  
+	PAGE_SPIDER      ,
+	PAGE_LOG         ,
+	PAGE_SECURITY    ,
+	PAGE_ADDCOLL     ,	
+	PAGE_DELCOLL     , 
+	PAGE_REPAIR      ,
+	PAGE_SITES , // site filters
+	PAGE_FILTERS     ,
+	PAGE_INJECT      , 
+	PAGE_ADDURL2     ,
+	PAGE_REINDEX     ,	
+
 	PAGE_HOSTS       ,
 	PAGE_STATS       , // 10
 	PAGE_STATSDB	 ,
 	PAGE_PERF        ,
 	PAGE_SOCKETS     ,
-	PAGE_LOG         ,
+
 	PAGE_LOGVIEW     ,
 //	PAGE_SYNC        , 
-	PAGE_SECURITY    ,
-	PAGE_ADDCOLL     ,	
-	PAGE_DELCOLL     , 
 	PAGE_AUTOBAN     , // 20
 	//PAGE_SPIDERLOCKS ,
 	PAGE_PROFILER    ,
 	PAGE_THREADS     ,
-	PAGE_REPAIR      ,
+
 //	PAGE_THESAURUS   , 

 	// . non master-admin pages (collection controls)
@ -335,16 +349,9 @@ enum {
 	PAGE_TITLEDB     ,  
 	//PAGE_STATSDB	 ,

-	PAGE_SEARCH      ,  
-	PAGE_SPIDER      ,
 	PAGE_CRAWLBOT    , // 35
 	PAGE_SPIDERDB    , 
 	//PAGE_PRIORITIES  ,  // priority queue controls
-	PAGE_SITES , // site filters
-	PAGE_FILTERS     ,
-	PAGE_INJECT      , 
-	PAGE_ADDURL2     ,
-	PAGE_REINDEX     ,	
 	//PAGE_KEYWORDS    ,
 	PAGE_SEO         ,
 	PAGE_ACCESS      ,  //40	
--- a/Parms.cpp
+++ b/Parms.cpp
@ -122,6 +122,40 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) ;
 //
 ////////

+
+// from PageBasic.cpp:
+bool updateSiteListTables(collnum_t collnum,bool addSeeds,char *siteListArg);
+
+bool CommandUpdateSiteList ( char *rec ) {
+	// caller must specify collnum
+	collnum_t collnum = getCollnumFromParmRec ( rec );
+	if ( collnum < 0 ) {
+		log("parms: bad collnum for update site list");
+		g_errno = ENOCOLLREC;
+		return true;
+	}
+	// sanity
+	long dataSize = getDataSizeFromParmRec ( rec );
+	if ( dataSize < 0 ) {
+		log("parms: bad site list size = %li bad!",dataSize);
+		g_errno = EBADENGINEER;
+		return true;
+	}
+	// need this
+	CollectionRec *cr = g_collectiondb.getRec ( collnum );
+	// get the sitelist
+	char *data = getDataFromParmRec ( rec );
+	// update it
+	updateSiteListTables ( collnum ,
+			       true , // add NEW seeds?
+			       data // entire sitelist
+			       );
+	// now that we deduped the old site list with the new one for
+	// purposes of adding NEW seeds, we can do the final copy
+	cr->m_siteListBuf.set ( data );
+	return true;
+}
+
 // . require user manually execute this to prevent us fucking up the data
 //   at first initially because of a bad hosts.conf file!!!
 // . maybe put a red 'A' in the hosts table on the web page to indicate
@ -450,7 +484,7 @@ bool CommandParserTestInit ( char *rec ) {
 	g_conf.m_spideringEnabled = 1;
 	//g_conf.m_webSpideringEnabled = 1;
 	// turn on for test coll too
-	CollectionRec *cr = g_collectiondb.getRec("test");
+	CollectionRec *cr = g_collectiondb.getRec("qatest123");
 	// turn on spiders
 	if ( cr ) cr->m_spideringEnabled = 1;
 	// if we are not host 0, turn on spiders for testing
@ -470,7 +504,7 @@ bool CommandSpiderTestInit ( char *rec ) {
 	g_conf.m_spideringEnabled = 1;
 	//g_conf.m_webSpideringEnabled = 1;
 	// turn on for test coll too
-	CollectionRec *cr = g_collectiondb.getRec("test");
+	CollectionRec *cr = g_collectiondb.getRec("qatest123");
 	// turn on spiders
 	if ( cr ) cr->m_spideringEnabled = 1;
 	// if we are not host 0, turn on spiders for testing
@ -488,7 +522,7 @@ bool CommandSpiderTestCont ( char *rec ) {
 	g_conf.m_spideringEnabled = 1;
 	//g_conf.m_webSpideringEnabled = 1;
 	// turn on for test coll too
-	CollectionRec *cr = g_collectiondb.getRec("test");
+	CollectionRec *cr = g_collectiondb.getRec("qatest123");
 	// turn on spiders
 	if ( cr ) cr->m_spideringEnabled = 1;
 	// done 
@ -1888,7 +1922,7 @@ bool Parms::printParm ( SafeBuf* sb,
 					"value=\"%f\" "
 					// 3 was ok on firefox but need 6
 					// on chrome
-					"size=6>",cgi,*(float *)s);
+					"size=7>",cgi,*(float *)s);
 	}
 	else if ( t == TYPE_IP ) {
 		if ( m->m_max > 0 && j == jend ) 
@ -1896,7 +1930,7 @@ bool Parms::printParm ( SafeBuf* sb,
 					"size=12>",cgi);
 		else
 			sb->safePrintf ("<input type=text name=%s value=\"%s\" "
-					"size=6>",cgi,iptoa(*(long *)s));
+					"size=12>",cgi,iptoa(*(long *)s));
 	}
 	else if ( t == TYPE_LONG ) {
 		// just show the parm name and value if printing in json
@ -5080,6 +5114,27 @@ void Parms::init ( ) {
 	m++;
 	*/

+	m->m_title = "init QA tests";
+	m->m_desc  = "If initiated gb performs some integrity tests "
+		"to ensure injecting, spidering and searching works "
+		"properly. Uses ./test/ subdirectory. Injects "
+		"urls in ./test/inject.txt. Spiders urls "
+		"in ./test/spider.txt. "
+		"Each of those two files is essentially a simple format of "
+		"a url followed by the http reply received from the server "
+		"for that url. "
+		// TODO: generate these files
+		;
+	m->m_cgi   = "qasptei";
+	m->m_type  = TYPE_CMD;
+	m->m_func  = CommandSpiderTestInit;
+	m->m_def   = "1";
+	m->m_cast  = 1;
+	m->m_group = 0;
+	m->m_flags = PF_HIDDEN | PF_NOSAVE;
+	m++;
+
+
 	m->m_title = "init parser test run";
 	m->m_desc  = "If enabled gb injects the urls in the "
 		"./test-parser/urls.txt "
@ -7513,6 +7568,7 @@ void Parms::init ( ) {
 	m->m_flags = PF_TEXTAREA;
 	m++;

+	/*
 	// the new upload post submit button
 	m->m_title = "upload urls";
 	m->m_desc  = "Upload your file of urls.";
@ -7521,6 +7577,7 @@ void Parms::init ( ) {
 	m->m_obj   = OBJ_NONE;
 	m->m_type  = TYPE_FILEUPLOADBUTTON;
 	m++;
+	*/

 	m->m_title = "strip sessionids";
 	m->m_desc  = "Strip added urls of their session ids.";
@ -7570,6 +7627,7 @@ void Parms::init ( ) {
 	m->m_title = "site list";
 	m->m_xml   = "siteList";
 	m->m_desc  = "List of sites to spider, one per line. "
+		"See <a href=#examples>example site list</a> below. "
 		"Gigablast uses the "
 		"<a href=/admin/filters#insitelist>insitelist</a> "
 		"directive on "
@ -7578,8 +7636,7 @@ void Parms::init ( ) {
 		"that match the site patterns you specify here, other than "
 		"urls you add individually via the add urls or inject url "
 		"tools. "
-		"See <a href=#examples>example site list</a> below. "
-		"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
+		"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
 		"to add then consider using the <a href=/admin/addurl>add "
 		"urls</a> interface.";
 	m->m_cgi   = "sitelist";
@ -7587,6 +7644,7 @@ void Parms::init ( ) {
 	m->m_page  = PAGE_BASIC_SETTINGS;
 	m->m_obj   = OBJ_COLL;
 	m->m_type  = TYPE_SAFEBUF;
+	m->m_func  = CommandUpdateSiteList;
 	m->m_def   = "";
 	// rebuild urlfilters now will nuke doledb and call updateSiteList()
 	m->m_flags = PF_TEXTAREA | PF_DUP | PF_REBUILDURLFILTERS;
@ -7608,6 +7666,7 @@ void Parms::init ( ) {
 	m++;
 	*/

+	/*
 	// the new upload post submit button
 	m->m_title = "upload site list";
 	m->m_desc  = "Upload your file of site patterns. Completely replaces "
@ -7619,12 +7678,13 @@ void Parms::init ( ) {
 	m->m_type  = TYPE_FILEUPLOADBUTTON;
 	m->m_flags = PF_NOSAVE | PF_DUP;
 	m++;
+	*/

 	m->m_title = "restart collection";
-	m->m_desc  = "Remove all documents from this collection and starts "
-		"spidering over again. If you do this accidentally there "
-		"is a <a href=/admin.html#recover>recovery procedure</a> to "
-		"get back the trashed data.";
+	m->m_desc  = "Remove all documents from this collection and restart "
+		"spidering.";// If you do this accidentally there "
+	//"is a <a href=/admin.html#recover>recovery procedure</a> to "
+	//	"get back the trashed data.";
 	m->m_cgi   = "restart";
 	m->m_page  = PAGE_BASIC_SETTINGS;
 	m->m_obj   = OBJ_COLL;
@ -7638,6 +7698,7 @@ void Parms::init ( ) {
 	m->m_title = "site list";
 	m->m_xml   = "siteList";
 	m->m_desc  = "List of sites to spider, one per line. "
+		"See <a href=#examples>example site list</a> below. "
 		"Gigablast uses the "
 		"<a href=/admin/filters#insitelist>insitelist</a> "
 		"directive on "
@ -7646,8 +7707,7 @@ void Parms::init ( ) {
 		"that match the site patterns you specify here, other than "
 		"urls you add individually via the add urls or inject url "
 		"tools. "
-		"See <a href=#examples>example site list</a> below. "
-		"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
+		"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
 		"to add then consider using the <a href=/admin/addurl>addurl"
 		"</a> interface.";
 	m->m_cgi   = "sitelist";
@ -7655,6 +7715,7 @@ void Parms::init ( ) {
 	m->m_page  = PAGE_SITES;
 	m->m_obj   = OBJ_COLL;
 	m->m_type  = TYPE_SAFEBUF;
+	m->m_func  = CommandUpdateSiteList;
 	m->m_def   = "";
 	// rebuild urlfilters now will nuke doledb and call updateSiteList()
 	m->m_flags = PF_TEXTAREA | PF_REBUILDURLFILTERS;
@ -8741,11 +8802,11 @@ void Parms::init ( ) {
 	m++;

 	m->m_title = "max robots.txt cache age";
-	m->m_desc  = "How many second to cache a robots.txt file for. "
+	m->m_desc  = "How many seconds to cache a robots.txt file for. "
 		"86400 is 1 day. 0 means Gigablast will not read from the "
 		"cache at all and will download the robots.txt before every "
 		"page if robots.txt use is enabled above. However, if this is "
-		"0 then Gigablast will still store robots.txt files into the "
+		"0 then Gigablast will still store robots.txt files in the "
 		"cache.";
 	m->m_cgi   = "mrca";
 	m->m_off   = (char *)&cr.m_maxRobotsCacheAge - x;
@ -10618,8 +10679,9 @@ void Parms::init ( ) {
 	m++;

 	m->m_title = "do query expansion";
-	m->m_desc  = "Query expansion will include word stems and synonyms in "
-		"its search results.";
+	m->m_desc  = "If enabled, query expansion will expand your query "
+		"to include word stems and "
+		"synonyms of the query terms.";
 	m->m_def   = "1";
 	m->m_off   = (char *)&cr.m_queryExpansion - x;
 	m->m_soff  = (char *)&si.m_queryExpansion - y;
@ -10632,7 +10694,7 @@ void Parms::init ( ) {

 	// more general parameters
 	m->m_title = "max search results";
-	m->m_desc  = "What is the limit to the total number "
+	m->m_desc  = "What is the maximum total number "
 		"of returned search results.";
 	m->m_cgi   = "msr";
 	m->m_off   = (char *)&cr.m_maxSearchResults - x;
@ -12436,7 +12498,7 @@ void Parms::init ( ) {
 	m++;

 	m->m_title = "max summary line width";
-	m->m_desc  = "<br> tags are inserted to keep the number "
+	m->m_desc  = "&lt;br&gt; tags are inserted to keep the number "
 		"of chars in the summary per line at or below this width. "
 		"Strings without spaces that exceed this "
 		"width are not split.";
@ -15299,6 +15361,18 @@ void Parms::init ( ) {
 	m->m_smin  = 0;
 	m++;

+	// when we do &qa=1 we do not show things like responseTime in
+	// search results so we can verify serp checksum consistency for QA
+	// in qa.cpp
+	m->m_title = "quality assurance";
+	m->m_desc  = "This is 1 if doing a QA test in qa.cpp";
+	m->m_def   = "0";
+	m->m_soff  = (char *)&si.m_qa - y;
+	m->m_type  = TYPE_CHAR;
+	m->m_sparm = 1;
+	m->m_scgi  = "qa";
+	m++;
+
 	//m->m_title = "show turk forms";
 	//m->m_desc  = "If enabled summaries in search results will be "
 	//	"turkable input forms.";
@ -16744,7 +16818,6 @@ bool Parms::addCurrentParmToList2 ( SafeBuf *parmList ,
 	return true;
 }

-
 // returns false and sets g_errno on error
 bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
 					  long page ){
@ -18019,7 +18092,11 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
 	}

 	// cmd to execute?
-	if ( parm->m_type == TYPE_CMD ) {
+	if ( parm->m_type == TYPE_CMD ||
+	     // sitelist is a safebuf but it requires special deduping
+	     // logic to update it so it uses CommandUpdateSiteList() to
+	     // do the updating
+	     parm->m_func ) {
 		// all parm rec data for TYPE_CMD should be ascii/utf8 chars
 		// and should be \0 terminated
 		char *data = getDataFromParmRec ( rec );
@ -18268,7 +18345,7 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 		CollectionRec *cr = (CollectionRec *)THIS;
 		// if testUrl is provided, find in the table
 		char testUrl [ 1025 ];
-		char *tt = r->getString ( "test" , NULL );
+		char *tt = r->getString ( "qatest123" , NULL );
 		testUrl[0]='\0';
 		if ( tt ) strncpy ( testUrl , tt , 1024 );
 		char *tu = testUrl;
--- a/Proxy.cpp
+++ b/Proxy.cpp
@ -5158,7 +5158,7 @@ char *Proxy::storeLoginBar ( char *reply ,
 	}
 	// point to first digit in there
 	mp += 16;
-	// store our new content length as ascii into "test" buf
+	// store our new content length as ascii into test buf
 	char test[64];
 	long len = sprintf(test,"%li",(long)(newReplySize-mimeLen));
 	// find end
--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -60,6 +60,7 @@ struct SafeBuf {
 	long  fillFromFile(char *filename);
 	long  fillFromFile(char *dir,char *filename);
 	long  load(char *dir,char *fname) { return fillFromFile(dir,fname);};
+	long  load(char *fname) { return fillFromFile(fname);};

 	void filterTags();
 	void filterQuotes();
--- a/SearchInput.h
+++ b/SearchInput.h
@ -179,6 +179,9 @@ class SearchInput {
 	long   m_queryMatchOffsets;
 	long   m_summaryMode;

+	// are we doing a QA query for quality assurance consistency
+	char   m_qa;
+
 	float  m_pqr_demFactSubPhrase;
 	float  m_pqr_demFactCommonInlinks;
 	float  m_pqr_demFactLocTitle;
--- a/Sections.cpp
+++ b/Sections.cpp
@ -1288,7 +1288,7 @@ bool Sections::set ( Words     *w                       ,
 	}


-	m_isTestColl = ! strcmp(m_coll,"test") ;
+	m_isTestColl = ! strcmp(m_coll,"qatest123") ;

 	//
 	//
@ -15163,7 +15163,7 @@ bool Sections::printVotingInfoInJSON ( SafeBuf *sb ) {
 		// breathe
 		QUICKPOLL ( m_niceness );
 		// print this section
-		printSectionDiv ( sk , FMT_JSON ); // forProCog );
+		printSectionDiv ( sk , FORMAT_JSON ); // forProCog );
 		// advance
 		long b = sk->m_b;
 		// stop if last
@ -15190,7 +15190,8 @@ bool Sections::print2 ( SafeBuf *sbuf ,
 			HashTableX *st2 ,
 			HashTableX *tt  ,
 			Addresses *aa ,
-			char format ) { // bool forProCog ){//FMT_PROCOG FMT_JSON HTML
+			char format ) { // bool forProCog ){
+	//FORMAT_PROCOG FORMAT_JSON HTML

 	//sbuf->safePrintf("<b>Sections in Document</b>\n");

@ -15244,7 +15245,7 @@ bool Sections::print2 ( SafeBuf *sbuf ,
 		sk = m_sectionPtrs[b];
 	}

-	if ( format != FMT_HTML ) return true; // forProCog
+	if ( format != FORMAT_HTML ) return true; // forProCog

 	// print header
 	char *hdr =
@ -15553,7 +15554,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
 	//	m_sbuf->safePrintf("A=%li ",sk->m_a);


-	if ( format == FMT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
+	if ( format == FORMAT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
 		// do not count our own site!
 		m_sbuf->safePrintf("<i>"
 				   "<font size=-1>"
@ -15573,7 +15574,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog

 	m_sbuf->safePrintf("<i>");

-	if ( format == FMT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
+	if ( format == FORMAT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
 		sec_t f = sk->m_flags;
 		//if ( f & SEC_SENTENCE )
 		//	m_sbuf->safePrintf("sentence " );
@ -15598,7 +15599,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
 	//	m_sbuf->safePrintf("notdupvotes=%li ",
 	//			   sk->m_votesForNotDup);
 	
-	if ( format != FMT_PROCOG ) {
+	if ( format != FORMAT_PROCOG ) {
 		// print the flags
 		m_sbuf->safePrintf("A=%li ",sk->m_a);
 		
--- a/SiteGetter.cpp
+++ b/SiteGetter.cpp
@ -146,8 +146,8 @@ bool SiteGetter::getSite ( char   *url      ,
 	long age = -1;
 	//long now = getTimeGlobal();
 	//if ( tag ) age = now - tag->m_timestamp;
-	// to parse conssitently for the qa test "test" coll use "timestamp"
-	// as the "current time"
+	// to parse conssitently for the qa test "qatest123" coll use 
+	// "timestamp" as the "current time"
 	if ( tag ) age = timestamp - tag->m_timestamp;
 	// if there, at least get it (might be -1)
 	if ( tag ) m_oldSitePathDepth = atol ( tag->getTagData() );
@ -534,7 +534,7 @@ bool SiteGetter::setSite ( ) {
 	//TagRec gr;
 	m_addedTag.addTag ( "sitepathdepth" , 
 			    // now XmlDoc must provide it to ensure that are 
-			    // injects into the "test" coll are consistent
+			    // injects into the "qatest123" coll are consistent
 			    m_timestamp     ,//getTime()// use now as timestamp
 			    "sitegit"       , // username
 			    0               , // ip
--- a/Spider.cpp
+++ b/Spider.cpp
@ -1082,7 +1082,7 @@ SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
 	// save this
 	strcpy ( sc->m_coll , cr->m_coll );
 	// set this
-	if ( ! strcmp ( cr->m_coll,"test" ) ) sc->m_isTestColl = true;
+	if ( ! strcmp ( cr->m_coll,"qatest123" ) ) sc->m_isTestColl = true;
 	else                                  sc->m_isTestColl = false;
 	
 	// set first doledb scan key
@ -6761,12 +6761,12 @@ bool SpiderLoop::spiderUrl2 ( ) {
 	char *coll = "collnumwasinvalid";
 	if ( cr ) coll = cr->m_coll;

-	// . pass in a pbuf if this is the "test" collection
+	// . pass in a pbuf if this is the "qatest123" collection
 	// . we will dump the SafeBuf output into a file in the
 	//   test subdir for comparison with previous versions of gb
 	//   in order to see what changed
 	SafeBuf *pbuf = NULL;
-	if ( !strcmp( coll,"test") && g_conf.m_testParserEnabled ) 
+	if ( !strcmp( coll,"qatest123") && g_conf.m_testParserEnabled ) 
 		pbuf = &xd->m_sbuf;

 	//
@ -6969,10 +6969,10 @@ bool SpiderLoop::indexedDoc ( XmlDoc *xd ) {
 	bool respider = false;
 	if ( xd->m_oldDocValid && xd->m_oldDoc ) respider = true;

-	// . dump it out to a file in the "test" subdir
+	// . dump it out to a file in the "qatest123" subdir
 	// . but only the first time we spider it...
 	/*
-	if ( ! strcmp(xd->m_coll,"test") && ! respider &&
+	if ( ! strcmp(xd->m_coll,"qatest123") && ! respider &&
 	     // no longer need this when qa testing spider, not parser
 	     g_conf.m_testParserEnabled ) {
 		// save the buffers
@ -12414,7 +12414,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
 	if ( cx->m_isCustomCrawl )
 		return msg->safePrintf("Job is in progress.");
 	else
-		return true;
+		return msg->safePrintf("Spider is in progress.");
 }

 // pattern is a ||-separted list of substrings
--- a/Tagdb.cpp
+++ b/Tagdb.cpp
@ -3103,7 +3103,7 @@ void TagRec::gotAllReplies ( ) {

 	// site getter sometimes adds recs to tagdb to add in a new subsite
 	// it finds... i'd imagine this will create a parsing inconsistency
-	// when injecting docs into the "test" coll... but oh well!
+	// when injecting docs into the "qatest123" coll... but oh well!
 	long timestamp = getTimeGlobal();

 	// . begin the "inheritance loop"
@ -3288,7 +3288,7 @@ bool Msg9a::addTags ( char    *sites                  ,
 	// when we add the "site" tag to it use the timestamp from one
 	// of the tags we are adding... therefore we must require there be
 	// some tags! we do this to insure injection consistency into the
-	// "test" collection.
+	// "qatest123" collection.
 	if ( ! tagRec || tagRec->getNumTags() <= 0 ) { char *xx=NULL;*xx=0; }

 	// use the first timestamp
--- a/Test.cpp
+++ b/Test.cpp
@ -55,7 +55,7 @@ bool Test::init ( ) {
 }

 void Test::reset ( ) {
-	if ( m_urlBuf ) mfree ( m_urlBuf , m_urlEnd - m_urlBuf , "test");
+	if ( m_urlBuf ) mfree ( m_urlBuf , m_urlEnd - m_urlBuf , "test999");
 	//m_spiderLinks = true;//false;
 	m_bypassMenuElimination = false;
 }
@ -122,7 +122,7 @@ void Test::removeFiles ( ) {
 	long saved = g_conf.m_useQuickpoll;
 	g_conf.m_useQuickpoll = false;

-	CollectionRec *cr = g_collectiondb.getRec("test");
+	CollectionRec *cr = g_collectiondb.getRec("qatest123");

 	// . reset the qatest collection to zero docs
 	// . TODO: implement this. only allow it for qatest coll.
@ -172,8 +172,8 @@ void Test::initTestRun ( ) {
 	//if ( m_testSpiderEnabledSaved ) return;
 	//if ( m_testParserEnabledSaved ) return;

-	// you must have the "test" coll already setup!
-	CollectionRec *cr = g_collectiondb.getRec("test");
+	// you must have the "qatest123" coll already setup!
+	CollectionRec *cr = g_collectiondb.getRec("qatest123");
 	if ( ! cr ) {
 		// note it
 		log("test: please add a collection named \"test\" first.");
@ -233,7 +233,7 @@ void Test::initTestRun ( ) {
 	// save it
 	m_runId = i;

-	cr = g_collectiondb.getRec ( "test" );
+	cr = g_collectiondb.getRec ( "qatest123" );
 	if ( ! cr ) {
 		// and no more of this
 		g_conf.m_testParserEnabled = false;
--- a/Title.cpp
+++ b/Title.cpp
@ -71,6 +71,7 @@ void Title::reset() {
 		mfree ( m_title , m_titleAllocSize , "Title" );
 	m_title = NULL;
 	m_titleBytes = 0;
+	m_titleAllocSize = 0;
 	m_query = NULL;
 	m_titleTagStart = -1;
 	m_titleTagEnd   = -1;
@ -113,7 +114,7 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 	char *val = NULL;
 	// look for the "title:" field in json then use that
 	SafeBuf jsonTitle;
-	long vlen;
+	long vlen = 0;
 	if ( xd->m_contentType == CT_JSON ) {
 		char *jt;
 		jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
@ -124,7 +125,6 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 			val = jsonTitle.getBufStart();
 			vlen = jsonTitle.length();
 		}
-		
 	}
 	// if we had a title: field in the json...
 	if ( val && vlen > 0 ) {
@ -135,6 +135,7 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 		else {
 			dst = (char *)mmalloc ( m_titleBytes+1,"titdst" );
 			if ( ! dst ) return false;
+			m_titleAllocSize = m_titleBytes+1;
 		}
 		m_title = dst;
 		memcpy ( dst , val , m_titleBytes );
@ -142,6 +143,13 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 		return true;
 	}

+	// json content, if has no explicit title field, has no title then
+	if ( xd->m_contentType == CT_JSON ) {
+		m_localBuf[0] = '\0';
+		m_title = m_localBuf;
+		m_titleBytes = 0;
+		return true;
+	}

 	bool status = setTitle4 ( xd ,
 				  xml ,
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -879,8 +879,8 @@ bool XmlDoc::set1 ( char    *url         ,
 char *XmlDoc::getTestDir ( ) {
 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;
-	// return NULL if we are not the "test" collection
-	if ( strcmp(cr->m_coll,"test") ) return NULL;
+	// return NULL if we are not the "qatest123" collection
+	if ( strcmp(cr->m_coll,"qatest123") ) return NULL;
 	// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
 	// then return "test-spider" otherwise...
 	if ( m_sreqValid && m_sreq.m_useTestSpiderDir ) 
@ -914,7 +914,7 @@ long XmlDoc::getSpideredTime ( ) {
 	if ( ! cr ) return 0;

 	// if not test collection keep it simple
-	if ( strcmp(cr->m_coll,"test") ) {
+	if ( strcmp(cr->m_coll,"qatest123") ) {
 		// . set spider time to current time
 		// . this might already be valid if we set it in 
 		//   getTestSpideredDate()
@ -3295,13 +3295,13 @@ char *XmlDoc::prepareToMakeTitleRec ( ) {
 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;

-	// if we are injecting into the "test" coll, then we need to have
+	// if we are injecting into the "qatest123" coll, then we need to have
 	// m_spideredTimeValid be true before calling getIsSpam() which calls
 	// getSiteNumInlinks() which adds tags to tagdb using that date, but
-	// only for the "test" coll! that keeps our parser output consistent
-	// across runs!
+	// only for the "qatest123" coll!
+	// that keeps our parser output consistent across runs!
 	char **content = NULL;
-	if ( ! strcmp ( cr->m_coll,"test") ) {
+	if ( ! strcmp ( cr->m_coll,"qatest123") ) {
 		content = getContent ( );
 		if ( ! content || content == (void *)-1 ) 
 			return (char *)content;
@ -11842,7 +11842,7 @@ long *XmlDoc::getSiteNumInlinks ( ) {
 	// current time
 	long now = getTimeGlobal();
 	// use the spidered time for the test collection for consistency
-	if ( !strcmp(cr->m_coll,"test") ) {
+	if ( !strcmp(cr->m_coll,"qatest123") ) {
 		//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
 		now = getSpideredTime();//m_spideredTime;
 	}
@ -12061,8 +12061,8 @@ LinkInfo *XmlDoc::getSiteLinkInfo() {
 	// get from spider request if there
 	//bool injected = false;
 	//if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true;
-	// but be consistent if doing the "test" collection
-	if ( ! strcmp(cr->m_coll,"test") ) {
+	// but be consistent if doing the "qatest123" collection
+	if ( ! strcmp(cr->m_coll,"qatest123") ) {
 		//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
 		lastUpdateTime = getSpideredTime();//m_spideredTime;
 	}
@ -12164,14 +12164,14 @@ long *XmlDoc::getIp ( ) {
 	if ( ! cr ) return NULL;

 	bool useTestCache = false;
-	if ( ! strcmp(cr->m_coll,"test") ) useTestCache = true;
+	if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
 	// unless its the pagesubmit.cpp event submission tool
 	//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;


-	// when building the "test" collection try to get the ip from
+	// when building the "qatest123" collection try to get the ip from
 	// "./test/ips.txt" so our injections are consistent every time 
-	// Test.cpp runs its injection loop into the "test" collection
+	// Test.cpp runs its injection loop into the "qatest123" collection
 	if ( useTestCache ) { // && m_useIpsTxtFile ) {
 		// stolen from msgc.cpp:
 		// if url is already in a.b.c.d format return that
@ -12204,7 +12204,7 @@ long *XmlDoc::getIp ( ) {
 	// this basically slows the spider down.
 	long delay = cr->m_spiderDelayInMilliseconds;
 	// ignore for testing
-	if ( ! strcmp(cr->m_coll,"test") ) delay = 0;
+	if ( ! strcmp(cr->m_coll,"qatest123") ) delay = 0;
 	// injected?
 	if ( m_sreqValid && m_sreq.m_isInjecting  ) delay = 0;
 	if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
@ -12281,14 +12281,14 @@ long *XmlDoc::gotIp ( bool save ) {
 	if ( ! cr ) return NULL;

 	bool useTestCache = false;
-	if ( ! strcmp(cr->m_coll,"test") ) useTestCache = true;
+	if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
 	// unless its the pagesubmit.cpp event submission tool
 	//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;
 	

-	// when building the "test" collection try to get the ip from
+	// when building the "qatest123" collection try to get the ip from
 	// "./test/ips.txt" so our injections are consistent every time 
-	// Test.cpp runs its injection loop into the "test" collection
+	// Test.cpp runs its injection loop into the "qatest123" collection
 	if ( save && useTestCache ) {
 		// ip of 0 means NXDOMAIN i think (-1 means error)
 		//if ( m_ip == 0 ) {
@ -12592,8 +12592,8 @@ bool *XmlDoc::getIsAllowed ( ) {
 		return &m_isAllowed;
 	}

-	// or if using the "test" collection, assume yes!
-	//if ( ! strcmp ( m_coll , "test" ) ) {
+	// or if using the "qatest123" collection, assume yes!
+	//if ( ! strcmp ( m_coll , "qatest123" ) ) {
 	//	m_isAllowed      = true;
 	//	m_isAllowedValid = true;
 	//	return &m_isAllowed;
@ -12939,8 +12939,8 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
 	if ( ! m_calledMsg25 ) {
 		// get this
 		long lastUpdateTime = getTimeGlobal();
-		// but be consistent if doing the "test" collection
-		if ( ! strcmp(cr->m_coll,"test") ) {
+		// but be consistent if doing the "qatest123" collection
+		if ( ! strcmp(cr->m_coll,"qatest123") ) {
 			//if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;}
 			lastUpdateTime = getSpideredTime();//m_spideredTime;
 		}
@ -14184,7 +14184,7 @@ char **XmlDoc::getHttpReply ( ) {
 	// come back up here if a redirect invalidates it
 loop:
 	// sanity test -- only if not the test collection (NO, might be EBADIP)
-	//if ( m_indexCode && strcmp(m_coll,"test") ) { char *xx=NULL;*xx=0; }
+	//if ( m_indexCode && strcmp(m_coll,"qatest123")){char*xx=NULL;*xx=0;}
 	// get the http reply
 	char **replyPtr = getHttpReply2();
 	if ( ! replyPtr || replyPtr == (void *)-1 ) return (char **)replyPtr;
@ -14382,7 +14382,7 @@ char **XmlDoc::getHttpReply2 ( ) {
 	//	return gotHttpReply ( );

 	bool useTestCache = false;
-	if ( ! strcmp(cr->m_coll,"test") ) useTestCache = true;
+	if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
 	// unless its the pagesubmit.cpp event submission tool
 	//if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false;

@ -14474,11 +14474,12 @@ char **XmlDoc::getHttpReply2 ( ) {
 	// turn off
 	r->m_useCompressionProxy = false;
 	r->m_compressReply       = false;
+	r->m_isCustomCrawl       = cr->m_isCustomCrawl;

 	// set it for this too
 	if ( g_conf.m_useCompressionProxy &&
 	     // do not use for the test collection ever, that is qa'ing
-	     strcmp(cr->m_coll,"test") ) {
+	     strcmp(cr->m_coll,"qatest123") ) {
 		r->m_useCompressionProxy = true;
 		r->m_compressReply       = true;
 	}
@ -14539,7 +14540,7 @@ char **XmlDoc::getHttpReply2 ( ) {
 	// . msg13 uses XmlDoc::getHttpReply() function to handle
 	//   redirects, etc.? no...
 	bool isTestColl = false;
-	if ( ! strcmp(cr->m_coll,"test") ) isTestColl = true;
+	if ( ! strcmp(cr->m_coll,"qatest123") ) isTestColl = true;

 	// sanity check. keep injections fast. no downloading!
 	if ( m_wasInjected ) { 
@ -14613,7 +14614,7 @@ char **XmlDoc::gotHttpReply ( ) {
 	// . i.e. what are you doing downloading the page if there was
 	//   a problem with the page we already know about
 	if ( m_indexCode && m_indexCodeValid &&
-	     strcmp(cr->m_coll,"test") ) { char *xx=NULL;*xx=0; }
+	     strcmp(cr->m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }

 	// fix this
 	if ( saved == EDOCUNCHANGED ) {
@ -17207,6 +17208,8 @@ long *XmlDoc::getContentHashJson32 ( ) {
 		if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
 			continue;

+		char *topName = NULL;
+
 		// what name level are we?
 		long numNames = 1;
 		JsonItem *pi = ji->m_parent;
@ -17214,6 +17217,7 @@ long *XmlDoc::getContentHashJson32 ( ) {
 			// empty name?
 			if ( ! pi->m_name ) continue;
 			if ( ! pi->m_name[0] ) continue;
+			topName = pi->m_name;
 			numNames++;
 		}

@ -17232,6 +17236,22 @@ long *XmlDoc::getContentHashJson32 ( ) {
 		     strcmp(ji->m_name,"resolved_url") == 0 )
 			continue;

+		if ( topName && strcmp(topName,"stats") == 0 )
+			continue;
+
+		if ( topName && strcmp(topName,"queryString") == 0 )
+			continue;
+
+		if ( topName && strcmp(topName,"nextPages") == 0 )
+			continue;
+
+		if ( topName && strcmp(topName,"textAnalysis") == 0 )
+			continue;
+
+		if ( topName && strcmp(topName,"links") == 0 )
+			continue;
+
+
 		// hash the fully compound name
 		long nameHash32 = 0;
 		JsonItem *p = ji;
@ -17607,7 +17627,7 @@ long **XmlDoc::getOutlinkFirstIpVector () {
 	if ( ! cr ) return NULL;

 	// . go get it
-	// . if coll is "test" then try to use the file ./test/ips.txt to
+	// . if coll is "qatest123" then try to use the file ./test/ips.txt to
 	//   see if the ip is in there for the given url hostname
 	// . this will now update Tagdb with the "firstip" tags if it should!!
 	// . this just dns looks up the DOMAINS of each outlink because these
@ -17747,7 +17767,7 @@ long *XmlDoc::getUrlFilterNum ( ) {

 	// . look it up
 	// . use the old spidered date for "nowGlobal" so we can be consistent
-	//   for injecting into the "test" coll
+	//   for injecting into the "qatest123" coll
 	long ufn = ::getUrlFilterNum ( oldsr,&fakeReply,spideredTime,false,
 				       m_niceness,cr,
 				       false, // isOutlink?
@ -18754,7 +18774,7 @@ bool XmlDoc::doConsistencyTest ( bool forceTest ) {
 		return true;

 	// if not test coll skip this
-	//if ( strcmp(cr->m_coll,"test") ) return true;
+	//if ( strcmp(cr->m_coll,"qatest123") ) return true;

 	// title rec is null if we are reindexing an old doc
 	// and "unchanged" was true.
@ -19200,7 +19220,7 @@ void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) {
 		else if ( rdbId == RDB_TITLEDB ) {
 			//XmlDoc tr;
 			//SafeBuf tmp;
-			//tr.set2 ( rec,recSize ,"test",&tmp,m_niceness);
+			//tr.set2 ( rec,recSize ,"qatest123",&tmp,m_niceness);
 			// print each offset and size for the variable crap
 			sb->safePrintf("<td><nobr>titlerec datasize=%li "
 				       //"sizeofxmldoc=%li "
@ -19273,7 +19293,7 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
 	if ( ! cr ) return true;

 	// do not do this if not test collection for now
-	if ( strcmp(cr->m_coll,"test") ) return true;
+	if ( strcmp(cr->m_coll,"qatest123") ) return true;

 	// store each record in the list into the send buffers
 	for ( ; p < pend ; ) {
@ -22437,7 +22457,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 	//if ( ! m_spideredTimeValid         ) { char *xx=NULL;*xx=0; }

 	// . set other fields besides key
-	// . crap! if we are the "test" collection then m_spideredTime
+	// . crap! if we are the "qatest123" collection then m_spideredTime
 	//   was read from disk usually and is way in the past! watch out!!
 	m_srep.m_spideredTime = getSpideredTime();//m_spideredTime;

@ -22447,7 +22467,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 	// crap, for the test coll this is often a very old time and it
 	// causes the spider request to be repeatedly executed, so let's
 	// fix that
-	if ( ! strcmp(cr->m_coll,"test") ) 
+	if ( ! strcmp(cr->m_coll,"qatest123") ) 
 		m_srep.m_spideredTime = getTimeGlobal();


@ -23031,7 +23051,7 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 	if ( ! cr ) return NULL;

 	// do not do this if not test collection for now
-	bool isTestColl = (! strcmp(cr->m_coll,"test") );
+	bool isTestColl = (! strcmp(cr->m_coll,"qatest123") );
 	// turn off for now
 	isTestColl = false;

@ -30297,6 +30317,9 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
 	// . this now allows for commas in numbers like "1,500.62"
 	float f = atof2 ( p , bufEnd - p );

+	// debug
+	//log("build: hashing %s %f",hi->m_prefix,f);
+
 	if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
 		return false;

@ -33687,7 +33710,7 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
 	long now = getTimeGlobal();
 	// actually, use spider download time if we can. that way
 	// Test.cpp's injection runs will be more consistent!
-	if ( ! strcmp(cr->m_coll,"test") ) {
+	if ( ! strcmp(cr->m_coll,"qatest123") ) {
 		//if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; }
 		now = getSpideredTime();//m_spideredTime;
 	}
--- a/coll.main.0/coll.conf
+++ b/coll.main.0/coll.conf
@ -1,416 +0,0 @@
-# List of sites to spider, one per line. Gigablast uses the <a
-# href=/admin/filters#insitelist>insitelist</a> directive on the <a
-# href=/admin/filters>url filters</a> page to make sure that the spider only
-# indexes urls that match the site patterns you specify here, other than urls
-# you add individually via the add urls or inject url tools. See <a
-# href=#examples>example site list</a> below. Limit list to 300MB. If you have
-# a lot of INDIVIDUAL URLS to add then consider using the <a
-# href=/admin/addurl>addurl</a> interface.
-<siteList><![CDATA[]]></>
-
-# All <, >, " and # characters that are values for a field contained herein
-# must be represented as &lt;, &gt;, &#34; and &#035; respectively.
-
-# Controls just the spiders for this collection.
-<spideringEnabled>1</>
-
-# What is the maximum number of web pages the spider is allowed to download
-# simultaneously PER HOST for THIS collection?
-<maxSpiders>100</>
-
-# make each spider wait this many milliseconds before getting the ip and
-# downloading the page.
-<spiderDelayInMilliseconds>0</>
-
-# If this is true Gigablast will respect the robots.txt convention.
-<useRobotstxt>1</>
-
-# How many second to cache a robots.txt file for. 86400 is 1 day. 0 means
-# Gigablast will not read from the cache at all and will download the
-# robots.txt before every page if robots.txt use is enabled above. However, if
-# this is 0 then Gigablast will still store robots.txt files into the cache.
-<maxRobotstxtCacheAge>86400</>
-
-# Do a tight merge on posdb and titledb at this time every day. This is
-# expressed in MINUTES past midnight UTC. UTC is 5 hours ahead of EST and 7
-# hours ahead of MST. Leave this as -1 to NOT perform a daily merge. To merge
-# at midnight EST use 60*5=300 and midnight MST use 60*7=420.
-<dailyMergeTime>-1</>
-
-# Comma separated list of days to merge on. Use 0 for Sunday, 1 for Monday,
-# ... 6 for Saturday. Leaving this parmaeter empty or without any numbers will
-# make the daily merge happen every day
-<dailyMergeDays><![CDATA[0]]></>
-
-# When the daily merge was last kicked off. Expressed in UTC in seconds since
-# the epoch.
-<dailyMergeLastStarted>-1</>
-
-# If this is true, users will have to pass a simple Turing test to add a url.
-# This prevents automated url submission.
-<turingTestEnabled>0</>
-
-# Maximum number of urls that can be submitted via the addurl interface, per
-# IP domain, per 24 hour period. A value less than or equal to zero implies no
-# limit.
-<maxAddUrls>0</>
-
-# When the spider round started
-<spiderRoundStartTime>0</>
-
-# The spider round number.
-<spiderRoundNum>0</>
-
-# When enabled, the spider will discard web pages which are identical to other
-# web pages that are already in the index. However, root urls, urls that have
-# no path, are never discarded. It most likely has to hit disk to do these
-# checks so it does cause some slow down. Only use it if you need it.
-<dedupingEnabled>0</>
-
-# When enabled, the spider will discard web pages which, when a www is
-# prepended to the page's url, result in a url already in the index.
-<dedupingEnabledForWww>1</>
-
-# Detect and do not index pages which have a 200 status code, but are likely
-# to be error pages.
-<detectCustomErrorPages>1</>
-
-# Should pages be removed from the index if they are no longer accessible on
-# the web?
-<delete404s>1</>
-
-# If this is true, the spider, when a url redirects to a "simpler" url, will
-# add that simpler url into the spider queue and abandon the spidering of the
-# current url.
-<useSimplifiedRedirects>1</>
-
-# If this is true, the spider, when updating a web page that is already in the
-# index, will not even download the whole page if it hasn't been updated since
-# the last time Gigablast spidered it. This is primarily a bandwidth saving
-# feature. It relies on the remote webserver's returned Last-Modified-Since
-# field being accurate.
-<useIfModifiedSince>0</>
-
-# If this is true, do not allow spammy inlinks to vote. This check is too
-# aggressive for some collections, i.e.  it does not allow pages with cgi in
-# their urls to vote.
-<doLinkSpamChecking>1</>
-
-# If this is true Gigablast will only allow one vote per the top 2 significant
-# bytes of the IP address. Otherwise, multiple pages from the same top IP can
-# contribute to the link text and link-based quality ratings of a particular
-# URL. Furthermore, no votes will be accepted from IPs that have the same top
-# 2 significant bytes as the IP of the page being indexed.
-<restrictLinkVotingByIp>1</>
-
-# How often should Gigablast recompute the link info for a url. Also applies
-# to getting the quality of a site or root url, which is based on the link
-# info. In days. Can use decimals. 0 means to update the link info every time
-# the url's content is re-indexed. If the content is not reindexed because it
-# is unchanged then the link info will not be updated. When getting the link
-# info or quality of the root url from an external cluster, Gigablast will
-# tell the external cluster to recompute it if its age is this or higher.
-<updateLinkInfoFrequency>60.000000</>
-
-# If this is eabled the spider will not allow any docs which are determined to
-# be serps.
-<doSerpDetection>1</>
-
-# If this is false then the filter will not be used on html or text pages.
-<applyFilterToTextPages>0</>
-
-# Program to spawn to filter all HTTP replies the spider receives. Leave blank
-# for none.
-<filterName><![CDATA[]]></>
-
-# Kill filter shell after this many seconds. Assume it stalled permanently.
-<filterTimeout>40</>
-
-# Retrieve pages from the proxy at this IP address.
-<proxyIp>0.0.0.0</>
-
-# Retrieve pages from the proxy on this port.
-<proxyPort>0</>
-
-# Index the body of the documents so you can search it. Required for searching
-# that. You wil pretty much always want to keep this enabled.
-<indexBody>1</>
-
-# Send every spidered url to this diffbot.com by appending a &url=<url> to it
-# before trinyg to downloading it. We expect get get back a JSON reply which
-# we index. You will need to supply your token to this as well.
-<diffbotApiUrl><![CDATA[]]></>
-
-# Get scoring information for each result so you can see how each result is
-# scored? You must explicitly request this using &scores=1 for the XML feed
-# because it is not included by default.
-<getDocidScoringInfo>1</>
-
-# Query expansion will include word stems and synonyms in its search results.
-<doQueryExpansion>1</>
-
-# What is the limit to the total number of returned search results.
-<maxSearchResults>1000</>
-
-# What is the limit to the total number of returned search results per query?
-<maxSearchResultsPerQuery>100</>
-
-# What is the maximum number of characters allowed in titles displayed in the
-# search results?
-<maxTitleLen>80</>
-
-# Should search results be site clustered by default?
-<siteClusterByDefault>1</>
-
-# Hide all clustered results instead of displaying two results from each site.
-<hideAllClusteredResults>0</>
-
-# Should duplicate search results be removed by default?
-<dedupResultsByDefault>1</>
-
-# Should we dedup URLs with case insensitivity? This is mainly to correct
-# duplicate wiki pages.
-<dedupURLs>0</>
-
-# If document summary is this percent similar to a document summary above it,
-# then remove it from the search results. 100 means only to remove if exactly
-# the same. 0 means no summary deduping.
-<percentSimilarDedupSummary>90</>
-
-# Sets the number of lines to generate for summary deduping. This is to help
-# the deduping process not thorw out valid summaries when normally displayed
-# summaries are smaller values. Requires percent similar dedup summary to be
-# enabled.
-<numberOfLinesToUseInSummaryToDedup>4</>
-
-# Default language to use for ranking results. Value should be any language
-# abbreviation, for example "en" for English.
-<sortLanguagePreference><![CDATA[en]]></>
-
-# Default country to use for ranking results. Value should be any country code
-# abbreviation, for example "us" for United States.
-<sortCountryPreference><![CDATA[us]]></>
-
-# What is the maximum number of characters displayed in a summary for a search
-# result?
-<maxSummaryLen>512</>
-
-# What is the maximum number of excerpts displayed in the summary of a search
-# result?
-<maxSummaryExcerpts>4</>
-
-# What is the maximum number of characters allowed per summary excerpt?
-<maxSummaryExcerptLength>300</>
-
-# What is the default number of summary excerpts displayed per search result?
-<defaultNumberOfSummaryExcerpts>3</>
-
-# <br> tags are inserted to keep the number of chars in the summary per line
-# at or below this width. Strings without spaces that exceed this width are
-# not split.
-<maxSummaryLineWidth>80</>
-
-# Truncating this will miss out on good summaries, but performance will
-# increase.
-<bytesOfDocToScanForSummaryGeneration>70000</>
-
-# Front html tag used for highlightig query terms in the summaries displated
-# in the search results.
-<frontHighlightTag><![CDATA[&lt;b style=&#34;color:black;background-color:&#035;ffff66&#34;&gt;]]></>
-
-# Front html tag used for highlightig query terms in the summaries displated
-# in the search results.
-<backHighlightTag><![CDATA[&lt;/b&gt;]]></>
-
-# How many search results should we scan for related topics (gigabits) per
-# query?
-<docsToScanForTopics>300</>
-
-# Should Gigablast only get one document per IP domain and per domain for
-# topic (gigabit) generation?
-<ipRestrictionForTopics>0</>
-
-# Should Gigablast remove overlapping topics (gigabits)?
-<removeOverlappingTopics>1</>
-
-# What is the number of related topics (gigabits) displayed per query? Set to
-# 0 to save CPU time.
-<numberOfRelatedTopics>11</>
-
-# Related topics (gigabits) with scores below this will be excluded. Scores
-# range from 0% to over 100%.
-<minTopicsScore>5</>
-
-# How many documents must contain the topic (gigabit) for it to be displayed.
-<minTopicDocCount>2</>
-
-# If a document is this percent similar to another document with a higher
-# score, then it will not contribute to the topic (gigabit) generation.
-<dedupDocPercentForTopics>80</>
-
-# Maximum number of words a topic (gigabit) can have. Affects raw feeds, too.
-<maxWordsPerTopic>6</>
-
-# Max chars to sample from each doc for topics (gigabits).
-<topicMaxSampleSize>4096</>
-
-# If enabled, results in dmoz will display their categories on the results
-# page.
-<displayDmozCategoriesInResults>1</>
-
-# If enabled, results in dmoz will display their indirect categories on the
-# results page.
-<displayIndirectDmozCategoriesInResults>0</>
-
-# If enabled, a link will appear next to each category on each result allowing
-# the user to perform their query on that entire category.
-<displaySearchCategoryLinkToQueryCategoryOfResult>0</>
-
-# Yes to use DMOZ given title when a page is untitled but is in DMOZ.
-<useDmozForUntitled>1</>
-
-# Yes to always show DMOZ summaries with search results that are in DMOZ.
-<showDmozSummaries>1</>
-
-# Yes to display the Adult category in the Top category
-<showAdultCategoryOnTop>0</>
-
-# Before downloading the contents of a URL, Gigablast first chains down this
-# list of expressions</a>, starting with expression #0.  The first expression
-# it matches is the ONE AND ONLY matching row for that url. It then uses the
-# respider frequency, spider priority, etc. on the MATCHING ROW when spidering
-# that URL. If you specify the <i>expression</i> as <i><b>default</b></i> then
-# that MATCHES ALL URLs. URLs with high spider priorities take spidering
-# precedence over URLs with lower spider priorities. The respider frequency
-# dictates how often a URL will be respidered. See the help table below for
-# examples of all the supported expressions. Use the <i>&&</i> operator to
-# string multiple expressions together in the same expression text box. A
-# <i>spider priority</i> of <i>DELETE</i> will cause the URL to not be
-# spidered, or if it has already been indexed, it will be deleted when it is
-# respidered.<br><br>
-<filterExpression><![CDATA[isdocidbased]]></>
-<filterExpression><![CDATA[ismedia]]></>
-<filterExpression><![CDATA[errorcount&gt;=3 &amp;&amp; hastmperror]]></>
-<filterExpression><![CDATA[errorcount&gt;=1 &amp;&amp; hastmperror]]></>
-<filterExpression><![CDATA[isaddurl]]></>
-<filterExpression><![CDATA[hopcount==0 &amp;&amp; iswww &amp;&amp; isnew]]></>
-<filterExpression><![CDATA[hopcount==0 &amp;&amp; iswww]]></>
-<filterExpression><![CDATA[hopcount==0 &amp;&amp; isnew]]></>
-<filterExpression><![CDATA[hopcount==0]]></>
-<filterExpression><![CDATA[hopcount==1 &amp;&amp; isnew]]></>
-<filterExpression><![CDATA[hopcount==1]]></>
-<filterExpression><![CDATA[hopcount==2 &amp;&amp; isnew]]></>
-<filterExpression><![CDATA[hopcount==2]]></>
-<filterExpression><![CDATA[hopcount&gt;=3 &amp;&amp; isnew]]></>
-<filterExpression><![CDATA[hopcount&gt;=3]]></>
-<filterExpression><![CDATA[isnew]]></>
-<filterExpression><![CDATA[default]]></>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<harvestLinks>1</>
-<filterFrequency>0.000000</>
-<filterFrequency>0.000000</>
-<filterFrequency>1.000000</>
-<filterFrequency>1.000000</>
-<filterFrequency>1.000000</>
-<filterFrequency>7.000000</>
-<filterFrequency>7.000000</>
-<filterFrequency>7.000000</>
-<filterFrequency>10.000000</>
-<filterFrequency>20.000000</>
-<filterFrequency>20.000000</>
-<filterFrequency>40.000000</>
-<filterFrequency>40.000000</>
-<filterFrequency>60.000000</>
-<filterFrequency>60.000000</>
-<filterFrequency>30.000000</>
-<filterFrequency>30.000000</>
-
-# Do not allow more than this many outstanding spiders for all urls in this
-# priority.
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>4</>
-<maxSpidersPerRule>2</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>2</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>1</>
-<maxSpidersPerRule>99</>
-<maxSpidersPerRule>99</>
-
-# Allow this many spiders per IP.
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-<maxSpidersPerIp>1</>
-
-# Wait at least this long before downloading urls from the same IP address.
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<spiderIpWait>1000</>
-<filterPriority>80</>
-<filterPriority>-3</>
-<filterPriority>3</>
-<filterPriority>45</>
-<filterPriority>85</>
-<filterPriority>50</>
-<filterPriority>48</>
-<filterPriority>49</>
-<filterPriority>47</>
-<filterPriority>40</>
-<filterPriority>39</>
-<filterPriority>30</>
-<filterPriority>29</>
-<filterPriority>20</>
-<filterPriority>19</>
-<filterPriority>1</>
-<filterPriority>0</>
--- a/dnstest.cpp
+++ b/dnstest.cpp
@ -104,7 +104,7 @@ void timeWrapper ( int fd , void *state ) {
 	// bail if too many launched
 	if ( s_count >= s_max ) return;
 	// new state
-	StateT *st = (StateT *)mmalloc ( sizeof(StateT) , "test" );
+	StateT *st = (StateT *)mmalloc ( sizeof(StateT) , "dnstest" );
 	// get url from stdin into buf
 	char *p = st->m_buf;
 	if ( ! fgets ( p , 1023 , stdin ) ) exit ( 0 );
@ -147,6 +147,6 @@ void dnsWrapper ( void *state , long ip ) {
 			st->m_buf , iptoa(ip) , mstrerror(g_errno));
 	//if ( g_errno == ETRYAGAIN )
 	//	log("hey");
-	mfree ( st , sizeof(StateT), "test" );
+	mfree ( st , sizeof(StateT), "dnstest" );
 	s_count--;
 }
--- a/html/help.html
+++ b/html/help.html
@ -127,11 +127,14 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
            <td style="padding-bottom:12px">&nbsp;</td>
            <td style="padding-bottom:12px">&nbsp;</td>
          </tr>
-<!--
-          <tr bgcolor="#006699"> 
-            <th><a name="boolean" id="boolean"></a><font color="#FFFFFF">Boolean Search</font></th>
-            <th><font color="#FFFFFF">Description</font></th>
+
+<tr bgcolor="#0340fd">
+
+            <th><font color=33dcff>Boolean Search</font></th>
+            <th><font color=33dcff>Description</font></th>
+
          </tr>
+
          <tr> 
            <td colspan="2" bgcolor="#FFFFCC"><center>
                Note: boolean operators must be in UPPER CASE. 
@ -214,16 +217,17 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
              expressions and can be optionally enclosed in parentheses. A NOT 
              operator can optionally preceed the left or the right operand.</td>
          </tr>
-->
+
        </table>



 </td></tr>
 </table>
+<br>

 <center>
-Copyright &copy; 2013. All rights reserved.
+Copyright &copy; 2014. All rights reserved.
 </center>
 </body>
 </html>
--- a/main.cpp
+++ b/main.cpp
@ -5680,7 +5680,7 @@ void zlibtest() {
 		// malloc 1,000 bufs of size about 100-64k each
 		for ( long i = 0 ; i < 100 ; i++ ) {
 			long  bufSize = 1000 + (rand() % 65000);
-			ptrs[i] = (char *)mmalloc ( bufSize , "test" );
+			ptrs[i] = (char *)mmalloc ( bufSize , "ztest" );
 			if ( ! ptrs[i] ) {
 				log("no mem!"); exit(-1); }
 			lens[i] = bufSize;
@ -5690,7 +5690,7 @@ void zlibtest() {
 		}
 		// now free them
 		for ( long i = 0 ; i < 100 ; i++ ) 
-			mfree (ptrs[i] , lens[i] , "test" );
+			mfree (ptrs[i] , lens[i] , "ztest" );
 	}
 }
 */
@ -11555,8 +11555,8 @@ bool parseTest ( char *coll , long long docId , char *query ) {
 	// speed test
 	t = gettimeofdayInMilliseconds();
 	for ( long k = 0 ; k < 100 ; k++ ) {
-		char *mm = (char *)mmalloc ( 300*1024 , "test");
-		mfree ( mm , 300*1024 ,"test");
+		char *mm = (char *)mmalloc ( 300*1024 , "ztest");
+		mfree ( mm , 300*1024 ,"ztest");
 	}
 	e = gettimeofdayInMilliseconds();
 	logf(LOG_DEBUG,"build: Took %.3f ms to do mallocs.",
@ -14833,7 +14833,7 @@ bool cacheTest() {
 			false         ,  // support lists of recs?
 			maxCacheNodes ,
 			false         ,  // use half keys?
-			"test"        ,  // dbname
+			"cachetest"        ,  // dbname
 			false         )) // save cache to disk?
 		return log("test: Cache init failed.");

@ -14906,7 +14906,7 @@ bool cacheTest() {
 			false         ,  // support lists of recs?
 			maxCacheNodes ,
 			false         ,  // use half keys?
-			"test"        ,  // dbname
+			"cachetest"        ,  // dbname
 			false         )) // save cache to disk?
 		return log("test: Cache init failed.");

--- a/monitor.cpp
+++ b/monitor.cpp
@ -233,7 +233,6 @@ long g_qn = 0;

 char *g_queries[] = {
 	//"buzzlogic",
-	//"test",
 	"broncos",
 	"ibm",
 	"yahoo",
--- a/qa.cpp
+++ b/qa.cpp
@ -0,0 +1,446 @@
+#include <string.h>
+#include "SafeBuf.h"
+#include "HttpServer.h"
+
+static long s_failures = 0;
+
+bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
+	SafeBuf sb;
+	sb.safePrintf ( "http://%s:%li%s"
+			, iptoa(g_hostdb.m_myHost->m_ip)
+			, (long)g_hostdb.m_myHost->m_port
+			, path
+			);
+	Url u;
+	u.set ( sb.getBufStart() );
+	if ( ! g_httpServer.getDoc ( u.getUrl() ,
+				     0 , // ip
+				     0 , // offset
+				     -1 , // size
+				     0 , // ifmodsince
+				     NULL ,
+				     callback ,
+				     60*1000, // timeout
+				     0, // proxyip
+				     0, // proxyport
+				     -1, // maxtextdoclen
+				     -1, // maxotherdoclen
+				     NULL ) ) // useragent
+		return false;
+	// error?
+	log("qa: getUrl error: %s",mstrerror(g_errno));
+	return true;
+}	
+
+bool qatest ( ) ;
+
+void qatestWrapper ( void *state , TcpSocket *sock ) { qatest(); }	
+
+// return false if blocked, true otherwise
+bool addColl ( ) {
+	static bool s_flag = false;
+	if ( s_flag ) return true;
+	s_flag = true;
+	return getUrl ( "/admin/addcoll?c=qatest123" , qatestWrapper );
+}
+
+
+// first inject a set list of urls
+static char  **s_urlPtrs = NULL;
+static long    s_numUrls = 0;
+static SafeBuf s_ubuf1;
+static SafeBuf s_ubuf2;
+
+
+bool loadUrls ( ) {
+	static bool s_loaded = false;
+	if ( s_loaded ) return true;
+	// use injectme3 file
+	s_ubuf1.load("./injectme3");
+	// scan for +++URL: xxxxx
+	char *s = s_ubuf1.getBufStart();
+	for ( ; *s ; s++ ) {
+		if ( strncmp(s,"+++URL: ",8) ) continue;
+		// got one
+		// find end of it
+		s += 8;
+		char *e = s;
+		for ( ; *e && ! is_wspace_a(*e); e++ );
+		// null term it
+		if ( *e ) *e = '\0';
+		// store ptr
+		s_ubuf2.pushLong((long)s);
+		// skip past that
+		s = e;
+	}
+	// make array of url ptrs
+	s_urlPtrs = (char **)s_ubuf2.getBufStart();
+	return true;
+}
+
+bool injectUrls ( ) {
+	loadUrls();
+	static long s_ii = 0;
+	for ( ; s_ii < s_numUrls ; ) {
+		// pre-inc it
+		s_ii++;
+		// inject using html api
+		SafeBuf sb;
+		sb.safePrintf("/admin/inject?c=qatest123&delete=0&u=");
+		sb.urlEncode ( s_urlPtrs[s_ii] );
+		return getUrl ( sb.getBufStart() , qatestWrapper );
+	}
+	return true;
+}
+
+static char *s_queries[] = {
+	"the",
+	"+the",
+	"cats",
+	"+cats dog",
+	"+cats +dog",
+	"cat OR dog",
+	"cat AND dog",
+	"cat AND NOT dog",
+	"NOT cat AND NOT dog",
+	"cat -dog",
+	"site:wisc.edu"
+};
+
+static long s_checksums[] = {
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0
+};
+
+static long s_qi1 = 0;
+
+void doneSearching1 ( void *state , TcpSocket *sock ) {
+	//loadQueries1();
+	long ii = s_qi1 - 1;
+	// get checksum of it
+	HttpMime hm;
+	hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
+	char *page = sock->m_readBuf + hm.getMimeLen() ;
+	// we will need to ignore fields like the latency etc.
+	// perhaps pass that in as a cgi parm. &qa=1
+	long crc = hash32n ( page );
+	if ( crc != s_checksums[ii] ) {
+		log("qatest: query '%s' checksum %lu != %lu",
+		    s_queries[ii],
+		    s_checksums[ii],
+		    crc);
+		s_failures++;
+	}
+	// resume the qa loop
+	qatest();
+}
+		
+
+// ensure search results are consistent
+bool searchTest1 () {
+	long nq = sizeof(s_queries)/sizeof(char *);
+	for ( ; s_qi1 < nq ; ) {
+		// pre-inc it
+		s_qi1++;
+		// inject using html api
+		SafeBuf sb;
+		// qa=1 tell gb to exclude "variable" or "random" things
+		// from the serps so we can checksum it consistently
+		sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
+		sb.urlEncode ( s_queries[s_qi1] );
+		return getUrl ( sb.getBufStart() , doneSearching1 );
+	}
+	return true;
+}	
+
+static long s_qi2 = 0;
+
+void doneSearching2 ( void *state , TcpSocket *sock ) {
+	//loadQueries1();
+	long ii = s_qi2 - 1;
+	// get checksum of it
+	HttpMime hm;
+	hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
+	char *page = sock->m_readBuf + hm.getMimeLen() ;
+	// we will need to ignore fields like the latency etc.
+	// perhaps pass that in as a cgi parm. &qa=1
+	long crc = hash32n ( page );
+	if ( crc != s_checksums[ii] ) {
+		log("qatest: query '%s' checksum %lu != %lu",
+		    s_queries[ii],
+		    s_checksums[ii],
+		    crc);
+		s_failures++;
+	}
+	// resume the qa loop
+	qatest();
+}
+		
+
+// ensure search results are consistent
+bool searchTest2 () {
+	long nq = sizeof(s_queries)/sizeof(char *);
+	for ( ; s_qi2 < nq ; ) {
+		// pre-inc it
+		s_qi2++;
+		// inject using html api
+		SafeBuf sb;
+		// qa=1 tell gb to exclude "variable" or "random" things
+		// from the serps so we can checksum it consistently
+		sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
+		sb.urlEncode ( s_queries[s_qi2] );
+		return getUrl ( sb.getBufStart() , doneSearching2 );
+	}
+	return true;
+}	
+
+bool deleteUrls ( ) {
+	static long s_ii2 = 0;
+	for ( ; s_ii2 < s_numUrls ; ) {
+		// pre-inc it
+		s_ii2++;
+		// reject using html api
+		SafeBuf sb;
+		sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u=");
+		sb.urlEncode ( s_urlPtrs[s_ii2] );
+		return getUrl ( sb.getBufStart() , qatestWrapper );
+	}
+	return true;
+}
+
+#include "Msg0.h"
+static Msg0 s_msg0;
+static RdbList s_list;
+
+void gotList33 ( void *state ) {
+	long *rdbId = (long *)state;
+	if ( ! s_list.isEmpty() ) {
+		log("qa: delete failed. list is not empty rdbid=%li.",*rdbId);
+		s_failures++;
+	}
+	// resume main loop
+	qatest();
+}
+
+// scan all Rdb databases and ensure no recs (it was a clean delete)
+bool checkRdbLists ( long *rdbId ) {
+	CollectionRec *cr = g_collectiondb.getRec("qatest123");
+	if ( ! cr ) return true;
+	collnum_t cn = cr->m_collnum;
+	for ( ; *rdbId < RDB_END ; ) {
+		// pre-inc it
+		*rdbId = *rdbId + 1;
+		char minKey[MAX_KEY_BYTES];
+		char maxKey[MAX_KEY_BYTES];
+	        KEYMIN(minKey,MAX_KEY_BYTES);
+	        KEYMAX(maxKey,MAX_KEY_BYTES);
+		if ( ! s_msg0.getList ( 0 , // hostid
+					0 , // ip
+					0 , // port
+					0 , // cacheage
+					false, // addtocache
+					*rdbId , // rdbid
+					cn , // collnum
+					&s_list ,
+					minKey ,
+					maxKey ,
+					1000 , // minrecsizes
+					rdbId , // state
+					gotList33,
+					0 // niceness
+					) )
+			return false;
+	}
+	return true;
+}
+
+// once we have triggered the dump this will cause all rdbs to tightmerge
+void doneDumping ( void *state , TcpSocket *sock ) {
+	CollectionRec *cr = g_collectiondb.getRec("qatest123");
+	if ( ! cr ) { qatest(); return; }
+	// tight merge the rdb that was dumped
+	for ( long i = 0 ; i < RDB_END ; i++ ) {
+		Rdb *rdb = getRdbFromId ( i );
+		if ( ! rdb ) continue;
+		RdbBase *base = rdb->getBase ( cr->m_collnum );
+		if ( ! base ) continue;
+		// . force a tight merge as soon as dump completes
+		// . the dump should already be going
+		base->m_nextMergeForced = true;
+	}
+	// wait for tight merges to complete now
+	qatest();
+}
+
+bool dumpTreesToDisk () {
+	static bool s_done = false;
+	if ( s_done ) return true;
+	s_done = true;
+	// force dump data to disk. dumps all rdbs.
+	return getUrl("/admin/master?dump=1",doneDumping );
+}
+
+void doneAddingUrls ( void *state ) {
+	qatest();
+}
+
+void sleepCallback ( int fd , void *state ) {
+	qatest();
+}
+
+// check every second to see if merges are done
+bool waitForMergeToFinish ( ) {
+	// if registered
+	static bool s_registered = false;
+	if ( s_registered ) {
+		g_loop.unregisterSleepCallback ( NULL , sleepCallback );
+		s_registered = false;
+	}
+	CollectionRec *cr = g_collectiondb.getRec("qatest123");
+	if ( ! cr ) { qatest(); return true; }
+	// tight merge the rdb that was dumped
+	long i; for ( i = 0 ; i < RDB_END ; i++ ) {
+		Rdb *rdb = getRdbFromId ( i );
+		if ( ! rdb ) continue;
+		RdbBase *base = rdb->getBase ( cr->m_collnum );
+		if ( ! base ) continue;
+		// . force a tight merge as soon as dump completes
+		// . the dump should already be going
+		if ( base->m_nextMergeForced ) return false;
+		// still waiting on this merge
+		break;
+	}
+	// if not still waiting return true
+	if ( i >= RDB_END ) return true;
+	// sleep for 1 second
+	g_loop.registerSleepCallback ( 1000 , // 1000 ms
+				       NULL , // state
+				       sleepCallback ,
+				       0 ); // niceness
+	s_registered = true;
+	return false;
+}
+
+bool resetColl ( ) {
+	static bool s_flag = false;
+	if ( s_flag ) return true;
+	s_flag = true;
+	// also turn spiders on
+	return getUrl("/admin/master?reset=qatest123&se=1", qatestWrapper );
+}
+	
+bool addUrlTest ( ) {
+	static bool s_flag = false;
+	if ( s_flag ) return true;
+	s_flag = true;
+	return getUrl ( "/admin/addurl"
+			"?c=qatest123&u=www.dmoz.org+www.ibm.com+"
+			"www.diffbot.com"
+			, qatestWrapper );
+}
+
+// check every second to see if spidering phase is completed
+bool checkSpidersDone ( ) {
+	// if registered
+	static bool s_registered = false;
+	if ( s_registered ) {
+		g_loop.unregisterSleepCallback ( NULL , sleepCallback );
+		s_registered = false;
+	}
+	// we have to adjust this once we know how many pages we'll archive
+	CollectionRec *cr = g_collectiondb.getRec("qatest123");
+	if ( ! cr ) { qatest(); return true; }
+	// return true if all done
+	if ( cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound >= 200 )
+		return true;
+	// sleep for 1 second
+	g_loop.registerSleepCallback ( 1000 , // 1000 ms
+				       NULL , // state
+				       sleepCallback ,
+				       0 ); // niceness
+	s_registered = true;
+	return false;
+}
+
+bool delColl ( ) {
+	static bool s_flag = false;
+	if ( s_flag ) return true;
+	s_flag = true;
+	return getUrl ( "/admin/delcoll?c=qatest123" , qatestWrapper );
+}
+
+
+static long s_rdbId1 = 0;
+static long s_rdbId2 = 0;
+//static long s_rdbId3 = 0;
+
+// . run a series of tests to ensure that gb is functioning properly
+// . use s_urls[] array of urls for injecting and spider seeding
+// . contain an archive copy of all webpages in the injectme3 file and
+//   in pagearchive1.txt file
+// . while initially spidering store pages in pagearchive1.txt so we can
+//   replay later. store up to 100,000 pages in there.
+bool qatest ( ) {
+
+	// add the 'qatest123' collection
+	if ( ! addColl () ) return false;
+
+	// inject urls, return false if not done yet
+	if ( ! injectUrls ( ) ) return false;
+
+	// test search results
+	if ( ! searchTest1 () ) return false;
+
+	// delete all urls cleanly now
+	if ( ! deleteUrls ( ) ) return false;
+
+	// now get rdblist for every rdb for this coll and make sure all zero!
+	if ( ! checkRdbLists ( &s_rdbId1 ) ) return false;
+
+	// dump, tight merge and ensure no data in our rdbs for this coll
+	if ( ! dumpTreesToDisk() ) return false;
+
+	// wait for tight merge to complete
+	if ( ! waitForMergeToFinish() ) return false;
+
+	// now get rdblist for every rdb for this coll and make sure all zero!
+	if ( ! checkRdbLists ( &s_rdbId2 ) ) return false;
+
+	// reset the collection so we can test spidering
+	if ( ! resetColl ( ) ) return false;
+
+	// add urls to seed spider with. make msg13.cpp recognize qatest123
+	// collection and return 404 on urls not in our official list so
+	// we can ensure search result consistency. msg13.cpp will initially
+	// store the pages in a file, like the first 1,000 or so pages.
+	if ( ! addUrlTest () ) return false;
+
+	// wait for spidering to complete. sleep callback. # of spidered urls
+	// will be x, so we know when to stop
+	if ( ! checkSpidersDone() ) return false;
+
+	// . now search again on the large collection most likely
+	// . store search queries and checksum into queries2.txt
+	// . a 0 (or no) checksum means we should fill it in
+	if ( ! searchTest2 () ) return false;
+
+	// try a query delete
+	//if ( ! queryDeleteTest() ) return false;
+
+	// ensure empty
+	//if ( ! checkRdbLists ( &s_rdbId3 ) ) return false;
+
+	// delete the collection
+	if ( ! delColl() ) return false;
+
+	return true;
+}