Merge branch 'testing' into diffbot-matt

Conflicts: Errno.cpp Errno.h Parms.h
2024-10-04 04:07:13 +03:00 · 2014-07-07 09:49:59 -07:00 · 2014-07-07 09:49:59 -07:00 · 6434e5cc04
commit 6434e5cc04
parent 05065f7f8c e22641997a
54 changed files with 2718 additions and 825 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -1962,7 +1962,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
 	m_harvestLinks[n] = 1;
 	*/

-	m_regExs[n].set("isdocidbased");
+	m_regExs[n].set("isreindex");
 	m_harvestLinks       [n] = 1;
 	m_spiderFreqs        [n] = 0; // 30 days default
 	m_maxSpidersPerRule  [n] = 99; // max spiders
@ -2198,7 +2198,7 @@ bool CollectionRec::rebuildChineseRules ( ) {

 	long n = 0;

-	m_regExs[n].set("isdocidbased");
+	m_regExs[n].set("isreindex");
 	m_harvestLinks       [n] = 1;
 	m_spiderFreqs        [n] = 0; // 30 days default
 	m_maxSpidersPerRule  [n] = 99; // max spiders
@ -3029,7 +3029,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 	long i = 0;

 	// 1st one! for query reindex/ query delete
-	m_regExs[i].set("isdocidbased");
+	m_regExs[i].set("isreindex");
 	m_spiderIpMaxSpiders [i] = 10;
 	m_spiderPriorities   [i] = 70;
 	i++;
--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -426,6 +426,9 @@ class CollectionRec {
 	long   m_spiderRoundNum;

 	char  m_makeImageThumbnails;
+
+	long m_thumbnailMaxWidthHeight ;
+
 	char  m_indexSpiderReplies;
 	char  m_indexBody;

@ -616,7 +619,6 @@ class CollectionRec {
 	long m_summaryMaxLen;
 	long m_summaryMaxNumLines;
 	long m_summaryMaxNumCharsPerLine;
-	long m_summaryDefaultNumLines;
 	char m_useNewSummaries;

 	char m_getDocIdScoringInfo;
--- a/Dates.cpp
+++ b/Dates.cpp
@ -17781,6 +17781,65 @@ TimeZone tzs[] = {
 // hash table of timezone information
 static HashTableX s_tzt;

+static long long h_mountain;
+static long long h_eastern;
+static long long h_central;
+static long long h_pacific;
+static long long h_time2;
+static long long h_mdt;
+static long long h_at2;
+
+bool initTimeZoneTable ( ) {
+
+	// if already initalized return true
+	if ( s_tzt.m_numSlotsUsed ) return true;
+
+	// init static wids
+	h_mountain = hash64n("mountain");
+	h_eastern  = hash64n("eastern");
+	h_central  = hash64n("central");
+	h_pacific  = hash64n("pacific");
+	h_time2    = hash64n("time");
+	h_mdt      = hash64n("mdt");
+	h_at2      = hash64n("at");
+	// set up the time zone hashtable
+	if ( ! s_tzt.set( 8,4, 300,NULL,0,false,0,"tzts"))
+		return false;
+	// load time zone names and their modifiers into hashtable
+	for ( long i = 0 ; *tzs[i].m_name ; i++ ) {
+		char *t    = tzs[i].m_name;
+		long  tlen = gbstrlen(t);
+		// hash like Words.cpp computeWordIds
+		uint64_t h    = hash64Lower_utf8( t , tlen );
+		// use the ptr as the value
+		if ( ! s_tzt.addKey ( &h, &tzs[i] ) )
+			return false;
+	}
+	return true;
+}
+
+// return what we have to add to UTC to get time in locale specified by "s"
+// where "s" is like "PDT" "MST" "EST" etc. if unknown return 999999
+long getTimeZone ( char *s ) {
+	if ( ! s ) return BADTIMEZONE;
+	char *send = s;
+	// point to end of the potential timezone
+	for ( ; *send && isalnum(*send) ; send++ );
+	// hash it
+	uint64_t h = hash64Lower_utf8( s , send -s );
+	// make sure table is ready
+	initTimeZoneTable();
+	// look it up
+	long slot = s_tzt.getSlot( &h );
+	if ( slot < 0 ) return 999999;
+	// did we find it in the table?
+	TimeZone *tzptr = (TimeZone *)s_tzt.getValueFromSlot ( slot );
+	// no error, return true
+	long secs = tzptr->m_hourMod * 3600;
+	secs += tzptr->m_minMod * 60;
+	return secs;
+}
+
 // . returns how many words starting at i are in the time zone
 // . 0 means not a timezone
 long getTimeZoneWord ( long i ,
@ -17793,40 +17852,14 @@ long getTimeZoneWord ( long i ,
 	*tzptr = NULL;
 	// only init table once
 	bool s_init16 = false;
-	static long long h_mountain;
-	static long long h_eastern;
-	static long long h_central;
-	static long long h_pacific;
-	static long long h_time;
-	static long long h_mdt;
-	static long long h_at;
 	// init the hash table of month names
 	if ( ! s_init16 ) {
-		// init static wids
-		h_mountain = hash64n("mountain");
-		h_eastern  = hash64n("eastern");
-		h_central  = hash64n("central");
-		h_pacific  = hash64n("pacific");
-		h_time     = hash64n("time");
-		h_mdt      = hash64n("mdt");
-		h_at       = hash64n("at");
-		// set up the time zone hashtable
-		if ( ! s_tzt.set( 8,4, 300,NULL,0,false,niceness,"tzts"))
-			return -1;
-		// load time zone names and their modifiers into hashtable
-		for ( long i = 0 ; *tzs[i].m_name ; i++ ) {
-			char *t    = tzs[i].m_name;
-			long  tlen = gbstrlen(t);
-			// hash like Words.cpp computeWordIds
-			uint64_t h    = hash64Lower_utf8( t , tlen );
-			// use the ptr as the value
-			if ( ! s_tzt.addKey ( &h, &tzs[i] ) )
-				return -1;
-		}
+		// on error we return -1 from here
+		if ( ! initTimeZoneTable() ) return -1;
 		s_init16 = true;
 	}
 	// this is too common of a word!
-	if ( wids[i] == h_at ) return 0;
+	if ( wids[i] == h_at2 ) return 0;

 	long slot = s_tzt.getSlot( &wids[i] );
 	// return this, assume just one word
@ -17834,7 +17867,7 @@ long getTimeZoneWord ( long i ,
 	// . "mountain time"
 	// . this removes the event title "M-F 8:30 AM-5:30 PM Mountain Time"
 	//   from the event (horus) on http://www.sfreporter.com/contact_us/
-	if ( slot<0 && i+2<nw && wids[i+2] == h_time ) {
+	if ( slot<0 && i+2<nw && wids[i+2] == h_time2 ) {
 		if ( wids[i] == h_mountain ) {
 			slot = s_tzt.getSlot (&h_mdt);
 			tznw = 3;
--- a/Dates.h
+++ b/Dates.h
@ -794,6 +794,7 @@ public:
 	bool m_isSiteRoot ;
 };

+
 // now time zones
 struct TimeZone {
 	char m_name[16];
@ -803,6 +804,13 @@ struct TimeZone {
        long m_modType;
 };

+#define BADTIMEZONE 999999
+
+// "s" is the timezone, like "EDT" and we return # of secs to add to UTC
+// to get the current time in that time zone.
+// returns BADTIMEZONE if "s" is unknown timezone
+long getTimeZone ( char *s ) ;
+
 // . returns how many words starting at i are in the time zone
 // . 0 means not a timezone
 long getTimeZoneWord ( long i , long long *wids , long nw , 
--- a/Errno.cpp
+++ b/Errno.cpp
@ -170,8 +170,21 @@ case	EDOCNONCANONICAL: return "Url was dup of canonical page";
 case    ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
 case    ENOTOKEN: return "Missing token";
 case    EBADIMG: return "Bad image";
-case	EREINDEXREDIR: return "Not parent url to reindex";
+case	EREINDEXREDIR: return "Not a reindexable doc";
 case	ETOOMANYPARENS: return "Too many nested parentheses in boolean query";
+case EDIFFBOTUNABLETOAPPLYRULES: return "Diffbot unable to apply rules";
+case EDIFFBOTCOULDNOTPARSE: return "Diffbot could not parse page";
+case EDIFFBOTCOULDNOTDOWNLOAD: return "Diffbot could not download page";
+case EDIFFBOTINVALIDAPI: return "Diffbot invalid API";
+case EDIFFBOTVERSIONREQ: return "Diffbot version required";
+case EDIFFBOTEMPTYCONTENT: return "Diffbot empty content";
+case EDIFFBOTREQUESTTIMEDOUT: return "Diffbot request timed out";
+case EDIFFBOTURLPROCESSERROR: return "Diffbot error processing url";
+case EDIFFBOTTOKENEXPIRED: return "Diffbot token expired";
+case EDIFFBOTUNKNOWNERROR: return "Diffbot unknown error";
+case EMISSINGINPUT: return "Missing required input parms";
+case EDMOZNOTREADY: return "Dmoz is not setup, follow instructions in "
+			"admin.html to setup";
 case	EPROXYSSLCONNECTFAILED: return "SSL tunnel through HTTP proxy failed";
 	}
 	// if the remote error bit is clear it must be a regulare errno
--- a/Errno.h
+++ b/Errno.h
@ -176,6 +176,20 @@ enum {
 	EBADIMG,
 	EREINDEXREDIR,
 	ETOOMANYPARENS,
+
+	EDIFFBOTUNABLETOAPPLYRULES,
+	EDIFFBOTCOULDNOTPARSE,
+	EDIFFBOTCOULDNOTDOWNLOAD,
+	EDIFFBOTINVALIDAPI,
+	EDIFFBOTVERSIONREQ,
+	EDIFFBOTEMPTYCONTENT,
+	EDIFFBOTREQUESTTIMEDOUT,
+	EDIFFBOTURLPROCESSERROR,
+	EDIFFBOTTOKENEXPIRED,
+	EDIFFBOTUNKNOWNERROR,
+
+	EMISSINGINPUT,
+	EDMOZNOTREADY,
 	EPROXYSSLCONNECTFAILED
 };
 #endif
--- a/HttpMime.cpp
+++ b/HttpMime.cpp
@ -237,6 +237,8 @@ time_t atotime ( char *s ) {
 	return atotime3 ( s );
 }

+#include "Dates.h" // for getTimeZone()
+
 // #1: Sun, 06 Nov 1994 08:49:37 GMT  ;RFC 822, updated by RFC 1123
 time_t atotime1 ( char *s ) {
 	// this time structure, once filled, will help yield a time_t
@ -258,8 +260,20 @@ time_t atotime1 ( char *s ) {
 	getTime ( s , &t.tm_sec , &t.tm_min , &t.tm_hour );
 	// unknown if we're in  daylight savings time
 	t.tm_isdst = -1;
+
 	// translate using mktime
-	time_t local = mktime ( &t );
+	time_t global = timegm ( &t );
+
+	// skip HH:MM:SS
+	while ( ! isspace (*s) ) s++;	
+	// skip spaces
+	while ( isspace (*s) ) s++;
+	// convert local time to "utc" or whatever timezone "s" points to,
+	// which is usually gmt or utc
+	long tzoff = getTimeZone ( s ) ;
+	if ( tzoff != BADTIMEZONE ) global += tzoff;
+	return global;
+
 	// now, convert to utc
 	//time_t utc  = time(NULL);
 	// get time here locally
@ -268,7 +282,6 @@ time_t atotime1 ( char *s ) {
 	//long delta = here - utc;
 	// modify our time to make it into utc
 	//return local - delta;
-	return local;
 }

 // #2: Sunday, 06-Nov-94 08:49:37 GMT ;RFC 850,obsoleted by RFC1036
@ -293,7 +306,17 @@ time_t atotime2 ( char *s ) {
 	// unknown if we're in  daylight savings time
 	t.tm_isdst = -1;
 	// translate using mktime
-	return mktime ( &t );
+	time_t global = timegm ( &t );
+
+	// skip HH:MM:SS
+	while ( ! isspace (*s) ) s++;	
+	// skip spaces
+	while ( isspace (*s) ) s++;
+	// convert local time to "utc" or whatever timezone "s" points to,
+	// which is usually gmt or utc
+	long tzoff = getTimeZone ( s ) ;
+	if ( tzoff != BADTIMEZONE ) global += tzoff;
+	return global;
 }

 // #3: Sun Nov  6 08:49:37 1994       ;ANSI C's asctime() format
@ -319,7 +342,7 @@ time_t atotime3 ( char *s ) {
 	// unknown if we're in  daylight savings time
 	t.tm_isdst = -1;
 	// translate using mktime
-	time_t tt = mktime ( &t );
+	time_t tt = timegm ( &t );
 	return tt;
 }

@ -346,7 +369,17 @@ time_t atotime4 ( char *s ) {
 	// unknown if we're in  daylight savings time
 	t.tm_isdst = -1;
 	// translate using mktime
-	return mktime ( &t );
+	time_t global = timegm ( &t );
+
+	// skip HH:MM:SS
+	while ( ! isspace (*s) ) s++;	
+	// skip spaces
+	while ( isspace (*s) ) s++;
+	// convert local time to "utc" or whatever timezone "s" points to,
+	// which is usually gmt or utc
+	long tzoff = getTimeZone ( s ) ;
+	if ( tzoff != BADTIMEZONE ) global += tzoff;
+	return global;
 }

 // 2007-12-31
@ -387,7 +420,7 @@ time_t atotime5 ( char *s ) {
 	// unknown if we're in  daylight savings time
 	t.tm_isdst = -1;
 	// translate using mktime
-	return mktime ( &t );
+	return timegm ( &t );
 }


--- a/HttpRequest.cpp
+++ b/HttpRequest.cpp
@ -6,18 +6,63 @@
 HttpRequest::HttpRequest () { m_cgiBuf = NULL; m_cgiBuf2 = NULL; reset(); }
 HttpRequest::~HttpRequest() { reset();      }

+
 char HttpRequest::getReplyFormat() {
 	if ( m_replyFormatValid ) return m_replyFormat;
-	char *fs = getString("format",NULL,NULL);
-	char fmt = FORMAT_HTML;
-	if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
-	if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
-	if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
-	m_replyFormat = fmt;
+
+	char *formatStr = getString("format");
+
+	char format = -1;//FORMAT_HTML;
+
+	// what format should search results be in? default is html
+	if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
+	if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
+	if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
+	if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
+	if ( formatStr && strcmp(formatStr,"iframe")==0)
+		format=FORMAT_WIDGET_IFRAME;
+	if ( formatStr && strcmp(formatStr,"ajax")==0)
+		format=FORMAT_WIDGET_AJAX;
+	if ( formatStr && strcmp(formatStr,"append")==0)
+		format=FORMAT_WIDGET_APPEND;
+
+	// support old api &xml=1 to mean &format=1
+	if ( getLong("xml",0) ) {
+		format = FORMAT_XML;
+	}
+
+	// also support &json=1
+	if ( getLong("json",0) ) {
+		format = FORMAT_JSON;
+	}
+
+	if ( getLong("csv",0) ) {
+		format = FORMAT_CSV;
+	}
+
+	if ( getLong("iframe",0) ) {
+		format = FORMAT_WIDGET_IFRAME;
+	}
+
+	if ( getLong("ajax",0) ) {
+		format = FORMAT_WIDGET_AJAX;
+	}
+
+	if ( getLong("append",0) ) {
+		format = FORMAT_WIDGET_APPEND;
+	}
+
+	// default to html
+	if ( format == -1 ) 
+		format = FORMAT_HTML;
+
+	m_replyFormat = format;
 	m_replyFormatValid = true;
-	return m_replyFormat;
+
+	return format;
 }

+
 void HttpRequest::reset() {
 	m_numFields = 0;
 	m_replyFormatValid = false;
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -1562,6 +1562,77 @@ void cleanUp ( void *state , TcpSocket *s ) {
 	if ( s && s->m_state == f ) s->m_state = NULL;
 }

+bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
+	// get time in secs since epoch
+	time_t now ;
+	if ( isClockInSync() ) now = getTimeGlobal();
+	else                   now = getTimeLocal();
+	// . buffer for the MIME request and brief html err msg
+	// . NOTE: ctime appends a \n to the time, so we don't need to
+	char msg[1024];
+	SafeBuf sb(msg,1024,0,false);
+
+	char *tt = asctime(gmtime ( &now ));
+	tt [ gbstrlen(tt) - 1 ] = '\0';
+
+	char *ct = "text/html";
+	if ( format == FORMAT_XML  ) ct = "text/xml";
+	if ( format == FORMAT_JSON ) ct = "application/json";
+
+	char cbuf[1024];
+	SafeBuf cb(cbuf,1024,0,false);
+
+	if ( format != FORMAT_XML && format != FORMAT_JSON )
+		cb.safePrintf("<html><b>Success</b></html>");
+
+	if ( format == FORMAT_XML ) {
+		cb.safePrintf("<response>\n"
+			      "\t<statusCode>0</statusCode>\n"
+			      "\t<statusMsg><![CDATA[Success]]>"
+			      "</statusMsg>\n");
+	}
+
+	if ( format == FORMAT_JSON ) {
+		cb.safePrintf("{\"response\":{\n"
+			      "\t\"statusCode\":0,\n"
+			      "\t\"statusMsg\":\"Success\",\n" );
+	}
+
+	if ( addMsg )
+		cb.safeStrcpy(addMsg);
+
+
+	if ( format == FORMAT_XML ) {
+		cb.safePrintf("</response>\n");
+	}
+
+	if ( format == FORMAT_JSON ) {
+		// erase trailing ,\n
+		cb.m_length -= 2;
+		cb.safePrintf("\n"
+			      "}\n"
+			      "}\n");
+	}
+
+
+	sb.safePrintf(
+		      "HTTP/1.0 200 (OK)\r\n"
+		      "Content-Length: %li\r\n"
+		      "Connection: Close\r\n"
+		      "Content-Type: %s\r\n"
+		      "Date: %s UTC\r\n\r\n"
+		      , cb.length()
+		      , ct
+		      , tt );
+
+	sb.safeMemcpy ( &cb );
+
+	// use this new function that will compress the reply now if the
+	// request was a ZET instead of a GET
+	return sendReply2 ( msg , sb.length() , NULL , 0 , s );
+}
+
+
 // . send an error reply, like "HTTP/1.1 404 Not Found"
 // . returns false if blocked, true otherwise
 // . sets g_errno on error
@ -1578,9 +1649,16 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
 	time_t now ;//= getTimeGlobal();
 	if ( isClockInSync() ) now = getTimeGlobal();
 	else                   now = getTimeLocal();
+
+	// this kinda sucks that we have to do it twice...
+	HttpRequest hr;
+	hr.set ( s->m_readBuf , s->m_readOffset , s ) ;
+	char format = hr.getReplyFormat();
+
 	// . buffer for the MIME request and brief html err msg
 	// . NOTE: ctime appends a \n to the time, so we don't need to
 	char msg[1024];
+	SafeBuf sb(msg,1024,0,false);
 	// if it's a 404, redirect to home page
 	/*
 	if ( error == 404 ) 
@ -1595,26 +1673,61 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
 	*/
 	char *tt = asctime(gmtime ( &now ));
 	tt [ gbstrlen(tt) - 1 ] = '\0';
-	sprintf ( msg , 
-		  "HTTP/1.0 %li (%s)\r\n"
-		  "Content-Length: %li\r\n"
-		  "Connection: Close\r\n"
-		  "Date: %s UTC\r\n\r\n"
-		  "<html><b>Error = %s</b></html>",
-		  error  ,
-		  errmsg ,
-		  (long)(gbstrlen("<html><b>Error = </b></html>")+
-			 gbstrlen(errmsg)),
-		  tt , // ctime ( &now ) ,
-                  errmsg );
+
+	char *ct = "text/html";
+	if ( format == FORMAT_XML  ) ct = "text/xml";
+	if ( format == FORMAT_JSON ) ct = "application/json";
+
+	SafeBuf xb;
+
+	if ( format != FORMAT_XML && format != FORMAT_JSON )
+		xb.safePrintf("<html><b>Error = %s</b></html>",errmsg );
+
+	if ( format == FORMAT_XML ) {
+		xb.safePrintf("<response>\n"
+			      "\t<statusCode>%li</statusCode>\n"
+			      "\t<statusMsg><![CDATA[", error );
+		xb.cdataEncode(errmsg );
+		xb.safePrintf("]]></statusMsg>\n"
+			      "</response>\n");
+	}
+
+	if ( format == FORMAT_JSON ) {
+		xb.safePrintf("{\"response\":{\n"
+			      "\t\"statusCode\":%li,\n"
+			      "\t\"statusMsg\":\"", error );
+		xb.jsonEncode(errmsg );
+		xb.safePrintf("\"\n"
+			      "}\n"
+			      "}\n");
+	}
+
+	sb.safePrintf(
+		      "HTTP/1.0 %li (%s)\r\n"
+		      "Content-Length: %li\r\n"
+		      "Connection: Close\r\n"
+		      "Content-Type: %s\r\n"
+		      "Date: %s UTC\r\n\r\n"
+		      ,
+		      error  ,
+		      errmsg ,
+
+		      xb.length(),
+
+		      ct ,
+		      tt ); // ctime ( &now ) ,
+
+
+	sb.safeMemcpy ( &xb );
+
 	// . move the reply to a send buffer
 	// . don't make sendBuf bigger than g_conf.m_httpMaxSendBufSize
-	long msgSize    = gbstrlen ( msg );
+	//long msgSize    = gbstrlen ( msg );
 	// record it
-	if ( bytesSent ) *bytesSent = msgSize;//sendBufSize;
+	if ( bytesSent ) *bytesSent = sb.length();//sendBufSize;
 	// use this new function that will compress the reply now if the
 	// request was a ZET instead of a GET
-	return sendReply2 ( msg , msgSize , NULL , 0 , s );
+	return sendReply2 ( msg , sb.length() , NULL , 0 , s );

 	/*
 	// . this returns false if blocked, true otherwise
@ -1640,6 +1753,11 @@ bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
 				      //long  rawFormat, 
 				      char format ,
 				      int errnum, char *content) {
+
+	// just use this for now. it detects the format already...
+	return sendErrorReply ( s,error,errmsg,NULL);
+
+	/*
 	// clear g_errno so the send goes through
 	g_errno = 0;
 	// get time in secs since epoch
@ -1707,6 +1825,7 @@ bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
 	long msgSize    = gbstrlen ( msg );

 	return sendReply2 ( msg , msgSize , NULL , 0 , s );
+	*/

 	/*
 	long sendBufSize = msgSize;
--- a/HttpServer.h
+++ b/HttpServer.h
@ -135,6 +135,8 @@ class HttpServer {
 	// send an error reply, like "HTTP/1.1 404 Not Found"
 	bool sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
 			      long *bytesSent = NULL ); 
+	// xml and json uses this
+	bool sendSuccessReply (TcpSocket *s , char format , char *addMsg=NULL);
 	// send a "prettier" error reply, formatted in XML if necessary
 	bool sendQueryErrorReply ( TcpSocket *s , long error , char *errmsg,
 				   // FORMAT_HTML=0,FORMAT_XML,FORMAT_JSON
--- a/Images.cpp
+++ b/Images.cpp
@ -91,6 +91,58 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
 	// best candidate, and just use that
 	if ( xd->m_isDiffbotJSONObject ) return;

+	//
+	// first add any open graph candidate.
+	// basically they page telling us the best image straight up.
+	//
+
+	long node2 = -1;
+	long startNode = 0;
+
+	// . field can be stuff like "summary","description","keywords",...
+	// . if "convertHtmlEntites" is true we change < to &lt; and > to &gt;
+	// . <meta property="og:image" content="http://example.com/rock2.jpg"/>
+	// . <meta property="og:image" content="http://example.com/rock3.jpg"/>
+ ogimgloop:
+	char ubuf[2000];
+	long ulen = xml->getMetaContent ( ubuf , // store the val here
+					  1999 ,
+					  "og:image",
+					  8,
+					  "property",
+					  false, // convertHtmlEntities
+					  startNode ,
+					  &node2 ); // matchedNode
+	// update this in case goto ogimgloop is called
+	startNode = node2 + 1;
+	// see section below for explanation of what we are storing here...
+	if ( node2 >= 0 ) {
+		// save it
+		m_imageNodes[m_numImages] = node2;
+		Query q;
+		if ( ulen > MAX_URL_LEN ) goto ogimgloop;
+		// set it to the full url
+		Url iu;
+		// use "pageUrl" as the baseUrl
+		iu.set ( pageUrl , ubuf , ulen );
+		// skip if invalid domain or TLD
+		if ( iu.getDomainLen() <= 0 ) goto ogimgloop;
+		// for looking it up on disk to see if unique or not
+		char buf[2000];
+		snprintf ( buf , 1999, "gbimage:%s",iu.getUrl());
+		// TODO: make sure this is a no-split termid storage thingy
+		// in Msg14.cpp
+		if ( ! q.set2 ( buf , langUnknown , false ) ) return;
+		// store the termid
+		m_termIds[m_numImages] = q.getTermId(0);
+		// advance the counter
+		m_numImages++;
+		// try to get more graph images if we have some room
+		if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop;
+	}
+	
+
+
 	//m_pageSite  = pageSite;
 	// scan the words
 	long       nw     = words->getNumWords();
@ -530,7 +582,7 @@ bool Images::downloadImages () {
 				// get img tag node
 				node = m_imageNodes[m_j];
 				// get the url of the image
-				src = m_xml->getString(node,"src",&srcLen);
+				src = getImageUrl ( m_j , &srcLen );
 				// use "pageUrl" as the baseUrl
 				m_imageUrl.set ( m_pageUrl , src , srcLen ); 
 			}
@ -755,8 +807,7 @@ bool Images::makeThumb ( ) {
 		srcLen = gbstrlen(src);
 	}
 	else {
-		long node = m_imageNodes[m_j];
-		src = m_xml->getString(node,"src",&srcLen);
+		src = getImageUrl ( m_j , &srcLen );
 	}
 	// set it to the full url
 	Url iu;
@ -848,6 +899,16 @@ bool Images::makeThumb ( ) {
 		}
 	}

+	CollectionRec *cr = g_collectiondb.getRec(m_collnum);
+	if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
+
+	// save how big of thumbnails we should make. user can change
+	// this in the 'spider controls'
+	m_xysize = cr->m_thumbnailMaxWidthHeight ;
+	// make it 250 pixels if no decent value provided
+	if ( m_xysize <= 0 ) m_xysize = 250;
+	// and keep it sane
+	if ( m_xysize > 2048 ) m_xysize = 2048;

 	// update status
 	if ( m_xd ) m_xd->setStatus ( "making thumbnail" );
@ -897,16 +958,18 @@ void Images::thumbStart_r ( bool amThread ) {
 	long id = getpidtid();

 	// pass the input to the program through this file
-	// rather than a pipe, since popen() seems broken
+	// rather than a pipe, since popen() seems broken.
+	// m_dir ends in / so this should work.
 	char in[364];
-	snprintf ( in , 363,"%strash/in.%li", g_hostdb.m_dir, id );
+	snprintf ( in , 363,"%strashin.%li", g_hostdb.m_dir, id );
 	unlink ( in );

 	log( LOG_DEBUG, "image: thumbStart_r create in file." );

 	// collect the output from the filter from this file
+	// m_dir ends in / so this should work.
 	char out[364];
-	snprintf ( out , 363,"%strash/out.%li", g_hostdb.m_dir, id );
+	snprintf ( out , 363,"%strashout.%li", g_hostdb.m_dir, id );
        unlink ( out );

 	log( LOG_DEBUG, "image: thumbStart_r create out file." );
@ -964,23 +1027,48 @@ void Images::thumbStart_r ( bool amThread ) {
 		       break;
        } 

-	long xysize = 250;//100;
+	//long xysize = 250;//100;
 	// make thumbnail a little bigger for diffbot for widget
-	if ( m_xd->m_isDiffbotJSONObject ) xysize = 250;
+	//if ( m_xd->m_isDiffbotJSONObject ) xysize = 250;

 	// i hope 2500 is big enough!
 	char  cmd[2501];

 	//sprintf( cmd, scmd, ext, in, out);
 	char *wdir = g_hostdb.m_dir;
+	// can be /dev/stderr or like /var/gigablast/data/log000 etc.
+	char *logFile = g_log.getFilename();
+	// wdir ends in / so this should work.
 	snprintf( cmd, 2500 ,
-		 "LD_LIBRARY_PATH=%s %s/%stopnm %s | "
-		 "LD_LIBRARY_PATH=%s %s/pnmscale -xysize %li %li - | "
-		 "LD_LIBRARY_PATH=%s %s/ppmtojpeg - > %s"
-		 , wdir , wdir , ext , in
-		 , wdir , wdir , xysize , xysize
-		 , wdir , wdir , out
+		 "LD_LIBRARY_PATH=%s %s%stopnm %s 2>> %s | "
+		 "LD_LIBRARY_PATH=%s %spnmscale -xysize %li %li - 2>> %s | "
+		  // put all its stderr msgs into /dev/null
+		  // so "jpegtopnm: WRITING PPM FILE" doesn't clog console
+		 "LD_LIBRARY_PATH=%s %sppmtojpeg - > %s 2>> %s"
+		  , wdir , wdir , ext , in , logFile
+		  , wdir , wdir , m_xysize , m_xysize , logFile
+		  , wdir , wdir , out , logFile
 		 );
+
+	// if they already have netpbm package installed use that then
+	static bool s_checked = false;
+	static bool s_hasNetpbm = false;
+	if ( ! s_checked ) {
+		s_checked = true;
+		File f;
+		f.set("/usr/bin/pnmscale");
+		s_hasNetpbm = f.doesExist() ;
+	}
+	if ( s_hasNetpbm )
+		snprintf( cmd, 2500 ,
+			  "%stopnm %s 2>> %s | "
+			  "pnmscale -xysize %li %li - 2>> %s | "
+			  "ppmtojpeg - > %s 2>> %s"
+			  , ext , in , logFile
+			  , m_xysize , m_xysize , logFile
+			  , out , logFile
+			  );
+		
        
        // Call clone function for the shell to execute command
        // This call WILL BLOCK	. timeout is 30 seconds.
@ -1211,10 +1299,11 @@ bool ThumbnailInfo::printThumbnailInHtml ( SafeBuf *sb ,
 	long newdx = (long)((float)m_dx * min);
 	long newdy = (long)((float)m_dy * min);

-	if ( printLink && format==FORMAT_HTML ) 
+	// might be FORMAT_AJAX!
+	if ( printLink && format !=FORMAT_XML && format != FORMAT_JSON )
 		sb->safePrintf("<a href=%s>", getUrl() );

-	if ( format == FORMAT_HTML )
+	if ( format !=FORMAT_XML && format != FORMAT_JSON )
 		sb->safePrintf("<img width=%li height=%li align=left "
 			       "%s"
 			       "src=\"data:image/"
@ -1225,20 +1314,44 @@ bool ThumbnailInfo::printThumbnailInHtml ( SafeBuf *sb ,
 			       );

 	if ( format == FORMAT_XML )
-		sb->safePrintf("<imageBase64>");
+		sb->safePrintf("\t<imageBase64>");
+
+	if ( format == FORMAT_JSON )
+		sb->safePrintf("\t\"imageBase64\":\"");

 	// encode image in base 64
 	sb->base64Encode ( getData(), m_dataSize , 0 ); // 0 niceness
-	if ( format == FORMAT_HTML ) {
+	if ( format !=FORMAT_XML && format != FORMAT_JSON ) {
 		sb->safePrintf("\">");
 		if ( printLink ) sb->safePrintf ("</a>");
 	}

 	if ( format == FORMAT_XML )
-		sb->safePrintf("</imageBase64>");
+		sb->safePrintf("</imageBase64>\n");
+
+	if ( format == FORMAT_JSON )
+		sb->safePrintf("\",\n");

 	// widget needs to know the width of the thumb for formatting
 	// the text either on top of the thumb or to the right of it
 	if ( retNewdx ) *retNewdx = newdx;
 	return true;
 }
+
+
+char *Images::getImageUrl ( long j , long *urlLen ) {
+
+	long node = m_imageNodes[j];
+	long srcLen = 0;
+	char *src = m_xml->getString(node,"src",&srcLen);
+	// maybe it was an og:image meta tag
+	if ( ! src ) 
+		src = m_xml->getString(node,"content",&srcLen);
+
+	// wtf?
+	if ( ! src ) 
+		log("image: image bad/null src");
+
+	*urlLen = srcLen;
+	return src;
+}
--- a/Images.h
+++ b/Images.h
@ -119,6 +119,8 @@ class Images {
 	bool downloadImage();
 	bool makeThumb();

+	char *getImageUrl ( long j , long *urlLen ) ;
+
 	//bool gotImage ( );
 	void thumbStart_r ( bool amThread );

@ -131,6 +133,8 @@ class Images {
 	void  *m_state  ;
 	void (* m_callback)(void *state );

+	long m_xysize;
+
 	bool      m_setCalled;
 	long      m_errno;
 	long      m_hadError;
--- a/Log.h
+++ b/Log.h
@ -143,6 +143,8 @@ class Log {

 	bool m_logTimestamps;

+	char *getFilename() { return m_filename; };
+
 private:

 	bool dumpLog ( ); // make room for the new ones
--- a/20
+++ b/20
@ -551,6 +551,7 @@ master-rpm:
 # DEBIAN PACKAGE SECTION BEGIN

 # need to do 'apt-get intall dh-make'
+# deb-master
 master-deb:
 	git archive --format=tar --prefix=gb-1.0/ master > ../gb_1.0.orig.tar
 	rm -rf debian
@ -569,6 +570,7 @@ master-deb:
 	cp control.deb debian/control
 # try to use our own rules so we can override dh_shlibdeps and others
 	cp gb.deb.rules debian/rules
+	cp changelog debian/changelog
 # fix dh_shlibdeps from bitching about dependencies on shared libs
 # YOU HAVE TO RUN THIS before you run 'make'
 #	export LD_LIBRARY_PATH=./debian/gb/var/gigablast/data0
@ -583,12 +585,12 @@ master-deb:
 # upload rpm
 	scp gb*.rpm gk268:/w/html/	

-
+#deb-testing
 testing-deb:
-	git archive --format=tar --prefix=gb-1.0/ testing > ../gb_1.0.orig.tar
+	git archive --format=tar --prefix=gb-1.1/ testing > ../gb_1.1.orig.tar
 	rm -rf debian
 # change "-p gb_1.0" to "-p gb_1.1" to update version for example
-	dh_make -e gigablast@mail.com -p gb_1.0 -f ../gb_1.0.orig.tar
+	dh_make -e gigablast@mail.com -p gb_1.1 -f ../gb_1.1.orig.tar
 # zero this out, it is just filed with the .txt files erroneously and it'll
 # try to automatiicaly install in /usr/docs/
 	rm debian/docs
@ -602,16 +604,24 @@ testing-deb:
 	cp control.deb debian/control
 # try to use our own rules so we can override dh_shlibdeps and others
 	cp gb.deb.rules debian/rules
+	cp changelog debian/changelog
+# make the pkg dependencies file ourselves since we overrode dh_shlibdeps
+# with our own debian/rules file. see that file for more info.
+#	echo  "shlibs:Depends=libc6 (>= 2.3)" > debian/gb.substvars 
+#	echo  "shlibs:Depends=netpbm (>= 0.0)" > debian/gb.substvars 
+#	echo  "misc:Depends=netpbm (>= 0.0)" > debian/gb.substvars 
 # fix dh_shlibdeps from bitching about dependencies on shared libs
 # YOU HAVE TO RUN THIS before you run 'make'
 #	export LD_LIBRARY_PATH=./debian/gb/var/gigablast/data0
-# build the package now
+# build the package now. if we don't specify -ai386 -ti386 then some users
+# get a wrong architecture msg and 'dpkg -i' fails
 	dpkg-buildpackage -nc -ai386 -ti386 -b -uc -rfakeroot
+#	dpkg-buildpackage -nc -b -uc -rfakeroot
 # move to current dur
 	mv ../gb_*.deb .	

 install-pkgs-local:
-	sudo alien --to-rpm gb_1.0-1_i386.deb
+	sudo alien --to-rpm gb_1.1-1_i386.deb
 # upload
 	scp gb*.deb gb*.rpm gk268:/w/html/

--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -1931,7 +1931,7 @@ bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) {
 	// hash the url into 64 bits
 	long long uh64 = hash64(u->getUrl(),u->getUrlLen());
 	// read the spider date file first
-	char fn[300]; 
+	char fn[2000]; 
 	File f;
 	// get the spider date then
 	sprintf(fn,"%s/%s/doc.%llu.spiderdate.txt",
@ -1964,6 +1964,10 @@ bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) {
 }

 bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) {
+
+	// ensure dir exists
+	::mkdir(testDir,S_IRWXU);
+
 	// set this
 	long long uh64 = hash64(u->getUrl(),u->getUrlLen());
 	// make that into a filename
--- a/Msg1f.cpp
+++ b/Msg1f.cpp
@ -51,8 +51,11 @@ void handleRequest ( UdpSlot *slot , long netnice ) {
 	char *filename = g_hostdb.m_logFilename;

 	// running just ./gb will log to stderr...
-	if ( strcmp(filename ,"/dev/stderr") == 0 )
+	if ( strcmp(filename ,"/dev/stderr") == 0 ) {
+		g_errno = EBADFILE;
+		g_udpServer.sendErrorReply ( slot, g_errno ); 
 		return;
+	}

 	long fd = open ( filename , O_RDONLY,
 			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
--- a/Msg20.h
+++ b/Msg20.h
@ -75,6 +75,7 @@ class Msg20Request {
 	char       m_numSummaryLines           ; // non-zero default
 	char       m_expected                  ; // non-zero default
 	char       m_allowPunctInPhrase        ; // non-zero default
+	bool       m_getHeaderTag              ;
 	void      *m_state                     ;
 	void      *m_state2                    ; // used by Msg25.cpp
 	long       m_j                         ; // used by Msg25.cpp
@ -416,9 +417,11 @@ public:
 	//   methods must be changed
 	// . also, all ptr_* should be char* and all size_* should be in bytes
 	char       *ptr_tbuf                 ; // title buffer
+	char       *ptr_htag                 ; // h1 tag buf
 	char       *ptr_ubuf                 ; // url buffer
 	char       *ptr_rubuf                ; // redirect url buffer
-	char       *ptr_sum                  ; // summary
+	char       *ptr_displaySum           ; // summary for displaying
+	char       *ptr_dedupSum             ; // summary for deduping
 	char       *ptr_dbuf                 ; // display metas \0 separated
 	//char     *ptr_sbuf                 ; // big sample buf for gigabits
 	char       *ptr_gigabitSample        ;
@ -512,9 +515,11 @@ public:
 	// . string sizes of the strings we store into m_buf[]
 	// . wordCountBuf is an exact word count 1-1 with each "range"
 	long       size_tbuf                 ;
+	long       size_htag                 ;
 	long       size_ubuf                 ;
 	long       size_rubuf                ;
-	long       size_sum                  ;
+	long       size_displaySum           ;
+	long       size_dedupSum             ;
 	long       size_dbuf                 ;
 	//long     size_sbuf                 ;
 	long       size_gigabitSample        ; // includes \0
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -1330,31 +1330,52 @@ bool Msg40::launchMsg20s ( bool recalled ) {
 		// m_printi < m_msg3a.m_numDocIds checks that kinda expect
 		// us to get all summaries for every docid. but when we
 		// do federated search we can get a ton of docids.
-		if ( m_printi >= m_docsToGetVisible ) {
-			logf(LOG_DEBUG,"query: got %li >= %li "
-			     "summaries. done. "
-			     "waiting on remaining "
-			     "%li to return."
-			     , m_printi
-			     , m_docsToGetVisible
-			     , m_numRequests-m_numReplies);
-			// wait for all msg20 replies to come in
-			if ( m_numRequests != m_numReplies ) break;
-			// then let's hack fix this then so we can call
-			// printSearchResultsTail()
-			m_printi   = m_msg3a.m_numDocIds;
-			// set these to max so they do not launch another
-			// summary request, just in case, below
-			m_numRequests = m_msg3a.m_numDocIds;
-			m_numReplies  = m_msg3a.m_numDocIds;
-			break;
-		}
+		// if ( m_printi >= m_docsToGetVisible ) {
+		// 	logf(LOG_DEBUG,"query: got %li >= %li "
+		// 	     "summaries. done. "
+		// 	     "waiting on remaining "
+		// 	     "%li to return."
+		// 	     , m_printi
+		// 	     , m_docsToGetVisible
+		// 	     , m_numRequests-m_numReplies);
+		// 	// wait for all msg20 replies to come in
+		// 	if ( m_numRequests != m_numReplies ) break;
+		// 	// then let's hack fix this then so we can call
+		// 	// printSearchResultsTail()
+		// 	m_printi   = m_msg3a.m_numDocIds;
+		// 	// set these to max so they do not launch another
+		// 	// summary request, just in case, below
+		// 	m_numRequests = m_msg3a.m_numDocIds;
+		// 	m_numReplies  = m_msg3a.m_numDocIds;
+		// 	break;
+		// }

 		// do not double count!
 		//if ( i <= m_lastProcessedi ) continue;
 		// do not repeat for this i
 		m_lastProcessedi = i;

+
+		// if we have printed enough summaries then do not launch
+		// any more, wait for them to come back in.
+		/// this is causing problems because we have a bunch of
+		// m_printi < m_msg3a.m_numDocIds checks that kinda expect
+		// us to get all summaries for every docid. but when we
+		// do federated search we can get a ton of docids.
+		// if ( m_printi >= m_docsToGetVisible ) {
+		// 	logf(LOG_DEBUG,"query: got %li >= %li "
+		// 	     "summaries. done. "
+		// 	     "waiting on remaining "
+		// 	     "%li to return."
+		// 	     , m_printi
+		// 	     , m_docsToGetVisible
+		// 	     , m_numRequests-m_numReplies);
+		// 	m_numRequests++;
+		// 	m_numReplies++;
+		// 	continue;
+		// }
+
+
 		// start up a Msg20 to get the summary
 		Msg20 *m = NULL;
 		if ( m_si->m_streamResults ) {
@ -1492,6 +1513,12 @@ bool Msg40::launchMsg20s ( bool recalled ) {
 		req.m_bigSampleMaxLen    = bigSampleMaxLen;
 		req.m_titleMaxLen        = 256;
 		req.m_titleMaxLen = cr->m_titleMaxLen;
+		req.m_summaryMaxLen = cr->m_summaryMaxLen;
+		// a special undocumented thing for getting <h1> tag
+		req.m_getHeaderTag       = m_si->m_hr.getLong("geth1tag",0);
+		//req.m_numSummaryLines = cr->m_summaryMaxNumLines;
+		// let "ns" parm override
+		req.m_numSummaryLines    = m_si->m_numLinesInSummary;
 		if(m_si->m_isAdmin && m_si->m_format == FORMAT_HTML )
 			req.m_getGigabitVector   = true;
 		else    req.m_getGigabitVector   = false;
@ -1909,7 +1936,9 @@ bool Msg40::gotSummary ( ) {

 	// . wrap it up with Next 10 etc.
 	// . this is in PageResults.cpp
-	if ( m_si && m_si->m_streamResults && ! m_printedTail &&
+	if ( m_si && 
+	     m_si->m_streamResults && 
+	     ! m_printedTail &&
 	     m_printi >= m_msg3a.m_numDocIds ) {
 		m_printedTail = true;
 		printSearchResultsTail ( st );
@ -1960,10 +1989,19 @@ bool Msg40::gotSummary ( ) {
 		if ( ! launchMsg20s ( true ) ) return false; 
 		// it won't launch now if we are bottlnecked waiting for
 		// m_printi's summary to come in
-		if ( m_si->m_streamResults )
+		if ( m_si->m_streamResults ) {
 			// it won't launch any if we printed out enough as well
-			// and it printed "waiting on remaining 0 to return"
+			// and it printed "waiting on remaining 0 to return".
+			// we shouldn't be waiting for more to come in b/c
+			// we are in gotSummart() so one just came in 
+			// freeing up a msg20 to launch another, so assume
+			// this means we are basically done. and it
+			// set m_numRequests=m_msg3a.m_numDocIds etc.
+			//if ( m_numRequests == m_msg3a.m_numDocIds )
+			//	goto printTail;
+			// otherwise, keep chugging
 			goto complete;
+		}
 		// maybe some were cached?
 		//goto refilter;
 		// it returned true, so m_numRequests == m_numReplies and
--- a/Msg5.cpp
+++ b/Msg5.cpp
@ -204,11 +204,13 @@ bool Msg5::getList ( char     rdbId         ,
 	m_rdbId         = rdbId;
 	m_collnum          = collnum;

-	CollectionRec *ttt = g_collectiondb.getRec ( m_collnum );
-	if ( ! ttt ) {
-		g_errno = ENOCOLLREC;
-		return true;
-	}
+	// why was this here? it was messing up the statsdb ("graph") link
+	// in the admin panel.
+	//CollectionRec *ttt = g_collectiondb.getRec ( m_collnum );
+	//if ( ! ttt ) {
+	//	g_errno = ENOCOLLREC;
+	//	return true;
+	//}

 	m_list          = list;
 	//m_startKey      = startKey;
--- a/PageAddColl.cpp
+++ b/PageAddColl.cpp
@ -53,6 +53,29 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
 	if ( ! add && ! cast ) g_collectiondb.deleteRecs ( r )   ;
 	*/

+	char format = r->getReplyFormat();
+
+
+	if ( format == FORMAT_XML || format == FORMAT_JSON ) {
+		// no addcoll given?
+		long  page = g_pages.getDynamicPageNumber ( r );
+		char *addcoll = r->getString("addcoll",NULL);
+		char *delcoll = r->getString("delcoll",NULL);
+		if ( ! addcoll ) addcoll = r->getString("addColl",NULL);
+		if ( ! delcoll ) delcoll = r->getString("delColl",NULL);
+		if ( page == PAGE_ADDCOLL && ! addcoll ) {
+			g_errno = EBADENGINEER;
+			char *msg = "no addcoll parm provided";
+			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
+		}
+		if ( page == PAGE_DELCOLL && ! delcoll ) {
+			g_errno = EBADENGINEER;
+			char *msg = "no delcoll parm provided";
+			return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
+		}
+		return g_httpServer.sendSuccessReply(s,format);
+	}
+
 	char  buf [ 64*1024 ];
 	SafeBuf p(buf, 64*1024);
 	// print standard header
@ -93,7 +116,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
 		p.safePrintf (
 			  "<tr bgcolor=#%s>"
 			  "<td><b>name of new collection to add</td>\n"
-			  "<td><input type=text name=addColl size=30>"
+			  "<td><input type=text name=addcoll size=30>"
 			  "</td></tr>\n"
 			  , LIGHT_BLUE
 			      );
@ -142,7 +165,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
 		if ( ! cr ) continue;
 		p.safePrintf (
 			  "<tr bgcolor=#%s><td>"
-			  "<input type=checkbox name=delColl value=\"%s\"> "
+			  "<input type=checkbox name=delcoll value=\"%s\"> "
 			  "%s</td></tr>\n",
 			  DARK_BLUE,
 			  cr->m_coll,cr->m_coll);
--- a/PageAddUrl.cpp
+++ b/PageAddUrl.cpp
@ -117,6 +117,9 @@ bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
 	// . use to manually update spider times for a url
 	// . however, will not remove old scheduled spider times
 	// . mdw: made force on the default
+	// . mdw: don't use this anymore, use url filters, it has
+	//   a "isaddurl" directive you can use where you can set the
+	//   respider frequency to basically 0 to simulate this parm.
 	//st1->m_forceRespider = r->getLong("force",1); // 0);

 	// if no url given, just print a blank page
@ -135,7 +138,10 @@ bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
 		return sendReply ( st1 , true );
 	}

-
+	if ( spiderLinks )
+		sreq->m_avoidSpiderLinks = 0;
+	else
+		sreq->m_avoidSpiderLinks = 1;

 	// shortcut
 	Msg4 *m = &st1->m_msg4;
--- a/PageBasic.cpp
+++ b/PageBasic.cpp
@ -63,7 +63,7 @@ public:
 	// hash of the subdomain or domain for this line in sitelist
 	long m_thingHash32;
 	// ptr to the line in CollectionRec::m_siteListBuf
-	char *m_patternStr;
+	long m_patternStrOff;
 	// offset of the url path in the pattern, 0 means none
 	short m_pathOff; 
 	short m_pathLen;
@ -315,7 +315,10 @@ bool updateSiteListBuf ( collnum_t collnum ,
 		pd.m_thingHash32 = u.getHostHash32();
 		// . ptr to the line in CollectionRec::m_siteListBuf. 
 		// . includes pointing to "exact:" too i guess and tag: later.
-		pd.m_patternStr = start;
+		// . store offset since CommandUpdateSiteList() passes us
+		//   a temp buf that will be freed before copying the buf
+		//   over to its permanent place at cr->m_siteListBuf
+		pd.m_patternStrOff = start - siteListArg;
 		// offset of the url path in the pattern, 0 means none
 		pd.m_pathOff = 0;
 		// scan url pattern, it should start at "s"
@ -432,30 +435,66 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
 	// we handle.
 	long slot = dt->getSlot ( &sreq->m_domHash32 );

+	char *buf = cr->m_siteListBuf.getBufStart();
+
 	// loop over all the patterns that contain this domain and see
 	// the first one we match, and if we match a negative one.
 	for ( ; slot >= 0 ; slot = dt->getNextSlot(slot,&sreq->m_domHash32)) {
 		// get pattern
 		PatternData *pd = (PatternData *)dt->getValueFromSlot ( slot );
+		// point to string
+		char *patternStr = buf + pd->m_patternStrOff;
 		// is it negative? return NULL if so so url will be ignored
-		//if ( pd->m_patternStr[0] == '-' ) 
+		//if ( patternStr[0] == '-' ) 
 		//	return NULL;
 		// otherwise, it has a path. skip if we don't match path ptrn
 		if ( pd->m_pathOff ) {
 			if ( ! myPath ) myPath = sreq->getUrlPath();
 			if ( strncmp (myPath,
-				      pd->m_patternStr + pd->m_pathOff,
+				      patternStr + pd->m_pathOff,
 				      pd->m_pathLen ) )
 				continue;
 		}
+
+		// for entries like http://domain.com/ we have to match
+		// protocol and url can NOT be like www.domain.com to match.
+		// this is really like a regex like ^http://xyz.com/poo/boo/
+		if ( (patternStr[0]=='h' ||
+		      patternStr[0]=='H') &&
+		     ( patternStr[1]=='t' ||
+		       patternStr[1]=='T' ) &&
+		     ( patternStr[2]=='t' ||
+		       patternStr[2]=='T' ) &&
+		     ( patternStr[3]=='p' ||
+		       patternStr[3]=='P' ) ) {
+			char *x = patternStr+4;
+			// is it https:// ?
+			if ( *x == 's' || *x == 'S' ) x++;
+			// watch out for subdomains like http.foo.com
+			if ( *x != ':' ) goto nomatch;
+			// ok, we have to substring match exactly. like 
+			// ^http://xyssds.com/foobar/
+			char *a = patternStr;
+			char *b = sreq->m_url;
+			for ( ; ; a++, b++ ) {
+				// stop matching when pattern is exhausted
+				if ( is_wspace_a(*a) || ! *a ) 
+					return patternStr;
+				if ( *a != *b ) break;
+			}
+			// we failed to match "pd" so try next line
+			continue;
+		}
+ nomatch:		
+
 		// was the line just a domain and not a subdomain?
 		if ( pd->m_thingHash32 == sreq->m_domHash32 )
 			// this will be false if negative pattern i guess
-			return pd->m_patternStr;
+			return patternStr;
 		// was it just a subdomain?
 		if ( pd->m_thingHash32 == sreq->m_hostHash32 )
 			// this will be false if negative pattern i guess
-			return pd->m_patternStr;
+			return patternStr;
 	}


@ -573,7 +612,25 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
 		      "Spider the url "
 		      "<i>http://www.goodstuff.com/</i> and spider "
 		      "any links we harvest that start with "
-		      "<i>http://www.goodstuff.com/</i>"
+		      "<i>http://www.goodstuff.com/</i>. NOTE: if the url "
+		      "www.goodstuff.com redirects to foo.goodstuff.com then "
+		      "foo.goodstuff.com still gets spidered "
+		      "because it is considered to be manually added, but "
+		      "no other urls from foo.goodstuff.com will be spidered."
+		      "</td>"
+		      "</tr>"
+
+		      // protocol and subdomain match
+		      "<tr>"
+		      "<td>http://justdomain.com/foo/</td>"
+		      "<td>"
+		      "Spider the url "
+		      "<i>http://justdomain.com/foo/</i> and spider "
+		      "any links we harvest that start with "
+		      "<i>http://justdomain.com/foo/</i>. "
+		      "Urls that start with "
+		      "<i>http://<b>www.</b>justdomain.com/</i>, for example, "
+		      "will NOT match this."
 		      "</td>"
 		      "</tr>"

--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -804,6 +804,14 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 		if ( srep && srep->m_hadDiffbotError )
 			msg = "Diffbot processing error";

+		// indicate specific diffbot error if we have it
+		if ( srep && 
+		     srep->m_hadDiffbotError && 
+		     srep->m_errCode &&
+		     // stick with "diffbot processing error" for these...
+		     srep->m_errCode != EDIFFBOTINTERNALERROR )
+			msg = mstrerror(srep->m_errCode);
+
 		// matching url filter, print out the expression
 		long ufn ;
 		ufn = ::getUrlFilterNum(sreq,
@ -1868,6 +1876,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {

 	// i guess bail if not there?
 	if ( ! cr ) {
+		log("crawlbot: missing coll rec for coll %s",collName);
 		char *msg = "invalid or missing collection rec";
 		return sendErrorReply2 (socket,fmt,msg);
 	}
--- a/PageDirectory.cpp
+++ b/PageDirectory.cpp
@ -50,7 +50,11 @@ bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
 	// if /Top print the directory homepage
 	if ( catId == 1 || catId <= 0 ) {
 		// this is in PageRoot.cpp
-		printDirHomePage(sb,r);
+		if ( ! printDirHomePage(sb,r) )
+			// this will be an error if dmoz not set up and
+			// it and xml or json reply format requested
+			return g_httpServer.sendErrorReply(s,500, 
+							   mstrerror(g_errno));
 	}
 	//
 	// try printing this shit out not as search results right now
--- a/PageGet.cpp
+++ b/PageGet.cpp
@ -22,6 +22,7 @@ static bool processLoop ( void *state ) ;
 class State2 {
 public:
 	Msg22      m_msg22;
+	char m_format;
 	//TitleRec   m_tr;
 	long       m_niceness;
 	XmlDoc     m_xd;
@ -76,7 +77,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
 	// get the collection rec
 	CollectionRec *cr = g_collectiondb.getRec ( coll );
 	if ( ! cr ) {
-		g_errno = ENOTFOUND;
+		g_errno = ENOCOLLREC;
 		log("query: Archived copy retrieval failed. "
 		    "No collection record found for "
 		    "collection \"%s\".",coll);
@ -103,6 +104,13 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
 	long  long docId = r->getLongLong ( "d" , 0LL /*default*/ );
 	// get url
 	char *url = r->getString ( "u",NULL);
+
+	if ( docId == 0 && ! url ) {
+		g_errno = EMISSINGINPUT;
+		return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
+	}
+
+
 	// . should we do a sequential lookup?
 	// . we need to match summary here so we need to know this
 	//bool seq = r->getLong ( "seq" , false );
@ -153,6 +161,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
 	st->m_isBanned = false;
 	st->m_noArchive = false;
 	st->m_socket = s;
+	st->m_format = r->getReplyFormat();
 	// default to 0 niceness
 	st->m_niceness = 0;
 	st->m_r.copy ( r );
@ -212,7 +221,7 @@ bool sendErrorReply ( void *state , long err ) {
 	TcpSocket *s = st->m_socket;

 	char tmp [ 1024*32 ] ;
-	sprintf ( tmp , "<b>had server-side error: %s</b><br>",
+	sprintf ( tmp , "%s",
 		  mstrerror(g_errno));
 	// nuke state2
 	mdelete ( st , sizeof(State2) , "PageGet1" );
@ -358,6 +367,9 @@ bool processLoop ( void *state ) {
 		//p += gbstrlen ( p );
 	}

+	char format = st->m_format;
+	if ( format == FORMAT_XML ) sb.reset();
+	if ( format == FORMAT_JSON ) sb.reset();

 	// for undoing the stuff below
 	long startLen2 = sb.length();//p;
@ -383,6 +395,19 @@ bool processLoop ( void *state ) {
 	if ( xd->m_contentType == CT_JSON )
 		printDisclaimer = false;

+	if ( format == FORMAT_XML ) printDisclaimer = false;
+	if ( format == FORMAT_JSON ) printDisclaimer = false;
+
+	char tbuf[100];
+	tbuf[0] = 0;
+	time_t lastSpiderDate = xd->m_spideredTime;
+
+	if ( printDisclaimer ||
+	     format == FORMAT_XML ||
+	     format == FORMAT_JSON ) {
+		struct tm *timeStruct = gmtime ( &lastSpiderDate );
+		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
+	}

 	// We should always be displaying this disclaimer.
 	// - May eventually want to display this at a different location
@ -425,10 +450,10 @@ bool processLoop ( void *state ) {
 		//p += gbstrlen ( p );

 		// then the spider date in GMT
-		time_t lastSpiderDate = xd->m_spideredTime;
-		struct tm *timeStruct = gmtime ( &lastSpiderDate );
-		char tbuf[100];
-		strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
+		// time_t lastSpiderDate = xd->m_spideredTime;
+		// struct tm *timeStruct = gmtime ( &lastSpiderDate );
+		// char tbuf[100];
+		// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
 		//p += gbstrlen ( p );
 		sb.safeStrcpy(tbuf);

@ -562,6 +587,9 @@ bool processLoop ( void *state ) {
 	if ( xd->m_contentType == CT_JSON )
 		includeHeader = false;

+	if ( format == FORMAT_XML ) includeHeader = false;
+	if ( format == FORMAT_JSON ) includeHeader = false;
+
 	//mfree(uq, uqCapacity, "PageGet");
 	// undo the header writes if we should
 	if ( ! includeHeader ) {
@ -571,6 +599,35 @@ bool processLoop ( void *state ) {
 		else                         sb.m_length=startLen1;//p=start1;
 	}

+	//sb.safeStrcpy(tbuf);
+
+
+
+	if ( format == FORMAT_XML ) {
+		sb.safePrintf("<response>\n");
+		sb.safePrintf("<statusCode>0</statusCode>\n");
+		sb.safePrintf("<statusMsg>Success</statusMsg>\n");
+		sb.safePrintf("<url><![CDATA[");
+		sb.cdataEncode(xd->m_firstUrl.m_url);
+		sb.safePrintf("]]></url>\n");
+		sb.safePrintf("<docId>%llu</docId>\n",xd->m_docId);
+		sb.safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
+			      lastSpiderDate);
+		sb.safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
+	}
+
+	if ( format == FORMAT_JSON ) {
+		sb.safePrintf("{\"response\":{\n");
+		sb.safePrintf("\t\"statusCode\":0,\n");
+		sb.safePrintf("\t\"statusMsg\":\"Success\",\n");
+		sb.safePrintf("\t\"url\":\"");
+		sb.jsonEncode(xd->m_firstUrl.m_url);
+		sb.safePrintf("\",\n");
+		sb.safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
+		sb.safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
+		sb.safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
+	}
+
 	// identify start of <title> tag we wrote out
 	char *sbstart = sb.getBufStart();
 	char *sbend   = sb.getBufEnd();
@ -681,6 +738,10 @@ bool processLoop ( void *state ) {
 	if ( ctype == CT_TEXT ) pre = true ; // text/plain
 	if ( ctype == CT_DOC  ) pre = true ; // filtered msword
 	if ( ctype == CT_PS   ) pre = true ; // filtered postscript
+
+	if ( format == FORMAT_XML ) pre = false;
+	if ( format == FORMAT_JSON ) pre = false;
+
 	// if it is content-type text, add a <pre>
 	if ( pre ) {//p + 5 < bufEnd && pre ) {
 		sb.safePrintf("<pre>");
@ -706,10 +767,15 @@ bool processLoop ( void *state ) {
 	// do not do term highlighting if json
 	if ( xd->m_contentType == CT_JSON )
 		queryHighlighting = false;
+
+	SafeBuf tmp;
+	SafeBuf *xb = &sb;
+	if ( format == FORMAT_XML ) xb = &tmp;
+	if ( format == FORMAT_JSON ) xb = &tmp;
 	

 	if ( ! queryHighlighting ) {
-		sb.safeMemcpy ( content , contentLen );
+		xb->safeMemcpy ( content , contentLen );
 		//p += contentLen ;
 	}
 	else {
@ -733,7 +799,7 @@ bool processLoop ( void *state ) {
 		Matches m;
 		m.setQuery ( &qq );
 		m.addMatches ( &ww );
-		hilen = hi.set ( &sb , // p , avail , 
+		hilen = hi.set ( xb , // p , avail , 
 				 &ww , &m ,
 				 false /*doStemming?*/ ,  
 				 st->m_clickAndScroll , 
@ -742,6 +808,21 @@ bool processLoop ( void *state ) {
 		log(LOG_DEBUG, "query: Done highlighting cached page content");
 	}

+
+	if ( format == FORMAT_XML ) {
+		sb.safePrintf("\t<content><![CDATA[");
+		sb.cdataEncode ( xb->getBufStart() );
+		sb.safePrintf("]]></content>\n");
+		sb.safePrintf("</response>\n");
+	}
+
+	if ( format == FORMAT_JSON ) {
+		sb.safePrintf("\t\"content\":\"\n");
+		sb.jsonEncode ( xb->getBufStart() );
+		sb.safePrintf("\"\n}\n}\n");
+	}
+
+
 	// if it is content-type text, add a </pre>
 	if ( pre ) { // p + 6 < bufEnd && pre ) {
 		sb.safeMemcpy ( "</pre>" , 6 );
@ -784,6 +865,9 @@ bool processLoop ( void *state ) {
 	if ( xd->m_contentType == CT_JSON )
 		contentType = "application/json";

+	if ( format == FORMAT_XML ) contentType = "text/xml";
+	if ( format == FORMAT_JSON ) contentType = "application/json";
+
 	// nuke state2
 	mdelete ( st , sizeof(State2) , "PageGet1" );
 	delete (st);
--- a/PageInject.cpp
+++ b/PageInject.cpp
@ -44,6 +44,15 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
 	mnew ( msg7, sizeof(Msg7) , "PageInject" );


+	char format = hr->getReplyFormat();
+
+	// no url parm?
+	if ( format != FORMAT_HTML && ! hr->getString("c",NULL) ) {
+		g_errno = ENOCOLLREC;
+		char *msg = mstrerror(g_errno);
+		return g_httpServer.sendErrorReply(sock,g_errno,msg,NULL);
+	}
+
 	// set this. also sets gr->m_hr
 	GigablastRequest *gr = &msg7->m_gr;
 	// this will fill in GigablastRequest so all the parms we need are set
@ -78,6 +87,9 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
 		return sendReply ( msg7 );
 	}

+
+
+
 	// a scrape request?
 	if ( gr->m_queryToScrape && gr->m_queryToScrape[0] ) {
 		//char *uf="http://www.google.com/search?num=50&"
@ -117,7 +129,45 @@ bool sendReply ( void *state ) {
 	//long      hostId = msg7->m_msg7.m_hostId;
 	long long docId  = xd->m_docId;
 	long      hostId = 0;//msg7->m_msg7.m_hostId;
-	
+
+	// set g_errno to index code
+	if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno )
+		g_errno = xd->m_indexCode;
+
+	char format = gr->m_hr.getReplyFormat();
+
+	// no url parm?
+	if ( ! g_errno && ! gr->m_url && format != FORMAT_HTML )
+		g_errno = EMISSINGINPUT;
+
+	if ( g_errno ) {
+		long save = g_errno;
+		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
+		delete (msg7);
+		g_errno = save;
+		char *msg = mstrerror(g_errno);
+		return g_httpServer.sendErrorReply(sock,save,msg,NULL);
+	}
+
+	char abuf[32];
+	SafeBuf am(abuf,32,0,false);
+
+	// a success reply, include docid and url i guess
+	if ( format == FORMAT_XML ) {
+		am.safePrintf("\t<docId>%lli</docId>\n",xd->m_docId);
+		char *addMsg = am.getBufStart();
+		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
+		delete (msg7);
+		return g_httpServer.sendSuccessReply(sock,format,addMsg);
+	}
+
+	if ( format == FORMAT_JSON ) {
+		am.safePrintf("\t\"docId\":%lli,\n",xd->m_docId);
+		char *addMsg = am.getBufStart();
+		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
+		delete (msg7);
+		return g_httpServer.sendSuccessReply(sock,format,addMsg);
+	}

 	//
 	// debug
@ -159,11 +209,6 @@ bool sendReply ( void *state ) {
 	if ( url && gr->m_shortReply ) {
 		char buf[1024*32];
 		char *p = buf;
-		// set g_errno to index code
-		if ( xd->m_indexCodeValid &&
-		     xd->m_indexCode &&
-		     ! g_errno )
-			g_errno = xd->m_indexCode;
 		// return docid and hostid
 		if ( ! g_errno ) p += sprintf ( p , 
 					   "0,docId=%lli,hostId=%li," , 
@ -275,6 +320,12 @@ bool Msg7::inject ( void *state ,
 		return true;
 	}

+	if ( ! gr->m_url ) {
+		log("inject: no url provied to inject");
+		g_errno = EBADURL;
+		return true;
+	}
+
 	//char *coll = cr->m_coll;

 	m_state = state;
--- a/PageReindex.cpp
+++ b/PageReindex.cpp
@ -257,6 +257,10 @@ bool Msg1c::reindexQuery ( char *query ,

 	//CollectionRec *cr = g_collectiondb.getRec ( collnum );

+	// sanity fix
+	if ( endNum - startNum > MAXDOCIDSTOCOMPUTE )
+		endNum = startNum + MAXDOCIDSTOCOMPUTE;
+
 	//CollectionRec *cr = g_collectiondb.getRec ( coll );
 	// reset again just in case
 	m_req.reset();
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -149,6 +149,7 @@ bool sendReply ( State0 *st , char *reply ) {
 	mdelete(st, sizeof(State0), "PageResults2");
 	delete st;

+	/*
 	if ( format == FORMAT_XML ) {
 		SafeBuf sb;
 		sb.safePrintf("<?xml version=\"1.0\" "
@ -174,6 +175,7 @@ bool sendReply ( State0 *st , char *reply ) {
 					     charset );
 		return true;
 	}
+	*/

 	long status = 500;
 	if (savedErr == ETOOMANYOPERANDS ||
@ -244,7 +246,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 	//long xml = hr->getLong("xml",0);

 	// what format should search results be in? default is html
-	char format = getFormatFromRequest ( hr );
+	char format = hr->getReplyFormat();//getFormatFromRequest ( hr );

 	// get the dmoz catid if given
 	//long searchingDmoz = hr->getLong("dmoz",0);
@ -543,6 +545,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 	if ( cr ) st->m_collnum = cr->m_collnum;
 	else      st->m_collnum = -1;

+	// turn this on for json output, unless diffbot collection
+	if ( format == FORMAT_JSON && ! cr->m_isCustomCrawl )
+		st->m_header = 1;
+
 	// take this out here as well!
 	// limit here
 	// long maxpp = cr->m_maxSearchResultsPerQuery ;
@ -1009,6 +1015,11 @@ bool gotResults ( void *state ) {

 	// if already printed from Msg40.cpp, bail out now
 	if ( si->m_streamResults ) {
+		// this will be our final send
+		if ( st->m_socket->m_streamingMode ) {
+			log("res: socket still in streaming mode. wtf?");
+			st->m_socket->m_streamingMode = false;
+		}
 		log("msg40: done streaming. nuking state.");
 		mdelete(st, sizeof(State0), "PageResults2");
 		delete st;
@ -1019,12 +1030,12 @@ bool gotResults ( void *state ) {
 	//char        *coll    = si->m_coll2;
 	//long         collLen = si->m_collLen2;

-	collnum_t collnum = si->m_firstCollnum;
+	//collnum_t collnum = si->m_firstCollnum;

 	// collection rec must still be there since SearchInput references 
 	// into it, and it must be the SAME ptr too!
-	CollectionRec *cr = g_collectiondb.getRec ( collnum );
-	if ( ! cr || cr != si->m_cr ) {
+	CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum );
+	if ( ! cr ) { // || cr != si->m_cr ) {
 	       g_errno = ENOCOLLREC;
 	       return sendReply(st,NULL);
 	}
@ -1705,12 +1716,6 @@ bool printSearchResultsHeader ( State0 *st ) {
 				       (long)moreFollow);
 	}

-
-	if ( st->m_header && si->m_format == FORMAT_JSON ) {
-		sb->safePrintf("\"objects\":[\n");
-		return true;
-	}
-
 	// . did he get a spelling recommendation?
 	// . do not use htmlEncode() on this anymore since receiver
 	//   of the XML feed usually does not want that.
@ -1720,6 +1725,27 @@ bool printSearchResultsHeader ( State0 *st ) {
 		sb->safePrintf ("]]></spell>\n");
 	}

+	if ( si->m_format == FORMAT_JSON && st->m_spell[0] ) {
+		sb->safePrintf ("\t\"spell\":\"");
+		sb->jsonEncode(st->m_spell);
+		sb->safePrintf ("\"\n,");
+	}
+
+
+	// for diffbot collections only...
+	if ( st->m_header && 
+	     si->m_format == FORMAT_JSON &&
+	     cr->m_isCustomCrawl ) {
+		sb->safePrintf("\"objects\":[\n");
+		return true;
+	}
+
+	if ( si->m_format == FORMAT_JSON &&
+	     ! cr->m_isCustomCrawl ) {
+		sb->safePrintf("\"results\":[\n");
+		return true;
+	}
+
 	// debug
 	if ( si->m_debug )
 		logf(LOG_DEBUG,"query: Displaying up to %li results.",
@ -2821,6 +2847,40 @@ static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,
 					   long catid ,
 					   State0 *st ) {

+	char format = si->m_format;
+
+	if ( format == FORMAT_XML ) {
+		sb->safePrintf("\t\t<dmozCat>\n"
+			       "\t\t\t<dmozCatId>%li</dmozCatId>\n"
+			       "\t\t\t<dmozCatStr><![CDATA["
+			       ,catid);
+		// print the name of the dmoz category
+		char xbuf[256];
+		SafeBuf xb(xbuf,256,0,false);
+		g_categories->printPathFromId(&xb, catid, false,si->m_isRTL);
+		sb->cdataEncode(xb.getBufStart());
+		sb->safePrintf("]]></dmozCatStr>\n"
+			       "\t\t</dmozCat>\n");
+		return true;
+	}
+
+	if ( format == FORMAT_JSON ) {
+		sb->safePrintf("\t\t\"dmozCat\":{\n"
+			       "\t\t\t\"dmozCatId\":%li,\n"
+			       "\t\t\t\"dmozCatStr\":\""
+			       ,catid);
+		// print the name of the dmoz category
+		char xbuf[256];
+		SafeBuf xb(xbuf,256,0,false);
+		g_categories->printPathFromId(&xb, catid, false,si->m_isRTL);
+		sb->jsonEncode(xb.getBufStart());
+		sb->safePrintf("\"\n"
+			       "\t\t},\n");
+
+
+		return true;
+	}
+
 	//uint8_t queryLanguage = langUnknown;
 	uint8_t queryLanguage = si->m_queryLangId;
 	// Don't print category if not in native language category
@ -3011,7 +3071,13 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 	}


-	if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t<result>\n" );
+	if ( si->m_format == FORMAT_XML ) 
+		sb->safePrintf("\t<result>\n" );
+
+	if ( si->m_format == FORMAT_JSON ) {
+		if ( *numPrintedSoFar != 0 ) sb->safePrintf(",\n");
+		sb->safePrintf("\t{\n" );
+	}

 	Highlight hi;

@ -3112,7 +3178,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {

 	// if we have a thumbnail show it next to the search result,
 	// base64 encoded
-	if ( (si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) &&
+	if ( //(si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) &&
 	     //! mr->ptr_imgUrl &&
 	     mr->ptr_imgData ) {
 		ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
@ -3128,9 +3194,25 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 					   si->m_format );
 		if ( si->m_format == FORMAT_XML ) {
 			sb->safePrintf("\t\t<imageHeight>%li</imageHeight>\n",
-				       ti->m_dx);
-			sb->safePrintf("\t\t<imageWidth>%li</imageWidth>\n",
 				       ti->m_dy);
+			sb->safePrintf("\t\t<imageWidth>%li</imageWidth>\n",
+				       ti->m_dx);
+			sb->safePrintf("\t\t<origImageHeight>%li"
+				       "</origImageHeight>\n",
+				       ti->m_origDY);
+			sb->safePrintf("\t\t<origImageWidth>%li"
+				       "</origImageWidth>\n",
+				       ti->m_origDX);
+		}
+		if ( si->m_format == FORMAT_JSON ) {
+			sb->safePrintf("\t\t\"imageHeight\":%li,\n",
+				       ti->m_dy);
+			sb->safePrintf("\t\t\"imageWidth\":%li,\n",
+				       ti->m_dx);
+			sb->safePrintf("\t\t\"origImageHeight\":%li,\n",
+				       ti->m_origDY);
+			sb->safePrintf("\t\t\"origImageWidth\":%li,\n",
+				       ti->m_origDX);
 		}
 	}

@ -3357,7 +3439,6 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 	}
 	

-
 	long hlen;
 	//copy all summary and title excerpts for this result into here
 	//char tt[1024*32];
@ -3375,8 +3456,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 		frontTag = "<font style=\"background-color:yellow\">" ;
 	}
 	long cols = 80;
-	if ( si->m_format == FORMAT_XML ) 
-		sb->safePrintf("\t\t<title><![CDATA[");
+
 	SafeBuf hb;
 	if ( str && strLen && si->m_doQueryHighlighting ) {
 		hlen = hi.set ( &hb,
@ -3393,29 +3473,55 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 				backTag,
 				0,
 				0 ); // niceness
+		// reassign!
+		str = hb.getBufStart();
+		strLen = hb.getLength();
 		//if (!sb->utf8Encode2(tt, hlen)) return false;
-		if ( ! sb->brify ( hb.getBufStart(),
-				  hb.getLength(),
-				  0,
-				  cols) ) return false;
+		// if ( si->m_format != FORMAT_JSON )
+		// 	if ( ! sb->brify ( hb.getBufStart(),
+		// 			   hb.getLength(),
+		// 			   0,
+		// 			   cols) ) return false;
 	}
-	else if ( str && strLen ) {
+
+	// . use "UNTITLED" if no title
+	// . msg20 should supply the dmoz title if it can
+	if ( strLen == 0 && 
+	     si->m_format != FORMAT_XML && 
+	     si->m_format != FORMAT_JSON ) {
+		str = "<i>UNTITLED</i>";
+		strLen = gbstrlen(str);
+	}
+
+	if ( str && 
+	     strLen && 
+	     ( si->m_format == FORMAT_HTML ||
+	       si->m_format == FORMAT_WIDGET_IFRAME ||
+	       si->m_format == FORMAT_WIDGET_APPEND ||
+	       si->m_format == FORMAT_WIDGET_AJAX ) 
+	     ) {
 		// determine if TiTle wraps, if it does add a <br> count for
 		// each wrap
 		//if (!sb->utf8Encode2(str , strLen )) return false;
 		if ( ! sb->brify ( str,strLen,0,cols) ) return false;
 	}
-	// . use "UNTITLED" if no title
-	// . msg20 should supply the dmoz title if it can
-	if ( strLen == 0 ) {
-		if(!sb->safePrintf("<i>UNTITLED</i>"))
-			return false;
-	}
+
 	// close up the title tag
-	if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></title>\n");
+	if ( si->m_format == FORMAT_XML ) {
+		sb->safePrintf("\t\t<title><![CDATA[");
+		if ( str ) sb->cdataEncode(str);
+		sb->safePrintf("]]></title>\n");
+	}
+
+	if ( si->m_format == FORMAT_JSON ) {
+		sb->safePrintf("\t\t\"title\":\"");
+		if ( str ) sb->jsonEncode(str);
+		sb->safePrintf("\",\n");
+	}


-	if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("</a><br>\n" ) ;
+	if ( si->m_format == FORMAT_HTML ) 
+		sb->safePrintf ("</a><br>\n" ) ;


 	// close the title tag stuf
@ -3424,6 +3530,22 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 	     si->m_format == FORMAT_WIDGET_AJAX ) 
 		sb->safePrintf("</b></a>\n");

+	//
+	// print <h1> tag contents. hack for client.
+	//
+	if ( mr->ptr_htag && mr->size_htag > 1 ) {
+		if ( si->m_format == FORMAT_XML ) {
+			sb->safePrintf("\t\t<h1Tag><![CDATA[");
+			sb->cdataEncode(mr->ptr_htag);
+			sb->safePrintf("]]></h1Tag>\n");
+		}
+		if ( si->m_format == FORMAT_JSON ) {
+			sb->safePrintf("\t\t\"h1Tag\":\"");
+			sb->jsonEncode(mr->ptr_htag);
+			sb->safePrintf("\",\n");
+		}
+	}
+

 	/////
 	//
@ -3440,6 +3562,8 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 				      "]]>"
 				      "</contentType>\n",
 				      cs);
+		else if ( si->m_format == FORMAT_JSON )
+			sb->safePrintf("\t\t\"contentType\":\"%s\",\n",cs);
 		else if ( si->m_format == FORMAT_HTML && ctype != CT_HTML ) {
 			sb->safePrintf(" <b><font style=color:white;"
 				      "background-color:maroon;>");
@ -3460,13 +3584,18 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {

 	// . then the summary
 	// . "s" is a string of null terminated strings
-	char *send;
+	//char *send;
 	// do the normal summary
-	str    = mr->ptr_sum;
-	strLen = mr->size_sum-1;
+	str    = mr->ptr_displaySum;
+	// sometimes the summary is longer than requested because for
+	// summary deduping purposes (see "pss" parm in Parms.cpp) we do not
+	// get it as short as request. so use mr->m_sumPrintSize here
+	// not mr->size_sum
+	strLen = mr->size_displaySum - 1;//-1;
+
 	// this includes the terminating \0 or \0\0 so back up
 	if ( strLen < 0 ) strLen  = 0;
-	send = str + strLen;
+	//send = str + strLen;

 	// dmoz summary might override if we are showing a dmoz topic page
 	if ( dmozSummary ) {
@ -3474,8 +3603,6 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 		strLen = gbstrlen(dmozSummary);
 	}

-	if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<sum><![CDATA[");
-
 	bool printSummary = true;
 	// do not print summaries for widgets by default unless overridden
 	// with &summary=1
@ -3485,13 +3612,25 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 	     hr->getLong("summaries",0) == 0 )
 		printSummary = false;

-	if ( printSummary )
+	if ( printSummary && si->m_format == FORMAT_HTML )
 		sb->brify ( str , strLen, 0 , cols ); // niceness = 0

-	// close xml tag
-	if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></sum>\n");
+	if ( si->m_format == FORMAT_XML ) {
+		sb->safePrintf("\t\t<sum><![CDATA[");
+		sb->cdataEncode(str);
+		sb->safePrintf("]]></sum>\n");
+	}
+
+	if ( si->m_format == FORMAT_JSON ) {
+		sb->safePrintf("\t\t\"sum\":\"");
+		sb->jsonEncode(str);
+		sb->safePrintf("\",\n");
+	}
+
+
 	// new line if not xml
-	else if ( strLen ) sb->safePrintf("<br>\n");
+	if ( si->m_format == FORMAT_HTML && strLen ) 
+		sb->safePrintf("<br>\n");

 	////////////
 	//
@ -3557,6 +3696,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 		sb->safeMemcpy ( url , urlLen );
 		sb->safePrintf("]]></url>\n");
 	}
+	if ( si->m_format == FORMAT_JSON ) {
+		sb->safePrintf("\t\t\"url\":\"");
+		sb->jsonEncode ( url , urlLen );
+		sb->safePrintf("\",\n");
+	}


 	// now the last spidered date of the document
@ -3617,6 +3761,49 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 					datedbDate);
 	}

+	if ( si->m_format == FORMAT_JSON ) {
+		// doc size in Kilobytes
+		sb->safePrintf ( "\t\t\"size\":\"%4.0fk\",\n",
+				(float)mr->m_contentLen/1024.0);
+		// . docId for possible cached link
+		// . might have merged a bunch together
+		sb->safePrintf("\t\t\"docId\":%lli,\n",mr->m_docId );
+		// . show the site root
+		// . for hompages.com/users/fred/mypage.html this will be
+		//   homepages.com/users/fred/
+		// . for www.xyz.edu/~foo/burp/ this will be
+		//   www.xyz.edu/~foo/ etc.
+		long  siteLen = 0;
+		char *site = NULL;
+		// seems like this isn't the way to do it, cuz Tagdb.cpp
+		// adds the "site" tag itself and we do not always have it
+		// in the XmlDoc::ptr_tagRec... so do it this way:
+		site    = mr->ptr_site;
+		siteLen = mr->size_site-1;
+		//char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec);
+		sb->safePrintf("\t\t\"site\":\"");
+		if ( site && siteLen > 0 ) sb->safeMemcpy ( site , siteLen );
+		sb->safePrintf("\",\n");
+		//long sh = hash32 ( site , siteLen );
+		//sb->safePrintf ("\t\t<siteHash32>%lu</siteHash32>\n",sh);
+		//long dh = uu.getDomainHash32 ();
+		//sb->safePrintf ("\t\t<domainHash32>%lu</domainHash32>\n",dh);
+		// spider date
+		sb->safePrintf ( "\t\t\"spidered\":%lu,\n",
+				mr->m_lastSpidered);
+		// backwards compatibility for buzz
+		sb->safePrintf ( "\t\t\"firstIndexedDateUTC\":%lu,\n"
+				 , mr->m_firstIndexedDate);
+		sb->safePrintf( "\t\t\"contentHash32\":%lu,\n"
+				, mr->m_contentHash32);
+		// pub date
+		long datedbDate = mr->m_datedbDate;
+		// show the datedb date as "<pubDate>" for now
+		if ( datedbDate != -1 )
+			sb->safePrintf ( "\t\t\"pubdate\":%lu,\n",
+					datedbDate);
+	}
+


 	// . we also store the outlinks in a linkInfo structure
@ -3642,6 +3829,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 			      k->m_ip, // hostHash, but use ip for now
 			      (long)k->m_firstIndexedDate ,
 			      (long)k->m_datedbDate );
+
 	if ( si->m_format == FORMAT_XML ) {
 		// result
 		sb->safePrintf("\t\t<language><![CDATA[%s]]>"
@ -3654,6 +3842,16 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 				      "</charset>\n", charset);
 	}

+	if ( si->m_format == FORMAT_JSON ) {
+		// result
+		sb->safePrintf("\t\t\"language\":\"%s\",\n",
+			      getLanguageString(mr->m_language));
+		
+		char *charset = get_charset_str(mr->m_charset);
+		if(charset)
+			sb->safePrintf("\t\t\"charset\":\"%s\",\n",charset);
+	}
+
 	//
 	// end more xml stuff
 	//
@ -3797,10 +3995,10 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 			       ,ix
 			       );
 		// reindex
-		sb->safePrintf(" - <a style=color:red; href=\"/addurl?u=");
+		sb->safePrintf(" - <a style=color:red; href=\"/addurl?urls=");
 		sb->urlEncode ( url , gbstrlen(url) , false );
 		unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
-		sb->safePrintf("&rand64=%llu&force=1\">respider</a>",rand64);
+		sb->safePrintf("&rand64=%llu\">respider</a>",rand64);
 	}


@ -4041,6 +4239,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 	if ( ! dp ) {
 		if ( si->m_format == FORMAT_XML ) 
 			sb->safePrintf ("\t</result>\n\n");
+		if ( si->m_format == FORMAT_JSON ) {
+			// remove last ,\n
+			sb->m_length -= 2;
+			sb->safePrintf ("\n\t}\n\n");
+		}
 		// wtf?
 		//char *xx=NULL;*xx=0;
 		// at least close up the table
@ -4126,7 +4329,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 		if ( minScore < 0.0 || totalPairScore < minScore )
 			minScore = totalPairScore;
 		// we need to set "ft" for xml stuff below
-		if ( si->m_format == FORMAT_XML ) continue;
+		if ( si->m_format != FORMAT_HTML ) continue;
 		//sb->safePrintf("<table border=1><tr><td><center><b>");
 		// print pair text
 		//long qtn1 = fps->m_qtermNum1;
@ -4209,7 +4412,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 		if ( minScore < 0.0 || totalSingleScore < minScore )
 			minScore = totalSingleScore;
 		// we need to set "ft" for xml stuff below
-		if ( si->m_format == FORMAT_XML ) continue;
+		if ( si->m_format != FORMAT_HTML ) continue;
 		//sb->safePrintf("<table border=1><tr><td><center><b>");
 		// print pair text
 		//long qtn = fss->m_qtermNum;
--- a/PageRoot.cpp
+++ b/PageRoot.cpp
@ -22,7 +22,7 @@
 //char *printNumResultsDropDown ( char *p, long n, bool *printedDropDown);
 bool printNumResultsDropDown ( SafeBuf& sb, long n, bool *printedDropDown);
 //static char *printTopDirectory ( char *p, char *pend );
-static bool printTopDirectory ( SafeBuf& sb );
+static bool printTopDirectory ( SafeBuf& sb , char format );

 // this prints the last five queries
 //static long printLastQueries ( char *p , char *pend ) ;
@ -586,7 +586,7 @@ bool expandHtml (  SafeBuf& sb,
 		if ( head[i+1] == 't' ) {
 			i += 1;
 			//p = printTopDirectory ( p, pend );
-			printTopDirectory ( sb );
+			printTopDirectory ( sb , FORMAT_HTML );
 			continue;
 		}

@ -963,7 +963,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
 			      "onLoad=\""
 			      "var client = new XMLHttpRequest();\n"
 			      "client.onreadystatechange = handler;\n"
-			      "var url='/addurl?u="
+			      "var url='/addurl?urls="
 			      );
 		sb.urlEncode ( url );
 		// propagate "admin" if set
@ -1042,7 +1042,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
 	if ( ! coll ) 
 		coll = "";

-	sb.safePrintf("<input name=u type=text size=60 value=\"");
+	sb.safePrintf("<input name=urls type=text size=60 value=\"");
 	if ( url ) {
 		SafeBuf tmp;
 		tmp.safePrintf("%s",url);
@ -1092,7 +1092,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
 			      //"alert('shit');"
 			      "var client = new XMLHttpRequest();\n"
 			      "client.onreadystatechange = handler;\n"
-			      "var url='/addurl?u="
+			      "var url='/addurl?urls="
 			      , root );
 		sb.urlEncode ( url );
 		// propagate "admin" if set
@ -1128,6 +1128,11 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {

 bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {

+	char format = r->getReplyFormat();
+	if ( format != FORMAT_HTML )
+		return printTopDirectory ( sb , format );
+
+
 	sb.safePrintf("<html>\n");
 	sb.safePrintf("<head>\n");
 	//sb.safePrintf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
@ -1216,7 +1221,7 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
 	sb.safePrintf("\n");


-	printTopDirectory ( sb );
+	printTopDirectory ( sb , FORMAT_HTML );

 	sb.safePrintf("<br><br>\n");

@ -1395,10 +1400,12 @@ long printLastQueries ( char *p , char *pend ) {


 //char *printTopDirectory ( char *p, char *pend ) {
-bool printTopDirectory ( SafeBuf& sb ) {
+bool printTopDirectory ( SafeBuf& sb , char format ) {
+
+	long nr = g_catdb.getRdb()->getNumTotalRecs();

 	// if no recs in catdb, print instructions
-	if ( g_catdb.getRdb()->getNumTotalRecs() == 0 )
+	if ( nr == 0 && format == FORMAT_HTML)
 		return sb.safePrintf("<center>"
 				     "<b>DMOZ functionality is not set up.</b>"
 				     "<br>"
@ -1411,6 +1418,12 @@ bool printTopDirectory ( SafeBuf& sb ) {
 				     "</b>"
 				     "</center>");

+	// send back an xml/json error reply
+	if ( nr == 0 && format != FORMAT_HTML ) {
+		g_errno = EDMOZNOTREADY;
+		return false;
+	}
+
 	//char topList[4096];
 	//sprintf(topList, 
 	return sb.safePrintf (
@ -1619,26 +1632,26 @@ static bool s_inprogress = false;

 // . returns false if blocked, true otherwise
 // . sets g_errno on error
-bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
+bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
 	// . get fields from cgi field of the requested url
 	// . get the search query
 	long  urlLen = 0;
-	char *url = r->getString ( "u" , &urlLen , NULL /*default*/);
+	char *url = hr->getString ( "urls" , &urlLen , NULL /*default*/);

 	// see if they provided a url of a file of urls if they did not
 	// provide a url to add directly
-	bool isAdmin = g_conf.isCollAdmin ( s , r );
+	bool isAdmin = g_conf.isCollAdmin ( sock , hr );
 	long  ufuLen = 0;
 	char *ufu = NULL;
-	if ( isAdmin )
-		// get the url of a file of urls (ufu)
-		ufu = r->getString ( "ufu" , &ufuLen , NULL );
+	//if ( isAdmin )
+	//	// get the url of a file of urls (ufu)
+	//	ufu = hr->getString ( "ufu" , &ufuLen , NULL );

 	// can't be too long, that's obnoxious
 	if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) {
 		g_errno = EBUFTOOSMALL;
 		g_msg = " (error: url too long)";
-		return g_httpServer.sendErrorReply(s,500,"url too long");
+		return g_httpServer.sendErrorReply(sock,500,"url too long");
 	}
 	// get the collection
 	//long  collLen = 0;
@ -1650,20 +1663,20 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 	//}
 	// get collection rec

-	CollectionRec *cr = g_collectiondb.getRec ( r );
+	CollectionRec *cr = g_collectiondb.getRec ( hr );

 	// bitch if no collection rec found
 	if ( ! cr ) {
 		g_errno = ENOCOLLREC;
 		g_msg = " (error: no collection)";
-		return g_httpServer.sendErrorReply(s,500,"no coll rec");
+		return g_httpServer.sendErrorReply(sock,500,"no coll rec");
 	}
 	// . make sure the ip is not banned
 	// . we may also have an exclusive list of IPs for private collections
-	if ( ! cr->hasSearchPermission ( s ) ) {
+	if ( ! cr->hasSearchPermission ( sock ) ) {
 		g_errno = ENOPERM;
 		g_msg = " (error: permission denied)";
-		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
+	       return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno));
 	}


@ -1672,8 +1685,8 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 	//
 	if ( ! url ) {
 		SafeBuf sb;
-		printAddUrlHomePage ( sb , NULL , r );
-		return g_httpServer.sendDynamicPage(s, 
+		printAddUrlHomePage ( sb , NULL , hr );
+		return g_httpServer.sendDynamicPage(sock, 
 						    sb.getBufStart(), 
 						    sb.length(),
 						    // 120 secs cachetime
@ -1686,19 +1699,19 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 						    200,
 						    NULL, // cookie
 						    "UTF-8",
-						    r);
+						    hr);
 	}

 	//
 	// run the ajax script on load to submit the url now 
 	//
-	long id = r->getLong("id",0);
+	long id = hr->getLong("id",0);
 	// if we are not being called by the ajax loader, the put the
 	// ajax loader script into the html now
 	if ( id == 0 ) {
 		SafeBuf sb;
-		printAddUrlHomePage ( sb , url , r );
-		return g_httpServer.sendDynamicPage ( s, 
+		printAddUrlHomePage ( sb , url , hr );
+		return g_httpServer.sendDynamicPage ( sock, 
 						      sb.getBufStart(), 
 						      sb.length(),
 						      // don't cache any more
@ -1711,7 +1724,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 						      200,
 						      NULL, // cookie
 						      "UTF-8",
-						      r);
+						      hr);
 	}

 	//
@ -1742,7 +1755,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 	if  ( msg ) {
 		SafeBuf sb;
 		sb.safePrintf("%s",msg);
-		g_httpServer.sendDynamicPage (s, 
+		g_httpServer.sendDynamicPage (sock, 
 					      sb.getBufStart(), 
 					      sb.length(),
 					      3600,//-1, // cachetime
@ -1764,10 +1777,10 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 		g_errno = ENOMEM;
 		log("PageAddUrl: new(%i): %s", 
 		    sizeof(State1i),mstrerror(g_errno));
-		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); }
+	    return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno)); }
 	mnew ( st1 , sizeof(State1i) , "PageAddUrl" );
 	// save socket and isAdmin
-	st1->m_socket  = s;
+	st1->m_socket  = sock;
 	st1->m_isAdmin = isAdmin;

 	/*
@ -1809,12 +1822,12 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 	//unsigned long h = ipdom ( s->m_ip );
 	// . use top 2 bytes now, some isps have large blocks
 	// . if this causes problems, then they can do pay for inclusion
-	unsigned long h = iptop ( s->m_ip );
+	unsigned long h = iptop ( sock->m_ip );
 	long codeLen;
-	char* code = r->getString("code", &codeLen);
-	if(g_autoBan.hasCode(code, codeLen, s->m_ip)) {
+	char* code = hr->getString("code", &codeLen);
+	if(g_autoBan.hasCode(code, codeLen, sock->m_ip)) {
 		long uipLen = 0;
-		char* uip = r->getString("uip",&uipLen);
+		char* uip = hr->getString("uip",&uipLen);
 		long hip = 0;
 		//use the uip when we have a raw query to test if 
 		//we can submit
@ -1824,18 +1837,18 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 		}
 	}

-	st1->m_strip = r->getLong("strip",0);
+	st1->m_strip = hr->getLong("strip",0);
 	// . Remember, for cgi, if the box is not checked, then it is not 
 	//   reported in the request, so set default return value to 0
 	// . support both camel case and all lower-cases
-	st1->m_spiderLinks = r->getLong("spiderLinks",0);
-	st1->m_spiderLinks = r->getLong("spiderlinks",st1->m_spiderLinks);
+	st1->m_spiderLinks = hr->getLong("spiderLinks",0);
+	st1->m_spiderLinks = hr->getLong("spiderlinks",st1->m_spiderLinks);

 	// . should we force it into spiderdb even if already in there
 	// . use to manually update spider times for a url
 	// . however, will not remove old scheduled spider times
 	// . mdw: made force on the default
-	st1->m_forceRespider = r->getLong("force",1); // 0);
+	st1->m_forceRespider = hr->getLong("force",1); // 0);

 	long now = getTimeGlobal();
 	// . allow 1 submit every 1 hour
@ -1850,7 +1863,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 		delete (st1);
 		// use cachetime of 3600 so it does not re-inject if you hit 
 		// the back button!
-		g_httpServer.sendDynamicPage (s, 
+		g_httpServer.sendDynamicPage (sock, 
 					      sb.getBufStart(), 
 					      sb.length(),
 					      3600,//-1, // cachetime
@ -1878,6 +1891,17 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
 	*/


+	// set this. also sets gr->m_hr
+	GigablastRequest *gr = &st1->m_msg7.m_gr;
+	// this will fill in GigablastRequest so all the parms we need are set
+	g_parms.setGigablastRequest ( sock , hr , gr );
+
+	// this is really an injection, not add url, so make
+	// GigablastRequest::m_url point to Gigablast::m_urlsBuf because
+	// the PAGE_ADDURLS2 parms in Parms.cpp fill in the m_urlsBuf.
+	// HACK!
+	gr->m_url = gr->m_urlsBuf;
+
 	//
 	// inject using msg7
 	//
--- a/PageStatsdb.cpp
+++ b/PageStatsdb.cpp
@ -51,7 +51,7 @@ static void   sendReply ( void *st ) ;

 // . returns false if blocked, otherwise true
 // . sets g_errno on error
-bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
+bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
 	
 	char *cgi;
 	long cgiLen;
@ -201,6 +201,13 @@ void sendReply ( void *state ) {
 	strncpy( startTimeStr, ctime( &st->m_startDate ), 30 );
 	strncpy( endTimeStr, ctime( &st->m_endDate ), 30 );

+	buf.safePrintf(
+		       "<b>Graph of various query performance statistics.</b>"
+		       "<br>"
+		       "<br>"
+		       );
+
+
 	buf.safePrintf("<center>\n");

 	if ( ! g_conf.m_useStatsdb ) 
@ -208,6 +215,7 @@ void sendReply ( void *state ) {
 			       "Turn on in the master controls.</b>"
 			       "</font>\n" );

+
 	buf.safePrintf("<table %s>\n",TABLE_STYLE);

 	buf.safePrintf("<tr><td bgcolor=#%s>"
--- a/Pages.cpp
+++ b/Pages.cpp
@ -72,13 +72,16 @@ static WebPage s_pages[] = {
 	//{ PAGE_WIDGET   , "widget"        , 0 , "widget" , 0 , 0 ,
 	//  "widget page",
 	//  sendPageWidget, 0 ,NULL,NULL,PG_NOAPI},
+
+	// this is the public addurl, /addurl, if you are using the 
+	// api use PAGE_ADDURL2 which is /admin/addurl. so we set PG_NOAPI here
 	{ PAGE_ADDURL    , "addurl"       , 0 , "add url" , 0 , 0 ,
 	  "Page where you can add url for spidering",
-	  sendPageAddUrl, 0 ,NULL,NULL,0},
+	  sendPageAddUrl, 0 ,NULL,NULL,PG_NOAPI},

 	{ PAGE_GET       , "get"           , 0 , "get" ,  0 , 0 ,
 	  //USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_CLIENT, 
-	  "gets cached url",
+	  "gets cached web page",
 	  sendPageGet  , 0 ,NULL,NULL,0},
 	{ PAGE_LOGIN     , "login"         , 0 , "login" ,  0 , 0 ,
 	  //USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_SPAM | USER_CLIENT, 
@ -99,15 +102,15 @@ static WebPage s_pages[] = {

 	// use post now for the "site list" which can be big
 	{ PAGE_BASIC_SETTINGS, "admin/settings", 0 , "settings",1, M_POST , 
-	  "Basic settings page.", sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI},
+	  "basic settings page", sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI},
 	{ PAGE_BASIC_STATUS, "admin/status", 0 , "status",1, 0 , 
-	  "Basic status page.", sendPageBasicStatus  , 0 ,NULL,NULL,0},
+	  "basic status page", sendPageBasicStatus  , 0 ,NULL,NULL,0},
 	//{ PAGE_BASIC_DIFFBOT, "admin/diffbot", 0 , "diffbot",1, 0 , 
 	//  "Basic diffbot page.",  sendPageBasicDiffbot  , 0 ,NULL,NULL,PG_NOAPI},
 	{ PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 , 
-	  "Basic security page.", sendPageGeneric  , 0 ,NULL,NULL,0},
+	  "basic security page", sendPageGeneric  , 0 ,NULL,NULL,0},
 	{ PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 , 
-	  "Basic search page.", sendPageRoot  , 0 ,NULL,NULL,PG_NOAPI},
+	  "basic search page", sendPageRoot  , 0 ,NULL,NULL,PG_NOAPI},



@ -115,7 +118,8 @@ static WebPage s_pages[] = {
 	  //USER_MASTER | USER_PROXY ,
 	  "master controls page",
 	  sendPageGeneric  , 0 ,NULL,NULL,0},
-	{ PAGE_SEARCH    , "admin/search"   , 0 , "search controls" ,  1 , 1,
+	// use POST for html head/tail and page root html. might be large.
+	{ PAGE_SEARCH    , "admin/search"   , 0 , "search controls" ,1,M_POST,
 	  //USER_ADMIN | USER_MASTER   , 
 	  "search controls page",
 	  sendPageGeneric  , 0 ,NULL,NULL,0},
@ -151,10 +155,11 @@ static WebPage s_pages[] = {
 	// { PAGE_SITES   , "admin/sites", 0 , "site list" ,  1 , 1,
 	//   "what sites can be spidered",
 	//   sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, // sendPageBasicSettings
-	{ PAGE_FILTERS   , "admin/filters", 0 , "url filters" ,  1 , 1,
+	{ PAGE_FILTERS   , "admin/filters", 0 , "url filters" ,  1 ,M_POST,
 	  //USER_ADMIN | USER_MASTER   , 
 	  "prioritize urls for spidering",
-	  sendPageGeneric  , 0 ,NULL,NULL,0},
+	  // until we get this working, set PG_NOAPI
+	  sendPageGeneric  , 0 ,NULL,NULL,PG_NOAPI},
 	{ PAGE_INJECT    , "admin/inject"   , 0 , "inject url" , 0,M_MULTI ,
 	  //USER_ADMIN | USER_MASTER   ,
 	  "inject url in the index here",
@ -180,17 +185,17 @@ static WebPage s_pages[] = {
 	// master admin pages
 	{ PAGE_STATS     , "admin/stats"   , 0 , "stats" ,  0 , 0 ,
 	  //USER_MASTER | USER_PROXY , 
-	  "statistics page",
+	  "general statistics",
 	  sendPageStats    , 0 ,NULL,NULL,0},

-	{ PAGE_STATSDB , "admin/statsdb"  , 0 , "graph"  ,  0 , 0 ,
+	{ PAGE_GRAPH , "admin/graph"  , 0 , "graph"  ,  0 , 0 ,
 	  //USER_MASTER , 
-	  "statistics page",
-	  sendPageStatsdb  , 2 /*niceness*/ ,NULL,NULL,0},
+	  "query stats graph page",
+	  sendPageGraph  , 2 /*niceness*/ ,NULL,NULL,0},

 	{ PAGE_PERF      , "admin/perf"    , 0 , "performance"     ,  0 , 0 ,
 	  //USER_MASTER | USER_PROXY ,
-	  "master performance page",
+	  "function performance graph",
 	  sendPagePerf     , 0 ,NULL,NULL,0},

 	{ PAGE_SOCKETS   , "admin/sockets" , 0 , "sockets" ,  0 , 0 ,
@ -237,7 +242,7 @@ static WebPage s_pages[] = {
 	{ PAGE_API , "admin/api"         , 0 , "api" , 0 , 0 ,
 	  //USER_MASTER | USER_ADMIN , 
 	  "api page",
-	  sendPageAPI , 0 ,NULL,NULL,0},
+	  sendPageAPI , 0 ,NULL,NULL,PG_NOAPI},
 	{ PAGE_RULES  , "admin/siterules", 0 , "site rules", 1, M_POST,
 	  //USER_ADMIN | USER_MASTER   , 
 	  "site rules page",
@ -258,7 +263,7 @@ static WebPage s_pages[] = {

 	{ PAGE_SPIDERDB  , "admin/spiderdb" , 0 , "spider queue" ,  0 , 0 ,
 	  //USER_ADMIN | USER_MASTER   , 
-	  "spiderdb page",
+	  "spider queue",
 	  sendPageSpiderdb , 0 ,NULL,NULL,0},
 	//{ PAGE_PRIORITIES, "admin/priorities"  , 0 , "priority controls",1,1,
 	//  //USER_ADMIN | USER_MASTER   , 
@ -293,7 +298,7 @@ static WebPage s_pages[] = {
 	  sendPageParser   , 2 ,NULL,NULL,PG_NOAPI},
 	{ PAGE_SITEDB    , "admin/tagdb"  , 0 , "tagdb"  ,  0 , M_POST,
 	  //USER_MASTER | USER_ADMIN,
-	  "tagdb page to add/remove/get tags",
+	  "add/remove/get tags for sites/urls",
 	  sendPageTagdb ,  0 ,NULL,NULL,0},	  
 	{ PAGE_CATDB     , "admin/catdb"   , 0 , "catdb"           ,  0,M_POST,
 	  //USER_MASTER | USER_ADMIN,
@ -518,6 +523,9 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
 	if ( ! publicPage && ! isAdmin )
 		return sendPageLogin ( s , r );

+	if ( page == PAGE_CRAWLBOT && ! isAdmin )
+		log("pages: accessing a crawlbot page without admin privs. "
+		    "no parms can be changed.");

 	/*
 	// is request coming from a local ip?
@ -1088,9 +1096,17 @@ bool Pages::printAdminTop (SafeBuf     *sb   ,
 	if ( isBasic ) menu = "basic";
 	sb->safePrintf("<br>");
 	sb->safePrintf("<b><font color=gray size=+2>"
-		       "%s &gt; %s &gt; %s</font></b>"
+		       "%s &gt; %s &gt; %s "
+		       "&nbsp; "
+		       "</font>"
+		       "</b>"
+		       //"<a href=/%s?c=%s&showparms=1&format=xml>xml</a> "
+		       //"<a href=/%s?c=%s&showparms=1&format=json>json</a> "
 		       "<br><br>\n", 
-		       coll, menu, s_pages[page].m_name);
+		       coll, menu, s_pages[page].m_name
+		       //,s_pages[page].m_filename , coll
+		       //,s_pages[page].m_filename , coll
+		       );



@ -2479,7 +2495,10 @@ bool sendPageAPI ( TcpSocket *s , HttpRequest *r ) {
 	g_pages.printLogo   ( &p , coll );
 	p.safePrintf("</td></tr></table><br><br>");

-
+	p.safePrintf("NOTE: All APIs support both GET and POST method. "
+		     "If the size of your request is more than 2K you "
+		     "should use POST.");
+	p.safePrintf("<br><br>");

 	p.safePrintf("<div style=padding-left:10%%>"
 		     "<font size=+2><b>API by pages</b></font>"
@ -2592,8 +2611,17 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
 	sb->safePrintf("</a>");

 	// description of page
-	sb->safePrintf("<font size=-0> - %s</font><br>",
-		       s_pages[PAGENUM].m_desc);
+	sb->safePrintf("<font size=-0> - %s "
+		       " &nbsp; "
+		       "[ <b>output response in</b> "
+		       "<a href=/%s?showparms=1&format=xml>xml</a> "
+		       "or <a href=/%s?showparms=1&format=json>json</a> "
+		       "or <a href=/%s>html</a> ] "
+		       "</font><br>",
+		       s_pages[PAGENUM].m_desc,
+		       pageStr,
+		       pageStr,
+		       pageStr);
 	sb->safePrintf("</div><br>");
 	
 	// begin new list of centered tables
@ -2603,7 +2631,7 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
 	sb->safePrintf ( 
 			"<table style=max-width:80%%; %s>"
 			"<tr class=hdrow><td colspan=9>"
-			"<center><b>Parms</b></tr></tr>"
+			"<center><b>Input</b></tr></tr>"
 			"<tr bgcolor=#%s>"
 			"<td><b>#</b></td>"
 			"<td><b>parm</b></td>"
@ -2615,9 +2643,75 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
 			, TABLE_STYLE
 			, DARK_BLUE );
 	
-	const char *blue = LIGHT_BLUE;
+	const char *blues[] = {DARK_BLUE,LIGHT_BLUE};
 	long count = 1;

+	//
+	// every page supports the:
+	// 1) &format=xml|html|json 
+	// 2) &showparms=0|1
+	// 3) &c=<collectionName>
+	// parms. we support them in sendPageGeneric() for pages like
+	// /admin/master /admin/search /admin/spider so you can see
+	// the settings.
+	// put these in Parms.cpp, but use PF_DISPLAY flag so we ignore them
+	// in convertHttpRequestToParmList() and we do not show them on the
+	// page itself.
+	//
+
+	// page display/output parms
+	sb->safePrintf("<tr bgcolor=%s>"
+		       "<td>%li</td>\n"
+		       "<td><b>format</b></td>"
+		       "<td>STRING</td>"
+		       "<td>output format</td>"
+		       "<td>html</td>"
+		       "<td>Display output in this format.</td>"
+		       "</tr>"
+		       , blues[count%2]
+		       , count
+		       );
+	count++;
+
+	// for pages that have settings...
+	if ( PAGENUM == PAGE_MASTER ||
+	     PAGENUM == PAGE_SEARCH ||
+	     PAGENUM == PAGE_SPIDER ) {
+		sb->safePrintf("<tr bgcolor=%s>"
+			       "<td>%li</td>\n"
+			       "<td><b>showparms</b></td>"
+			       "<td>BOOL (0 or 1)</td>"
+			       "<td>show parms</td>"
+			       "<td></td>"
+			       "<td>Display the values of all settings.</td>"
+			       "</tr>"
+			       , blues[count%2]
+			       , count
+			       );
+		count++;
+	}
+
+
+	// . master controls are for all collections so no need for this
+	// . we already have this in the parms list for some pages so only
+	//   show for selected pages here
+	// if ( PAGENUM != PAGE_MASTER ) {
+	// 	sb->safePrintf("<tr bgcolor=%s>"
+	// 		       "<td>%li</td>\n"
+	// 		       "<td><b>c</b></td>"
+	// 		       "<td>STRING</td>"
+	// 		       "<td>Collection</td>"
+	// 		       "<td></td>"
+	// 		       "<td>The name of the collection. "
+	// 		       "<font color=green><b>REQUIRED</b></font>"
+	// 		       "</td>"
+	// 		       "</tr>"
+	// 		       , blues[count%2]
+	// 		       , count
+	// 		       );
+	// 	count++;
+	// }
+
 	//char *lastPage = NULL;
 	//Parm *lastParm = NULL;

@ -2643,10 +2737,6 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {

 		if ( pageNum != PAGENUM ) continue;

-		if ( blue == (const char *)LIGHT_BLUE ) blue = DARK_BLUE;
-		else if(blue==(const char *)DARK_BLUE ) blue = LIGHT_BLUE;
-
-
 		SafeBuf tmp;
 		char diff = 0;
 		bool printVal = false;
@ -2664,7 +2754,7 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
 		if ( diff == 1 ) 
 			sb->safePrintf ( "<tr bgcolor=orange>");
 		else
-			sb->safePrintf ( "<tr bgcolor=#%s>",blue);
+			sb->safePrintf ( "<tr bgcolor=#%s>",blues[count%2]);

 		sb->safePrintf("<td>%li</td>",count++);

@ -2721,6 +2811,17 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
 	// end input parm table we started below
 	sb->safePrintf("</table><br>\n\n");

+	// do not print the tables below now,
+	// we provide output links for xml, json and html
+	sb->safePrintf("</center>");
+
+	if ( PAGENUM != PAGE_GET &&
+	     PAGENUM != PAGE_RESULTS )
+		return true;
+
+
+	sb->safePrintf("<center>");
+
 	//
 	// done printing parm table
 	//
@ -2731,22 +2832,82 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
 	sb->safePrintf ( 
 			"<table style=max-width:80%%; %s>"
 			"<tr class=hdrow><td colspan=9>"
-			"<center><b>XML Output</b></tr></tr>"
-			"<tr><td>"
+			"<center><b>Example XML Output</b> "
+			"(&format=xml)</tr></tr>"
+			"<tr><td bgcolor=%s>"
 			, TABLE_STYLE
+			, LIGHT_BLUE
 			);
-	sb->safePrintf("<pre>\n");
-	char *desc = s_pages[PAGENUM].m_xmlOutputDesc;
-	if ( ! desc )
-		desc = "<response>\n"
-			"\t<status>N</status> "
-			"# 0 on success, otherwise an "
-			"error code\n"
-			"\t<statusMsg>S</statusMsg> "
-			"# \"Success\" on success, "
-			"otherwise the error message."
-			"</response>";
-	sb->htmlEncode ( desc);
+
+
+	// bool showParms = false;
+	// if ( PAGENUM == PAGE_MASTER ||
+	//      PAGENUM == PAGE_SPIDER ||
+	//      PAGENUM == PAGE_SEARCH 
+	//      ) 
+	// 	showParms = true;
+
+
+	sb->safePrintf("<pre style=max-width:500px;>\n");
+
+	char *get = "<html><title>Some web page title</title>"
+		"<head>My first web page</head></html>";
+
+	// example output in xml
+	if ( PAGENUM == PAGE_GET ) {
+		SafeBuf xb;
+		xb.safePrintf("<response>\n"
+			      "\t<statusCode>0</statusCode>\n"
+			      "\t<statusMsg>Success</statusMsg>\n"
+			      "\t<url><![CDATA[http://www.doi.gov/]]></url>\n"
+			      "\t<docId>34111603247</docId>\n"
+			      "\t<cachedTimeUTC>1404512549</cachedTimeUTC>\n"
+			      "\t<cachedTimeStr>Jul 04, 2014 UTC"
+			      "</cachedTimeStr>\n"
+			      "\t<content><![CDATA[");
+		xb.cdataEncode(get);
+		xb.safePrintf("]]></content>\n");
+		xb.safePrintf("</response>\n");
+		sb->htmlEncode ( xb.getBufStart() );
+	}
+
+	if ( PAGENUM == PAGE_RESULTS ) {
+		SafeBuf xb;
+		xb.safePrintf("<response>\n"
+			      "\t<statusCode>0</statusCode>\n"
+			      "\t<statusMsg>Success</statusMsg>\n"
+			      "\t<currentTimeUTC>1404513734</currentTimeUTC>\n"
+			      "\t<responseTimeMS>284</responseTimeMS>\n"
+			      "\t<docsInCollection>226</docsInCollection>\n"
+			      "\t<hits>193</hits>\n"
+			      "\t<moreResultsFollow>1</moreResultsFollow>\n"
+
+			      "\t<result>\n"
+			      "\t\t<imageBase64>/9j/4AAQSkZJRgABAQAAAQABA..."
+			      "</imageBase64>\n"
+			      "\t\t<imageHeight>350</imageHeight>\n"
+			      "\t\t<imageWidth>223</imageWidth>\n"
+			      "\t\t<origImageHeight>470</origImageHeight>\n"
+			      "\t\t<origImageWidth>300</origImageWidth>\n"
+			      "\t\t<title><![CDATA[U.S....]]></title>\n"
+			      "\t\t<sum>Department of the Interior protects "
+			      "America's natural resources and</sum>\n"
+			      "\t\t<url><![CDATA[www.doi.gov]]></url>\n"
+			      "\t\t<size>  64k</size>\n"
+			      "\t\t<docId>34111603247</docId>\n"
+			      "\t\t<site>www.doi.gov</site>\n"
+			      "\t\t<spidered>1404512549</spidered>\n"
+			      "\t\t<firstIndexedDateUTC>1404512549"
+			      "</firstIndexedDateUTC>\n"
+			      "\t\t<contentHash32>2680492249</contentHash32>\n"
+			      "\t\t<language>English</language>\n"
+			      "\t</result>\n"
+
+			      "</response>\n");
+		sb->htmlEncode ( xb.getBufStart() );
+	}
+
+
 	sb->safePrintf("</pre>");
 	sb->safePrintf ( "</td></tr></table><br>\n\n" );
 	
@ -2756,23 +2917,74 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
 	sb->safePrintf ( 
 			"<table style=max-width:80%%; %s>"
 			"<tr class=hdrow><td colspan=9>"
-			"<center><b>JSON Output</b></tr></tr>"
-			"<tr><td>"
+			"<center><b>Example JSON Output</b> "
+			"(&format=json)</tr></tr>"
+			"<tr><td bgcolor=%s>"
 			, TABLE_STYLE
+			, LIGHT_BLUE
 			);
 	sb->safePrintf("<pre>\n");
-	desc = s_pages[PAGENUM].m_jsonOutputDesc;
-	if ( ! desc )
-		desc = "{ \"response:\"{\n"
-			"\t\"status\":N, "
-			"# 0 on success, otherwise an "
-			"error code\n"
-			"\t\"statusMsg\":\"xxx\" "
-			"# xxx is \"Success\" on success, "
-			"otherwise the error message.\n"
-			"\t}\n"
-			"}";
-	sb->htmlEncode ( desc);
+
+
+	// example output in xml
+	if ( PAGENUM == PAGE_GET ) {
+		sb->safePrintf(
+			       "{ \"response:\"{\n"
+			       "\t\"statusCode\":0,\n"
+			       "\t\"statusMsg\":\"Success\",\n"
+			       "\t\"url\":\"http://www.doi.gov/\",\n"
+			       "\t\"docId\":34111603247,\n"
+			       "\t\"cachedTimeUTC\":1404512549,\n"
+			       "\t\"cachedTimeStr\":\"Jul 04, 2014 UTC\",\n"
+			       "\t\"content\":\"");
+		SafeBuf js;
+		js.jsonEncode(get);
+		sb->htmlEncode(js.getBufStart());
+		sb->safePrintf("\"\n"
+			       "}\n"
+			       "}\n");
+	}
+
+	if ( PAGENUM == PAGE_RESULTS ) {
+		sb->safePrintf(
+			       "{ \"response:\"{\n"
+			       "\t\"statusCode\":0,\n"
+			       "\t\"statusMsg\":\"Success\",\n"
+
+			       "\t\"currentTimeUTC\":1404588231,\n"
+			       "\t\"responseTimeMS\":312,\n"
+			       "\t\"docsInCollection\":226,\n"
+			       "\t\"hits\":193,\n"
+			       "\t\"moreResultsFollow\":1,\n"
+			       "\t\"results\":[\n"
+
+			       "\t{\n"
+			       "\t\t\"imageBase64\":\"/9j/4AAQSkZJR...\",\n"
+			       "\t\t\"imageHeight\":223,\n"
+			       "\t\t\"imageWidth\":350,\n"
+			       "\t\t\"origImageHeight\":300,\n"
+			       "\t\t\"origImageWidth\":470,\n"
+			       "\t\t\"title\":\"U.S....\",\n"
+			       "\t\t\"sum\":\"Department of the Interior "
+			       "protects America's natural resources.\",\n"
+			       "\t\t\"url\":\"www.doi.gov\",\n"
+			       "\t\t\"size\":\"  64k\",\n"
+			       "\t\t\"docId\":34111603247,\n"
+			       "\t\t\"site\":\"www.doi.gov\",\n"
+			       "\t\t\"spidered\":1404512549,\n"
+			       "\t\t\"firstIndexedDateUTC\":1404512549,\n"
+			       "\t\t\"contentHash32\":2680492249,\n"
+			       "\t\t\"language\":\"English\"\n"
+			       "\t}\n"
+			       "\t,\n"
+			       "\t...\n"
+
+			       "]\n"
+			       "}\n"
+			       );
+	}
+
+
 	sb->safePrintf("</pre>");
 	sb->safePrintf ( "</td></tr></table><br>\n\n" );
 	
--- a/Pages.h
+++ b/Pages.h
@ -85,7 +85,7 @@ bool sendPageAPI        ( TcpSocket *s , HttpRequest *r );
 bool sendPageWordVec   ( TcpSocket *s , HttpRequest *r );
 bool sendPageQualityAgent   ( TcpSocket *s , HttpRequest *r );
 bool sendPageThesaurus  ( TcpSocket *s , HttpRequest *r );
-bool sendPageStatsdb   ( TcpSocket *s , HttpRequest *r );
+bool sendPageGraph      ( TcpSocket *s , HttpRequest *r );

 // values for m_usePost:
 #define M_GET   0x00
@ -110,8 +110,8 @@ class WebPage {
 	char *m_desc; // page description
 	bool (* m_function)(TcpSocket *s , HttpRequest *r);
 	long  m_niceness;
-	char *m_xmlOutputDesc;
-	char *m_jsonOutputDesc;
+	char *m_reserved1;
+	char *m_reserved2;
 	char  m_pgflags;
 };

@ -340,7 +340,7 @@ enum {

 	PAGE_HOSTS       ,
 	PAGE_STATS       , // 10
-	PAGE_STATSDB	 ,
+	PAGE_GRAPH       , // PAGE_STATSDB	 ,
 	PAGE_PERF        ,
 	PAGE_SOCKETS     ,

--- a/Parms.cpp
+++ b/Parms.cpp
--- a/Parms.h
+++ b/Parms.h
@ -152,7 +152,6 @@ class GigablastRequest {
 	char *m_urlsBuf;
 	char  m_stripBox;
 	char  m_harvestLinksBox;
-	char  m_forceRespiderBox;

 	/////////////
 	//
@ -200,6 +199,9 @@ class GigablastRequest {
 #define PF_REQUIRED    0x4000
 #define PF_REBUILDPROXYTABLE 0x8000

+#define PF_NOHTML      0x10000
+
+
 class Parm {
 public:
 	char *m_title; // displayed above m_desc on admin gui page
@ -317,7 +319,7 @@ class Parms {
 			  long nc , 
 			  long pd ,
 			  bool isCrawlbot ,
-			  bool isJSON,
+			  char format, //bool isJSON,
 			  TcpSocket *sock
 			  );

@ -353,7 +355,7 @@ class Parms {
 			 long  pd   ,
 			 bool lastRow ,
 			 bool isCrawlbot = false,
-			 bool isJSON = false ) ;
+			 char format = FORMAT_HTML);//bool isJSON = false ) ;

 	char *getTHIS ( HttpRequest *r , long page );

--- a/RdbDump.cpp
+++ b/RdbDump.cpp
@ -58,6 +58,13 @@ bool RdbDump::set ( //char     *coll          ,
 	// use 0 for collectionless
 	if ( rdb && rdb->m_isCollectionLess ) m_collnum = 0;

+	// are we like catdb/statsdb etc.?
+	m_doCollCheck = true;
+	if ( rdb && rdb->m_isCollectionLess ) m_doCollCheck = false;
+	// RdbMerge also calls us but rdb is always set to NULL and it was
+	// causing a merge on catdb (collectionless) to screw up
+	if ( ! rdb ) m_doCollCheck = false;
+
 	/*
 	if ( ! coll && g_catdb.getRdb() == rdb )
 		strcpy(m_coll, "catdb");
@ -1023,14 +1030,18 @@ void RdbDump::continueDumping() {

 	// if someone reset/deleted the collection we were dumping...
 	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
-	if ( ! cr ) {
+	// . do not do this for statsdb/catdb which always use collnum of 0
+	// . RdbMerge also calls us but gives a NULL m_rdb so we can't
+	//   set m_isCollectionless to false
+	if ( ! cr && m_doCollCheck ) {
 		g_errno = ENOCOLLREC;
 		// m_file is invalid if collrec got nuked because so did
 		// the Rdbbase which has the files
 		log("db: continue dumping lost collection");
 	}
+
 	// bitch about errors
-	else if (g_errno)log("db: Dump to %s had error writing: %s.",
+	if (g_errno)log("db: Dump to %s had error writing: %s.",
 			     m_file->getFilename(),mstrerror(g_errno));

 	// go back now if we were NOT dumping a tree
--- a/RdbDump.h
+++ b/RdbDump.h
@ -183,6 +183,8 @@ class RdbDump {
 	//char      m_coll [ MAX_COLL_LEN + 1 ];
 	collnum_t m_collnum;

+	bool m_doCollCheck;
+
 	bool m_tried;

 	bool m_isSuspended;
--- a/RdbTree.cpp
+++ b/RdbTree.cpp
@ -1212,6 +1212,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
 	long hkp = 0;
 	char useHalfKeys = false;

+	// these guy always use a collnum of 0
+	bool doCollRecCheck = true;
+	if ( !strcmp(m_dbname,"catdb") ) doCollRecCheck = false;
+	if ( !strcmp(m_dbname,"statsdb") ) doCollRecCheck = false;
+
+
 	if ( !strcmp(m_dbname,"indexdb") ) useHalfKeys = true;
 	if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
 	if ( !strcmp(m_dbname,"tfndb"  ) ) useHalfKeys = true;
@ -1232,12 +1238,17 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
 		// for posdb
 		if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
 			char *xx=NULL;*xx=0; }
+
 		// bad collnum?
-		collnum_t cn = m_collnums[i];
-		if ( m_rdbId>=0 && (cn >= g_collectiondb.m_numRecs || cn < 0) )
-			return log("db: bad collnum in tree");
-		if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
-			return log("db: collnum is obsolete in tree");
+		if ( doCollRecCheck ) {
+			collnum_t cn = m_collnums[i];
+			if ( m_rdbId>=0 && 
+			     (cn >= g_collectiondb.m_numRecs || cn < 0) )
+				return log("db: bad collnum in tree");
+			if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
+				return log("db: collnum is obsolete in tree");
+		}
+
 		// if no left/right kid it MUST be -1
 		if ( m_left[i] < -1 )
 			return log(
--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -392,6 +392,10 @@ long SafeBuf::saveToFile ( char *dir , char *filename ) {
 	return dumpToFile ( buf );
 }

+long SafeBuf::save ( char *fullFilename ) {
+	return dumpToFile ( fullFilename );
+}
+
 long SafeBuf::dumpToFile(char *filename ) {
 retry22:
 	long fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC,
@ -2785,6 +2789,15 @@ bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
 }
 */

+bool SafeBuf::jsonEncode ( char *src , long srcLen ) {
+	char c = src[srcLen];
+	src[srcLen] = 0;
+	bool status = jsonEncode ( src );
+	src[srcLen] = c;
+	return status;
+}
+
+// encode into json
 bool SafeBuf::safeUtf8ToJSON ( char *utf8 ) {

 	if ( ! utf8 ) return true;
--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -56,6 +56,7 @@ struct SafeBuf {
 	long saveToFile ( char *dir , char *filename ) ;
 	long dumpToFile(char *filename);
 	long save ( char *dir, char *fname){return saveToFile(dir,fname); };
+	long save ( char *fullFilename ) ;

 	long  fillFromFile(char *filename);
 	long  fillFromFile(char *dir,char *filename);
@ -107,6 +108,8 @@ struct SafeBuf {
 	bool  safeStrcpy ( char *s ) ;
 	//bool  safeStrcpyPrettyJSON ( char *decodedJson ) ;
 	bool  safeUtf8ToJSON ( char *utf8 ) ;
+	bool jsonEncode ( char *utf8 ) { return safeUtf8ToJSON(utf8); };
+	bool jsonEncode ( char *utf8 , long utf8Len );

 	bool  csvEncode ( char *s , long len , long niceness = 0 );

--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -12,7 +12,7 @@
 #include "Timedb.h"
 #include "PageResults.h"

-char getFormatFromRequest ( class HttpRequest *hr ) ;
+//char getFormatFromRequest ( class HttpRequest *hr ) ;

 SearchInput::SearchInput() {
 	reset();
@ -257,7 +257,8 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
 			return false;
 		}
 		// add to our list
-		if (!m_collnumBuf.safeMemcpy(&cr->m_collnum,sizeof(collnum_t)))
+		if (!m_collnumBuf.safeMemcpy(&tmpcr->m_collnum,
+					     sizeof(collnum_t)))
 			return false;
 		// restore the \0 character we wrote in there
 		*end = c;
@ -272,10 +273,10 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
 	// use default collection if none provided
 	if ( ! p && ! token && m_collnumBuf.length() <= 0 ) {
 		// get default collection rec
-		CollectionRec *dr = g_collectiondb.getRec (coll);
+		cr = g_collectiondb.getRec (coll);
 		// add to our list
-		if ( dr &&
-		     !m_collnumBuf.safeMemcpy(&dr->m_collnum,
+		if ( cr &&
+		     !m_collnumBuf.safeMemcpy(&cr->m_collnum,
 					      sizeof(collnum_t)))
 			return false;
 	}
@ -294,9 +295,9 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {

 	// must have had one
 	if ( ! cr ) {
-		log("si: collection does not exist");
-		g_errno = ENOCOLLREC;
-		return false;
+		log("si: si. collection does not exist");
+		//g_errno = ENOCOLLREC;
+		//return false;
 	}

 	// and set from the http request. will set m_coll, etc.
@ -310,7 +311,7 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
 	//////

 	// get the format. "xml" "html" "json" --> FORMAT_HTML, FORMAT_CSV ...
-	char tmpFormat = getFormatFromRequest ( &m_hr );
+	char tmpFormat = m_hr.getReplyFormat();//getFormatFromRequest ( &m_hr);
 	// now override automatic defaults for special cases
 	if ( tmpFormat != FORMAT_HTML ) {
 		m_familyFilter            = 0;
@ -960,51 +961,6 @@ uint8_t SearchInput::detectQueryLanguage(void) {
 }
 */

-char getFormatFromRequest ( HttpRequest *r ) {
-
-	char *formatStr = r->getString("format");
-
-	//if ( ! formatStr ) return FORMAT_HTML;
-
-	char format = FORMAT_HTML;
-
-	// what format should search results be in? default is html
-	if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
-	if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
-	if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
-	if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
-	if ( formatStr && strcmp(formatStr,"iframe")==0)
-		format=FORMAT_WIDGET_IFRAME;
-	if ( formatStr && strcmp(formatStr,"ajax")==0)
-		format=FORMAT_WIDGET_AJAX;
-	if ( formatStr && strcmp(formatStr,"append")==0)
-		format=FORMAT_WIDGET_APPEND;
-
-	// support old api &xml=1 to mean &format=1
-	if ( r->getLong("xml",0) ) {
-		format = FORMAT_XML;
-	}
-
-	// also support &json=1
-	if ( r->getLong("json",0) ) {
-		format = FORMAT_JSON;
-	}
-
-	if ( r->getLong("csv",0) ) {
-		format = FORMAT_CSV;
-	}
-
-	if ( r->getLong("iframe",0) ) {
-		format = FORMAT_WIDGET_IFRAME;
-	}
-
-	if ( r->getLong("ajax",0) ) {
-		format = FORMAT_WIDGET_AJAX;
-	}
-
-	if ( r->getLong("append",0) ) {
-		format = FORMAT_WIDGET_APPEND;
-	}
-
-	return format;
-}
+//char getFormatFromRequest ( HttpRequest *r ) {
+//
+//}
--- a/Spider.cpp
+++ b/Spider.cpp
@ -5207,6 +5207,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
 	// if spidering disabled then do not do this crap
 	if ( ! g_conf.m_spideringEnabled )  return;
 	//if ( ! g_conf.m_webSpideringEnabled )  return;
+	// or if trying to exit
+	if ( g_process.m_mode == EXIT_MODE ) return;	

 	// wait for clock to sync with host #0
 	if ( ! isClockInSync() ) { 
@ -5517,6 +5519,8 @@ void SpiderLoop::spiderDoledUrls ( ) {

 	// must be spidering to dole out
 	if ( ! g_conf.m_spideringEnabled ) return;
+	// or if trying to exit
+	if ( g_process.m_mode == EXIT_MODE ) return;	
 	// if we don't have all the url counts from all hosts, then wait.
 	// one host is probably down and was never up to begin with
 	if ( ! s_countsAreValid ) return;
@ -6617,7 +6621,9 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
 		return true;
 	}
 	// turned off?
-	if ( ( (! g_conf.m_spideringEnabled 
+	if ( ( (! g_conf.m_spideringEnabled ||
+		// or if trying to exit
+		g_process.m_mode == EXIT_MODE
 		) && // ! g_conf.m_webSpideringEnabled ) &&
 	       ! sreq->m_isInjecting ) || 
 	     // repairing the collection's rdbs?
@ -8584,7 +8590,16 @@ bool sendPage ( State11 *st ) {
 		     g_stats.m_allErrorsOld[i] == 0 &&
 		     bucketsNew[i] == 0 && bucketsOld[i] == 0 ) continue;
 		sb.safePrintf (
-			       "<tr bgcolor=#%s><td><b>%s</b></td>"
+			       "<tr bgcolor=#%s>"
+			       "<td><b><a href=/search?c=%s&q=gbstatusmsg%%3A"
+			       "%%22"
+			       ,
+			       LIGHT_BLUE , cr->m_coll );
+		sb.urlEncode(mstrerror(i));
+		sb.safePrintf ("%%22>"
+			       "%s"
+			       "</a>"
+			       "</b></td>"
 			       "<td>%lli</td>"
 			       "<td>%lli</td>"
 			       "<td>%lli</td>"
@ -8592,7 +8607,6 @@ bool sendPage ( State11 *st ) {
 			       "<td>%li</td>"
 			       "<td>%li</td>"
 			       "</tr>\n" ,
-			       LIGHT_BLUE,
 			       mstrerror(i),
 			       g_stats.m_allErrorsNew[i] +
 			       g_stats.m_allErrorsOld[i],
@ -10259,6 +10273,14 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			     errCode != EDNSDEAD &&
 			     // assume diffbot is temporarily experiencing errs
 			     errCode != EDIFFBOTINTERNALERROR &&
+			     // if diffbot received empty content when d'lding
+			     errCode != EDIFFBOTEMPTYCONTENT &&
+			     // or diffbot tcp timed out when d'lding the url
+			     errCode != EDIFFBOTREQUESTTIMEDOUT &&
+			     // if diffbot closed the socket on us...
+			     errCode != EDIFFBOTMIMEERROR &&
+			     // of the diffbot reply itself was not 200 (OK)
+			     errCode != EDIFFBOTBADHTTPSTATUS &&
 			     // out of memory while crawling?
 			     errCode != ENOMEM &&
 			     errCode != ENETUNREACH &&
@ -10332,6 +10354,22 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		if ( strncmp(p,"isreindex",9) == 0 ) {
+			// skip for msg20
+			if ( isForMsg20 ) continue;
+			// if no match continue
+			//if ( (bool)sreq->m_urlIsDocId==val ) continue;
+			if ( (bool)sreq->m_isPageReindex==val ) continue;
+			// skip
+			p += 10;
+			// skip to next constraint
+			p = strstr(p, "&&");
+			// all done?
+			if ( ! p ) return i;
+			p += 2;
+			goto checkNextRule;
+		}
+
 		if ( strncmp(p,"iscontacty",10) == 0 ) {
 			// skip for msg20
 			if ( isForMsg20 ) continue;
--- a/Summary.cpp
+++ b/Summary.cpp
@ -58,6 +58,7 @@ bool Summary::set2 ( Xml      *xml                ,
 		     bool      doStemming         ,
 		     long      maxSummaryLen      , 
 		     long      maxNumLines        ,
+		     long      numDisplayLines    ,
 		     long      maxNumCharsPerLine ,
 		     //long      bigSampleRadius    ,
 		     //long      bigSampleMaxLen    ,
@ -81,6 +82,9 @@ bool Summary::set2 ( Xml      *xml                ,
 	// to see if it has all the query terms...
 	//if ( maxNumLines <= 0 ) return true;

+	m_numDisplayLines = numDisplayLines;
+	m_displayLen      = 0;
+
 	//m_useDateLists   = useDateLists;
 	//m_exclDateList   = exclDateList;
 	//m_begPubDateList = begPubDateList;
@ -232,7 +236,12 @@ bool Summary::set2 ( Xml      *xml                ,
 	// highest scoring window around each term. And then find the highest
 	// of those over all the matching terms.
 	//
-	for ( long numFinal = 0; numFinal < maxNumLines; numFinal++ ){
+	long numFinal;
+	for ( numFinal = 0; numFinal < maxNumLines; numFinal++ ){
+
+		if ( numFinal == m_numDisplayLines )
+			m_displayLen = p - m_summary;
+
 		// reset these at the top of each loop
 		Match     *maxm;
 		long long  maxScore = 0;
@ -508,6 +517,9 @@ bool Summary::set2 ( Xml      *xml                ,
 			bb[j] |= D_USED;
 	}

+	if ( numFinal <= m_numDisplayLines )
+		m_displayLen = p - m_summary;
+
 	/*end = gettimeofdayInMilliseconds();
 	if ( end - start > 10 )
 		log ( LOG_WARN,"summary: took %llims to finish doing summary "
@ -530,18 +542,25 @@ bool Summary::set2 ( Xml      *xml                ,
 			m_summaryExcerptLen[0] = p - m_summary;
 			m_numExcerpts = 1;
 		}
+		// in this case we only have one summary line
+		if ( m_numDisplayLines > 0 )
+			m_displayLen = p - m_summary;
 	}


 	// If we still didn't find a summary, get the default summary
-	if ( p == m_summary )
+	if ( p == m_summary ) {
 		// then return the default summary
-		return getDefaultSummary ( xml,
-					   words,
-					   sections,
-					   pos,
-					   //bigSampleRadius,
-					   maxSummaryLen );
+		bool status = getDefaultSummary ( xml,
+						  words,
+						  sections,
+						  pos,
+						  //bigSampleRadius,
+						  maxSummaryLen );
+		if ( m_numDisplayLines > 0 )
+			m_displayLen = m_summaryLen;
+		return status;
+	}

 	// if we don't find a summary, theres no need to NULL terminate
 	if ( p != m_summary ) *p++ = '\0';
@ -954,6 +973,10 @@ bool Summary::getDefaultSummary ( Xml    *xml,
 		m_summaryLen = xml->getMetaContent(p,maxSummaryLen,
 						   "description",11);

+
+	if ( m_numDisplayLines > 0 )
+		m_displayLen = m_summaryLen;
+
 	if ( m_summaryLen > 0 ) {
 		m_summaryExcerptLen[0] = m_summaryLen;
 		m_numExcerpts = 1;
@ -1056,6 +1079,10 @@ bool Summary::getDefaultSummary ( Xml    *xml,
 		*p++ = '\0';
 		// set length
 		m_summaryLen = p - m_summary;
+
+		if ( m_numDisplayLines > 0 )
+			m_displayLen = m_summaryLen;
+
 		if ( m_summaryLen > 50000 ) { char*xx=NULL;*xx=0; }
 		return true;
 	}
--- a/Summary.h
+++ b/Summary.h
@ -78,6 +78,7 @@ class Summary {
 		    //long            collLen            ,
 		    bool            doStemming         ,
 		    long            maxSummaryLen      , 
+		    long            numDisplayLines    ,
 		    long            maxNumLines        ,
 		    long            maxNumCharsPerLine ,
 		    //long            bigSampleRadius    ,
@ -237,6 +238,12 @@ class Summary {
 	//bool  m_freeBuf;
        //char  m_localBuf[10032];

+	// if getting more lines for deduping than we need for displaying,
+	// how big is that part of the summary to display?
+	long m_numDisplayLines;
+	long m_displayLen;
+	long getSummaryDisplayLen() { return m_displayLen; }
+
 	long  m_maxNumCharsPerLine;

 	long m_titleVersion;
--- a/TcpServer.cpp
+++ b/TcpServer.cpp
@ -136,7 +136,9 @@ bool TcpServer::init ( void (* requestHandler)(TcpSocket *s) ,
 	struct sockaddr_in name; 
 	// parm
 	int options;
-	// if port is -1 don't set up a listening socket
+	// if port is -1 don't set up a listening socket, this is used
+	// for things like blaster that are clients only. or the qatest()
+	// function.
 	if ( m_port == -1 || m_port == 0 ) goto skipServer;
 	// . set up our connection listening socket
 	// . sets g_errno and returns -1 on error
@ -756,7 +758,7 @@ static long s_lastTime = 0;
 TcpSocket *TcpServer::getNewSocket ( ) {
 	// . if outta sd's we close least used socket first
 	// . if they're all in use set g_errno and return NULL
-	if ( m_numIncomingUsed >= *m_maxSocketsPtr ) 
+	if ( m_maxSocketsPtr && m_numIncomingUsed >= *m_maxSocketsPtr ) 
 		if ( ! closeLeastUsed () ){
 			// note it in the log
 			long now = getTimeLocal();
--- a/Title.cpp
+++ b/Title.cpp
@ -1878,15 +1878,18 @@ bool Title::copyTitle ( Words *w , Pos *pos ,
 	// size of character in bytes, usually 1
 	char cs ;
 	// point to last punct char
-	char *lastp = NULL;
+	char *lastp = dst;//NULL;
 	// convert them always for now
 	bool convertHtmlEntities = true;
+	long charCount = 0;
 	// copy the node @p into "dst"
 	for ( ; src < srcEnd ; src += cs , dst += cs ) {
 		// get src size
 		cs = getUtf8CharSize ( src );
 		// break if we are full!
 		if ( dst + cs >= dstEnd ) break;
+		// or hit our max char limit
+		if ( charCount++ >= m_maxTitleChars ) break;
 		// remember last punct for cutting purposes
 		if ( ! is_alnum_utf8 ( src ) ) lastp = dst;
 		// encode it as an html entity if asked to
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -890,9 +890,10 @@ char *XmlDoc::getTestDir ( ) {
 	// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
 	// then return "test-spider" otherwise...
 	if ( m_sreqValid && m_sreq.m_useTestSpiderDir ) 
-		return "test-spider";
+		return "qa";//"test-spider";
 	// ... default to "test-parser"
-	return "test-parser";
+	//return "test-parser";
+	return "qa";
 	/*
 	if ( getIsPageParser() )
 		return "test-page-parser";
@ -1969,6 +1970,8 @@ bool XmlDoc::injectDoc ( char *url ,
 	SpiderRequest sreq;
 	sreq.setFromInject ( cleanUrl );

+	if ( deleteUrl )
+		sreq.m_forceDelete = 1;

 	//static char s_dummy[3];
 	// sometims the content is indeed NULL...
@ -2282,6 +2285,9 @@ bool XmlDoc::indexDoc ( ) {
 	//
 	////
 	SpiderReply *nsr = getFakeSpiderReply (  );
+	// this can be NULL and g_errno set to ENOCOLLREC or something
+	if ( ! nsr )
+		return true;

 	//SafeBuf metaList;
 	if ( ! m_metaList2.pushChar(RDB_SPIDERDB) )
@ -3229,6 +3235,10 @@ long *XmlDoc::getIndexCode2 ( ) {
 	if ( gr->getLong("deep",0) ) spamCheck = false;
 	// not for crawlbot
 	if ( cr->m_isCustomCrawl ) spamCheck = false;
+	// only html for now
+	if ( m_contentTypeValid && m_contentType != CT_HTML ) spamCheck =false;
+	// turn this off for now
+	spamCheck = false;
 	// otherwise, check the weights
 	if ( spamCheck ) {
 		char *ws = getWordSpamVec();
@ -3272,17 +3282,23 @@ long *XmlDoc::getIndexCode2 ( ) {
 		return &m_indexCode;
 	}

-	// if using diffbot and the diffbot reply had a time out error
-	// or otherwise... diffbot failure demands a re-try always i guess.
-	// put this above getSpiderPriority() call otherwise we end up in
-	// a recursive loop with getIndexCode() and getNewSpiderReply()
-	SafeBuf *dbr = getDiffbotReply();
-	if ( ! dbr || dbr == (void *)-1 ) return (long *)dbr;
-	if ( m_diffbotReplyValid && m_diffbotReplyError ) {
-		m_indexCode= m_diffbotReplyError;
-		m_indexCodeValid = true;
-		return &m_indexCode;
-	}
+	// . if using diffbot and the diffbot reply had a time out error
+	//   or otherwise... diffbot failure demands a re-try always i guess.
+	//   put this above getSpiderPriority() call otherwise we end up in
+	//   a recursive loop with getIndexCode() and getNewSpiderReply()
+	// . NO, don't do this anymore, however, if there is a diffbot
+	//   reply error then record it in the spider reply BUT only if it is
+	//   a diffbot reply error that warrants a retry. for instance,
+	//   EDIFFBOTCOULDNOTDOWNLOAD happens when diffbot got a 404 or 500
+	//   error trying to download the page so it probably should not
+	//   retry. but EDIFFBOTREQUESTTIMEDOUT should retry.
+	// SafeBuf *dbr = getDiffbotReply();
+	// if ( ! dbr || dbr == (void *)-1 ) return (long *)dbr;
+	// if ( m_diffbotReplyValid && m_diffbotReplyError ) {
+	// 	m_indexCode= m_diffbotReplyError;
+	// 	m_indexCodeValid = true;
+	// 	return &m_indexCode;
+	// }

 	// no error otherwise
 	m_indexCode      = 0;
@ -9639,8 +9655,10 @@ Url **XmlDoc::getRedirUrl() {
 		if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
 		return &m_redirUrlPtr;
 	}
-	// if we followed too many then bail
-	if ( ++m_numRedirects >= 4 ) {
+	// . if we followed too many then bail
+	// . www.motorolamobility.com www.outlook.com ... failed when we 
+	//   had >= 4 here
+	if ( ++m_numRedirects >= 5 ) {
 		if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
 		return &m_redirUrlPtr;
 	}
@ -10702,6 +10720,8 @@ char *XmlDoc::getIsIndexed ( ) {
 	// note it
 	if ( ! m_calledMsg22e )
 		setStatus ( "checking titledb for old title rec");
+	else
+		setStatus ( "back from msg22e call");

 	// . consult the title rec tree!
 	// . "justCheckTfndb" is set to true here!
@ -13621,7 +13641,35 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
 			    THIS->m_diffbotUrl.getBufStart(),
 			    page 
 			    );
-			THIS->m_diffbotReplyError = EDIFFBOTINTERNALERROR;
+			// try to get the right error code
+			char *err = strstr(page,"\"error\":\"");
+			if ( err ) err += 9;
+			long code = EDIFFBOTUNKNOWNERROR;
+			if ( err && !strncmp(err,"Unable to apply rules",21))
+				code = EDIFFBOTUNABLETOAPPLYRULES;
+			// like .pdf pages get this error
+			if ( err && !strncmp(err,"Could not parse page",20))
+				code = EDIFFBOTCOULDNOTPARSE;
+			// if it is 404... 502, etc. any http status code
+			if ( err && !strncmp(err,"Could not download page",23))
+				code = EDIFFBOTCOULDNOTDOWNLOAD;
+			// custom api does not apply to the url
+			if ( err && !strncmp(err,"Invalid API",11))
+				code = EDIFFBOTINVALIDAPI;
+			if ( err && !strncmp(err,"Version required",16))
+				code = EDIFFBOTVERSIONREQ;
+			if ( err && !strncmp(err,"Empty content",13))
+				code = EDIFFBOTEMPTYCONTENT;
+			if ( err && !strncmp(err,"No content received",19))
+				code = EDIFFBOTEMPTYCONTENT;
+			if ( err && !strncmp(err,"Request timed",13))
+				code = EDIFFBOTREQUESTTIMEDOUT;
+			// error processing url
+			if ( err && !strncmp(err,"Error processing",16))
+				code = EDIFFBOTURLPROCESSERROR;
+			if ( err && !strncmp(err,"Your token has exp",18))
+				code = EDIFFBOTTOKENEXPIRED;
+			THIS->m_diffbotReplyError = code;
 		}
 		// a hack for detecting if token is expired
 		if ( ! ttt && cr && strstr ( page , ":429}" ) ) {
@ -15183,6 +15231,7 @@ long long *XmlDoc::getDownloadEndTime ( ) {
 	if ( m_deleteFromIndex ) {
 		m_downloadEndTime = 0;
 		m_downloadEndTimeValid = true;
+		return &m_downloadEndTime;
 	}

 	// if recycling content use its download end time
@ -15199,7 +15248,7 @@ long long *XmlDoc::getDownloadEndTime ( ) {
 			return &m_downloadEndTime;
 		}
 	}
-		
+
 	// need a valid reply
 	char **reply = getHttpReply ();
 	if ( ! reply || reply == (void *)-1 ) return (long long *)reply;
@ -17021,7 +17070,8 @@ char **XmlDoc::getUtf8Content ( ) {
 		     // it should be there if trying to delete as well!
 		     m_deleteFromIndex ) {
 			log("xmldoc: null utf8 content for docid-based "
-			    "titlerec lookup which was not found");
+			    "titlerec (d=%lli) lookup which was not found",
+			    m_docId);
 			ptr_utf8Content = NULL;
 			size_utf8Content = 0;
 			m_utf8ContentValid = true;
@ -19804,7 +19854,9 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
 		if ( *p & 0x01 ) del = false;
 		else             del = true;
 		// must always be negative if deleteing
-		if ( m_deleteFromIndex && ! del ) {
+		// spiderdb is exempt because we add a spiderreply that is
+		// positive and a spiderdoc
+		if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
 			char *xx=NULL;*xx=0; }
 		// get the key size. a table lookup in Rdb.cpp.
 		long ks ;
@ -20485,7 +20537,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		// if we are indexing a subdoc piece of a multidoc url 
 		// then parentUrl should return non-NULL
 		char *parentUrl = getDiffbotParentUrl(od->m_firstUrl.m_url);
-		if ( ! parentUrl ) goto skip9;
+		if ( ! parentUrl && od->m_contentType != CT_STATUS ) 
+			goto skip9;
 		// in that case we need to reindex the parent url not the
 		// subdoc url, so make the spider reply gen quick
 		//SpiderReply *newsr = od->getFakeSpiderReply();
@ -20537,12 +20590,23 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		// been fulfilled!
 		if( ! m_zbuf.safeMemcpy (&srep, srep.getRecSize()))
 			return NULL;
-		// complain
-		if ( ! cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
-			log("build: doing query reindex but diffbot api "
-			    "url is not set in spider controls");
+
 		// but also store a new spider request for the parent url
 		SpiderRequest ksr;
+		long long pd;
+
+		// skip if doc is a spider status "document". their docids
+		// often get added during a query reindex but we should ignore
+		// them completely.
+		if ( od->m_contentType == CT_STATUS )
+			goto returnList;
+
+		//goto returnList;
+
+		// complain
+		if ( cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
+			log("build: doing query reindex but diffbot api "
+			    "url is not set in spider controls");
 		// just copy original request
 		memcpy ( &ksr , &m_sreq , m_sreq.getRecSize() );
 		// do not spider links, it's a page reindex of a multidoc url
@ -20551,6 +20615,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		ksr.m_ignoreDocUnchangedError = 1;
 		// no longer docid based we set it to parentUrl
 		ksr.m_urlIsDocId = 0;
+		// but consider it a manual add. this should already be set.
+		ksr.m_isPageReindex = 1;
 		// but it is not docid based, so overwrite the docid
 		// in ksr.m_url with the parent multidoc url. it \0 terms it.
 		strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);
@ -20558,7 +20624,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		//if ( ! od->m_firstIpValid ) { char *xx=NULL;*xx=0; }
 		// set the key, ksr.m_key. isDel = false
 		// fake docid
-		long long pd = g_titledb.getProbableDocId(parentUrl);
+		pd = g_titledb.getProbableDocId(parentUrl);
 		ksr.setKey ( m_sreq.m_firstIp, pd , false );
 		// store this
 		if ( ! m_zbuf.pushChar(RDB_SPIDERDB) ) 
@ -20566,6 +20632,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		// then the request
 		if ( ! m_zbuf.safeMemcpy(&ksr,ksr.getRecSize() ) )
 			return NULL;
+	returnList:
 		// prevent cores in indexDoc()
 		m_indexCode = EREINDEXREDIR;
 		m_indexCodeValid = true;
@ -20960,7 +21027,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	// if recycling json objects, leave them there!
 	if ( *recycle ) nukeJson = false;
 	// you have to be a diffbot crawl to do this
-	if ( ! cr->m_isCustomCrawl ) nukeJson = false;
+	// no, not if you have th diffbot api url set... so take this out
+	//if ( ! cr->m_isCustomCrawl ) nukeJson = false;
 	// do not remove old json objects if pageparser.cpp test
 	// because that can not change the index, etc.
 	if ( getIsPageParser() ) nukeJson = false;
@ -21818,7 +21886,12 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	// but don't do this if it is pagereindex. why is pagereindex
 	// setting the injecting flag anyway?
 	long needSpiderdb3 = 0;
-	if ( m_sreqValid && m_sreq.m_isInjecting )//&&!m_sreq.m_isPageReindex) 
+	if ( m_sreqValid && 
+	     m_sreq.m_isInjecting &&
+	     m_sreq.m_fakeFirstIp &&
+	     ! m_sreq.m_forceDelete &&
+	     /// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
+	     ! m_isDiffbotJSONObject )
 		needSpiderdb3 = m_sreq.getRecSize() + 1;
 	need += needSpiderdb3;

@ -22325,11 +22398,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 	// if we are injecting we must add the spider request
 	// we are injecting from so the url can be scheduled to be
 	// spidered again
-	if ( m_sreqValid && 
-	     m_sreq.m_isInjecting &&
-	     m_sreq.m_fakeFirstIp &&
-	     /// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
-	     ! m_isDiffbotJSONObject ) {
+	if ( needSpiderdb3 ) {
 		// note it
 		setStatus("adding spider request");
 		// checkpoint
@ -23308,6 +23377,10 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 	else
 		m_srep.m_hadDiffbotError = false;

+	// if we only had an error code in the diffbot reply, record that
+	if ( ! m_indexCode && m_diffbotReplyError )
+		m_srep.m_errCode = m_diffbotReplyError;
+
 	// sanity. if being called directly from indexDoc() because of
 	// an error like out of memory, then we do not know if it is
 	// indexed or not or was indexed...
@ -25112,11 +25185,11 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
 	// hash gbimage: for permalinks only for Images.cpp
 	for ( long i = 0 ; i < m_images.m_numImages ; i++ ) {
 		// get the node number
-		long nn = m_images.m_imageNodes[i];
+		//long nn = m_images.m_imageNodes[i];
 		// get the url of the image
-		XmlNode *xn = m_xml.getNodePtr(nn);
+		//XmlNode *xn = m_xml.getNodePtr(nn);
 		long  srcLen;
-		char *src = xn->getFieldValue("src",&srcLen);
+		char *src = m_images.getImageUrl(i,&srcLen);
 		// set it to the full url
 		Url iu;
 		// use "pageUrl" as the baseUrl
@ -25488,6 +25561,17 @@ SafeBuf *XmlDoc::getSpiderReplyMetaList ( SpiderReply *reply ) {
 		return &m_spiderReplyMetaList;
 	}

+	// we double add regular html urls in a query reindex because the
+	// json url adds the parent, so the parent gets added twice sometimes,
+	// and for some reason it is adding a spider status doc the 2nd time
+	// so cut that out. this is kinda a hack b/c i'm not sure what's 
+	// going on. but you can set a break point here and see what's up if
+	// you want.
+	if ( m_indexCodeValid && m_indexCode == EDOCFORCEDELETE ) {
+		m_spiderReplyMetaListValid = true;
+		return &m_spiderReplyMetaList;
+	}
+
 	// . fake this out so we do not core
 	// . hashWords3() uses it i guess
 	bool forcedLangId = false;
@ -28586,28 +28670,37 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
 	*/

 	// does they want a summary?
-	if ( m_req->m_numSummaryLines>0 && ! reply->ptr_sum ) {
-		char *sum = getHighlightedSummary();
-		if ( ! sum || sum == (void *)-1 ) return (Msg20Reply *)sum;
+	if ( m_req->m_numSummaryLines>0 && ! reply->ptr_displaySum ) {
+		char *hsum = getHighlightedSummary();
+		if ( ! hsum || hsum == (void *)-1 ) return (Msg20Reply *)hsum;
+		//Summary *s = getSummary();
+		//if ( ! s || s == (void *)-1 ) return (Msg20Reply *)s;
 		//long sumLen = m_finalSummaryBuf.length();
 		// is it size and not length?
-		long sumLen = 0;
+		long hsumLen = 0;
 		// seems like it can return 0x01 if none...
-		//if ( sum == (char *)0x01 ) sum = NULL;
-		// get len
-		if ( sum ) sumLen = gbstrlen(sum);
-		// must be \0 terminated
-		if ( sumLen > 0 && sum[sumLen] ) { char *xx=NULL;*xx=0; }
+		if ( hsum == (char *)0x01 ) hsum = NULL;
+		// get len. this is the HIGHLIGHTED summary so it is ok.
+		if ( hsum ) hsumLen = gbstrlen(hsum);
+		// must be \0 terminated. not any more, it can be a subset
+		// of a larger summary used for deduping
+		if ( hsumLen > 0 && hsum[hsumLen] ) { char *xx=NULL;*xx=0; }
 		// assume size is 0
-		long sumSize = 0;
+		//long sumSize = 0;
 		// include the \0 in size
-		if ( sum ) sumSize = sumLen + 1;
+		//if ( sum ) sumSize = sumLen + 1;
 		// do not get any more than "me" lines/excerpts of summary
 		//long max = m_req->m_numSummaryLines;
 		// grab stuff from it!
 		//reply->m_proximityScore = s->getProximityScore();
-		reply-> ptr_sum         = sum;//s->getSummary();
-		reply->size_sum         = sumSize;//s->getSummaryLen(max)+1;
+		reply-> ptr_displaySum = hsum;//s->getSummary();
+		reply->size_displaySum = hsumLen+1;//sumSize;//s->getSummaryLen
+		// this is unhighlighted for deduping, and it might be longer
+		// . seems like we are not using this for deduping but using
+		//   the gigabit vector in Msg40.cpp, so take out for now
+		//reply-> ptr_dedupSum = s->m_summary;
+		//reply->size_dedupSum = s->m_summaryLen+1;
+		//if ( s->m_summaryLen == 0 ) reply->size_dedupSum = 0;
 		//reply->m_diversity      = s->getDiversity();
 	}

@ -28675,6 +28768,15 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
 		}
 	}

+	// this is not documented because i don't think it will be popular
+	if ( m_req->m_getHeaderTag ) {
+		SafeBuf *htb = getHeaderTagBuf();
+		if ( ! htb || htb == (SafeBuf *)-1 ) return (Msg20Reply *)htb;
+		// it should be null terminated
+		reply->ptr_htag = htb->getBufStart();
+		reply->size_htag = htb->getLength() + 1;
+	}
+
 	// breathe
 	QUICKPOLL ( m_niceness );

@ -29674,6 +29776,38 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
 	return m_dbuf;
 }

+SafeBuf *XmlDoc::getHeaderTagBuf() {
+	if ( m_htbValid ) return &m_htb;
+
+	Sections *ss = getSections();
+	if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;
+
+	// scan sections
+	Section *si = ss->m_rootSection;
+	for ( ; si ; si = si->m_next ) {
+		// breathe
+		QUICKPOLL(m_niceness);
+		if ( si->m_tagId == TAG_H1 ) break;
+	}
+	// if no h1 tag then make buf empty
+	if ( ! si ) {
+		m_htb.nullTerm();
+		m_htbValid = true;
+		return &m_htb;
+	}
+	// otherwise, set it
+	char *a = m_words.m_words[si->m_firstWordPos];
+	char *b = m_words.m_words[si->m_lastWordPos] ;
+	b += m_words.m_wordLens[si->m_lastWordPos];
+
+	// copy it
+	m_htb.safeMemcpy ( a , b - a );
+	m_htb.nullTerm();
+	m_htbValid = true;
+	return &m_htb;
+}
+	
+
 Title *XmlDoc::getTitle ( ) {
 	if ( m_titleValid ) return &m_title;
 	// need a buncha crap
@ -29775,6 +29909,10 @@ Summary *XmlDoc::getSummary () {
 			  false                            , // doStemming
 			  m_req->m_summaryMaxLen           ,
 			  numLines                         ,
+			  // . displayLines, # lines we are displaying
+			  // . Summary::getDisplayLen() will return the
+			  //   length of the summary to display
+			  m_req->m_numSummaryLines         ,
 			  cr->m_summaryMaxNumCharsPerLine,
 			  m_req->m_ratInSummary            ,
 			  getFirstUrl()                    ,
@ -29807,11 +29945,15 @@ char *XmlDoc::getHighlightedSummary ( ) {

 	// get the summary
 	char *sum    = s->getSummary();
-	long  sumLen = s->getSummaryLen();
+	//long  sumLen = s->getSummaryLen();
+	long sumLen = s->getSummaryDisplayLen();
+
+	//sum[sumLen] = 0;

 	// assume no highlighting?
 	if ( ! m_req->m_highlightQueryTerms || sumLen == 0 ) {
-		m_finalSummaryBuf.safeMemcpy ( sum , sumLen + 1 );
+		m_finalSummaryBuf.safeMemcpy ( sum , sumLen );
+		m_finalSummaryBuf.nullTerm();
 		m_finalSummaryBufValid = true;
 		return m_finalSummaryBuf.getBufStart();
 		//char *fsum = m_finalSummaryBuf.getBufStart();
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -821,6 +821,7 @@ class XmlDoc {
 	Query *getQuery() ;
 	Matches *getMatches () ;
 	char *getDescriptionBuf ( char *displayMetas , long *dlen ) ;
+	SafeBuf *getHeaderTagBuf();
 	class Title *getTitle ();
 	class Summary *getSummary () ;
 	char *getHighlightedSummary ();
@ -1377,6 +1378,7 @@ class XmlDoc {
 	bool m_matchesValid;
 	bool m_dbufValid;
 	bool m_titleValid;
+	bool m_htbValid;
 	bool m_collnumValid;
 	//bool m_twidsValid;
 	bool m_termId32BufValid;
@ -2010,6 +2012,7 @@ class XmlDoc {
 	// meta description buf
 	long m_dbufLen;
 	char m_dbuf[1024];
+	SafeBuf m_htb;
 	Title m_title;
 	Summary m_summary;
 	char m_isCompromised;
--- a/XmlNode.cpp
+++ b/XmlNode.cpp
@ -257,6 +257,8 @@ long XmlNode::setCommentNode2 ( char *node ) {
 		// look for ending of ]> like for <![if gt IE 6]>
 		if ( node[i]   !='>' ) continue;
 		if ( node[i-1] ==']' ) break;
+		// look for ending of --> like for <![endif]-->
+		if ( node[i-1] == '-' && node[i-2] == '-' ) break;
 	}

 	// skip i over the >, if any (could be end of doc)
--- a/6
+++ b/6
@ -0,0 +1,6 @@
+gb (1.1-1) unstable; urgency=low
+
+  * Lots of bug fixes
+  * API updates.
+
+ -- mwells <gigablast@mail.com>  Sat, 05 Jul 2014 18:38:35 -0700
--- a/dmozparse.cpp
+++ b/dmozparse.cpp
@ -26,6 +26,8 @@ bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
 //long g_qbufNeedSave = false;
 //SafeBuf g_qbuf;

+bool g_recoveryMode;
+
 #define RDFBUFFER_SIZE    (1024*1024*10)
 #define RDFSTRUCTURE_FILE "structure.rdf.u8"
 #define RDFCONTENT_FILE   "content.rdf.u8"
--- a/gb.deb.rules
+++ b/gb.deb.rules
@ -32,6 +32,11 @@ override_dh_strip:
 # debian/gb.substvars and makes dpkg -i bitch about dependencies not being met
 override_dh_shlibdeps:
 	echo "skipping dh_shlibdeps call! MDW"
+# adding the line below here does not seem to make dpkg prompt to
+# install netpbm, rather just bitch about it and make it harder to install
+#	echo "building our own gb.substvars"
+#	echo  "misc:Depends=netpbm (>= 0.0)" > debian/gb.substvars 
+#	echo  "misc:Depends=netpbm" > debian/gb.substvars 

 # override_dh_shlibdeps-indep:
 # 	echo "shit"
--- a/html/admin.html
+++ b/html/admin.html
@ -835,7 +835,7 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
 <table cellpadding=1 border=0 width=100% bgcolor=#0079ba>
 <tr><td><center><b><font color=#ffffff size=+1>Building a DMOZ Based Directory</td></tr></table>
 <br>
-&lt;<i>Last Updated October 2013</i>&gt;
+&lt;<i>Last Updated July 2014</i>&gt;
 <br>
 <br>

@ -849,9 +849,9 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
 <br>           $ wget http://rdf.dmoz.org/rdf/structure.rdf.u8.gz
 <br>           $ gunzip structure.rdf.u8.gz</b>
 <br>    
-<li>Execute <i>dmozparse</i> in its directory with the <i>new</i> option to generate the catdb dat files.<br>        <b>$ dmozparse new</b><br>    
+<li>Execute <i>dmozparse</i> in its directory with the <i>new</i> option to generate the catdb dat files.<br>        <b>$ ./dmozparse new</b><br>    

-<li>Execute the installcat script command on host 0 to distribute the catdb files to all the hosts.<br>This just does an scp/rcp from host 0 to the other hosts listed in <a href=#hosts>hosts.conf</a>.<br>        <b>$ gb installcat</b><br>
+<li>Execute the installcat script command on host 0 to distribute the catdb files to all the hosts.<br>This just does an scp/rcp from host 0 to the other hosts listed in <a href=#hosts>hosts.conf</a>.<br>        <b>$ ./gb installcat</b><br>

    <li>Make sure all spiders are stopped and inactive.<br>

@ -865,7 +865,7 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
 <li>Gigablast provides the unique ability to search the content of the pages in the DMOZ directory. But in order to search the pages in DMOZ we have to index them. 
 So execute <i>dmozparse</i> with the <i>urldump -s</i> option to create the html/gbdmoz.urls.txt.* files which contain all the URLs in DMOZ. (Excluding URLs that contained hashtags, '#'.)

-<br><b>$ dmozparse urldump -s</b>
+<br><b>$ ./dmozparse urldump -s</b>

 <br><li>Now tell Gigablast to index each URL listed in each gbdmoz.urls.txt.* file. Make sure you specify the collection you are using for DMOZ, in the example below it uses <i>main</i>. You can use the <a href=/addurl>add url</a> page to add the gbdmoz.urls.txt.* files or you can use curl (or wget) like:
 <br>
--- a/main.cpp
+++ b/main.cpp
@ -129,6 +129,9 @@
 //#include "Facebook.h"
 //#include "Accessdb.h"

+// from qa.cpp
+bool qatest ( ) ;
+
 // call this to shut everything down
 bool mainShutdown ( bool urgent ) ;
 //bool mainShutdown2 ( bool urgent ) ;
@ -1453,6 +1456,70 @@ int main2 ( int argc , char *argv[] ) {
 	g_conf.m_save = false;
 	

+	//
+	// run our smoketests
+	//
+	if ( strcmp ( cmd, "qa" ) == 0 ) {
+		// let's ensure our core file can dump
+		struct rlimit lim;
+		lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
+		if ( setrlimit(RLIMIT_CORE,&lim) )
+			log("qa::setrlimit: %s", mstrerror(errno) );
+		// 50MB
+		g_conf.m_maxMem = 50000000;
+		// init our table for doing zobrist hashing
+		if ( ! hashinit() ) {
+			log("qa::hashinit failed" ); return 0; }
+		// init memory class after conf since it gets maxMem from Conf
+		if ( ! g_mem.init ( 200000000 ) ) {
+			log("qa::Mem init failed" ); return 0; }
+		if (!ucInit(g_hostdb.m_dir)) {
+			log("Unicode initialization failed!");
+			return 1;
+		}
+		g_conf.m_askRootNameservers = true;
+		//g_conf.m_dnsIps  [0]    = atoip ( "192.168.0.1", 11 );
+		//g_conf.m_dnsClientPort  = 9909;
+		g_conf.m_dnsMaxCacheMem = 1024*10;
+		// hack http server port to -1 (none)
+		//g_conf.m_httpPort           = 0;
+		g_conf.m_httpMaxSockets     = 200;
+		//g_conf.m_httpMaxReadBufSize = 102*1024*1024;
+		g_conf.m_httpMaxSendBufSize = 16*1024;
+		// init the loop
+		if ( ! g_loop.init() ) {
+			log("qa::Loop init failed" ); return 0; }
+		// . then dns client
+		// . server should listen to a socket and register with g_loop
+		if ( ! g_dns.init(14834)        ) {
+			log("qa::Dns client init failed" ); return 0; }
+		// . then webserver
+		// . server should listen to a socket and register with g_loop
+		// . use -1 for both http and https ports to mean do not
+		//   listen on any ports. we are a client only.
+		if ( ! g_httpServer.init( -1 , -1 ) ) {
+			log("qa::HttpServer init failed" ); return 0; }
+		// set our new pid
+		g_mem.setPid();
+		g_threads.setPid();
+		g_log.setPid();
+		//
+		// beging the qaloop
+		//
+		qatest();
+		//
+		// wait for some i/o signals
+		//
+		if ( ! g_loop.runLoop()    ) {
+			log("db: runLoop failed." ); 
+			return 1; 
+		}
+		// no error, return 0
+		return 0;
+	}
+
+
+
 	// log the version
 	//log(LOG_INIT,"conf: Gigablast Server %s",GBVersion);

@ -5044,7 +5111,19 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
 		else if ( installFlag == ifk_installcat ) {
 			// . copy catdb files to all hosts
 			// don't copy to ourselves
-			if ( h2->m_hostId == 0 ) continue;
+			if ( h2->m_hostId == 0 ) {
+				sprintf(tmp,
+					"cp "
+					"content.rdf.u8 "
+					"structure.rdf.u8 "
+					"gbdmoz.structure.dat "
+					"gbdmoz.content.dat "
+					"%scatdb/",
+					h2->m_dir);
+				log(LOG_INIT,"admin: %s", tmp);
+				system ( tmp );
+				continue;
+			}
 			sprintf(tmp,
 				"rcp "
 				"%scatdb/content.rdf.u8 "
--- a/qa.cpp
+++ b/qa.cpp
@ -4,15 +4,18 @@

 static long s_failures = 0;

-bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
+bool getUrl( char *path , 
+	     void (* callback) (void *state, TcpSocket *sock) ,
+	     char *post = NULL ) {
 	SafeBuf sb;
 	sb.safePrintf ( "http://%s:%li%s"
 			, iptoa(g_hostdb.m_myHost->m_ip)
-			, (long)g_hostdb.m_myHost->m_port
+			, (long)g_hostdb.m_myHost->m_httpPort
 			, path
 			);
 	Url u;
 	u.set ( sb.getBufStart() );
+	log("qa: getting %s",sb.getBufStart());
 	if ( ! g_httpServer.getDoc ( u.getUrl() ,
 				     0 , // ip
 				     0 , // offset
@ -25,7 +28,13 @@ bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
 				     0, // proxyport
 				     -1, // maxtextdoclen
 				     -1, // maxotherdoclen
-				     NULL ) ) // useragent
+				     NULL , // useragent
+				     "HTTP/1.0" , // protocol
+				     true , // doPost
+				     NULL , // cookie
+				     NULL , // additionalHeader
+				     NULL , // fullRequest
+				     post ) )
 		return false;
 	// error?
 	log("qa: getUrl error: %s",mstrerror(g_errno));
@ -34,27 +43,90 @@ bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {

 bool qatest ( ) ;

-void qatestWrapper ( void *state , TcpSocket *sock ) { qatest(); }	
+void markOut ( char *reply , char *needle ) {

-// return false if blocked, true otherwise
-bool addColl ( ) {
-	static bool s_flag = false;
-	if ( s_flag ) return true;
-	s_flag = true;
-	return getUrl ( "/admin/addcoll?c=qatest123" , qatestWrapper );
+	if ( ! reply ) return;
+
+	char *s = strstr ( reply , needle );
+	if ( ! s ) return;
+
+	for ( ; *s && ! is_digit(*s); s++ );
+
+	// find end of digit stream
+	//char *end = s;
+	//while ( ; *end && is_digit(*s); end++ );
+	// just bury the digit stream now, zeroing out was not
+	// a consistent LENGTH if we had 10 hits vs 9... making the hash 
+	// different
+
+	// space out digits
+	for ( ; *s && is_digit(*s); s++ ) *s = ' ';
 }

+// do not hash 
+long qa_hash32 ( char *s ) {
+	unsigned long h = 0;
+	long k = 0;
+	for ( long i = 0 ; s[i] ; i++ ) {
+		// skip if not first space and back to back spaces
+		if ( s[i] == ' ' &&i>0 && s[i-1]==' ') continue;
+		h ^= g_hashtab [(unsigned char)k] [(unsigned char)s[i]];
+		k++;
+	}
+	return h;
+}
+
+long s_replyCRC = 0;
+TcpSocket *s_sock = NULL;
+
+void qatestWrapper ( void *state , TcpSocket *sock ) { 
+	log("qa: got reply(%li)=%s",sock->m_readOffset,sock->m_readBuf);
+
+	// get mime
+	HttpMime mime;
+	mime.set ( sock->m_readBuf , sock->m_readOffset , NULL );
+	// only hash content since mime has a timestamp in it
+	char *content = mime.getContent();
+	long  contentLen = mime.getContentLen();
+	if ( content[contentLen] ) { char *xx=NULL;*xx=0; }
+
+	char *reply = sock->m_readBuf;
+
+	// take out <responseTimeMS>
+	markOut ( reply , "<currentTimeUTC>");
+
+	markOut ( reply , "<responseTimeMS>");
+
+	// until i figure this one out, take it out
+	markOut ( reply , "<docsInCollection>");
+
+	// until i figure this one out, take it out
+	markOut ( reply , "<hits>");
+
+	// make checksum. we ignore back to back spaces so this
+	// hash works for <docsInCollection>10 vs <docsInCollection>9
+	s_replyCRC = qa_hash32 ( content );
+
+	// this too is used for recording the reply into a file on disk
+	s_sock = sock;
+
+	// continue qa loop
+	qatest(); 
+
+}	

 // first inject a set list of urls
 static char  **s_urlPtrs = NULL;
-static long    s_numUrls = 0;
+static char  **s_contentPtrs = NULL;
 static SafeBuf s_ubuf1;
 static SafeBuf s_ubuf2;
+static SafeBuf s_cbuf2;


 bool loadUrls ( ) {
 	static bool s_loaded = false;
 	if ( s_loaded ) return true;
+	s_loaded = true;
 	// use injectme3 file
 	s_ubuf1.load("./injectme3");
 	// scan for +++URL: xxxxx
@ -62,6 +134,8 @@ bool loadUrls ( ) {
 	for ( ; *s ; s++ ) {
 		if ( strncmp(s,"+++URL: ",8) ) continue;
 		// got one
+		// \0 term it for s_contentPtrs below
+		*s = '\0';
 		// find end of it
 		s += 8;
 		char *e = s;
@ -72,27 +146,16 @@ bool loadUrls ( ) {
 		s_ubuf2.pushLong((long)s);
 		// skip past that
 		s = e;
+		// point to content
+		s_cbuf2.pushLong((long)(s+1));
 	}
 	// make array of url ptrs
 	s_urlPtrs = (char **)s_ubuf2.getBufStart();
+	s_contentPtrs= (char **)s_cbuf2.getBufStart();
 	return true;
 }

-bool injectUrls ( ) {
-	loadUrls();
-	static long s_ii = 0;
-	for ( ; s_ii < s_numUrls ; ) {
-		// pre-inc it
-		s_ii++;
-		// inject using html api
-		SafeBuf sb;
-		sb.safePrintf("/admin/inject?c=qatest123&delete=0&u=");
-		sb.urlEncode ( s_urlPtrs[s_ii] );
-		return getUrl ( sb.getBufStart() , qatestWrapper );
-	}
-	return true;
-}
-
+/*
 static char *s_queries[] = {
 	"the",
 	"+the",
@ -106,116 +169,7 @@ static char *s_queries[] = {
 	"cat -dog",
 	"site:wisc.edu"
 };
-
-static long s_checksums[] = {
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0
-};
-
-static long s_qi1 = 0;
-
-void doneSearching1 ( void *state , TcpSocket *sock ) {
-	//loadQueries1();
-	long ii = s_qi1 - 1;
-	// get checksum of it
-	HttpMime hm;
-	hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
-	char *page = sock->m_readBuf + hm.getMimeLen() ;
-	// we will need to ignore fields like the latency etc.
-	// perhaps pass that in as a cgi parm. &qa=1
-	long crc = hash32n ( page );
-	if ( crc != s_checksums[ii] ) {
-		log("qatest: query '%s' checksum %lu != %lu",
-		    s_queries[ii],
-		    s_checksums[ii],
-		    crc);
-		s_failures++;
-	}
-	// resume the qa loop
-	qatest();
-}
-		
-
-// ensure search results are consistent
-bool searchTest1 () {
-	long nq = sizeof(s_queries)/sizeof(char *);
-	for ( ; s_qi1 < nq ; ) {
-		// pre-inc it
-		s_qi1++;
-		// inject using html api
-		SafeBuf sb;
-		// qa=1 tell gb to exclude "variable" or "random" things
-		// from the serps so we can checksum it consistently
-		sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
-		sb.urlEncode ( s_queries[s_qi1] );
-		return getUrl ( sb.getBufStart() , doneSearching1 );
-	}
-	return true;
-}	
-
-static long s_qi2 = 0;
-
-void doneSearching2 ( void *state , TcpSocket *sock ) {
-	//loadQueries1();
-	long ii = s_qi2 - 1;
-	// get checksum of it
-	HttpMime hm;
-	hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
-	char *page = sock->m_readBuf + hm.getMimeLen() ;
-	// we will need to ignore fields like the latency etc.
-	// perhaps pass that in as a cgi parm. &qa=1
-	long crc = hash32n ( page );
-	if ( crc != s_checksums[ii] ) {
-		log("qatest: query '%s' checksum %lu != %lu",
-		    s_queries[ii],
-		    s_checksums[ii],
-		    crc);
-		s_failures++;
-	}
-	// resume the qa loop
-	qatest();
-}
-		
-
-// ensure search results are consistent
-bool searchTest2 () {
-	long nq = sizeof(s_queries)/sizeof(char *);
-	for ( ; s_qi2 < nq ; ) {
-		// pre-inc it
-		s_qi2++;
-		// inject using html api
-		SafeBuf sb;
-		// qa=1 tell gb to exclude "variable" or "random" things
-		// from the serps so we can checksum it consistently
-		sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
-		sb.urlEncode ( s_queries[s_qi2] );
-		return getUrl ( sb.getBufStart() , doneSearching2 );
-	}
-	return true;
-}	
-
-bool deleteUrls ( ) {
-	static long s_ii2 = 0;
-	for ( ; s_ii2 < s_numUrls ; ) {
-		// pre-inc it
-		s_ii2++;
-		// reject using html api
-		SafeBuf sb;
-		sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u=");
-		sb.urlEncode ( s_urlPtrs[s_ii2] );
-		return getUrl ( sb.getBufStart() , qatestWrapper );
-	}
-	return true;
-}
+*/

 #include "Msg0.h"
 static Msg0 s_msg0;
@ -371,67 +325,238 @@ bool checkSpidersDone ( ) {
 	return false;
 }

-bool delColl ( ) {
-	static bool s_flag = false;
-	if ( s_flag ) return true;
-	s_flag = true;
-	return getUrl ( "/admin/delcoll?c=qatest123" , qatestWrapper );
+//static long s_phase = -1;
+
+void checkCRC ( long needCRC ) {
+
+	// and our current reply
+	SafeBuf fb2;
+	fb2.safeMemcpy(s_sock->m_readBuf,s_sock->m_readOffset);
+	fb2.nullTerm();
+
+	if ( s_replyCRC == needCRC ) {
+		// save reply if good
+		char fn3[1024];
+		sprintf(fn3,"%sqa/reply.%li",g_hostdb.m_dir,needCRC);
+		File ff; ff.set ( fn3 );
+		if ( ff.doesExist() ) return;
+		// if not there yet then save it
+		fb2.save(fn3);
+		return;
+	}
+
+	const char *emsg = "qa: bad replyCRC of %li should be %li "
+		"\n";//"phase=%li\n";
+	fprintf(stderr,emsg,s_replyCRC,needCRC);//,s_phase-1);
+	// get response on file
+	SafeBuf fb1;
+	char fn1[1024];
+	sprintf(fn1,"%sqa/reply.%li",g_hostdb.m_dir,needCRC);
+	fb1.load(fn1);
+	fb1.nullTerm();
+	// break up into lines
+	char fn2[1024];
+	sprintf(fn2,"/tmp/reply.%li",s_replyCRC);
+	fb2.save ( fn2 );
+
+	// do the diff between the two replies so we can see what changed
+	char cmd[1024];
+	sprintf(cmd,"diff %s %s",fn1,fn2);
+	fprintf(stderr,"%s\n",cmd);
+	system(cmd);
+	// if this is zero allow it to slide by. it is learning mode i guess.
+	// so we can learn what crc we need to use.
+	if ( needCRC == 0 ) return;
+	// otherwise, stop right there for debugging
+	exit(1);
 }

+#undef usleep

-static long s_rdbId1 = 0;
-static long s_rdbId2 = 0;
-//static long s_rdbId3 = 0;
+//
+// the injection qa test suite
+//
+bool qainject () {

-// . run a series of tests to ensure that gb is functioning properly
-// . use s_urls[] array of urls for injecting and spider seeding
-// . contain an archive copy of all webpages in the injectme3 file and
-//   in pagearchive1.txt file
-// . while initially spidering store pages in pagearchive1.txt so we can
-//   replay later. store up to 100,000 pages in there.
-bool qatest ( ) {
+	static bool s_x1 = false;
+	if ( ! s_x1 ) {
+		s_x1 = true;
+		return getUrl ( "/admin/delcoll?delcoll=qatest123" , 
+				qatestWrapper );
+	}

+	//
 	// add the 'qatest123' collection
-	if ( ! addColl () ) return false;
+	//
+	static bool s_x2 = false;
+	if ( ! s_x2 ) {
+		s_x2 = true;
+		if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , 
+				qatestWrapper ) )
+			return false;
+	}

+	//
+	// check addcoll reply
+	//
+	static bool s_x3 = false;
+	if ( ! s_x3 ) {
+		s_x3 = true;
+		checkCRC ( 238170006 );
+	}
+
+	//
 	// inject urls, return false if not done yet
-	if ( ! injectUrls ( ) ) return false;
+	//
+	static bool s_x4 = false;
+	if ( ! s_x4 ) {
+		// TODO: try delimeter based injection too
+		loadUrls();
+		static long s_ii = 0;
+		for ( ; s_ii < s_ubuf2.length()/(long)sizeof(char *) ; ) {
+			// inject using html api
+			SafeBuf sb;
+			sb.safePrintf("&c=qatest123&deleteurl=0&"
+				      "format=xml&u=");
+			sb.urlEncode ( s_urlPtrs[s_ii] );
+			// the content
+			sb.safePrintf("&hasmime=1");
+			sb.safePrintf("&content=");
+			sb.urlEncode(s_contentPtrs[s_ii] );
+			sb.nullTerm();
+			// pre-inc it in case getUrl() blocks
+			s_ii++;
+			getUrl("/admin/inject",qatestWrapper,sb.getBufStart());
+			return false;
+		}
+		s_x4 = true;
+	}
+
+	// +the
+	static bool s_x5 = false;
+	if ( ! s_x5 ) {
+		usleep(500000);
+		s_x5 = true;
+		getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
+			 qatestWrapper );
+		return false;
+	}
+
+	static bool s_x6 = false;
+	if ( ! s_x6 ) { s_x6 = true ; checkCRC ( -1452050577 ); }
+
+
+	// sports news
+	static bool s_x7 = false;
+	if ( ! s_x7 ) {
+		s_x7 = true;
+		getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports+news",
+			 qatestWrapper );
+		return false;
+	}
+
+	static bool s_x8 = false;
+	if ( ! s_x8 ) { s_x8 = true; checkCRC ( -1586622518 ); }
+
+	//
+	// eject/delete the urls
+	//
+	static long s_ii2 = 0;
+	for ( ; s_ii2 < s_ubuf2.length()/(long)sizeof(char *) ; ) {
+		// reject using html api
+		SafeBuf sb;
+		sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&"
+			       "format=xml&u=");
+		sb.urlEncode ( s_urlPtrs[s_ii2] );
+		sb.nullTerm();
+		// pre-inc it in case getUrl() blocks
+		s_ii2++;
+		getUrl ( sb.getBufStart() , qatestWrapper );
+		return false;
+	}
+
+	//
+	// make sure no results left, +the
+	//
+	static bool s_x9 = false;
+	if ( ! s_x9 ) {
+		usleep(500000);
+		s_x9 = true;
+		getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
+			 qatestWrapper );
+		return false;
+	}
+
+	// seems to have <docsInCollection>2</>
+	static bool s_y1 = false;
+	if ( ! s_y1 ) { s_y1 = true; checkCRC ( -1672870556 ); }
+
+	//
+	// try delimeter based injecting
+	//
+	static bool s_y2 = false;
+	if ( ! s_y2 ) {
+		s_y2 = true;
+		SafeBuf sb;
+		// delim=+++URL:
+		sb.safePrintf("&c=qatest123&deleteurl=0&"
+			      "delim=%%2B%%2B%%2BURL%%3A&format=xml&u=xyz.com&"
+			      "hasmime=1&content=");
+		// use injectme3 file
+		SafeBuf ubuf;
+		ubuf.load("./injectme3");
+		sb.urlEncode(ubuf.getBufStart());
+		getUrl ( "/admin/inject",qatestWrapper,sb.getBufStart());
+		return false;
+	}
+
+	// check the reply, seems to have only a single docid in it...
+	static bool s_y3 = false;
+	if ( ! s_y3 ) { s_y3 = true; checkCRC ( -1970198487 ); }
+
+	// now query check
+	static bool s_y4 = false;
+	if ( ! s_y4 ) {
+		usleep(500000);
+		s_y4 = true;
+		getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
+			 qatestWrapper );
+		return false;
+	}
+
+	// check search results crc
+	static bool s_y5 = false;
+	if ( ! s_y5 ) { s_y5 = true; checkCRC ( -480078278 ); }
+
+
+

-	// test search results
-	if ( ! searchTest1 () ) return false;

-	// delete all urls cleanly now
-	if ( ! deleteUrls ( ) ) return false;

 	// now get rdblist for every rdb for this coll and make sure all zero!
-	if ( ! checkRdbLists ( &s_rdbId1 ) ) return false;
+	//if ( ! checkRdbLists ( &s_rdbId1 ) ) return false;

 	// dump, tight merge and ensure no data in our rdbs for this coll
-	if ( ! dumpTreesToDisk() ) return false;
+	//if ( ! dumpTreesToDisk() ) return false;

 	// wait for tight merge to complete
-	if ( ! waitForMergeToFinish() ) return false;
+	//if ( ! waitForMergeToFinish() ) return false;

 	// now get rdblist for every rdb for this coll and make sure all zero!
-	if ( ! checkRdbLists ( &s_rdbId2 ) ) return false;
+	//if ( ! checkRdbLists ( &s_rdbId2 ) ) return false;

 	// reset the collection so we can test spidering
-	if ( ! resetColl ( ) ) return false;
+	//if ( ! resetColl ( ) ) return false;

 	// add urls to seed spider with. make msg13.cpp recognize qatest123
 	// collection and return 404 on urls not in our official list so
 	// we can ensure search result consistency. msg13.cpp will initially
 	// store the pages in a file, like the first 1,000 or so pages.
-	if ( ! addUrlTest () ) return false;
+	//if ( ! addUrlTest () ) return false;

 	// wait for spidering to complete. sleep callback. # of spidered urls
 	// will be x, so we know when to stop
-	if ( ! checkSpidersDone() ) return false;
-
-	// . now search again on the large collection most likely
-	// . store search queries and checksum into queries2.txt
-	// . a 0 (or no) checksum means we should fill it in
-	if ( ! searchTest2 () ) return false;
+	//if ( ! checkSpidersDone() ) return false;

 	// try a query delete
 	//if ( ! queryDeleteTest() ) return false;
@ -440,7 +565,30 @@ bool qatest ( ) {
 	//if ( ! checkRdbLists ( &s_rdbId3 ) ) return false;

 	// delete the collection
-	if ( ! delColl() ) return false;
+	static bool s_fee = false;
+	if ( ! s_fee ) {
+		s_fee = true;
+		return getUrl ( "/admin/delcoll?delcoll=qatest123" , 
+				qatestWrapper );
+	}
+
+	static bool s_fee2 = false;
+	if ( ! s_fee2 ) {
+		s_fee2 = true;
+		fprintf(stderr,"\n\n\nSUCCESSFULLY COMPLETED QA TEST\n\n\n");
+		exit(0);
+	}
+

 	return true;
 }
+
+// . run a series of tests to ensure that gb is functioning properly
+// . uses the ./qa subdirectory to hold archive pages, ips, spider dates to
+//   ensure consistency between tests for exact replays
+bool qatest ( ) {
+
+	return qainject();
+
+}
+