open-source-search-engine/PageCrawlBot.cpp

// diffbot api implementaion

//
// WHAT APIs are here?
//
// . 1. the CrawlBot API to start a crawl
// . 2. To directly process a provided URL (injection)
// . 3. the Cache API so phantomjs can quickly check the cache for files
//      and quickly add files to the cache.
//

// Related pages:
//
// * http://diffbot.com/dev/docs/  (Crawlbot API tab, and others)
// * http://diffbot.com/dev/crawl/

#include "PageCrawlBot.h"
#include "TcpServer.h"
#include "HttpRequest.h"
#include "HttpServer.h"
#include "Pages.h" // g_msg
#include "XmlDoc.h" // for checkRegex()
#include "PageInject.h" // Msg7
//#include "Json.h"
#include "Parms.h"

// so user can specify the format of the reply/output
#define FMT_HTML 1
#define FMT_XML  2
#define FMT_JSON 3
#define FMT_CSV  4
#define FMT_TXT  5

void doneSendingWrapper ( void *state , TcpSocket *sock ) ;
bool sendBackDump ( TcpSocket *s,HttpRequest *hr );
CollectionRec *addNewDiffbotColl ( char *addColl , char *token,char *name ,
				   class HttpRequest *hr ) ;
bool resetUrlFilters ( CollectionRec *cr ) ;

bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
				     HttpRequest *hr ,
				     CollectionRec *cr ) ;


////////////////
//
// SUPPORT FOR DOWNLOADING an RDB DUMP
//
// We ask each shard for 10MB of Spiderdb records. If 10MB was returned
// then we repeat. Everytime we get 10MB from each shard we print the
// Spiderdb records out into "safebuf" and transmit it to the user. once
// the buffer has been transmitted then we ask the shards for another 10MB
// worth of spider records.
//
////////////////


// use this as a state while dumping out spiderdb for a collection
class StateCD {
public:
	StateCD () { m_needsMime = true; };
	void sendBackDump2 ( ) ;
	bool readDataFromRdb ( ) ;
	bool sendList ( ) ;
	void printSpiderdbList ( RdbList *list , SafeBuf *sb ,
				 char **lastKeyPtr ) ;
	void printTitledbList ( RdbList *list , SafeBuf *sb ,
				char **lastKeyPtr );
	bool printJsonItemInCsv ( char *json , SafeBuf *sb ) ;

	long long m_lastUh48;
	long m_lastFirstIp;
	long long m_prevReplyUh48;
	long m_prevReplyFirstIp;
	long m_prevReplyError;
	time_t m_prevReplyDownloadTime;

	char m_fmt;
	Msg4 m_msg4;
	HttpRequest m_hr;
	Msg7 m_msg7;
	long m_dumpRound;
	long long m_accumulated;

	WaitEntry m_waitEntry;

	bool m_isFirstTime;
	bool m_printedFirstBracket;
	bool m_printedEndingBracket;
	bool m_printedItem;

	bool m_needHeaderRow;

	SafeBuf m_seedBank;
	SafeBuf m_listBuf;

	bool m_needsMime;
	char m_rdbId;
	bool m_downloadJSON;
	collnum_t m_collnum;
	long m_numRequests;
	long m_numReplies;
	long m_minRecSizes;
	bool m_someoneNeedsMore;
	TcpSocket *m_socket;
	Msg0 m_msg0s[MAX_HOSTS];
	key128_t m_spiderdbStartKeys[MAX_HOSTS];
	key_t m_titledbStartKeys[MAX_HOSTS];
	RdbList m_lists[MAX_HOSTS];
	bool m_needMore[MAX_HOSTS];

};

// . basically dump out spiderdb
// . returns urls in csv format in reply to a
//   "GET /api/download/%s_data.json"
//   "GET /api/download/%s_data.xml"
//   "GET /api/download/%s_urls.csv"
//   "GET /api/download/%s_pages.txt"
//   where %s is the collection name
// . the ordering of the urls is not specified so whatever order they are
//   in spiderdb will do
// . the gui that lists the urls as they are spidered in real time when you
//   do a test crawl will just have to call this repeatedly. it shouldn't
//   be too slow because of disk caching, and, most likely, the spider requests
//   will all be in spiderdb's rdbtree any how
// . because we are distributed we have to send a msg0 request to each
//   shard/group asking for all the spider urls. dan says 30MB is typical
//   for a csv file, so for now we will just try to do a single spiderdb
//   request.
bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {

	char *path = hr->getPath();
	long pathLen = hr->getPathLen();
	char *pathEnd = path + pathLen;

	char *str = strstr ( path , "/download/" );
	if ( ! str ) {
		char *msg = "bad download request";
		log("crawlbot: %s",msg);
		g_httpServer.sendErrorReply(sock,500,msg);
		return true;
	}

	// when downloading csv socket closes because we can take minutes
	// before we send over the first byte, so try to keep open
	//int parm = 1;
	//if(setsockopt(sock->m_sd,SOL_TCP,SO_KEEPALIVE,&parm,sizeof(int))<0){
	//	log("crawlbot: setsockopt: %s",mstrerror(errno));
	//	errno = 0;
	//}

	//long pathLen = hr->getPathLen();
	char rdbId = RDB_NONE;
	bool downloadJSON = false;
	long fmt;
	char *xx;

	if ( ( xx = strstr ( path , "_data.json" ) ) ) {
		rdbId = RDB_TITLEDB;
		fmt = FMT_JSON;
		downloadJSON = true;
	}
	else if ( ( xx = strstr ( path , "_data.csv" ) ) ) {
		rdbId = RDB_TITLEDB;
		downloadJSON = true;
		fmt = FMT_CSV;
	}
	else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
		rdbId = RDB_SPIDERDB;
		fmt = FMT_CSV;
	}
	else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
		rdbId = RDB_SPIDERDB;
		fmt = FMT_TXT;
	}
	else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
		rdbId = RDB_TITLEDB;
		fmt = FMT_TXT;
	}

	// sanity, must be one of 3 download calls
	if ( rdbId == RDB_NONE ) {
		char *msg ;
		msg = "usage: downloadurls, downloadpages, downloaddata";
		log("crawlbot: %s",msg);
		g_httpServer.sendErrorReply(sock,500,msg);
		return true;
	}


	char *coll = str + 10;
	if ( coll >= pathEnd ) {
		char *msg = "bad download request2";
		log("crawlbot: %s",msg);
		g_httpServer.sendErrorReply(sock,500,msg);
		return true;
	}

	// get coll
	char *collEnd = xx;

	//CollectionRec *cr = getCollRecFromHttpRequest ( hr );
	CollectionRec *cr = g_collectiondb.getRec ( coll , collEnd - coll );
	if ( ! cr ) {
		char *msg = "token or id (crawlid) invalid";
		log("crawlbot: invalid token or crawlid to dump");
		g_httpServer.sendErrorReply(sock,500,msg);
		return true;
	}


	// . if doing download of csv, make it search results now!
	// . make an httprequest on stack and call it
	if ( fmt == FMT_CSV && rdbId == RDB_TITLEDB ) {
		char tmp2[5000];
		SafeBuf sb2(tmp2,5000);
		sb2.safePrintf("GET /search.csv?icc=1&format=csv&sc=0&"
			       // dedup. since stream=1 and pss=0 below
			       // this will dedup on page content hash only
			       // which is super fast.
			       "dr=1&"
			       "c=%s&n=1000000&"
			       // no summary similarity dedup, only exact
			       // doc content hash. otherwise too slow!!
			       "pss=0&"
			       // no gigabits
			       "dsrt=0&"
			       // do not compute summary. 0 lines.
			       "ns=0&"
			      "q=gbsortby%%3Agbspiderdate&"
			      "prepend=type%%3Ajson"
			      "\r\n\r\n"
			       , cr->m_coll
			       );
		HttpRequest hr2;
		hr2.set ( sb2.getBufStart() , sb2.length() , sock );
		return sendPageResults ( sock , &hr2 );
	}

	// . if doing download of json, make it search results now!
	// . make an httprequest on stack and call it
	if ( fmt == FMT_JSON && rdbId == RDB_TITLEDB ) {
		char tmp2[5000];
		SafeBuf sb2(tmp2,5000);
		sb2.safePrintf("GET /search.csv?icc=1&format=json&sc=0&"
			       // dedup. since stream=1 and pss=0 below
			       // this will dedup on page content hash only
			       // which is super fast.
			       "dr=1&"
			      "c=%s&n=1000000&"
			       // we can stream this because unlink csv it
			       // has no header row that needs to be
			       // computed from all results.
			       "stream=1&"
			       // no summary similarity dedup, only exact
			       // doc content hash. otherwise too slow!!
			       "pss=0&"
			       // no gigabits
			       "dsrt=0&"
			       // do not compute summary. 0 lines.
			       "ns=0&"
			      "q=gbsortby%%3Agbspiderdate&"
			      "prepend=type%%3Ajson"
			      "\r\n\r\n"
			       , cr->m_coll
			       );
		HttpRequest hr2;
		hr2.set ( sb2.getBufStart() , sb2.length() , sock );
		return sendPageResults ( sock , &hr2 );
	}


	//if ( strncmp ( path ,"/crawlbot/downloadurls",22  ) == 0 )
	//	rdbId = RDB_SPIDERDB;
	//if ( strncmp ( path ,"/crawlbot/downloadpages",23  ) == 0 )
	//	rdbId = RDB_TITLEDB;
	//if ( strncmp ( path ,"/crawlbot/downloaddata",22  ) == 0 ) {
	//	downloadJSON = true;
	//	rdbId = RDB_TITLEDB;
	//}


	StateCD *st;
	try { st = new (StateCD); }
	catch ( ... ) {
	       return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno));
	}
	mnew ( st , sizeof(StateCD), "statecd");

	// initialize the new state
	st->m_rdbId = rdbId;
	st->m_downloadJSON = downloadJSON;
	st->m_socket = sock;
	// the name of the collections whose spiderdb we read from
	st->m_collnum = cr->m_collnum;

	st->m_fmt = fmt;
	st->m_isFirstTime = true;

	st->m_printedFirstBracket = false;
	st->m_printedItem = false;
	st->m_printedEndingBracket = false;

	// for csv...
	st->m_needHeaderRow = true;

	st->m_lastUh48 = 0LL;
	st->m_lastFirstIp = 0;
	st->m_prevReplyUh48 = 0LL;
	st->m_prevReplyFirstIp = 0;
	st->m_prevReplyError = 0;
	st->m_prevReplyDownloadTime = 0LL;
	st->m_dumpRound = 0;
	st->m_accumulated = 0LL;

	// debug
	//log("mnew1: st=%lx",(long)st);

	// begin the possible segmented process of sending back spiderdb
	// to the user's browser
	st->sendBackDump2();
	// i dont think this return values matters at all since httpserver.cpp
	// does not look at it when it calls sendReply()
	return true;
}


// . all wrappers call this
// . returns false if would block, true otherwise
bool readAndSendLoop ( StateCD *st , bool readFirst ) {

 subloop:

	// if we had a broken pipe on the sendChunk() call then hopefully
	// this will kick in...
	if ( g_errno ) {
		log("crawlbot: readAndSendLoop: %s",mstrerror(g_errno));
		readFirst = true;
		st->m_someoneNeedsMore = false;
	}

	// wait if some are outstanding. how can this happen?
	if ( st->m_numRequests > st->m_numReplies ) {
		log("crawlbot: only got %li of %li replies. waiting for "
		    "all to come back in.",
		    st->m_numReplies,st->m_numRequests);
		return false;
	}

	// are we all done? we still have to call sendList() to
	// set socket's streamingMode to false to close things up
	if ( readFirst && ! st->m_someoneNeedsMore ) {
		log("crawlbot: done sending for download request");
		mdelete ( st , sizeof(StateCD) , "stcd" );
		delete st;
		return true;
	}

	// begin reading from each shard and sending the spiderdb records
	// over the network. return if that blocked
	if ( readFirst && ! st->readDataFromRdb ( ) ) return false;

	// send it to the browser socket. returns false if blocks.
	if ( ! st->sendList() ) return false;

	// read again i guess
	readFirst = true;

	// hey, it did not block... tcpserver caches writes...
	goto subloop;
}

void StateCD::sendBackDump2 ( ) {

	m_numRequests = 0;
	m_numReplies  = 0;

	// read 10MB from each shard's spiderdb at a time
	//m_minRecSizes = 9999999;
	// 1ook to be more fluid
	m_minRecSizes = 99999;

	// we stop reading from all shards when this becomes false
	m_someoneNeedsMore = true;

	// initialize the spiderdb startkey "cursor" for each shard's spiderdb
	for ( long i = 0 ; i < g_hostdb.m_numShards ; i++ ) {
		m_needMore[i] = true;
		KEYMIN((char *)&m_spiderdbStartKeys[i],sizeof(key128_t));
		KEYMIN((char *)&m_titledbStartKeys[i],sizeof(key_t));
	}

	// begin reading from the shards and trasmitting back on m_socket
	readAndSendLoop ( this , true );
}


static void gotListWrapper7 ( void *state ) {
	// get the Crawler dump State
	StateCD *st = (StateCD *)state;
	// inc it up here
	st->m_numReplies++;
	// wait for all
	if ( st->m_numReplies < st->m_numRequests ) return;
	// read and send loop
	readAndSendLoop( st , false );
}


bool StateCD::readDataFromRdb ( ) {

	// set end key to max key. we are limiting using m_minRecSizes for this
	key128_t ek; KEYMAX((char *)&ek,sizeof(key128_t));

	CollectionRec *cr = g_collectiondb.getRec(m_collnum);
	// top:
	// launch one request to each shard
	for ( long i = 0 ; i < g_hostdb.m_numShards ; i++ ) {
		// reset each one
		m_lists[i].freeList();
		// if last list was exhausted don't bother
		if ( ! m_needMore[i] ) continue;
		// count it
		m_numRequests++;
		// this is the least nice. crawls will yield to it mostly.
		long niceness = 0;
		// point to right startkey
		char *sk ;
		if ( m_rdbId == RDB_SPIDERDB )
			sk = (char *)&m_spiderdbStartKeys[i];
		else
			sk = (char *)&m_titledbStartKeys[i];
		// get host
		Host *h = g_hostdb.getLiveHostInShard(i);
		// show it
		long ks = getKeySizeFromRdbId(m_rdbId);
		log("dump: asking host #%li for list sk=%s",
		    h->m_hostId,KEYSTR(sk,ks));
		// msg0 uses multicast in case one of the hosts in a shard is
		// dead or dies during this call.
		if ( ! m_msg0s[i].getList ( h->m_hostId , // use multicast
					    h->m_ip,
					    h->m_port,
					    0, // maxcacheage
					    false, // addtocache?
					    m_rdbId,
					   cr->m_coll,
					   &m_lists[i],
					   sk,
					   (char *)&ek,
					   // get at most about
					   // "minRecSizes" worth of spiderdb
					   // records
					   m_minRecSizes,
					   this,
					    gotListWrapper7 ,
					    niceness ) ) {
			log("crawlbot: blocked getting list from shard");
			// continue if it blocked
			continue;
		}
		log("crawlbot: did not block getting list from shard err=%s",
		    mstrerror(g_errno));
		// we got a reply back right away...
		m_numReplies++;
	}
	// all done? return if still waiting on more msg0s to get their data
	if ( m_numReplies < m_numRequests ) return false;
	// i guess did not block, empty single shard? no, must have been
	// error becaues sendList() would have sent back on the tcp
	// socket and blocked and returned false if not error sending
	return true;
}

bool StateCD::sendList ( ) {
	// get the Crawler dump State
	// inc it
	//m_numReplies++;
	// sohw it
	log("crawlbot: got list from shard. req=%li rep=%li",
	    m_numRequests,m_numReplies);
	// return if still awaiting more replies
	if ( m_numReplies < m_numRequests ) return false;

	SafeBuf sb;
	//sb.setLabel("dbotdmp");

	char *ct = "text/csv";
	if ( m_fmt == FMT_JSON )
		ct = "application/json";
	if ( m_fmt == FMT_XML )
		ct = "text/xml";
	if ( m_fmt == FMT_TXT )
		ct = "text/plain";
	if ( m_fmt == FMT_CSV )
		ct = "text/csv";

	// . if we haven't yet sent an http mime back to the user
	//   then do so here, the content-length will not be in there
	//   because we might have to call for more spiderdb data
	if ( m_needsMime ) {
		m_needsMime = false;
		HttpMime mime;
		mime.makeMime ( -1, // totel content-lenght is unknown!
				0 , // do not cache (cacheTime)
				0 , // lastModified
				0 , // offset
				-1 , // bytesToSend
				NULL , // ext
				false, // POSTReply
				ct, // "text/csv", // contenttype
				"utf-8" , // charset
				-1 , // httpstatus
				NULL ); //cookie
		sb.safeMemcpy(mime.getMime(),mime.getMimeLen() );
	}

	//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );

	if ( ! m_printedFirstBracket && m_fmt == FMT_JSON ) {
		sb.safePrintf("[\n");
		m_printedFirstBracket = true;
	}

	// these are csv files not xls
	//if ( ! m_printedFirstBracket && m_fmt == FMT_CSV ) {
	//	sb.safePrintf("sep=,\n");
	//	m_printedFirstBracket = true;
	//}


	// we set this to true below if any one shard has more spiderdb
	// records left to read
	m_someoneNeedsMore = false;

	//
	// got all replies... create the HTTP reply and send it back
	//
	for ( long i = 0 ; i < g_hostdb.m_numShards ; i++ ) {
		if ( ! m_needMore[i] ) continue;
		// get the list from that group
		RdbList *list = &m_lists[i];

		// should we try to read more?
		m_needMore[i] = false;

		// report it
		log("dump: got list of %li bytes from host #%li round #%li",
		    list->getListSize(),i,m_dumpRound);


		if ( list->isEmpty() ) {
			list->freeList();
			continue;
		}

		// get the format
		//char *format = cr->m_diffbotFormat.getBufStart();
		//if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL;
		//char *format = NULL;

		// this cores because msg0 does not transmit lastkey
		//char *ek = list->getLastKey();

		char *lastKeyPtr = NULL;

		// now print the spiderdb list out into "sb"
		if ( m_rdbId == RDB_SPIDERDB ) {
			// print SPIDERDB list into "sb"
			printSpiderdbList ( list , &sb , &lastKeyPtr );
			//  update spiderdb startkey for this shard
			KEYSET((char *)&m_spiderdbStartKeys[i],lastKeyPtr,
			       sizeof(key128_t));
			// advance by 1
			m_spiderdbStartKeys[i] += 1;
		}

		else if ( m_rdbId == RDB_TITLEDB ) {
			// print TITLEDB list into "sb"
			printTitledbList ( list , &sb , &lastKeyPtr );
			//  update titledb startkey for this shard
			KEYSET((char *)&m_titledbStartKeys[i],lastKeyPtr,
			       sizeof(key_t));
			// advance by 1
			m_titledbStartKeys[i] += 1;
		}

		else { char *xx=NULL;*xx=0; }

		// figure out why we do not get the full list????
		//if ( list->m_listSize >= 0 ) { // m_minRecSizes ) {
		m_needMore[i] = true;
		m_someoneNeedsMore = true;
		//}

		// save mem
		list->freeList();
	}

	m_dumpRound++;

	//log("rdbid=%li fmt=%li some=%li printed=%li",
	//    (long)m_rdbId,(long)m_fmt,(long)m_someoneNeedsMore,
	//    (long)m_printedEndingBracket);

	m_socket->m_streamingMode = true;

	// if nobody needs to read more...
	if ( ! m_someoneNeedsMore && ! m_printedEndingBracket ) {
		// use this for printing out urls.csv as well...
		m_printedEndingBracket = true;
		// end array of json objects. might be empty!
		if ( m_rdbId == RDB_TITLEDB && m_fmt == FMT_JSON )
			sb.safePrintf("\n]\n");
		//log("adding ]. len=%li",sb.length());
		// i'd like to exit streaming mode here. i fixed tcpserver.cpp
		// so if we are called from makecallback() there it won't
		// call destroysocket if we WERE in streamingMode just yet
		m_socket->m_streamingMode = false;
	}

	TcpServer *tcp = &g_httpServer.m_tcp;

	// . transmit the chunk in sb
	// . steals the allocated buffer from sb and stores in the
	//   TcpSocket::m_sendBuf, which it frees when socket is
	//   ultimately destroyed or we call sendChunk() again.
	// . when TcpServer is done transmitting, it does not close the
	//   socket but rather calls doneSendingWrapper() which can call
	//   this function again to send another chunk
	if ( ! tcp->sendChunk ( m_socket ,
				&sb  ,
				this ,
				doneSendingWrapper ) )
		return false;

	// we are done sending this chunk, i guess tcp write was cached
	// in the network card buffer or something
	return true;
}

// TcpServer.cpp calls this when done sending TcpSocket's m_sendBuf
void doneSendingWrapper ( void *state , TcpSocket *sock ) {
	StateCD *st = (StateCD *)state;
	// error on socket?
	//if ( g_errno ) st->m_socketError = g_errno;
	//TcpSocket *socket = st->m_socket;
	st->m_accumulated += sock->m_totalSent;

	log("crawlbot: done sending on socket %li/%li [%lli] bytes",
	    sock->m_totalSent,
	    sock->m_sendBufUsed,
	    st->m_accumulated);


	readAndSendLoop ( st , true );

	return;
}

void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
	// declare these up here
	SpiderRequest *sreq = NULL;
	SpiderReply   *srep = NULL;
	long badCount = 0;

	long nowGlobalMS = gettimeofdayInMillisecondsGlobal();
	CollectionRec *cr = g_collectiondb.getRec(m_collnum);
	long lastSpidered = 0;

	// parse through it
	for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
		// this record is either a SpiderRequest or SpiderReply
		char *rec = list->getCurrentRec();
		// save it
		*lastKeyPtr = rec;
		// we encounter the spiderreplies first then the
		// spiderrequests for the same url
		if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) {
			srep = (SpiderReply *)rec;
			if ( sreq ) lastSpidered = 0;
			sreq = NULL;
			if ( lastSpidered == 0 )
				lastSpidered = srep->m_spideredTime;
			else if ( srep->m_spideredTime > lastSpidered )
				lastSpidered = srep->m_spideredTime;
			m_prevReplyUh48 = srep->getUrlHash48();
			m_prevReplyFirstIp = srep->m_firstIp;
			// 0 means indexed successfully. not sure if
			// this includes http status codes like 404 etc.
			// i don't think it includes those types of errors!
			m_prevReplyError = srep->m_errCode;
			m_prevReplyDownloadTime = srep->m_spideredTime;
			continue;
		}
		// ok, we got a spider request
		sreq = (SpiderRequest *)rec;
		// sanity check
		if ( srep && srep->getUrlHash48() != sreq->getUrlHash48()){
			badCount++;
			//log("diffbot: had a spider reply with no "
			//    "corresponding spider request for uh48=%lli"
			//    , srep->getUrlHash48());
			//char *xx=NULL;*xx=0;
		}

		// print the url if not yet printed
		long long uh48 = sreq->getUrlHash48  ();
		long firstIp = sreq->m_firstIp;
		bool printIt = false;
		// there can be multiple spiderrequests for the same url!
		if ( m_lastUh48 != uh48 ) printIt = true;
		// sometimes the same url has different firstips now that
		// we have the EFAKEFIRSTIP spider error to avoid spidering
		// seeds twice...
		if ( m_lastFirstIp != firstIp ) printIt = true;
		if ( ! printIt ) continue;
		m_lastUh48 = uh48;
		m_lastFirstIp = firstIp;

		// make sure spiderreply is for the same url!
		if ( srep && srep->getUrlHash48() != sreq->getUrlHash48() )
			srep = NULL;
		if ( ! srep )
			lastSpidered = 0;

		bool isProcessed = false;
		if ( srep ) isProcessed = srep->m_sentToDiffbot;

		if ( srep && srep->m_hadDiffbotError )
			isProcessed = false;

		// debug point
		//if ( strstr(sreq->m_url,"chief") )
		//	log("hey");

		// 1 means spidered, 0 means not spidered, -1 means error
		long status = 1;
		// if unspidered, then we don't match the prev reply
		// so set "status" to 0 to indicate hasn't been
		// downloaded yet.
		if ( m_lastUh48 != m_prevReplyUh48 ) status = 0;
		if ( m_lastFirstIp != m_prevReplyFirstIp ) status = 0;
		// if it matches, perhaps an error spidering it?
		if ( status && m_prevReplyError ) status = -1;

		// use the time it was added to spiderdb if the url
		// was not spidered
		time_t time = sreq->m_addedTime;
		// if it was spidered, successfully or got an error,
		// then use the time it was spidered
		if ( status ) time = m_prevReplyDownloadTime;

		char *msg = "Successfully Downloaded";//Crawled";
		if ( status == 0 ) msg = "Not downloaded";//Unexamined";
		if ( status == -1 ) msg = mstrerror(m_prevReplyError);

		if ( srep && srep->m_hadDiffbotError )
			msg = "Diffbot processing error";

		// matching url filter, print out the expression
		long ufn ;
		ufn = ::getUrlFilterNum(sreq,
					srep,
					nowGlobalMS,
					false,
					MAX_NICENESS,
					cr,
					false, // isoutlink?
					NULL);
		char *expression = NULL;
		long  priority = -4;
		// sanity check
		if ( ufn >= 0 ) {
			expression = cr->m_regExs[ufn].getBufStart();
			priority   = cr->m_spiderPriorities[ufn];
		}

		if ( ! expression ) {
			expression = "error. matches no expression!";
			priority = -4;
		}

		// when spidering rounds we use the
		// lastspidertime>={roundstart} --> spiders disabled rule
		// so that we do not spider a url twice in the same round
		if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) {
		     // we set this to 0 instead of using the checkbox
		     cr->m_maxSpidersPerRule[ufn] <= 0 ) {
			priority = -5;
		}

		char *as = "discovered";
		if ( sreq &&
		     ( sreq->m_isInjecting ||
		       sreq->m_isAddUrl ) ) {
			as = "manually added";
		}

		// print column headers?
		if ( m_isFirstTime ) {
			m_isFirstTime = false;
			sb->safePrintf("\"Url\","
				       "\"Entry Method\","
				       );
			if ( cr->m_isCustomCrawl )
				sb->safePrintf("\"Processed?\",");
			sb->safePrintf(
				       "\"Add Time\","
				       "\"Last Crawled\","
				       "\"Last Status\","
				       "\"Matching Expression\","
				       "\"Matching Action\"\n");
		}

		// "csv" is default if json not specified
		if ( m_fmt == FMT_JSON )
			sb->safePrintf("[{"
				       "{\"url\":"
				       "\"%s\"},"
				       "{\"time\":"
				       "\"%lu\"},"

				       "{\"status\":"
				       "\"%li\"},"

				       "{\"statusMsg\":"
				       "\"%s\"}"

				       "}]\n"
				       , sreq->m_url
				       // when was it first added to spiderdb?
				       , sreq->m_addedTime
				       , status
				       , msg
				       );
		// but default to csv
		else {
			sb->safePrintf("\"%s\",\"%s\","
				       , sreq->m_url
				       , as
				       );
			if ( cr->m_isCustomCrawl )
				sb->safePrintf("%li,",(long)isProcessed);
			sb->safePrintf(
				       "%lu,%lu,\"%s\",\"%s\",\""
				       //",%s"
				       //"\n"
				       // when was it first added to spiderdb?
				       , sreq->m_addedTime
				       // last time spidered, 0 if none
				       , lastSpidered
				       //, status
				       , msg
				       // the url filter expression it matches
				       , expression
				       // the priority
				       //, priorityMsg
				       //, iptoa(sreq->m_firstIp)
				       );
			// print priority
			if ( priority == SPIDER_PRIORITY_FILTERED )
				sb->safePrintf("url ignored");
			else if ( priority == SPIDER_PRIORITY_BANNED )
				sb->safePrintf("url banned");
			else if ( priority == -4 )
				sb->safePrintf("error");
			else if ( priority == -5 )
				sb->safePrintf("will spider next round");
			else
				sb->safePrintf("%li",priority);
			sb->safePrintf("\""
				       "\n");
		}
	}

	if ( ! badCount ) return;

	log("diffbot: had a spider reply with no "
	    "corresponding spider request %li times", badCount);
}


void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){

	XmlDoc xd;

	CollectionRec *cr = g_collectiondb.getRec ( m_collnum );

	// save it
	*lastKeyPtr = NULL;

	// parse through it
	for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) {
		// this record is either a SpiderRequest or SpiderReply
		char *rec = list->getCurrentRec();
		// skip ifnegative
		if ( (rec[0] & 0x01) == 0x00 ) continue;
		// set it
		*lastKeyPtr = rec;
		// reset first since set2() can't call reset()
		xd.reset();
		// uncompress it
		if ( ! xd.set2 ( rec ,
				 0, // maxSize unused
				 cr->m_coll ,
				 NULL , // ppbuf
				 0 , // niceness
				 NULL ) ) { // spiderRequest
			log("diffbot: error setting titlerec in dump");
			continue;
		}
		// must be of type json to be a diffbot json object
		if ( m_downloadJSON && xd.m_contentType != CT_JSON ) continue;
		// or if downloading web pages...
		if ( ! m_downloadJSON ) {
			// skip if json object content type
			if ( xd.m_contentType == CT_JSON ) continue;
			// . just print the cached page
			// . size should include the \0
			sb->safeStrcpy ( xd.m_firstUrl.m_url);
			// then \n
			sb->pushChar('\n');
			// then page content
			sb->safeStrcpy ( xd.ptr_utf8Content );
			// null term just in case
			//sb->nullTerm();
			// separate pages with \0 i guess
			sb->pushChar('\0');
			// \n
			sb->pushChar('\n');
			continue;
		}

		// skip if not a diffbot json url
		if ( ! xd.m_isDiffbotJSONObject ) continue;

		// get the json content
		char *json = xd.ptr_utf8Content;

		// empty?
		if ( xd.size_utf8Content <= 1 )
			continue;

		// if not json, just print the json item out in csv
		// moved into PageResults.cpp...
		//if ( m_fmt == FMT_CSV ) {
		//	printJsonItemInCsv ( json , sb );
		//	continue;
		//}

		// just print that out. encode \n's and \r's back to \\n \\r
		// and backslash to a \\ ...
		// but if they originally had a \u<backslash> encoding and
		// we made into utf8, do not put that back into the \u
		// encoding because it is not necessary.

		// print in json
		if ( m_printedItem )
			sb->safePrintf("\n,\n");

		m_printedItem = true;

		//if ( ! sb->safeStrcpyPrettyJSON ( json ) )
		//	log("diffbot: error printing json in dump");
		sb->safeStrcpy ( json );

		sb->nullTerm();

		// separate each JSON object with \n i guess
		//sb->pushChar('\n');
	}
}

/*
////////////////
//
// SUPPORT FOR GET /api/crawls and /api/activecrawls
//
// Just scan each collection record whose collection name includes the
// provided "token" of the user. then print out the stats of just
//
////////////////

// example output for http://live.diffbot.com/api/crawls?token=matt
// [{"id":"c421f09d-7c31-4131-9da2-21e35d8130a9","finish":1378233585887,"matched":274,"status":"Stopped","start":1378233159848,"token":"matt","parameterMap":{"token":"matt","seed":"www.techcrunch.com","api":"article"},"crawled":274}]

// example output from activecrawls?id=....
// {"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":null,"matched":27,"status":"Crawling","start":1378322184332,"token":"matt","parameterMap":{"token":"matt","seed":"www.alleyinsider.com","api":"article"},"crawled":34}

// NOTE: it does not seem to include active crawls! bad!! like if you lost
// the crawlid...

// "cr" is NULL if showing all crawls!
bool showAllCrawls ( TcpSocket *s , HttpRequest *hr ) {

	long tokenLen = 0;
	char *token = hr->getString("token",&tokenLen);

	// token MUST be there because this function's caller checked for it
	if ( ! token ) { char *xx=NULL;*xx=0; }

	// store the crawl stats as html into "sb"
	SafeBuf sb;

	// scan the collection recs
	for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		// get it
		CollectionRec *cr = g_collectiondb.m_recs[i];
		// skip if empty
		if ( ! cr ) continue;
		// get name
		char *coll = cr->m_coll;
		//long collLen = cr->m_collLen;
		// skip if first 16 or whatever characters does not match
		// the user token because the name of a collection is
		// <TOKEN>-<CRAWLID>
		if ( coll[0] != token[0] ) continue;
		if ( coll[1] != token[1] ) continue;
		if ( coll[2] != token[2] ) continue;
		// scan the rest
		bool match = true;
		for ( long i = 3 ; coll[i] && token[i] ; i++ ) {
			// the name of a collection is <TOKEN>-<CRAWLID>
			// so if we hit the hyphen we are done
			if ( coll[i] == '-' ) break;
			if ( coll[i] != token[i] ) { match = false; break; }
		}
		if ( ! match ) continue;
		// we got a match, print them out
		printCrawlStats ( &sb , cr );
	}

	// and send back now
	return g_httpServer.sendDynamicPage (s, sb.getBufStart(),
					     sb.length(),
					     -1);// cachetime

}
*/

/*
char *getTokenFromHttpRequest ( HttpRequest *hr ) {
	// provided directly?
	char *token = hr->getString("token",NULL,NULL);
	if ( token ) return token;
	// extract token from coll?
	char *c = hr->getString("c",NULL,NULL);
	// try new "id" approach
	if ( ! c ) c = hr->getString("id",NULL,NULL);
	if ( ! c ) return NULL;
	CollectionRec *cr = g_collectiondb.getRec(c);
	if ( ! cr ) return NULL;
	if ( cr->m_diffbotToken.length() <= 0 ) return NULL;
	token = cr->m_diffbotToken.getBufStart();
	return token;
}

CollectionRec *getCollRecFromHttpRequest ( HttpRequest *hr ) {
	// if we have the collection name explicitly, get the coll rec then
	char *c = hr->getString("c",NULL,NULL);
	// try new "id" approach
	if ( ! c ) c = hr->getString("id",NULL,NULL);
	if ( c ) return g_collectiondb.getRec ( c );
	// no matches
	return NULL;
}
*/

/*
// doesn't have to be fast, so  just do a scan
CollectionRec *getCollRecFromCrawlId ( char *crawlId ) {

	long idLen = gbstrlen(crawlId);

	// scan collection names
	for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		// get it
		CollectionRec *cr = g_collectiondb.m_recs[i];
		// skip if empty
		if ( ! cr ) continue;
		// get name
		char *coll = cr->m_coll;
		long collLen = cr->m_collLen;
		if ( collLen < 16 ) continue;
		// skip if first 16 or whatever characters does not match
		// the user token because the name of a collection is
		// <TOKEN>-<CRAWLID>
		if ( coll[collLen-1] != crawlId[idLen-1] ) continue;
		if ( coll[collLen-2] != crawlId[idLen-2] ) continue;
		if ( coll[collLen-3] != crawlId[idLen-3] ) continue;
		if ( ! strstr ( coll , crawlId ) ) continue;
		return cr;
	}
	return NULL;
}

void printCrawlStatsWrapper ( void *state ) {
	StateXX *sxx = (StateXX *)state;
	// get collection rec
	CollectionRec *cr = g_collectiondb.getRec(sxx->m_collnum);
	// print out the crawl
	SafeBuf sb;
	printCrawlStats ( &sb , cr );
	// save before nuking state
	TcpSocket *sock = sxx->m_socket;
	// nuke the state
	mdelete ( sxx , sizeof(StateXX) , "stxx" );
	delete sxx;
	// and send back now
	g_httpServer.sendDynamicPage ( sock ,
				       sb.getBufStart(),
				       sb.length(),
				       -1 ); // cachetime
}


void printCrawlStats ( SafeBuf *sb , CollectionRec *cr ) {

	// if we are the first, print a '[' to start a json thingy
	if ( sb->length() == 0 )
		sb->pushChar('[');
	// otherwise, remove the previous ']' since we are not the last
	else {
		char *p = sb->getBufStart();
		long plen = sb->length();
		if ( p[plen-1]=='[' )
			sb->incrementLength(-1);
	}

	sb->safePrintf( "{"
			"\"id\":\""
			);
	// get the token from coll name
	char *token = cr->m_coll;
	// and the length, up to the hyphen that separates it from crawl id
	long tokenLen = 0;
	for ( ; token[tokenLen] && token[tokenLen] != '-' ; tokenLen++ );
	// now crawl id
	char *crawlId = token + tokenLen;
	// skip hyphen
	if ( crawlId[0] == '-' ) crawlId++;
	// print crawl id out
	sb->safeStrcpy ( crawlId );
	// end its quote
	sb->safeStrcpy ( "\",");
	// now the time the crawl finished.
	if ( cr->m_spideringEnabled )
		sb->safePrintf("\"finish\":null,");
	else
		sb->safePrintf("\"finish\":%lli,",cr->m_diffbotCrawlEndTime);
	// how many urls we handoff to diffbot api. that implies successful
	// download and that it matches the url crawl pattern and
	// url process pattern and content regular expression pattern.
	//
	// NOTE: pageProcessAttempts can be higher than m_pageDownloadAttempts
	// when we call getMetaList() on an *old* (in titledb) xmldoc,
	// where we just get the cached content from titledb to avoid a
	// download, but we still call getDiffbotReply(). perhaps reconstruct
	// the diffbot reply from XmlDoc::m_diffbotJSONCount
	//
	// "processed" here corresponds to the "maxProcessed" cgi parm
	// specified when instantiating the crawl parms for the first time.
	//
	// likewise "crawled" corresponds to "maxCrawled"
	//
	sb->safePrintf("\"processedAttempts\":%lli,",
		       cr->m_globalCrawlInfo.m_pageProcessAttempts);
	sb->safePrintf("\"processed\":%lli,",
		       cr->m_globalCrawlInfo.m_pageProcessSuccesses);

	sb->safePrintf("\"crawlAttempts\":%lli,",
		       cr->m_globalCrawlInfo.m_pageDownloadAttempts);
	sb->safePrintf("\"crawled\":%lli,",
		       cr->m_globalCrawlInfo.m_pageDownloadSuccesses);

	sb->safePrintf("\"urlsConsidered\":%lli,",
		       cr->m_globalCrawlInfo.m_urlsConsidered);

	// how many spiders outstanding for this coll right now?
	SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);
	long spidersOut = sc->getTotalOutstandingSpiders();

	// . status of the crawl: "Stopped" or "Active"?
	// . TODO: check with dan to see if Active is correct and
	//   ShuttingDown is allowable
	if ( cr->m_spideringEnabled )
		sb->safePrintf("\"status\":\"Active\",");
	else if ( spidersOut )
		sb->safePrintf("\"status\":\"ShuttingDown\",");
	else
		sb->safePrintf("\"status\":\"Stopped\",");

	// spider crawl start time
	sb->safePrintf("\"start\":%lli,",cr->m_diffbotCrawlStartTime);

	// the token
	sb->safePrintf("\"token\":\"");
	sb->safeMemcpy(token,tokenLen);
	sb->safePrintf("\",");

	//
	// BEGIN parameter map
	//
	// the token again
	sb->safePrintf("{");
	sb->safePrintf("\"token\":\"");
	sb->safeMemcpy(token,tokenLen);
	sb->safePrintf("\",");
	// the seed url
	sb->safePrintf("\"seed\":\"%s\",",cr->m_diffbotSeed.getBufStart());
	// the api
	sb->safePrintf("\"api\":\"%s\",",cr->m_diffbotApi.getBufStart());
	sb->safePrintf("},");
	//
	// END parameter map
	//

	// crawl count. counts non-errors. successful downloads.
	//sb->safePrintf("\"crawled\":%lli",
	//	       cr->m_globalCrawlInfo.m_pageCrawlAttempts);

	sb->safePrintf("}");

	// assume we are the last json object in the array
	sb->pushChar(']');

}
*/

////////////////
//
//  **** THE CRAWLBOT CONTROL PANEL *****
//
// . Based on  http://diffbot.com/dev/crawl/ page.
// . got to /dev/crawl to see this!
//
////////////////

/*
// generate a random collection name
char *getNewCollName ( ) { // char *token , long tokenLen ) {
	// let's create a new crawl id. dan was making it 32 characters
	// with 4 hyphens in it for a total of 36 bytes, but since
	// MAX_COLL_LEN, the maximum length of a collection name, is just
	// 64 bytes, and the token is already 32, let's limit to 16 bytes
	// for the crawlerid. so if we print that out in hex, 16 hex chars
	// 0xffffffff 0xffffffff is 64 bits. so let's make a random 64-bit
	// value here.
	unsigned long r1 = rand();
	unsigned long r2 = rand();
	unsigned long long crawlId64 = (unsigned long long) r1;
	crawlId64 <<= 32;
	crawlId64 |= r2;

	static char s_collBuf[MAX_COLL_LEN+1];

	//long tokenLen = gbstrlen(token);

	// include a +5 for "-test"
	// include 16 for crawlid (16 char hex #)
	//if ( tokenLen + 16 + 5>= MAX_COLL_LEN ) { char *xx=NULL;*xx=0;}
	// ensure the crawlid is the full 16 characters long so we
	// can quickly extricate the crawlid from the collection name
	//memcpy ( s_collBuf, token, tokenLen );
	//sprintf(s_collBuf + tokenLen ,"-%016llx",crawlId64);
	sprintf(s_collBuf ,"%016llx",crawlId64);
	return s_collBuf;
}
*/

//////////////////////////////////////////
//
// MAIN API STUFF I GUESS
//
//////////////////////////////////////////


bool sendReply2 (TcpSocket *socket , long fmt , char *msg ) {
	// log it
	log("crawlbot: %s",msg);

	char *ct = "text/html";

	// send this back to browser
	SafeBuf sb;
	if ( fmt == FMT_JSON ) {
		sb.safePrintf("{\n\"response\":\"success\",\n"
			      "\"message\":\"%s\"\n}\n"
			      , msg );
		ct = "application/json";
	}
	else
		sb.safePrintf("<html><body>"
			      "success: %s"
			      "</body></html>"
			      , msg );

	//return g_httpServer.sendErrorReply(socket,500,sb.getBufStart());
	return g_httpServer.sendDynamicPage (socket,
					     sb.getBufStart(),
					     sb.length(),
					     0, // cachetime
					     false, // POST reply?
					     ct);
}


bool sendErrorReply2 ( TcpSocket *socket , long fmt , char *msg ) {

	// log it
	log("crawlbot: sending back 500 http status '%s'",msg);

	char *ct = "text/html";

	// send this back to browser
	SafeBuf sb;
	if ( fmt == FMT_JSON ) {
		sb.safePrintf("{\"error\":\"%s\"}\n"
			      , msg );
		ct = "application/json";
	}
	else
		sb.safePrintf("<html><body>"
			      "failed: %s"
			      "</body></html>"
			      , msg );

	// log it
	//log("crawlbot: %s",msg );

	//return g_httpServer.sendErrorReply(socket,500,sb.getBufStart());
	return g_httpServer.sendDynamicPage (socket,
					     sb.getBufStart(),
					     sb.length(),
					     0, // cachetime
					     false, // POST reply?
					     ct ,
					     500 ); // error! not 200...
}

bool printCrawlBotPage2 ( class TcpSocket *s ,
			  class HttpRequest *hr ,
			  char fmt,
			  class SafeBuf *injectionResponse ,
			  class SafeBuf *urlUploadResponse ,
			  collnum_t collnum ) ;

void addedUrlsToSpiderdbWrapper ( void *state ) {
	StateCD *st = (StateCD *)state;
	SafeBuf rr;
	rr.safePrintf("Successfully added urls for spidering.");
	printCrawlBotPage2 ( st->m_socket,
			     &st->m_hr ,
			     st->m_fmt,
			     NULL ,
			     &rr ,
			     st->m_collnum );
	mdelete ( st , sizeof(StateCD) , "stcd" );
	delete st;
	//log("mdel2: st=%lx",(long)st);
}
/*
void injectedUrlWrapper ( void *state ) {
	StateCD *st = (StateCD *)state;

	Msg7 *msg7 = &st->m_msg7;
	// the doc we injected...
	XmlDoc *xd = &msg7->m_xd;

	// make a status msg for the url
	SafeBuf sb;
	SafeBuf js; // for json reply
	if ( xd->m_indexCode == 0 ) {
		sb.safePrintf("<b><font color=black>"
			      "Successfully added ");
		js.safePrintf("Seed Successful. ");
	}
	else if ( xd->m_indexCode == EDOCFILTERED ) {
		sb.safePrintf("<b><font color=red>"
			      "Error: <i>%s</i> by matching "
			      "url filter #%li "
			      "when adding "
			      , mstrerror(xd->m_indexCode)
			      // divide by 2 because we add a
			      // "manualadd &&" rule with every url filter
			      // that the client adds
			      , (xd->m_urlFilterNum - 2) / 2
			      );
		js.safePrintf("Seed URL filtered by URL filter #%li"
			      , (xd->m_urlFilterNum - 2) / 2 );
	}
	else {
		sb.safePrintf("<b><font color=red>"
			      "Error: <i>%s</i> when adding "
			      , mstrerror(xd->m_indexCode) );
		js.safePrintf("Error adding seed url: %s"
			      , mstrerror(xd->m_indexCode) );
	}
	sb.safeTruncateEllipsis(xd->m_firstUrl.getUrl(),60);

	if ( xd->m_indexCode == 0 ) {
		if ( xd->m_numOutlinksAddedValid ) {
			sb.safePrintf(" &nbsp; (added %li outlinks)"
				      ,(long)xd->m_numOutlinksAdded);
			js.safePrintf("Added %li outlinks from same domain. "
				      "%li outlinks were filtered."
			       ,(long)xd->m_numOutlinksAddedFromSameDomain
				      ,(long)xd->m_numOutlinksFiltered
				      );
		}
		else {
			sb.safePrintf(" &nbsp; (added 0 outlinks)");
			js.safePrintf("Added 0 outlinks from same domain. "
				      "0 links were filtered." );
		}
	}

	sb.safePrintf("</font></b>");
	sb.nullTerm();

	js.nullTerm();

	// send back the html or json response?
	SafeBuf *response = &sb;
	if ( st->m_fmt == FMT_JSON ) response = &js;

	// . this will call g_httpServer.sendReply()
	// . pass it in the injection response, "sb"
	printCrawlBotPage2 ( st->m_socket,
			     &st->m_hr ,
			     st->m_fmt,
			     response,
			     NULL ,
			     st->m_collnum );
	mdelete ( st , sizeof(StateCD) , "stcd" );
	delete st;
}
*/

class HelpItem {
public:
	char *m_parm;
	char *m_desc;
};

static class HelpItem s_his[] = {
	{"format","Use &format=html to show HTML output. Default is JSON."},
	{"token","Required for all operations below."},

	{"name","Name of the crawl. If missing will just show "
	 "all crawls owned by the given token."},

	{"delete=1","Deletes the crawl."},
	{"reset=1","Resets the crawl. Removes all seeds."},
	{"restart=1","Restarts the crawl. Keeps the seeds."},

	{"pause",
	 "Specify 1 or 0 to pause or resume the crawl respectively."},

	{"repeat","Specify number of days as floating point to "
	 "recrawl the pages. Set to 0.0 to NOT repeat the crawl."},

	{"crawlDelay","Wait this many seconds between crawling urls from the "
	 "same IP address. Can be a floating point number."},

	//{"deleteCrawl","Same as delete."},
	//{"resetCrawl","Same as delete."},
	//{"pauseCrawl","Same as pause."},
	//{"repeatCrawl","Same as repeat."},

	{"seeds","Whitespace separated list of URLs used to seed the crawl. "
	 "Will only follow outlinks on the same domain of seed URLs."
	},
	{"spots",
	 "Whitespace separated list of URLs to add to the crawl. "
	 "Outlinks will not be followed." },
	{"urls",
	 "Same as spots."},
	//{"spiderLinks","Use 1 or 0 to spider the links or NOT spider "
	// "the links, respectively, from "
	// "the provided seed or addUrls parameters. "
	// "The default is 1."},


	{"maxToCrawl", "Specify max pages to successfully download."},
	//{"maxToDownload", "Specify max pages to successfully download."},

	{"maxToProcess", "Specify max pages to successfully process through "
	 "diffbot."},
	{"maxRounds", "Specify maximum number of crawl rounds. Use "
	 "-1 to indicate no max."},

	{"onlyProcessIfNew", "Specify 1 to avoid re-processing pages "
	 "that have already been processed once before."},

	{"notifyEmail","Send email alert to this email when crawl hits "
	 "the maxtocrawl or maxtoprocess limit, or when the crawl "
	 "completes."},
	{"notifyWebhook","Fetch this URL when crawl hits "
	 "the maxtocrawl or maxtoprocess limit, or when the crawl "
	 "completes."},
	{"obeyRobots","Obey robots.txt files?"},
	{"restrictDomain","Restrict downloaded urls to domains of seeds?"},

	{"urlCrawlPattern","List of || separated strings. If the url "
	 "contains any of these then we crawl the url, otherwise, we do not. "
	 "An empty pattern matches all urls."},

	{"urlProcessPattern","List of || separated strings. If the url "
	 "contains any of these then we send url to diffbot for processing. "
	 "An empty pattern matches all urls."},

	{"pageProcessPattern","List of || separated strings. If the page "
	 "contains any of these then we send it to diffbot for processing. "
	 "An empty pattern matches all pages."},

	{"urlCrawlRegEx","Regular expression that the url must match "
	 "in order to be crawled. If present then the urlCrawlPattern will "
	 "be ignored. "
	 "An empty regular expression matches all urls."},

	{"urlProcessRegEx","Regular expression that the url must match "
	 "in order to be processed. "
	 "If present then the urlProcessPattern will "
	 "be ignored. "
	 "An empty regular expression matches all urls."},

	{"apiUrl","Diffbot api url to use. We automatically append "
	 "token and url to it."},


	//{"expression","A pattern to match in a URL. List up to 100 "
	// "expression/action pairs in the HTTP request. "
	// "Example expressions:"},
	//{"action","Take the appropriate action when preceeding pattern is "
	// "matched. Specify multiple expression/action pairs to build a "
	// "table of filters. Each URL being spidered will take the given "
	// "action of the first expression it matches. Example actions:"},


	{NULL,NULL}
};

/*
// get the input string from the httprequest or the json post
char *getInputString ( char *string , HttpRequest *hr , Json *JS ) {
	// try to get it from http request
	char *val = hr->getString(string);
	// if token in json post, use that
	if ( ! val ) {
		JsonItem *ji = JS.getItem(string);
		if ( ji ) val = ji->getValue();
	}
	return val;
}
*/

void collOpDoneWrapper ( void *state ) {
	StateCD *st = (StateCD *)state;
	TcpSocket *socket = st->m_socket;
	log("crawlbot: done with blocked op.");
	mdelete ( st , sizeof(StateCD) , "stcd" );
	delete st;
	//log("mdel3: st=%lx",(long)st);
	g_httpServer.sendDynamicPage (socket,"OK",2);
}

// . when we receive the request from john we call broadcastRequest() from
//   Pages.cpp. then msg28 sends this replay with a &cast=0 appended to it
//   to every host in the network. then when msg28 gets back replies from all
//   those hosts it calls sendPageCrawlbot() here but without a &cast=0
// . so if no &cast is present we are the original!!!
bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {

	// print help
	long help = hr->getLong("help",0);
	if ( help ) {
		SafeBuf sb;
		sb.safePrintf("<html>"
			      "<title>Crawlbot API</title>"
			      "<h1>Crawlbot API</h1>"
			      "<b>Use the parameters below on the "
			      "<a href=\"/crawlbot\">/crawlbot</a> page."
			      "</b><br><br>"
			      "<table>"
			      );
		for ( long i = 0 ; i < 1000 ; i++ ) {
			HelpItem *h = &s_his[i];
			if ( ! h->m_parm ) break;
			sb.safePrintf( "<tr>"
				       "<td>%s</td>"
				       "<td>%s</td>"
				       "</tr>"
				       , h->m_parm
				       , h->m_desc
				       );
		}
		sb.safePrintf("</table>"
			      "</html>");
		return g_httpServer.sendDynamicPage (socket,
						     sb.getBufStart(),
						     sb.length(),
						     0); // cachetime
	}

	// . Pages.cpp by default broadcasts all PageCrawlbot /crawlbot
	//   requests to every host in the network unless a cast=0 is
	//   explicitly given
	// . Msg28::massConfig() puts a &cast=0 on the secondary requests
	//   sent to each host in the network
	//long cast = hr->getLong("cast",1);

	// httpserver/httprequest should not try to decode post if
	// it's application/json.
	//char *json = hr->getPOST();
	//Json JS;
	//if ( json ) JS.parseJsonStringIntoJsonItems ( json );

	// . now show stats for the current crawl
	// . put in xml or json if format=xml or format=json or
	//   xml=1 or json=1 ...
	char fmt = FMT_JSON;

	// token is always required. get from json or html form input
	//char *token = getInputString ( "token" );
	char *token = hr->getString("token");
	char *name = hr->getString("name");

	// . try getting token-name from ?c=
	// . the name of the collection is encoded as <token>-<crawlname>
	char *c = hr->getString("c");
	char tmp[MAX_COLL_LEN+100];
	if ( ! token && c ) {
		strncpy ( tmp , c , MAX_COLL_LEN );
		token = tmp;
		name = strstr(tmp,"-");
		if ( name ) {
			*name = '\0';
			name++;
		}
		// change default formatting to html
		fmt = FMT_HTML;
	}


	char *fs = hr->getString("format",NULL,NULL);
	// give john a json api
	if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
	if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON;
	if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML;
	// if we got json as input, give it as output
	//if ( JS.getFirstItem() ) fmt = FMT_JSON;


	if ( ! token && fmt == FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) {
		char *msg = "invalid token";
		return sendErrorReply2 (socket,fmt,msg);
	}

	if ( ! token ) {
		// print token form if html
		SafeBuf sb;
		sb.safePrintf("In order to use crawlbot you must "
			      "first LOGIN:"
			      "<form action=/crawlbot method=get>"
			      "<br>"
			      "<input type=text name=token size=50>"
			      "<input type=submit name=submit value=OK>"
			      "</form>"
			      "<br>"
			      "<b>- OR -</b>"
			      "<br> SIGN UP"
			      "<form action=/crawlbot method=get>"
			      "Name: <input type=text name=name size=50>"
			      "<br>"
			      "Email: <input type=text name=email size=50>"
			      "<br>"
			      "<input type=submit name=submit value=OK>"
			      "</form>"
			      "</body>"
			      "</html>");
		return g_httpServer.sendDynamicPage (socket,
						     sb.getBufStart(),
						     sb.length(),
						     0); // cachetime
	}

	if ( gbstrlen(token) > 32 ) {
		//log("crawlbot: token is over 32 chars");
		char *msg = "crawlbot: token is over 32 chars";
		return sendErrorReply2 (socket,fmt,msg);
	}

	char *seeds = hr->getString("seeds");
	char *spots = hr->getString("spots");

	// just existence is the operation
	//bool delColl   = hr->hasField("deleteCrawl");
	//bool resetColl = hr->hasField("resetCrawl");

	// /v2/bulk api support:
	if ( ! spots ) spots = hr->getString("urls");

	if ( spots && ! spots[0] ) spots = NULL;
	if ( seeds && ! seeds[0] ) seeds = NULL;

	//if ( ! delColl   ) delColl   = hr->hasField("delete");
	//if ( ! resetColl ) resetColl = hr->hasField("reset");

	bool restartColl = hr->hasField("restart");


	//if ( delColl && !  && cast == 0 ) {
	//	log("crawlbot: no collection found to delete.");
	//	char *msg = "Could not find crawl to delete.";
	//	return sendErrorReply2 (socket,fmt,msg);
	//}

	// just send back a list of all the collections after the delete
	//if ( delColl && cast && fmt == FMT_JSON ) {
	//	char *msg = "Collection deleted.";
	//	return sendReply2 (socket,fmt,msg);
	//}

	// default name to next available collection crawl name in the
	// case of a delete operation...
	char *msg = NULL;
	if ( hr->hasField("delete") ) msg = "deleted";
	// need to re-add urls for a restart
	//if ( hr->hasField("restart") ) msg = "restarted";
	if ( hr->hasField("reset") ) msg = "reset";
	if ( msg ) { // delColl && cast ) {
		// this was deleted... so is invalid now
		name = NULL;
		// no longer a delete function, we need to set "name" below
		//delColl = false;//NULL;
		// john wants just a brief success reply
		SafeBuf tmp;
		tmp.safePrintf("{\"response\":\"Successfully %s job.\"}",
			       msg);
		char *reply = tmp.getBufStart();
		return g_httpServer.sendDynamicPage( socket,
						     reply,
						     gbstrlen(reply),
						     0, // cacheTime
						     false, // POSTReply?
						     "application/json"
						     );
	}

	// if name is missing default to name of first existing
	// collection for this token.
	for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // cast
		if (  name ) break;
		// do not do this if doing an
		// injection (seed) or add url or del coll or reset coll !!
		if ( seeds ) break;
		if ( spots ) break;
		//if ( delColl ) break;
		//if ( resetColl ) break;
		if ( restartColl ) break;
		CollectionRec *cx = g_collectiondb.m_recs[i];
		// deleted collections leave a NULL slot
		if ( ! cx ) continue;
		// skip if token does not match
		if ( strcmp ( cx->m_diffbotToken.getBufStart(),token) )
			continue;
		// got it
		name = cx->m_diffbotCrawlName.getBufStart();
		break;
	}

	if ( ! name ) {
		// if the token is valid
		char *ct = "application/json";
		char *msg = "{}\n";
		return g_httpServer.sendDynamicPage ( socket,
						      msg,
						      gbstrlen(msg) ,
						      -1 , // cachetime
						      false ,
						      ct ,
						      200 ); // http status
		//log("crawlbot: no crawl name given");
		//char *msg = "invalid or missing name";
		//return sendErrorReply2 (socket,fmt,msg);
	}


	if ( gbstrlen(name) > 30 ) {
		//log("crawlbot: name is over 30 chars");
		char *msg = "crawlbot: name is over 30 chars";
		return sendErrorReply2 (socket,fmt,msg);
	}

	// make the collection name so it includes the token and crawl name
	char collName[MAX_COLL_LEN+1];
	// sanity
	if ( MAX_COLL_LEN < 64 ) { char *xx=NULL;*xx=0; }
	// make a compound name for collection of token and name
	sprintf(collName,"%s-%s",token,name);

	// if they did not specify the token/name of an existing collection
	// then cr will be NULL and we'll add it below
	CollectionRec *cr = g_collectiondb.getRec(collName);

	// i guess bail if not there?
	if ( ! cr ) {
		char *msg = "invalid or missing collection rec";
		return sendErrorReply2 (socket,fmt,msg);
	}


	// if no token... they need to login or signup
	//char *token = getTokenFromHttpRequest ( hr );

	// get coll name if any
	//char *c = hr->getString("c");
	//if ( ! c ) c = hr->getString("id");

	// get some other parms provided optionally
	//char *addColl   = hr->getString("addcoll");

	// try json
	//if ( JS.getInputString("addNewCrawl") ) addColl = collName;
	//if ( JS.getInputString("deleteCrawl") ) delColl = true;
	//if ( JS.getInputString("resetCrawl") ) resetColl = true;

	//if ( resetColl && ! cr ) {
	//	//log("crawlbot: no collection found to reset.");
	//	char *msg = "Could not find crawl to reset.";
	//	return sendErrorReply2 (socket,fmt,msg);
	//}

	//if ( restartColl && ! cr ) {
	//	char *msg = "Could not find crawl to restart.";
	//	return sendErrorReply2 (socket,fmt,msg);
	//}

	// make a new state
	StateCD *st;
	try { st = new (StateCD); }
	catch ( ... ) {
		return sendErrorReply2 ( socket , fmt , mstrerror(g_errno));
	}
	mnew ( st , sizeof(StateCD), "statecd");

	// debug
	//log("mnew2: st=%lx",(long)st);

	// copy crap
	st->m_hr.copy ( hr );
	st->m_socket = socket;
	st->m_fmt = fmt;
	if ( cr ) st->m_collnum = cr->m_collnum;
	else      st->m_collnum = -1;

	// save seeds
	if ( cr && restartColl ) { // && cast ) {
		// bail on OOM saving seeds
		if ( ! st->m_seedBank.safeMemcpy ( &cr->m_diffbotSeeds ) ||
		     ! st->m_seedBank.pushChar('\0') )
			return sendErrorReply2(socket,fmt,mstrerror(g_errno));
	}

	//
	// if we can't compile the provided regexes, return error
	//
	if ( cr ) {
		char *rx1 = hr->getString("urlCrawlRegEx",NULL);
		if ( rx1 && ! rx1[0] ) rx1 = NULL;
		char *rx2 = hr->getString("urlProcessRegEx",NULL);
		if ( rx2 && ! rx2[0] ) rx2 = NULL;
		// this will store the compiled regular expression into ucr
		regex_t re1;
		regex_t re2;
		long status1 = 0;
		long status2 = 0;
		if ( rx1 )
			status1 = regcomp ( &re1 , rx1 ,
					    REG_EXTENDED|REG_ICASE|
					    REG_NEWLINE|REG_NOSUB);
		if ( rx2 )
			status2 = regcomp ( &re2 , rx2 ,
					    REG_EXTENDED|REG_ICASE|
					    REG_NEWLINE|REG_NOSUB);
		if ( rx1 ) regfree ( &re1 );
		if ( rx2 ) regfree ( &re2 );
		SafeBuf em;
		if ( status1 ) {
			log("xmldoc: regcomp %s failed.",rx1);
			em.safePrintf("Invalid regular expresion: %s",rx1);
		}
		else if ( status2 ) {
			log("xmldoc: regcomp %s failed.",rx2);
			em.safePrintf("Invalid regular expresion: %s",rx2);
		}
		if ( status1 || status2 ) {
			mdelete ( st , sizeof(StateCD) , "stcd" );
			delete st;
			char *msg = em.getBufStart();
			return sendErrorReply2(socket,fmt,msg);
		}
	}


	// . if this is a cast=0 request it is received by all hosts in the
	//   network
	// . this code is the only code run by EVERY host in the network
	// . the other code is just run once by the receiving host
	// . so we gotta create a coll rec on each host etc.
	// . no need to update collectionrec parms here since Pages.cpp calls
	//   g_parms.setFromRequest() for us before calling this function,
	//   pg->m_function(). even though maxtocrawl is on "PAGE_NONE"
	//   hopefully it will still be set
	// . but we should take care of add/del/reset coll here.
	// . i guess this will be handled by the new parm syncing logic
	//   which deals with add/del coll requests

	/*
	if ( cast == 0 ) {
		// add a new collection by default
		if ( ! cr && name && name[0] )
			cr = addNewDiffbotColl ( collName , token , name, hr );
		// also support the good 'ole html form interface
		if ( cr ) setSpiderParmsFromHtmlRequest ( socket , hr , cr );
		// . we can't sync these operations on a dead host when it
		//   comes back up yet. we can only sync parms, not collection
		//   adds/deletes/resets
		// . TODO: make new collections just a list of rdb records,
		//   then they can leverage the msg4 and addsinprogress.dat
		//   functionality we have for getting dead hosts back up to
		//   sync. Call it Colldb.
		// . PROBLEM: when just starting up seems like hasDeadHost()
		//   is returning true because it has not yet received its
		//   first ping reply
		//if ( addColl || delColl || resetColl ) {
		//	// if any host in network is dead, do not do this
		//	if ( g_hostdb.hasDeadHost() )  {
		//		char *msg = "A host in the network is dead.";
		//		// log it
		//		log("crawlbot: %s",msg);
		//		// make sure this returns in json if required
		//		return sendErrorReply2(socket,fmt,msg);
		//	}
		//}

		// problem?
		if ( ! cr ) {
			// send back error
			char *msg = "Collection add failed";
			if ( delColl ) msg = "No such collection";
			if ( resetColl ) msg = "No such collection";
			if ( restartColl ) msg = "No such collection";
			// nuke it
			mdelete ( st , sizeof(StateCD) , "stcd" );
			delete st;
			// log it
			log("crawlbot: cr is null. %s",msg);
			// make sure this returns in json if required
			return sendErrorReply2(socket,fmt,msg);
		}


		// set this up
		WaitEntry *we = &st->m_waitEntry;
		we->m_state = st;
		we->m_callback = collOpDoneWrapper;
		// this won't work, collname is on the stack!
		//we->m_coll = collName;
		we->m_coll = cr->m_coll;

		if ( delColl ) {
			// note it
			log("crawlbot: deleting coll");
			// delete collection name
			// this can block if tree is saving, it has to wait
			// for tree save to complete before removing old
			// collnum recs from tree
			if ( ! g_collectiondb.deleteRec ( collName , we ) )
				return false;
			// nuke it
			mdelete ( st , sizeof(StateCD) , "stcd" );
			delete st;
			// all done
			return g_httpServer.sendDynamicPage (socket,"OK",2);
		}

		if ( resetColl || restartColl ) {
			// note it
			log("crawlbot: resetting/restarting coll");
			//cr = g_collectiondb.getRec ( resetColl );
			// this can block if tree is saving, it has to wait
			// for tree save to complete before removing old
			// collnum recs from tree
			bool purgeSeeds = true;
			if ( restartColl ) purgeSeeds = false;
			if ( ! g_collectiondb.resetColl ( collName ,
							  we ,
							  purgeSeeds ) )
				return false;
			// it is a NEW ptr now!
			cr = g_collectiondb.getRec( collName );
			// if reset from crawlbot api page then enable spiders
			// to avoid user confusion
			if ( cr ) cr->m_spideringEnabled = 1;
			// nuke it
			mdelete ( st , sizeof(StateCD) , "stcd" );
			delete st;
			// all done
			return g_httpServer.sendDynamicPage (socket,"OK",2);
		}
		// nuke it
		mdelete ( st , sizeof(StateCD) , "stcd" );
		delete st;
		// this will set the the collection parms from json
		//setSpiderParmsFromJSONPost ( socket , hr , cr , &JS );
		// this is a cast, so just return simple response
		return g_httpServer.sendDynamicPage (socket,"OK",2);
	}
	*/

	/////////
	//
	// after all hosts have replied to the request, we finally send the
	// request here, with no &cast=0 appended to it. so there is where we
	// send the final reply back to the browser
	//
	/////////

	/*
	// in case collection was just added above... try this!!
	cr = g_collectiondb.getRec(collName);

	// collectionrec must be non-null at this point. i.e. we added it
	if ( ! cr ) {
		char *msg = "Crawl name was not found.";
		if ( name && name[0] )
			msg = "Failed to add crawl. Crawl name is illegal.";
		// nuke it
		mdelete ( st , sizeof(StateCD) , "stcd" );
		delete st;
		//log("crawlbot: no collection found. need to add a crawl");
		return sendErrorReply2(socket,fmt, msg);
	}

	//char *spots = hr->getString("spots",NULL,NULL);
	//char *seeds = hr->getString("seeds",NULL,NULL);
	*/

	// check seed bank now too for restarting a crawl
	if ( st->m_seedBank.length() && ! seeds )
		seeds = st->m_seedBank.getBufStart();

	if ( seeds )
		log("crawlbot: adding seeds=\"%s\"",seeds);
	if ( spots )
		log("crawlbot: got spots to add");

	///////
	//
	// handle file of urls upload. can be HUGE!
	//
	///////
	if ( spots || seeds ) {
		// . avoid spidering links for these urls? i would say
		// . default is to NOT spider the links...
		// . support camel case and all lower case
		//long spiderLinks = hr->getLong("spiderLinks",1);
		//spiderLinks      = hr->getLong("spiderlinks",spiderLinks);
		//bool spiderLinks = false;
		// make a list of spider requests from these urls
		//SafeBuf listBuf;
		// this returns NULL with g_errno set
		bool status = true;
		if ( ! getSpiderRequestMetaList ( seeds,
						  &st->m_listBuf ,
						  true , // spiderLinks?
						  cr ) )
			status = false;
		// do not spider links for spots
		if ( ! getSpiderRequestMetaList ( spots,
						  &st->m_listBuf ,
						  false , // spiderLinks?
						  NULL ) )
			status = false;
		// empty?
		long size = st->m_listBuf.length();
		// error?
		if ( ! status ) {
			// nuke it
			mdelete ( st , sizeof(StateCD) , "stcd" );
			delete st;
			return sendErrorReply2(socket,fmt,mstrerror(g_errno));
		}
		// if not list
		if ( ! size ) {
			// nuke it
			mdelete ( st , sizeof(StateCD) , "stcd" );
			delete st;
			return sendErrorReply2(socket,fmt,"no urls found");
		}
		// add to spiderdb
		if ( ! st->m_msg4.addMetaList( st->m_listBuf.getBufStart() ,
					       st->m_listBuf.length(),
					       cr->m_coll,
					       st ,
					       addedUrlsToSpiderdbWrapper,
					       0 // niceness
					       ) )
			// blocked!
			return false;
		// did not block, print page!
		addedUrlsToSpiderdbWrapper(st);
		return true;
	}

	/////////
	//
	// handle direct injection of a url. looks at "spiderlinks=1" parm
	// and all the other parms in Msg7::inject() in PageInject.cpp.
	//
	//////////
	/*
	if ( injectUrl ) {
		// a valid collection is required
		if ( ! cr )
			return sendErrorReply2(socket,fmt,
					       "invalid collection");
		// begin the injection
		if ( ! st->m_msg7.inject ( st->m_socket,
					   &st->m_hr,
					   st ,
					   injectedUrlWrapper ,
					   1 , // spiderLinks default is on
					   collName ) ) // coll override
			// if blocked, return now
			return false;
		// otherwise send back reply
		injectedUrlWrapper ( st );
		return true;
	}
	*/

	// we do not need the state i guess

	////////////
	//
	// print the html or json page of all the data
	//
	printCrawlBotPage2 ( socket,hr,fmt,NULL,NULL,cr->m_collnum);

	// get rid of that state
	mdelete ( st , sizeof(StateCD) , "stcd" );
	delete st;
	//log("mdel4: st=%lx",(long)st);
	return true;
}


/*
bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {

	if ( fmt == FMT_JSON )
		sb.safePrintf("\"urlFilters\":[");

	// skip first filters that are:
	// 0. ismedia->ignore and
	// 1. !isonsamedomain->ignore
	// 2. lastspidertime or !isindexed
	// 3. errorcount rule
	// 4. errorcount rule

	long istart = 5;
	// if respidering then we added an extra filter
	// lastspidertime>={roundstart} --> FILTERED
	//if ( cr->m_collectiveRespiderFrequency > 0.0 )
	//	istart++;

	for ( long i = istart ; i < cr->m_numRegExs ; i++ ) {
		//sb.safePrintf
		char *expression = cr->m_regExs[i].getBufStart();
		// do not allow nulls
		if ( ! expression ) expression = "";
		// skip spaces
		if ( *expression && is_wspace_a(*expression) ) expression++;
		if ( strcmp(expression,"default") == 0 ) expression = "*";
		char *action = cr->m_spiderDiffbotApiUrl[i].getBufStart();
		// do not all nulls
		if ( ! action ) action = "";
		// skip spaces
		if ( *action && is_wspace_a(*action) ) action++;
		// if no diffbot api url specified, do not process
		if ( ! *action ) action = "doNotProcess";
		// if filtered from crawling, do not even spider
		long priority = cr->m_spiderPriorities[i];
		if ( priority == SPIDER_PRIORITY_FILTERED ) // -3
			action = "doNotCrawl";
		// we add this supplemental expressin/action for every
		// one the user adds in order to give manually added
		// urls higher spider priority, so skip it
		if ( strncmp(expression,"ismanualadd && ",15) == 0 )
			continue;
		if ( fmt == FMT_HTML ) {
			sb.safePrintf("<tr>"
				      "<td>Expression "
				      "<input type=text "
				      "name=expression size=30 "
				      "value=\"%s\"> "
				      "</td><td>"
				      "Action "
				      "<input type=text name=action size=50 "
				      "value=\"%s\">"
				      "</td>"
				      "</tr>\n"
				      , expression
				      , action
				      );
			continue;
		}
		// show it
		sb.safePrintf("{\"expression\":\"%s\",",expression);
		sb.safePrintf("\"action\":\"%s\"}",action);
		// more follow?
		sb.pushChar(',');
		sb.pushChar('\n');
	}

	if ( fmt == FMT_JSON ) {
		// remove trailing comma
		sb.removeLastChar('\n');
		sb.removeLastChar(',');
		sb.safePrintf("]\n");
	}

	return true;
}
*/

bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {

	SafeBuf tmp;
	long crawlStatus = -1;
	getSpiderStatusMsg ( cx , &tmp , &crawlStatus );
	CrawlInfo *ci = &cx->m_localCrawlInfo;
	long sentAlert = (long)ci->m_sentCrawlDoneAlert;
	if ( sentAlert ) sentAlert = 1;

	char *crawlTypeStr = "crawl";
	//char *nomen = "crawl";
	if ( cx->m_isCustomCrawl == 2 ) {
		crawlTypeStr = "bulk";
		//nomen = "job";
	}


	sb.safePrintf("\n\n{"
		      "\"name\":\"%s\",\n"
		      "\"type\":\"%s\",\n"
		      //"\"alias\":\"%s\",\n"
		      //"\"crawlingEnabled\":%li,\n"
		      "\"jobStatus\":{" // nomen = jobStatus / crawlStatus
		      "\"status\":%li,"
		      "\"message\":\"%s\"},\n"
		      "\"sentJobDoneNotification\":%li,\n"
		      //"\"crawlingPaused\":%li,\n"
		      "\"objectsFound\":%lli,\n"
		      "\"urlsHarvested\":%lli,\n"
		      //"\"urlsExamined\":%lli,\n"
		      "\"pageCrawlAttempts\":%lli,\n"
		      "\"pageCrawlSuccesses\":%lli,\n"
		      "\"pageCrawlSuccessesThisRound\":%lli,\n"

		      "\"pageProcessAttempts\":%lli,\n"
		      "\"pageProcessSuccesses\":%lli,\n"
		      "\"pageProcessSuccessesThisRound\":%lli,\n"

		      "\"maxRounds\":%li,\n"
		      "\"repeat\":%f,\n"
		      "\"crawlDelay\":%f,\n"

		      //,cx->m_coll
		      , cx->m_diffbotCrawlName.getBufStart()
		      , crawlTypeStr
		      //, alias
		      //, (long)cx->m_spideringEnabled
		      , crawlStatus
		      , tmp.getBufStart()
		      , sentAlert
		      //, (long)paused
		      , cx->m_globalCrawlInfo.m_objectsAdded -
		      cx->m_globalCrawlInfo.m_objectsDeleted
		      , cx->m_globalCrawlInfo.m_urlsHarvested
		      //,cx->m_globalCrawlInfo.m_urlsConsidered
		      , cx->m_globalCrawlInfo.m_pageDownloadAttempts
		      , cx->m_globalCrawlInfo.m_pageDownloadSuccesses
		      , cx->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound

		      , cx->m_globalCrawlInfo.m_pageProcessAttempts
		      , cx->m_globalCrawlInfo.m_pageProcessSuccesses
		      , cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound

		      , (long)cx->m_maxCrawlRounds
		      , cx->m_collectiveRespiderFrequency
		      , cx->m_collectiveCrawlDelay
		      );

	sb.safePrintf("\"obeyRobots\":%li,\n"
		      , (long)cx->m_useRobotsTxt );

	// if not a "bulk" injection, show crawl stats
	if ( cx->m_isCustomCrawl != 2 ) {

		sb.safePrintf(
			      // settable parms
			      "\"maxToCrawl\":%lli,\n"
			      "\"maxToProcess\":%lli,\n"
			      "\"restrictDomain\":%li,\n"
			      "\"onlyProcessIfNew\":%li,\n"
			      , cx->m_maxToCrawl
			      , cx->m_maxToProcess
			      , (long)cx->m_restrictDomain
			      , (long)cx->m_diffbotOnlyProcessIfNewUrl
			      );
		sb.safePrintf("\"seeds\":\"");
		sb.safeUtf8ToJSON ( cx->m_diffbotSeeds.getBufStart());
		sb.safePrintf("\",\n");
	}

	sb.safePrintf("\"roundsCompleted\":%li,\n",
		      cx->m_spiderRoundNum);

	sb.safePrintf("\"roundStartTime\":%lu,\n",
		      cx->m_spiderRoundStartTime);

	sb.safePrintf("\"currentTime\":%lu,\n",
		      getTimeGlobal() );
	sb.safePrintf("\"currentTimeUTC\":%lu,\n",
		      getTimeGlobal() );


	sb.safePrintf("\"apiUrl\":\"");
	sb.safeUtf8ToJSON ( cx->m_diffbotApiUrl.getBufStart() );
	sb.safePrintf("\",\n");


	sb.safePrintf("\"urlCrawlPattern\":\"");
	sb.safeUtf8ToJSON ( cx->m_diffbotUrlCrawlPattern.getBufStart() );
	sb.safePrintf("\",\n");

	sb.safePrintf("\"urlProcessPattern\":\"");
	sb.safeUtf8ToJSON ( cx->m_diffbotUrlProcessPattern.getBufStart() );
	sb.safePrintf("\",\n");

	sb.safePrintf("\"pageProcessPattern\":\"");
	sb.safeUtf8ToJSON ( cx->m_diffbotPageProcessPattern.getBufStart() );
	sb.safePrintf("\",\n");


	sb.safePrintf("\"urlCrawlRegEx\":\"");
	sb.safeUtf8ToJSON ( cx->m_diffbotUrlCrawlRegEx.getBufStart() );
	sb.safePrintf("\",\n");

	sb.safePrintf("\"urlProcessRegEx\":\"");
	sb.safeUtf8ToJSON ( cx->m_diffbotUrlProcessRegEx.getBufStart() );
	sb.safePrintf("\",\n");


	char *token = cx->m_diffbotToken.getBufStart();
	char *name = cx->m_diffbotCrawlName.getBufStart();


	char *mt = "crawl";
	if ( cx->m_isCustomCrawl == 2 ) mt = "bulk";

	sb.safePrintf("\"downloadJson\":"
		      "\"http://api.diffbot.com/v2/%s/download/"
		      "%s-%s_data.json\",\n"
		      , mt
		      , token
		      , name
		      );

	sb.safePrintf("\"downloadUrls\":"
		      "\"http://api.diffbot.com/v2/%s/download/"
		      "%s-%s_urls.csv\",\n"
		      , mt
		      , token
		      , name
		      );

	sb.safePrintf("\"notifyEmail\":\"");
	sb.safeUtf8ToJSON ( cx->m_notifyEmail.getBufStart() );
	sb.safePrintf("\",\n");

	sb.safePrintf("\"notifyWebhook\":\"");
	sb.safeUtf8ToJSON ( cx->m_notifyUrl.getBufStart() );
	sb.safePrintf("\"\n");
	//sb.safePrintf("\",\n");

	/////
	//
	// show url filters table. kinda hacky!!
	//
	/////
	/*
	  g_parms.sendPageGeneric ( socket ,
	  hr ,
	  PAGE_FILTERS ,
	  NULL ,
	  &sb ,
	  cr->m_coll,  // coll override
	  true // isJSON?
	  );
	*/
	//printUrlFilters ( sb , cx , FMT_JSON );
	// end that collection rec
	sb.safePrintf("}\n");

	return true;
}

bool printCrawlBotPage2 ( TcpSocket *socket ,
			  HttpRequest *hr ,
			  char fmt, // format
			  SafeBuf *injectionResponse ,
			  SafeBuf *urlUploadResponse ,
			  collnum_t collnum ) {


	// store output into here
	SafeBuf sb;

	if ( fmt == FMT_HTML )
		sb.safePrintf(
			      "<html>"
			      "<title>Crawlbot - "
			      "Web Data Extraction and Search Made "
			      "Easy</title>"
			      "<body>"
			      );

	CollectionRec *cr = g_collectiondb.m_recs[collnum];

	// was coll deleted while adding urls to spiderdb?
	if ( ! cr ) {
		g_errno = EBADREQUEST;
		char *msg = "invalid crawl. crawl was deleted.";
		return sendErrorReply2(socket,fmt,msg);
	}

	char *token = cr->m_diffbotToken.getBufStart();
	char *name = cr->m_diffbotCrawlName.getBufStart();

	// this is usefful
	SafeBuf hb;
	hb.safePrintf("<input type=hidden name=name value=\"%s\">"
		      "<input type=hidden name=token value=\"%s\">"
		      "<input type=hidden name=format value=\"html\">"
		      , name
		      , token );
	hb.nullTerm();

	// and this
	SafeBuf lb;
	lb.safePrintf("name=");
	lb.urlEncode(name);
	lb.safePrintf ("&token=");
	lb.urlEncode(token);
	if ( fmt == FMT_HTML ) lb.safePrintf("&format=html");
	lb.nullTerm();


	// set this to current collection. if only token was provided
	// then it will return the first collection owned by token.
	// if token has no collections it will be NULL.
	//if ( ! cr )
	//	cr = getCollRecFromHttpRequest ( hr );

	//if ( ! cr ) {
	//	char *msg = "failed to add new collection";
	//	g_msg = " (error: crawlbot failed to allocate crawl)";
	//	return sendErrorReply2 ( socket , fmt , msg );
	//}


	if ( fmt == FMT_HTML ) {
		sb.safePrintf("<table border=0>"
			      "<tr><td>"
			      "<b><font size=+2>"
			      "<a href=/crawlbot?token=%s>"
			      "Crawlbot</a></font></b>"
			      "<br>"
			      "<font size=-1>"
			      "Crawl, Datamine and Index the Web"
			      "</font>"
			      "</td></tr>"
			      "</table>"
			      , token
			      );
		sb.safePrintf("<center><br>");
		// first print help
		sb.safePrintf("[ <a href=/crawlbot?help=1>"
			      "api help</a> ] &nbsp; "
			      // json output
			      "[ <a href=\"/crawlbot?token=%s&format=json&"
			      "name=%s\">"
			      "json output"
			      "</a> ] &nbsp; "
			      , token
			      , name );
		// random coll name to add
		unsigned long r1 = rand();
		unsigned long r2 = rand();
		unsigned long long rand64 = (unsigned long long) r1;
		rand64 <<= 32;
		rand64 |=  r2;
		char newCollName[MAX_COLL_LEN+1];
		snprintf(newCollName,MAX_COLL_LEN,"%s-%016llx",
			 token , rand64 );
		// first print "add new collection"
		sb.safePrintf("[ <a href=/crawlbot?name=%016llx&token=%s&"
			      "format=html&addCrawl=%s>"
			      "add new crawl"
			      "</a> ] &nbsp; "
			      "[ <a href=/crawlbot?token=%s>"
			      "show all crawls"
			      "</a> ] &nbsp; "
			      , rand64
			      , token
			      , newCollName
			      , token
			      );
	}


	bool firstOne = true;

	//
	// print list of collections controlled by this token
	//
	for ( long i = 0 ; fmt == FMT_HTML && i<g_collectiondb.m_numRecs;i++ ){
		CollectionRec *cx = g_collectiondb.m_recs[i];
		if ( ! cx ) continue;
		// get its token if any
		char *ct = cx->m_diffbotToken.getBufStart();
		if ( ! ct ) continue;
		// skip if token does not match
		if ( strcmp(ct,token) )
			continue;
		// highlight the tab if it is what we selected
		bool highlight = false;
		if ( cx == cr ) highlight = true;
		char *style = "";
		if  ( highlight ) {
			style = "style=text-decoration:none; ";
			sb.safePrintf ( "<b><font color=red>");
		}
		// print the crawl id. collection name minus <TOKEN>-
		sb.safePrintf("<a %shref=/crawlbot?token=", style);
		sb.urlEncode(token);
		sb.safePrintf("&name=");
		sb.urlEncode(cx->m_diffbotCrawlName.getBufStart());
		sb.safePrintf("&format=html>"
			      "%s (%li)"
			      "</a> &nbsp; "
			      , cx->m_diffbotCrawlName.getBufStart()
			      , (long)cx->m_collnum
			      );
		if ( highlight )
			sb.safePrintf("</font></b>");
	}

	if ( fmt == FMT_HTML )
		sb.safePrintf ( "</center><br/>" );

	// the ROOT JSON [
	if ( fmt == FMT_JSON )
		sb.safePrintf("{\n");

	// injection is currently not in use, so this is an artifact:
	if ( fmt == FMT_JSON && injectionResponse )
		sb.safePrintf("\"response\":\"%s\",\n\n"
			      , injectionResponse->getBufStart() );

	if ( fmt == FMT_JSON && urlUploadResponse )
		sb.safePrintf("\"response\":\"%s\",\n\n"
			      , urlUploadResponse->getBufStart() );


	//////
	//
	// print collection summary page
	//
	//////

	// the items in the array now have type:bulk or type:crawl
	// so call them 'jobs'
	if ( fmt == FMT_JSON )
		sb.safePrintf("\"jobs\":[");//\"collections\":");

	long summary = hr->getLong("summary",0);
	// enter summary mode for json
	if ( fmt != FMT_HTML ) summary = 1;
	// start the table
	if ( summary && fmt == FMT_HTML ) {
		sb.safePrintf("<table border=1 cellpadding=5>"
			      "<tr>"
			      "<td><b>Collection</b></td>"
			      "<td><b>Objects Found</b></td>"
			      "<td><b>URLs Harvested</b></td>"
			      "<td><b>URLs Examined</b></td>"
			      "<td><b>Page Download Attempts</b></td>"
			      "<td><b>Page Download Successes</b></td>"
			      "<td><b>Page Download Successes This Round"
			      "</b></td>"
			      "<td><b>Page Process Attempts</b></td>"
			      "<td><b>Page Process Successes</b></td>"
			      "<td><b>Page Process Successes This Round"
			      "</b></td>"
			      "</tr>"
			      );
	}

	char *name3 = hr->getString("name");

	// scan each coll and get its stats
	for ( long i = 0 ; summary && i < g_collectiondb.m_numRecs ; i++ ) {
		CollectionRec *cx = g_collectiondb.m_recs[i];
		if ( ! cx ) continue;
		// must belong to us
		if ( strcmp(cx->m_diffbotToken.getBufStart(),token) )
			continue;


		// just print out single crawl info for json
		if ( fmt != FMT_HTML && cx != cr && name3 )
			continue;

		// if json, print each collectionrec
		if ( fmt == FMT_JSON ) {
			if ( ! firstOne )
				sb.safePrintf(",\n\t");
			firstOne = false;
			//char *alias = "";
			//if ( cx->m_collectionNameAlias.length() > 0 )
			//	alias=cx->m_collectionNameAlias.getBufStart();
			//long paused = 1;

			//if ( cx->m_spideringEnabled ) paused = 0;
			printCrawlDetailsInJson ( sb , cx );
			// print the next one out
			continue;
		}


		// print in table
		sb.safePrintf("<tr>"
			      "<td>%s</td>"
			      "<td>%lli</td>"
			      "<td>%lli</td>"
			      //"<td>%lli</td>"
			      "<td>%lli</td>"
			      "<td>%lli</td>"
			      "<td>%lli</td>"
			      "<td>%lli</td>"
			      "<td>%lli</td>"
			      "<td>%lli</td>"
			      "</tr>"
			      , cx->m_coll
			      , cx->m_globalCrawlInfo.m_objectsAdded -
			        cx->m_globalCrawlInfo.m_objectsDeleted
			      , cx->m_globalCrawlInfo.m_urlsHarvested
			      //, cx->m_globalCrawlInfo.m_urlsConsidered
			      , cx->m_globalCrawlInfo.m_pageDownloadAttempts
			      , cx->m_globalCrawlInfo.m_pageDownloadSuccesses
			      , cx->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
			      , cx->m_globalCrawlInfo.m_pageProcessAttempts
			      , cx->m_globalCrawlInfo.m_pageProcessSuccesses
			      , cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
			      );
	}
	if ( summary && fmt == FMT_HTML ) {
		sb.safePrintf("</table></html>" );
		return g_httpServer.sendDynamicPage (socket,
						     sb.getBufStart(),
						     sb.length(),
						     0); // cachetime
	}

	if ( fmt == FMT_JSON )
		// end the array of collection objects
		sb.safePrintf("\n]\n");

	///////
	//
	// end print collection summary page
	//
	///////


	//
	// show urls being crawled (ajax) (from Spider.cpp)
	//
	if ( fmt == FMT_HTML ) {
		sb.safePrintf ( "<table width=100%% cellpadding=5 "
				"style=border-width:1px;border-style:solid;"
				"border-color:black;>"
				//"bgcolor=#%s>\n"
				"<tr><td colspan=50>"// bgcolor=#%s>"
				"<b>Last 10 URLs</b> (%li spiders active)"
				//,LIGHT_BLUE
				//,DARK_BLUE
				,(long)g_spiderLoop.m_numSpidersOut);
		char *str = "<font color=green>Resume Crawl</font>";
		long pval = 0;
		if ( cr->m_spideringEnabled )  {
			str = "<font color=red>Pause Crawl</font>";
			pval = 1;
		}
		sb.safePrintf(" "
			      "<a href=/crawlbot?%s"
			      "&pauseCrawl=%li><b>%s</b></a>"
			      , lb.getBufStart() // has &name=&token= encoded
			      , pval
			      , str
			      );

		sb.safePrintf("</td></tr>\n" );

		// the table headers so SpiderRequest::printToTable() works
		if ( ! SpiderRequest::printTableHeaderSimple(&sb,true) )
			return false;
		// shortcut
		XmlDoc **docs = g_spiderLoop.m_docs;
		// row count
		long j = 0;
		// first print the spider recs we are spidering
		for ( long i = 0 ; i < (long)MAX_SPIDERS ; i++ ) {
			// get it
			XmlDoc *xd = docs[i];
			// skip if empty
			if ( ! xd ) continue;
			// sanity check
			if ( ! xd->m_sreqValid ) { char *xx=NULL;*xx=0; }
			// skip if not our coll rec!
			//if ( xd->m_cr != cr ) continue;
			if ( xd->m_collnum != cr->m_collnum ) continue;
			// grab it
			SpiderRequest *oldsr = &xd->m_sreq;
			// get status
			char *status = xd->m_statusMsg;
			// show that
			if ( ! oldsr->printToTableSimple ( &sb , status,xd,j))
				return false;
			j++;
		}

		// end the table
		sb.safePrintf ( "</table>\n" );
		sb.safePrintf ( "<br>\n" );

	} // end html format


	// this is for making sure the search results are not cached
	unsigned long r1 = rand();
	unsigned long r2 = rand();
	unsigned long long rand64 = (unsigned long long) r1;
	rand64 <<= 32;
	rand64 |=  r2;


	if ( fmt == FMT_HTML ) {
		sb.safePrintf("<br>"
			      "<table border=0 cellpadding=5>"

			      // OBJECT search input box
			      "<form method=get action=/search>"
			      "<tr>"
			      "<td>"
			      "<b>Search Objects:</b>"
			      "</td><td>"
			      "<input type=text name=q size=50>"
			      // site clustering off
			      "<input type=hidden name=sc value=0>"
			      // dup removal off
			      "<input type=hidden name=dr value=0>"
			      "<input type=hidden name=c value=\"%s\">"
			      "<input type=hidden name=rand value=%lli>"
			      // bypass ajax, searchbox, logo, etc.
			      "<input type=hidden name=id value=12345>"
			      // restrict search to json objects
			      "<input type=hidden name=prepend "
			      "value=\"type:json |\">"
			      " "
			      "<input type=submit name=submit value=OK>"
			      "</tr>"
			      "</form>"


			      // PAGE search input box
			      "<form method=get action=/search>"
			      "<tr>"
			      "<td>"
			      "<b>Search Pages:</b>"
			      "</td><td>"
			      "<input type=text name=q size=50>"
			      // site clustering off
			      "<input type=hidden name=sc value=0>"
			      // dup removal off
			      "<input type=hidden name=dr value=0>"
			      "<input type=hidden name=c value=\"%s\">"
			      "<input type=hidden name=rand value=%lli>"
			      // bypass ajax, searchbox, logo, etc.
			      "<input type=hidden name=id value=12345>"
			      // restrict search to NON json objects
			      "<input type=hidden "
			      "name=prepend value=\"-type:json |\">"
			      " "
			      "<input type=submit name=submit value=OK>"
			      "</tr>"
			      "</form>"

			      // add url input box
			      "<form method=get action=/crawlbot>"
			      "<tr>"
			      "<td>"
			      "<b>Add Seed Urls: </b>"
			      "</td><td>"
			      "<input type=text name=seeds size=50>"
			      "%s" // hidden tags
			      " "
			      "<input type=submit name=submit value=OK>"
			      //" &nbsp; &nbsp; <input type=checkbox "
			      //"name=spiderLinks value=1 "
			      //"checked>"
			      //" <i>crawl links on this page?</i>"
			      , cr->m_coll
			      , rand64
			      , cr->m_coll
			      , rand64
			      , hb.getBufStart() // hidden tags
			      );
	}

	if ( injectionResponse && fmt == FMT_HTML )
		sb.safePrintf("<br><font size=-1>%s</font>\n"
			      ,injectionResponse->getBufStart()
			      );

	if ( fmt == FMT_HTML )
		sb.safePrintf(//"<input type=hidden name=c value=\"%s\">"
			      //"<input type=hidden name=crawlbotapi value=1>"
			      "</td>"
			      "</tr>"
			      //"</form>"


			      "<tr>"
			      "<td><b>Add Spot URLs:</b></td>"

			      "<td>"
			      // this page will call
			      // printCrawlbotPage2(uploadResponse) 2display it
			      //"<form method=post action=/crawlbot>"
			      //"<input type=file name=spots size=40>"
			      "<input type=text name=spots size=50> "
			      "<input type=submit name=submit value=OK>"
			      "%s" // hidden tags
			      //" &nbsp; &nbsp; <input type=checkbox "
			      //"name=spiderLinks value=1 "
			      //"checked>"
			      //" <i>crawl links on those pages?</i>"

			      "</form>"

			      "</td>"
			      "</tr>"

			      "</table>"
			      "<br>"
			      //, cr->m_coll
			      , hb.getBufStart()
			      );


	//
	// show stats
	//
	if ( fmt == FMT_HTML ) {

		char *seedStr = cr->m_diffbotSeeds.getBufStart();
		if ( ! seedStr ) seedStr = "";

		SafeBuf tmp;
		long crawlStatus = -1;
		getSpiderStatusMsg ( cr , &tmp , &crawlStatus );
		CrawlInfo *ci = &cr->m_localCrawlInfo;
		long sentAlert = (long)ci->m_sentCrawlDoneAlert;
		if ( sentAlert ) sentAlert = 1;

		sb.safePrintf(

			      "<form method=get action=/crawlbot>"
			      "%s"
			      , hb.getBufStart() // hidden input token/name/..
			      );
		sb.safePrintf("<TABLE border=0>"
			      "<TR><TD valign=top>"

			      "<table border=0 cellpadding=5>"

			      //
			      "<tr>"
			      "<td><b>Crawl Name:</td>"
			      "<td>%s</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Crawl Type:</td>"
			      "<td>%li</td>"
			      "</tr>"

			      //"<tr>"
			      //"<td><b>Collection Alias:</td>"
			      //"<td>%s%s</td>"
			      //"</tr>"

			      "<tr>"
			      "<td><b>Token:</td>"
			      "<td>%s</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Seeds:</td>"
			      "<td>%s</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Crawl Status:</td>"
			      "<td>%li</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Crawl Status Msg:</td>"
			      "<td>%s</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Rounds Completed:</td>"
			      "<td>%li</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Has Urls Ready to Spider:</td>"
			      "<td>%li</td>"
			      "</tr>"


			      // this will  have to be in crawlinfo too!
			      //"<tr>"
			      //"<td><b>pages indexed</b>"
			      //"<td>%lli</td>"
			      //"</tr>"

			      "<tr>"
			      "<td><b>Objects Found</b></td>"
			      "<td>%lli</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>URLs Harvested</b> (inc. dups)</td>"
			      "<td>%lli</td>"

			      "</tr>"

			      //"<tr>"
			      //"<td><b>URLs Examined</b></td>"
			      //"<td>%lli</td>"
			      //"</tr>"

			      "<tr>"
			      "<td><b>Page Crawl Attempts</b></td>"
			      "<td>%lli</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Page Crawl Successes</b></td>"
			      "<td>%lli</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Page Crawl Successes This Round</b></td>"
			      "<td>%lli</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Page Process Attempts</b></td>"
			      "<td>%lli</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Page Process Successes</b></td>"
			      "<td>%lli</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Page Process Successes This Round</b></td>"
			      "<td>%lli</td>"
			      "</tr>"


			      , cr->m_diffbotCrawlName.getBufStart()

			      , (long)cr->m_isCustomCrawl

			      , cr->m_diffbotToken.getBufStart()

			      , seedStr

			      , crawlStatus
			      , tmp.getBufStart()
			      , cr->m_spiderRoundNum
			      , cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider

			      , cr->m_globalCrawlInfo.m_objectsAdded -
			        cr->m_globalCrawlInfo.m_objectsDeleted
			      , cr->m_globalCrawlInfo.m_urlsHarvested
			      //, cr->m_globalCrawlInfo.m_urlsConsidered

			      , cr->m_globalCrawlInfo.m_pageDownloadAttempts
			      , cr->m_globalCrawlInfo.m_pageDownloadSuccesses
			      , cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound

			      , cr->m_globalCrawlInfo.m_pageProcessAttempts
			      , cr->m_globalCrawlInfo.m_pageProcessSuccesses
			      , cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
			      );


		long now = getTimeGlobalNoCore();

		sb.safePrintf("<tr>"
			      "<td><b>Download Objects:</b> "
			      "</td><td>"
			      "<a href=/crawlbot/download/%s_data.csv>"
			      "csv</a>"

			      " &nbsp; "

			      "<a href=/crawlbot/download/%s_data.json>"
			      "json full dump</a>"

			      " &nbsp; "

			      , cr->m_coll
			      , cr->m_coll

			      );

		sb.safePrintf(
			      // newest json on top of results
			      "<a href=/search?icc=1&format=json&sc=0&dr=0&"
			      "c=%s&n=10000000&rand=%llu&scores=0&id=1&"
			      "q=gbsortby%%3Agbspiderdate&"
			      "prepend=type%%3Ajson"
			      ">"
			      "json full search (newest on top)</a>"


			      " &nbsp; "

			      // newest json on top of results, last 10 mins
			      "<a href=/search?icc=1&format=json&"
			      // disable site clustering
			      "sc=0&"
			      // dodupcontentremoval:
			      "dr=1&"
			      "c=%s&n=10000000&rand=%llu&scores=0&id=1&"
			      "stream=1&" // stream results back as we get them
			      "q="
			      // put NEWEST on top
			      "gbsortbyint%%3Agbspiderdate+"
			      // min spider date = now - 10 mins
			      "gbminint%%3Agbspiderdate%%3A%li&"
			      //"debug=1"
			      "prepend=type%%3Ajson"
			      ">"
			      "json search (last 30 seconds)</a>"


			      "</td>"
			      "</tr>"

			      // json search with gbsortby:gbspiderdate
			      , cr->m_coll
			      , rand64


			      // json search with gbmin:gbspiderdate
			      , cr->m_coll
			      , rand64
			      , now - 30 // 60 // last 1 minute

			      );


		sb.safePrintf (
			      "<tr>"
			      "<td><b>Download Products:</b> "
			      "</td><td>"
			      // make it search.csv so excel opens it
			      "<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
			      "c=%s&n=10000000&rand=%llu&scores=0&id=1&"
			      "q=gbrevsortby%%3Aproduct.offerPrice&"
			      "prepend=type%%3Ajson"
			      //"+type%%3Aproduct%%7C"
			      ">"
			      "csv</a>"
			      " &nbsp; "
			      "<a href=/search?icc=1&format=html&sc=0&dr=0&"
			      "c=%s&n=10000000&rand=%llu&scores=0&id=1&"
			      "q=gbrevsortby%%3Aproduct.offerPrice&"
			      "prepend=type%%3Ajson"
			      ">"
			      "html</a>"

			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Download Urls:</b> "
			      "</td><td>"
			      "<a href=/crawlbot/download/%s_urls.csv>"
			      "csv</a>"
			      "</td>"
			      "</tr>"


			      "<tr>"
			      "<td><b>Latest Objects:</b> "
			      "</td><td>"
			      "<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
			      "c=%s&n=10&rand=%llu&scores=0&id=1&"
			      "q=gbsortby%%3Agbspiderdate&"
			      "prepend=type%%3Ajson"
			      ">"
			      "csv</a>"
			      " &nbsp; "
			      "<a href=/search?icc=1&format=html&sc=0&dr=0&"
			      "c=%s&n=10rand=%llu&scores=0&id=1&"
			      "q=gbsortby%%3Agbspiderdate&"
			      "prepend=type%%3Ajson"
			      ">"
			      "html</a>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Latest Products:</b> "
			      "</td><td>"
			      "<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
			      "c=%s&n=10&rand=%llu&scores=0&id=1&"
			      "q=gbsortby%%3Agbspiderdate&"
			      "prepend=type%%3Ajson+type%%3Aproduct"
			      ">"
			      "csv</a>"
			      " &nbsp; "
			      "<a href=/search?icc=1&format=html&sc=0&dr=0&"
			      "c=%s&n=10&rand=%llu&scores=0&id=1&"
			      "q=gbsortby%%3Agbspiderdate&"
			      "prepend=type%%3Ajson+type%%3Aproduct"
			      ">"
			      "html</a>"

			      "</td>"
			      "</tr>"


			      "<tr>"
			      "<td><b>Download Pages:</b> "
			      "</td><td>"
			      "<a href=/crawlbot/download/%s_pages.txt>"
			      "txt</a>"
			      //
			      "</td>"
			      "</tr>"

			      "</table>"

			      "</TD>"

			      // download products html
			      , cr->m_coll
			      , rand64

			      , cr->m_coll
			      , rand64

			      //, cr->m_coll
			      //, cr->m_coll
			      //, cr->m_coll

			      , cr->m_coll

			      // latest objects in html
			      , cr->m_coll
			      , rand64

			      // latest objects in csv
			      , cr->m_coll
			      , rand64


			      // latest products in html
			      , cr->m_coll
			      , rand64

			      // latest products in csv
			      , cr->m_coll
			      , rand64

			      // download pages
			      , cr->m_coll
			      );


		// spacer column
		sb.safePrintf("<TD>"
			      "&nbsp;&nbsp;&nbsp;&nbsp;"
			      "&nbsp;&nbsp;&nbsp;&nbsp;"
			      "</TD>"
			      );

		// what diffbot api to use?
		/*
		char *api = cr->m_diffbotApi.getBufStart();
		char *s[10];
		for ( long i = 0 ; i < 10 ; i++ ) s[i] = "";
		if ( api && strcmp(api,"all") == 0 ) s[0] = " selected";
		if ( api && strcmp(api,"article") == 0 ) s[1] = " selected";
		if ( api && strcmp(api,"product") == 0 ) s[2] = " selected";
		if ( api && strcmp(api,"image") == 0 ) s[3] = " selected";
		if ( api && strcmp(api,"frontpage") == 0 ) s[4] = " selected";
		if ( api && strcmp(api,"none") == 0 ) s[5] = " selected";
		if ( ! api || ! api[0] ) s[5] = " selected";
		*/
		sb.safePrintf( "<TD valign=top>"

			      "<table cellpadding=5 border=0>"
			       /*
			      "<tr>"
			      "<td>"
			      "Diffbot API"
			      "</td><td>"
			      "<select name=diffbotapi>"
			      "<option value=all%s>All</option>"
			      "<option value=article%s>Article</option>"
			      "<option value=product%s>Product</option>"
			      "<option value=image%s>Image</option>"
			      "<option value=frontpage%s>FrontPage</option>"
			      "<option value=none%s>None</option>"
			      "</select>"
			      "</td>"
			       "</tr>"
			      , s[0]
			      , s[1]
			      , s[2]
			      , s[3]
			      , s[4]
			      , s[5]
			       */
			      );

		//char *alias = "";
		//if ( cr->m_collectionNameAlias.length() > 0 )
		//	alias = cr->m_collectionNameAlias.getBufStart();
		//char *aliasResponse = "";
		//if ( alias && ! isAliasUnique(cr,token,alias) )
		//	aliasResponse = "<br><font size=1 color=red>"
		//		"Alias not unique</font>";

		char *urtYes = " checked";
		char *urtNo  = "";
		if ( ! cr->m_useRobotsTxt ) {
			urtYes = "";
			urtNo  = " checked";
		}

		char *rdomYes = " checked";
		char *rdomNo  = "";
		if ( ! cr->m_restrictDomain ) {
			rdomYes = "";
			rdomNo  = " checked";
		}

		char *isNewYes = "";
		char *isNewNo  = " checked";
		if ( cr->m_diffbotOnlyProcessIfNewUrl ) {
			isNewYes = " checked";
			isNewNo  = "";
		}

		char *api = cr->m_diffbotApiUrl.getBufStart();
		if ( ! api ) api = "";
		SafeBuf apiUrl;
		apiUrl.htmlEncode ( api , gbstrlen(api), true , 0 );
		apiUrl.nullTerm();

		char *px1 = cr->m_diffbotUrlCrawlPattern.getBufStart();
		if ( ! px1 ) px1 = "";
		SafeBuf ppp1;
		ppp1.htmlEncode ( px1 , gbstrlen(px1) , true , 0 );
		ppp1.nullTerm();

		char *px2 = cr->m_diffbotUrlProcessPattern.getBufStart();
		if ( ! px2 ) px2 = "";
		SafeBuf ppp2;
		ppp2.htmlEncode ( px2 , gbstrlen(px2) , true , 0 );
		ppp2.nullTerm();

		char *px3 = cr->m_diffbotPageProcessPattern.getBufStart();
		if ( ! px3 ) px3 = "";
		SafeBuf ppp3;
		ppp3.htmlEncode ( px3 , gbstrlen(px3) , true , 0 );
		ppp3.nullTerm();

		char *rx1 = cr->m_diffbotUrlCrawlRegEx.getBufStart();
		if ( ! rx1 ) rx1 = "";
		SafeBuf rrr1;
		rrr1.htmlEncode ( rx1 , gbstrlen(rx1), true , 0 );

		char *rx2 = cr->m_diffbotUrlProcessRegEx.getBufStart();
		if ( ! rx2 ) rx2 = "";
		SafeBuf rrr2;
		rrr2.htmlEncode ( rx2 , gbstrlen(rx2), true , 0 );

		char *notifEmail = cr->m_notifyEmail.getBufStart();
		char *notifUrl   = cr->m_notifyUrl.getBufStart();
		if ( ! notifEmail ) notifEmail = "";
		if ( ! notifUrl   ) notifUrl = "";

		sb.safePrintf(

			      //
			      //
			      "<tr>"
			      "<td><b>Repeat Crawl:</b> "
			      "</td><td>"
			      "<input type=text name=repeat "
			      "size=10 value=\"%f\"> "
			      "<input type=submit name=submit value=OK>"
			      " days"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Diffbot API Url:</b> "
			      "</td><td>"
			      "<input type=text name=apiUrl "
			      "size=20 value=\"%s\"> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Url Crawl Pattern:</b> "
			      "</td><td>"
			      "<input type=text name=urlCrawlPattern "
			      "size=20 value=\"%s\"> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Url Process Pattern:</b> "
			      "</td><td>"
			      "<input type=text name=urlProcessPattern "
			      "size=20 value=\"%s\"> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Page Process Pattern:</b> "
			      "</td><td>"
			      "<input type=text name=pageProcessPattern "
			      "size=20 value=\"%s\"> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Url Crawl RegEx:</b> "
			      "</td><td>"
			      "<input type=text name=urlCrawlRegEx "
			      "size=20 value=\"%s\"> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Url Process RegEx:</b> "
			      "</td><td>"
			      "<input type=text name=urlProcessRegEx "
			      "size=20 value=\"%s\"> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"


			      "<tr>"
			      "<td><b>Only Process If New:</b> "
			      "</td><td>"
			      "<input type=radio name=onlyProcessIfNew "
			      "value=1%s> yes &nbsp; "
			      "<input type=radio name=onlyProcessIfNew "
			      "value=0%s> no &nbsp; "
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Crawl Delay (seconds):</b> "
			      "</td><td>"
			      "<input type=text name=crawlDelay "
			      "size=9 value=%f> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Max Page Crawl Successes:</b> "
			      "</td><td>"
			      "<input type=text name=maxToCrawl "
			      "size=9 value=%lli> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Max Page Process Successes:</b>"
			      "</td><td>"
			      "<input type=text name=maxToProcess "
			      "size=9 value=%lli> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Max Rounds:</b>"
			      "</td><td>"
			      "<input type=text name=maxRounds "
			      "size=9 value=%li> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Notification Email:</b>"
			      "</td><td>"
			      "<input type=text name=notifyEmail "
			      "size=20 value=\"%s\"> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr>"
			      "<td><b>Notification URL:</b>"
			      "</td><td>"
			      "<input type=text name=notifyWebhook "
			      "size=20 value=\"%s\"> "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>"

			      "<tr><td>"
			      "<b>Use Robots.txt when crawling?</b> "
			      "</td><td>"
			      "<input type=radio name=obeyRobots "
			      "value=1%s> yes &nbsp; "
			      "<input type=radio name=obeyRobots "
			      "value=0%s> no &nbsp; "
			      "</td>"
			      "</tr>"

			      "<tr><td>"
			      "<b>Restrict domain to seeds?</b> "
			      "</td><td>"
			      "<input type=radio name=restrictDomain "
			      "value=1%s> yes &nbsp; "
			      "<input type=radio name=restrictDomain "
			      "value=0%s> no &nbsp; "
			      "</td>"
			      "</tr>"

			      //"<tr><td>"
			      //"Use spider proxies on AWS? "
			      //"</td><td>"
			      //"<input type=checkbox name=usefloaters checked>
			      //"</td>"
			      //"</tr>"


			      "</table>"

			      "</TD>"
			      "</TR>"
			      "</TABLE>"


			      , cr->m_collectiveRespiderFrequency

			      , apiUrl.getBufStart()
			      , ppp1.getBufStart()
			      , ppp2.getBufStart()
			      , ppp3.getBufStart()

			      , rrr1.getBufStart()
			      , rrr2.getBufStart()

			      , isNewYes
			      , isNewNo

			      , cr->m_collectiveCrawlDelay


			      , cr->m_maxToCrawl
			      , cr->m_maxToProcess
			      , (long)cr->m_maxCrawlRounds

			      , notifEmail
			      , notifUrl

			      , urtYes
			      , urtNo

			      , rdomYes
			      , rdomNo

			      );
	}


	// xml or json does not show the input boxes
	//if ( format != FMT_HTML )
	//	return g_httpServer.sendDynamicPage ( s,
	//					      sb.getBufStart(),
	//					      sb.length(),
	//					      -1 ); // cachetime


	//
	// print url filters. use "multimedia" to handle jpg etc.
	//
	// use "notindexable" for images/movies/css etc.
	// add a "process" column to send to diffbot...
	//
	//

	/*
	char *s1 = "Show";
	char *s2 = "none";
	if ( hr->getLongFromCookie("showtable",0) ) {
		s1 = "Hide";
		s2 = "";
	}

	if ( fmt == FMT_HTML )
		sb.safePrintf(

			      "<a onclick="
			      "\""
			      "var e = document.getElementById('filters');"
			      "var m = document.getElementById('msg');"
			      "if ( e.style.display == 'none' ){"
			      "e.style.display = '';"
			      "m.innerHTML='Hide URL Filters Table';"
			      "document.cookie = 'showtable=1;';"
			      "}"
			      "else {"
			      "e.style.display = 'none';"
			      "m.innerHTML='Show URL Filters Table';"
			      "document.cookie = 'showtable=0;';"
			      "}"
			      "\""
			      " "
			      "style="
			      "cursor:hand;"
			      "cursor:pointer;"
			      "color:blue;>"

			      "<u><b>"
			      "<div id=msg>"
			      "%s URL Filters Table"
			      "</div>"
			      "</b></u>"
			      "</a>"

			      "<div id=filters style=display:%s;>"
			      "<form method=get action=/crawlbot>"
			      "<input type=hidden name=c value=\"%s\">"
			      "<input type=hidden name=showtable value=1>"
			      , s1
			      , s2
			      , cr->m_coll
			      );


	//
	// print url filters. HACKy...
	//
	if ( fmt == FMT_HTML )
		g_parms.sendPageGeneric ( socket ,
					  hr ,
					  PAGE_FILTERS ,
					  NULL ,
					  &sb ,
					  cr->m_coll,  // coll override
					  false ); // isJSON?
	//
	// end HACKy hack
	//
	if ( fmt == FMT_HTML )
		sb.safePrintf(
			      "</form>"
			      "</div>"
			      "<br>"
			      "<br>"
			      );
	*/


	//
	// add search box to your site
	//
	/*
	sb.safePrintf("<br>"
		      "<table>"
		      "<tr>"
		      "<td><a onclick=unhide();>"
		      "Add this search box to your site"
		      "</a>"
		      "</td>"
		      "</tr>"
		      "</table>");
	*/

	//
	// show simpler url filters table
	//
	if ( fmt == FMT_HTML ) {
		/*
		sb.safePrintf ( "<table>"
				"<tr><td colspan=2>"
				"<b>URL Filters</b>"
				"</td></tr>\n"
				);
		// true means its html input
		printUrlFilters ( sb , cr , fmt );
		// for adding new rule
		sb.safePrintf("<tr>"
			      "<td>Expression "
			      "<input type=text name=expression size=30 "
			      "value=\"\"> "
			      "</td><td>"
			      "Action <input type=text name=action size=50 "
			      "value=\"\">"
			      " "
			      "<input type=submit name=submit value=OK>"
			      "</td>"
			      "</tr>\n"
			      );


		//sb.safePrintf("<tr><td colspan=2><font size=-1><i>U
		sb.safePrintf("</table>\n");
		*/
		//
		// END THE BIG FORM
		//
		sb.safePrintf("</form>");
	}

	//
	// show reset and delete crawl buttons
	//
	if ( fmt == FMT_HTML ) {
		sb.safePrintf(
			      "<table cellpadding=5>"
			      "<tr>"

			      "<td>"


			      // reset collection form
			      "<form method=get action=/crawlbot>"
			      "%s" // hidden tags
			      , hb.getBufStart()
			      );
		sb.safePrintf(

			      "<input type=hidden name=reset value=1>"
			      // also show it in the display, so set "c"
			      "<input type=submit name=button value=\""
			      "Reset this collection\">"
			      "</form>"
			      // end reset collection form
			      "</td>"

			      "<td>"

			      // delete collection form
			      "<form method=get action=/crawlbot>"
			      "%s"
			      //, (long)cr->m_collnum
			      , hb.getBufStart()
			      );

		sb.safePrintf(

			      "<input type=hidden name=delete value=1>"
			      "<input type=submit name=button value=\""
			      "Delete this collection\">"
			      "</form>"
			      // end delete collection form
			      "</td>"


			      // restart collection form
			      "<td>"
			      "<form method=get action=/crawlbot>"
			      "%s"
			      "<input type=hidden name=restart value=1>"
			      "<input type=submit name=button value=\""
			      "Restart this collection\">"
			      "</form>"
			      "</td>"

			      "</tr>"
			      "</table>"

			      //, (long)cr->m_collnum
			      , hb.getBufStart()
			      //, (long)cr->m_collnum
			      );
	}


	// the ROOT JSON }
	if ( fmt == FMT_JSON )
		sb.safePrintf("}\n");

	char *ct = "text/html";
	if ( fmt == FMT_JSON ) ct = "application/json";
	if ( fmt == FMT_XML ) ct = "text/xml";
	if ( fmt == FMT_CSV ) ct = "text/csv";

	// this could be in html json or xml
	return g_httpServer.sendDynamicPage ( socket,
					      sb.getBufStart(),
					      sb.length(),
					      -1 , // cachetime
					      false ,
					      ct );

	/*
		      "<h1>API for Diffbot</h1>"
		      "<form action=/api/diffbot>"
		      "<input type=text name=url size=100>"
		      "<input type=submit name=inject value=\"Inject\">"
		      "</form>"
		      "<br>"

		      "<h1>API for Crawlbot</h1>"

      //        "<form id=\"addCrawl\" onSubmit=\"addCrawlFromForm(); return false;\">"
		      "<form action=/api/startcrawl method=get>"


	"<div class=\"control-group well\">"
        "<div id=\"apiSelection\" class=\"titleColumn\">"
	"<div class=\"row \">"

		      "Token: <input type=text name=token><br><br>"
		      "API: <input type=text name=api> <i>(article, product)</i><br><br>"

	"<div class=\"span2\"><label class=\"on-default-hide\">Page-type</label></div>"
	"<div class=\"input-append span7\">"
	"<select id=\"apiSelect\" name=\"api\" class=\"span2\" value=\"sds\">"
	"<option value=\"\" disabled=\"disabled\" selected=\"selected\">Select pages to process and extract</option>"
	"<option class=\"automatic\" value=\"article\">Article</option>"
	"<option class=\"automatic\" value=\"frontpage\">Frontpage</option>"
	"<option class=\"automatic\" value=\"image\">Image</option>"
	"<option class=\"automatic\" value=\"product\">Product</option>"
	"</select>"
	"<span id=\"formError-apiSelect\" class=\"formError\">Page-type is required</span>"
	"<span class=\"inputNote\">API calls will be made using your current token.</span>"
	"</div>"
	"</div>"
        "</div>"
        "<div id=\"apiQueryString\" class=\"titleColumn\">"
	"<div class=\"row \">"
	"<div class=\"span2\"><label class=\"on-default-hide\">API Querystring</label></div>"
	"<div class=\"input-prepend span7\">"
	"<span class=\"add-on\">?</span><input class=\"span6 search-input\" name=\"apiQueryString\" size=\"16\" type=\"text\" placeholder=\"Enter a querystring to specify Diffbot API parameters\">"
	"</div>"
	"</div>"
        "</div>"
        "<hr>"
        "<div id=\"seedUrl\" class=\"titleColumn\">"
	"<div class=\"row \">"
	"<div class=\"span2\"><label class=\"on-default-hide\">Seed URL</label></div>"
	"<div class=\"input-append span7\">"
	"<input class=\"span6 search-input\" name=\"seed\" size=\"16\" type=\"text\" placeholder=\"Enter a seed URL\">"
	"<span id=\"formError-seedUrl\" class=\"formError\"><br>Seed URL is required</span>"
	"</div>"
	"</div>"
        "</div>"
        "<hr>"
        "<div id=\"headerRow\" class=\"titleColumn\">"
	"<div class=\"row \">"
	"<div class=\"span2\"><label class=\"on-default-hide\"><strong>Crawl Filters</strong></label></div>"
	"</div>"
        "</div>"
        "<div id=\"urlCrawlPattern\" class=\"titleColumn\">"
	"<div class=\"regex-edit row \">"
	"<div class=\"span2\"><label class=\"on-default-hide\">URL Regex</label></div>"
	"<div class=\"input-append span7\">"
	"<input class=\"span6\" name=\"urlCrawlPattern\" size=\"16\" type=\"text\" placeholder=\"Only crawl pages whose URLs match this regex\" value=\"\">"
	"<span class=\"inputNote\">Diffbot uses <a href=\"http://www.regular-expressions.info/refflavors.html\" target=\"_blank\">Java regex syntax</a>. Be sure to escape your characters.</span>"
	"</div>"
	"</div>"
        "</div>"
        "<div id=\"maxCrawled\" class=\"titleColumn\">"
	"<div class=\"regex-edit row \"><div class=\"span2\"><label class=\"on-default-hide\">Max Pages Crawled</label></div>         <div class=\"input-append span7\">               <input class=\"span1\" name=\"maxCrawled\" size=\"\" type=\"text\" value=\"\">            </div>          </div>        </div>        <div id=\"headerRow\" class=\"titleColumn\">          <div class=\"row \">		<div class=\"span2\"><label class=\"on-default-hide\"><strong>Processing Filters</strong></label></div>          </div>        </div>        <div id=\"classify\" class=\"titleColumn\">          <div class=\"row\">		<div class=\"span2\" id=\"smartProcessLabel\"><label class=\"on-default-hide\">Smart Processing</label></div>		<div class=\"span7\"><label class=\"checkbox\"><input id=\"smartProcessing\" type=\"checkbox\" name=\"classify\"><span id=\"smartProcessAutomatic\">Only process pages that match the selected page-type. Uses <a href=\"/our-apis/classifier\">Page Classifier API</a>.</span><span id=\"smartProcessCustom\">Smart Processing only operates with Diffbot <a href=\"/products/automatic\">Automatic APIs.</a></span></label></div>          </div>        </div>        <div id=\"urlProcessPattern\" class=\"titleColumn\">          <div class=\"regex-edit row \">		<div class=\"span2\"><label class=\"on-default-hide\">URL Regex</label></div>            <div class=\"input-append span7\">                <input class=\"span6\" name=\"urlProcessPattern\" size=\"16\" type=\"text\" placeholder=\"Only process pages whose URLs match this regex\" value=\"\">            </div>          </div>        </div>        <div id=\"pageProcessPattern\" class=\"titleColumn\">          <div class=\"regex-edit row \">		<div class=\"span2\"><label class=\"on-default-hide\">Page-Content Regex</label></div>            <div class=\"input-append span7\">                <input class=\"span6\" name=\"pageProcessPattern\" size=\"16\" type=\"text\" placeholder=\"Only process pages whose content contains a match to this regex\" value=\"\">            </div>          </div>        </div>        <div id=\"maxMatches\" class=\"titleColumn\">          <div class=\"regex-edit row \">		<div class=\"span2\"><label class=\"on-default-hide\">Max Pages Processed</label></div>            <div class=\"input-append span7\">                <input class=\"span1\" name=\"maxProcessed\" size=\"16\" type=\"text\" value=\"\">            </div>          </div>        </div>        <hr>        <div class=\"controls row\">		<div class=\"span2\">&nbsp;</div>            <div class=\"span7\" id=\"startCrawlButtons\">					   <button id=\"testButton\" class=\"btn\" type=\"button\" onclick=\"testcrawl(formToData());clicky.log('/dev/crawl#testCrawl','Test Crawl');\">Test</button>						   "

"<!--<button id=\"submitButton\" class=\"btn btn-info\" type=\"button\" onclick=\"addCrawlFromForm()\" >Start Crawl</button>-->"

"<input type=submit name=start value=\"Start Crawl\">"


"          </div>        </div>    </div>        <div id=\"hiddenTestDiv\"  style=\"display: none;\"></div>    </form>    </div><!-- end Crawler tab -->" );


*/
}

// . do not add dups into m_diffbotSeeds safebuf
// . return 0 if not in table, 1 if in table. -1 on error adding to table.
long isInSeedBuf ( CollectionRec *cr , Url *url ) {

	HashTableX *ht = &cr->m_seedHashTable;

	// if table is empty, populate it
	if ( ht->m_numSlotsUsed <= 0 ) {
		// initialize the hash table
		if ( ! ht->set(8,0,1024,NULL,0,false,1,"seedtbl") )
			return -1;
		// populate it from list of seed urls
		char *p = cr->m_diffbotSeeds.getBufStart();
		for ( ; p && *p ; ) {
			// get url
			char *purl = p;
			// advance to next
			for ( ; *p && !is_wspace_a(*p) ; p++ );
			// make end then
			char *end = p;
			// skip possible white space. might be \0.
			if ( *p ) p++;
			// hash it
			long long h64 = hash64 ( purl , end-purl );
			if ( ! ht->addKey ( &h64 ) ) return -1;
		}
	}

	// is this url in the hash table?
	long long u64 = hash64 ( url->getUrl() , url->getUrlLen() );

	if ( ht->isInTable ( &u64 ) ) return 1;

	// add it to hashtable
	if ( ! ht->addKey ( &u64 ) ) return -1;

	// WAS not in table
	return 0;
}

// just use "fakeips" based on the hash of each url hostname/subdomain
// so we don't waste time doing ip lookups.
bool getSpiderRequestMetaList ( char *doc ,
				SafeBuf *listBuf ,
				bool spiderLinks ,
				CollectionRec *cr ) {

	if ( ! doc ) return true;

	// . scan the list of urls
	// . assume separated by white space \n \t or space
	char *p = doc;

	long now = getTimeGlobal();

	// a big loop
	while ( true ) {
		// skip white space (\0 is not a whitespace)
		for ( ; is_wspace_a(*p) ; p++ );
		// all done?
		if ( ! *p ) break;
		// save it
		char *saved = p;
		// advance to next white space
		for ( ; ! is_wspace_a(*p) && *p ; p++ );
		// set end
		char *end = p;
		// get that url
		Url url;
		url.set ( saved , end - saved );
		// if not legit skip
		if ( url.getUrlLen() <= 0 ) continue;
		// need this
		long long probDocId = g_titledb.getProbableDocId(&url);
		// make it
		SpiderRequest sreq;
		sreq.reset();
		sreq.m_firstIp = url.getHostHash32(); // fakeip!
		// avoid ips of 0 or -1
		if ( sreq.m_firstIp == 0 || sreq.m_firstIp == -1 )
			sreq.m_firstIp = 1;
		sreq.m_hostHash32 = url.getHostHash32();
		sreq.m_domHash32  = url.getDomainHash32();
		sreq.m_siteHash32 = url.getHostHash32();
		sreq.m_probDocId  = probDocId;
		sreq.m_hopCount   = 0; // we're a seed
		sreq.m_hopCountValid = true;
		sreq.m_addedTime = now;
		sreq.m_isNewOutlink = 1;
		sreq.m_isWWWSubdomain = url.isSimpleSubdomain();

		// treat seed urls as being on same domain and hostname
		sreq.m_sameDom = 1;
		sreq.m_sameHost = 1;
		sreq.m_sameSite = 1;

		sreq.m_fakeFirstIp = 1;
		sreq.m_isAddUrl = 1;

		// spider links?
		if ( ! spiderLinks )
			sreq.m_avoidSpiderLinks = 1;

		// save the url!
		strcpy ( sreq.m_url , url.getUrl() );
		// finally, we can set the key. isDel = false
		sreq.setKey ( sreq.m_firstIp , probDocId , false );

		if ( ! listBuf->reserve ( 100 + sreq.getRecSize() ) )
			// return false with g_errno set
			return false;

		// store rdbid first
		if ( ! listBuf->pushChar(RDB_SPIDERDB) )
			// return false with g_errno set
			return false;
		// store it
		if ( ! listBuf->safeMemcpy ( &sreq , sreq.getRecSize() ) )
			// return false with g_errno set
			return false;

		if ( ! cr ) continue;

		// do not add dups into m_diffbotSeeds safebuf
		long status = isInSeedBuf ( cr , &url );

		// error?
		if ( status == -1 ) {
			log ( "crawlbot: error adding seed to table: %s",
			      mstrerror(g_errno) );
			return true;
		}

		// already in buf
		if ( status == 1 ) continue;

		// add url into m_diffbotSeeds, \n separated list
		if ( cr->m_diffbotSeeds.length() )
			// make it space not \n so it looks better in the
			// json output i guess
			cr->m_diffbotSeeds.pushChar(' '); // \n
		cr->m_diffbotSeeds.safeMemcpy (url.getUrl(), url.getUrlLen());
		cr->m_diffbotSeeds.nullTerm();
	}
	// all done
	return true;
}

/*
bool isAliasUnique ( CollectionRec *cr , char *token , char *alias ) {
	// scan all collections
	for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		CollectionRec *cx = g_collectiondb.m_recs[i];
		if ( ! cx ) continue;
		// must belong to us
		if ( strcmp(cx->m_diffbotToken.getBufStart(),token) )
			continue;
		// skip if collection we are putting alias on
		if ( cx == cr ) continue;
		// does it match?
		if ( cx->m_collectionNameAlias.length() <= 0 ) continue;
		// return false if it matches! not unique
		if ( strcmp ( cx->m_collectionNameAlias.getBufStart() ,
			      alias ) == 0 )
			return false;
	}
	return true;
}
*/

// json can be provided via get or post but content type must be
// url-encoded so we can test with a simple html form page.
/*
bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
				  HttpRequest *hr ,
				  CollectionRec *cr ) {

	// get the json
	char *json = hr->getString("json");
	if ( ! json )
		return sendReply2 ( socket,
				    FMT_JSON,
				    "No &json= provided in request.");


	Json JP;
	bool status = JP.parseJsonStringIntoJsonItems ( json );

	// wtf?
	if ( ! status )
		return sendReply2 ( socket, FMT_JSON,
				    "Error with JSON parser.");

	// error adding it?
	if ( ! cr )
		return sendReply2 ( socket,FMT_JSON,
				    "Failed to create new collection.");

	ji = JP.getFirstItem();

	char *seed = NULL;

	// traverse the json
	for ( ; ji ; ji = ji->m_next ) {
		// just get STRINGS or NUMS
		if ( ji->m_type != JT_STRING && ji->m_type != JT_NUMBER )
			continue;
		// check name
		char *name   = ji->m_name;
		char *val = ji->getValue();

		if ( strcmp(name,"seed") == 0 )
			seed = val;
		if ( strcmp(name,"email") == 0 )
			cr->m_notifyEmail.set(val);
		if ( strcmp(name,"webhook") == 0 )
			cr->m_notifyUrl.set(val);
		if ( strcmp(name,"frequency") == 0 )
			cr->m_collectiveRespiderFrequency = atof(val);
		if ( strcmp(name,"maxToCrawl") == 0 )
			cr->m_maxToCrawl = atoll(val);
		if ( strcmp(name,"maxToProcess") == 0 )
			cr->m_maxToProcess = atoll(val);
		if ( strcmp(name,"pageProcessPattern") == 0 )
			cr->m_diffbotPageProcessPattern.set(val);
		if ( strcmp(name,"obeyRobots") == 0 ) {
			if ( val[0]=='t' || val[0]=='T' || val[0]==1 )
				cr->m_useRobotsTxt = true;
			else
				cr->m_useRobotsTxt = false;
		}
		if ( strcmp(name,"onlyProcessNew") == 0 ) {
			if ( val[0]=='t' || val[0]=='T' || val[0]==1 )
				cr->m_diffbotOnlyProcessIfNew = true;
			else
				cr->m_diffbotOnlyProcessIfNew = false;
		}
		if ( strcmp(name,"pauseCrawl") == 0 ) {
			if ( val[0]=='t' || val[0]=='T' || val[0]==1 )
				cr->m_spideringEnabled = 0;
			else
				cr->m_spideringEnabled = 1;
		}
	}

	// set collective respider in case just that was passed
	for ( long i =0 ; i < MAX_FILTERS ; i++ )
		cr->m_spiderFreqs[i] = cr->m_collectiveRespiderFrequency;

	// if url filters not specified, we are done
	if ( ! JP.getItem("urlFilters") )
		return true;

	// reset the url filters here to the default set.
	// we will append the client's filters below them below.
	resetUrlFilters ( cr );


	char *expression = NULL;
	char *action = NULL;

	// start over at top
	ji = JP.getFirstItem();

	// "urlFilters": [
	// {
	//   "value": "*",   // MDW - this matches all urls! ("default")
	//   "action": "http://www.diffbot.com/api/analyze?mode=auto"
	// }
	// {
	//   "value": "company",
	//   "action" : "http://www.diffbot.com/api/article?tags&meta"
	// }
	// {
	//   "value": "^http://www",
	//   "action": "doNotProcess"
	// }
	// {
	//   "value": "$.html && category",
	//   "action": "doNotCrawl"
	// }
	// {
	//   "value": "!$.html && $.php",
	//   "action": "doNotCrawl"
	// }
	// ]

	// how many filters do we have so far?
	long nf = cr->m_numRegExs;

	for ( ; ji ; ji = ji->m_next ) {
		// just get STRINGS only
		if ( ji->m_type != JT_STRING ) continue;
		// must be right now
		char *name = ji->m_name;
		char *value = ji->getValue();
		if ( strcmp(name,"value")==0 )
			expression = value;
		if ( strcmp(name,"action")==0 )
			action = ji->getValue();
		// need both
		if ( ! action ) continue;
		if ( ! expression ) continue;
		// they use "*" instead of "default" so put that back
		if ( expression[0] == '*' )
			expression = "default";
		// deal with it
		cr->m_regExs[1].set(expression);
		cr->m_numRegExs++;
		long priority = 50;
		// default diffbot api call:
		char *api = NULL;
		if ( strcasecmp(action,"donotcrawl") == 0 )
			priority = SPIDER_PRIORITY_FILTERED;
		//if ( strcasecmp(action,"donotprocess") == 0 )
		//	api = NULL;
		// a new diffbot url?
		if ( strcasecmp(action,"http") == 0 )
			api = action;
		// add the new filter
		cr->m_regExs             [nf].set(expression);
		cr->m_spiderPriorities   [nf] = priority;
		cr->m_spiderDiffbotApiUrl[nf].set(api);
		nf++;

		// add a mirror of that filter but for manually added,
		// i.e. injected or via add url,
		if ( priority < 0 ) continue;

		// make the priority higher!
		cr->m_regExs[nf].safePrintf("ismanualadd && %s",expression);
		cr->m_spiderPriorities   [nf] = 70;
		cr->m_spiderDiffbotApiUrl[nf].set(api); // appends \0
		nf++;

		// NULL out again
		action = NULL;
		expression = NULL;

		if ( nf < MAX_FILTERS ) continue;
		log("crawlbot: too many url filters!");
		break;
	}

	// update the counts
	cr->m_numRegExs   = nf;
	cr->m_numRegExs2  = nf;
	cr->m_numRegExs3  = nf;
	cr->m_numRegExs10 = nf;
	cr->m_numRegExs5  = nf;
	cr->m_numRegExs6  = nf;
	cr->m_numRegExs7  = nf;
	cr->m_numRegExs11 = nf;

	// set collective respider
	for ( long i =0 ; i < nf ; i++ )
		cr->m_spiderFreqs[i] = cr->m_collectiveRespiderFrequency;

	return true;
}
*/


/*
  THIS IS NOW AUTOMATIC from new Parms.cpp broadcast logic

bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
				     HttpRequest *hr ,
				     CollectionRec *cr ) {
	//  update the url filters for now since that is complicated
	//  supply "cr" directly since "c" may not be in the http
	//  request if addcoll=xxxxxx (just created a new rec)
	//long page = PAGE_FILTERS;
	//WebPage *pg = g_pages.getPage ( page ) ;
	//g_parms.setFromRequest ( hr , socket , pg->m_function, cr );

	bool rebuild = false;

	//
	// set other diffbot parms for this collection
	//
	long maxToCrawl = hr->getLongLong("maxToCrawl",-1LL);
	if ( maxToCrawl == -1 )
		maxToCrawl = hr->getLongLong("maxToDownload",-1LL);
	if ( maxToCrawl != -1 ) {
		cr->m_maxToCrawl = maxToCrawl;
		cr->m_needsSave = 1;
	}
	long maxToProcess = hr->getLongLong("maxToProcess",-1LL);
	if ( maxToProcess != -1 ) {
		cr->m_maxToProcess = maxToProcess;
		cr->m_needsSave = 1;
	}
	// -1 means no max, so use -2 as default here
	long maxCrawlRounds = hr->getLongLong("maxCrawlRounds",-2LL);
	if ( maxCrawlRounds == -2 )
		maxCrawlRounds = hr->getLongLong("maxRounds",-2LL);
	if ( maxCrawlRounds != -2 ) {
		cr->m_maxCrawlRounds = maxCrawlRounds;
		cr->m_needsSave = 1;
	}
	char *email = hr->getString("notifyEmail",NULL,NULL);
	if ( email ) {
		cr->m_notifyEmail.set(email);
		cr->m_needsSave = 1;
	}
	char *url = hr->getString("notifyWebHook",NULL,NULL);
	if ( ! url ) url = hr->getString("notifyWebhook",NULL,NULL);
	if ( url ) {
		// assume url is invalid, purge it
		cr->m_notifyUrl.purge();
		// normalize
		Url norm;
		norm.set ( url );
		if ( norm.getDomainLen() > 0 &&
		     norm.getHostLen() > 0 )
			// set the ssafebuf to it. will \0 terminate it.
			cr->m_notifyUrl.set(norm.getUrl());
		// save the collection rec
		cr->m_needsSave = 1;
	}
	long pause = hr->getLong("pauseCrawl",-1);

	// /v2/bulk api support
	if ( pause == -1 ) pause = hr->getLong("pause",-1);

	if ( pause == 0 ) { cr->m_needsSave = 1; cr->m_spideringEnabled = 1; }
	if ( pause == 1 ) { cr->m_needsSave = 1; cr->m_spideringEnabled = 0; }
	long obeyRobots = hr->getLong("obeyRobots",-1);
	if ( obeyRobots == -1 ) obeyRobots = hr->getLong("robots",-1);
	if ( obeyRobots != -1 ) {
		cr->m_useRobotsTxt = obeyRobots;
		cr->m_needsSave = 1;
	}
	long restrictDomain = hr->getLong("restrictDomain",-1);
	if ( restrictDomain != -1 ) {
		cr->m_restrictDomain = restrictDomain;
		cr->m_needsSave = 1;
		rebuild = true;
	}

	char *api = hr->getString("apiUrl",NULL);
	if ( api ) {
		cr->m_diffbotApiUrl.set(api);
		cr->m_needsSave = 1;
	}
	char *ppp1 = hr->getString("urlCrawlPattern",NULL);
	if ( ppp1 ) {
		cr->m_diffbotUrlCrawlPattern.set(ppp1);
		cr->m_needsSave = 1;
		rebuild = true;
	}
	char *ppp2 = hr->getString("urlProcessPattern",NULL);
	if ( ppp2 ) {
		cr->m_diffbotUrlProcessPattern.set(ppp2);
		cr->m_needsSave = 1;
	}
	char *ppp3 = hr->getString("pageProcessPattern",NULL);
	if ( ppp3 ) {
		cr->m_diffbotPageProcessPattern.set(ppp3);
		cr->m_needsSave = 1;
	}
	// reg ex support
	char *rx1 = hr->getString("urlCrawlRegEx",NULL);
	// clear what we had
	if ( rx1 && cr->m_hasucr ) {
		regfree ( &cr->m_ucr );
		cr->m_hasucr = false;
		cr->m_diffbotUrlCrawlRegEx.purge();
		cr->m_needsSave = 1;
		rebuild = true;
	}
	// add a new one if not blank
	if ( rx1 && rx1[0] ) {
		cr->m_diffbotUrlCrawlRegEx.set(rx1);
		cr->m_needsSave = 1;
		// this will store the compiled regular expression into ucr
		if ( regcomp ( &cr->m_ucr ,
			       // the regular expression to compile
			       rx1 ,
			       // some flags
			       REG_EXTENDED|REG_ICASE|
			       REG_NEWLINE|REG_NOSUB) ) {
			regfree ( &cr->m_ucr);
			// should never fail!
			return log("xmldoc: regcomp %s failed: %s. "
				   "Ignoring.",
				   rx1,mstrerror(errno));
		}
		cr->m_hasucr = true;
	}


	char *rx2 = hr->getString("urlProcessRegEx",NULL);
	// clear what we had
	if ( rx2 && cr->m_hasupr ) {
		regfree ( &cr->m_upr );
		cr->m_hasupr = false;
		cr->m_diffbotUrlProcessRegEx.purge();
		cr->m_needsSave = 1;
	}
	// add a new one if not blank
	if ( rx2 && rx2[0] ) {
		cr->m_diffbotUrlProcessRegEx.set(rx2);
		cr->m_needsSave = 1;
		// this will store the compiled regular expression into upr
		if ( regcomp ( &cr->m_upr ,
			       // the regular expression to compile
			       rx2 ,
			       // some flags
			       REG_EXTENDED|REG_ICASE|
			       REG_NEWLINE|REG_NOSUB) ) {
			regfree ( &cr->m_upr);
			// error!
			return log("xmldoc: regcomp %s failed: %s. "
				   "Ignoring.",
				   rx2,mstrerror(errno));
		}
		cr->m_hasupr = true;
	}


	float respider = hr->getFloat("repeatJob",-1.0);
	if ( respider == -1.0 ) respider = hr->getFloat("repeat",-1.0);
	if ( respider == -1.0 ) respider = hr->getFloat("repeatCrawl",-1.0);
	if ( respider >= 0.0 ) {
		// if not 0, then change this by the delta
		if ( cr->m_spiderRoundStartTime ) {
			// convert from days into seconds
			float rfOld = cr->m_collectiveRespiderFrequency;
			float rfNew = respider;
			// 86400 seconds in a day
			long secondsOld = (long)(rfOld * 86400);
			long secondsNew = (long)(rfNew * 86400);
			// remove old one.
			cr->m_spiderRoundStartTime -= secondsOld;
			// add in new one
			cr->m_spiderRoundStartTime += secondsNew;
		}
		// if 0 that means NO recrawling
		if ( respider == 0.0 ) {
			cr->m_spiderRoundStartTime = 0;//getTimeGlobal();
		}
		cr->m_collectiveRespiderFrequency = respider;
		cr->m_needsSave = 1;
	}

	float delay = hr->getFloat("crawlDelay",-1.0);
	//long crawlWait = hr->getLong("wait",-1);
	if ( delay >= 0.0 ) {
		rebuild = true;
		cr->m_collectiveCrawlDelay = delay;
	}

	long onlyProcessNew = hr->getLong("onlyProcessIfNew",-1);
	if ( onlyProcessNew != -1 ) {
		cr->m_diffbotOnlyProcessIfNew = onlyProcessNew;
		cr->m_needsSave = 1;
	}

	// set collective respider
	//for ( long i =0 ; i < cr->m_numRegExs ; i++ ) {
	//	if ( cr->m_collectiveRespiderFrequency == 0.0 )
	//		cr->m_spiderFreqs[i] = 0.000;
	//	else
	//		cr->m_spiderFreqs[i] = 0.001;
	//	//cr->m_collectiveRespiderFrequency;
	//}


	char *path = hr->getPath();
	bool isBulkApi = false;
	if ( path && strncmp(path,"/v2/bulk",8)==0 ) isBulkApi = true;


	// were any url filteres specified? if not, don't reset them
	//if ( ! hr->hasField("action") )
	//	return true;

	// reset the url filters here to the default set.
	// we will append the client's filters below them below.
	resetUrlFilters ( cr );

	// if it was not recrawling and we made it start we have
	// to repopulate waiting tree because most entries will
	// need to be re-added!
	// really, anytime we change url filters we have to repopulate
	// the waiting tree
	SpiderColl *sc = cr->m_spiderColl;
	if ( sc && rebuild ) {
		// this is causing a bulk job not to complete because
		// jenkins keeps checking it every 10 seconds
		sc->m_waitingTreeNeedsRebuild = true;
	}

	return true;

	// "urlFilters": [
	// {
	//   "value": "*",   // MDW - this matches all urls! ("default")
	//   "action": "http://www.diffbot.com/api/analyze?mode=auto"
	// }
	// {
	//   "value": "company",
	//   "action" : "http://www.diffbot.com/api/article?tags&meta"
	// }
	// {
	//   "value": "^http://www",
	//   "action": "doNotProcess"
	// }
	// {
	//   "value": "$.html && category",
	//   "action": "doNotCrawl"
	// }
	// {
	//   "value": "!$.html && $.php",
	//   "action": "doNotCrawl"
	// }
	// ]

	char *expression = NULL;
	char *action = NULL;

	// how many filters do we have so far?
	long nf = cr->m_numRegExs;

	// delete the 3rd default filter cuz we should re-add it below
	// to the bottom of the list.
	if ( nf >= 3 ) nf--;

	bool addedDefault = false;

	// loop over the cgi parms
	for ( long i = 0 ; i < hr->getNumFields() ; i++ ) {
		// get cgi parm name
		char *field = hr->getField    ( i );
		//long  flen  = hr->getFieldLen ( i );
		if ( strcmp(field,"expression") == 0 )
			expression = hr->getValue(i);
		if ( strcmp(field,"action") == 0 )
			action = hr->getValue(i);
		// need both
		if ( ! action ) continue;
		// no! the /v2/bulk api just has a single action
		if ( isBulkApi ) expression = "*";
		// action before expresion???? set action to NULL then?
		if ( ! expression ) continue;
		//else continue;// { action = NULL; continue; }
		// skip whitespace
		while ( is_wspace_a(*expression) ) expression++;
		while ( is_wspace_a(*action) ) action++;
		// skip if expression is empty
		if ( ! expression[0] ) {
			action = NULL; expression = NULL; continue; }
		// they use "*" instead of "default" so put that back
		if ( expression[0] == '*' ) {
			expression = "default";
			addedDefault = true;
		}
		// deal with it
		long priority = 50;
		// default diffbot api call:
		//char *api = NULL;
		if ( strcasecmp(action,"donotcrawl") == 0 )
			priority = SPIDER_PRIORITY_FILTERED;
		//if ( strcasecmp(action,"donotprocess") == 0 )
		//	api = NULL;
		// a new diffbot url?
		//if ( strncasecmp(action,"http",4) == 0 )
		//api = action;

		// add a mirror of that filter but for manually added,
		// i.e. injected or via add url,
		if ( priority >= 0 ) {
			// purge because might have been the last "default"
			// filter that we did nf-- above on.
			cr->m_regExs [nf].purge();
			// make the priority higher!
			cr->m_regExs [nf].safePrintf("ismanualadd && %s",
						     expression);
			cr->m_spiderPriorities   [nf] = 70;
			cr->m_spiderDiffbotApiUrl[nf].set(action); // appends\0
			cr->m_spiderFreqs[nf]=
				cr->m_collectiveRespiderFrequency;
			nf++;
		}

		// add the new filter
		cr->m_regExs             [nf].set(expression);
		cr->m_spiderPriorities   [nf] = priority;
		cr->m_spiderDiffbotApiUrl[nf].set(action);
		cr->m_spiderFreqs [nf] = cr->m_collectiveRespiderFrequency;
		nf++;

		// NULL out again
		action = NULL;
		expression = NULL;

		if ( nf < MAX_FILTERS ) continue;
		log("crawlbot: too many url filters!");
		break;
	}

	// if no '*' line was provided, add it here
	if ( ! addedDefault ) {
		cr->m_regExs [nf].set("default");
		cr->m_spiderPriorities   [nf] = 50;
		cr->m_spiderDiffbotApiUrl[nf].set(NULL);
		cr->m_spiderFreqs[nf] = cr->m_collectiveRespiderFrequency;
		nf++;
	}

	// update the counts
	cr->m_numRegExs   = nf;
	cr->m_numRegExs2  = nf;
	cr->m_numRegExs3  = nf;
	cr->m_numRegExs10 = nf;
	cr->m_numRegExs5  = nf;
	cr->m_numRegExs6  = nf;
	cr->m_numRegExs7  = nf;
	cr->m_numRegExs11 = nf;

	// set collective respider
	//for ( long i =0 ; i < nf ; i++ )
	//	cr->m_spiderFreqs[i] = cr->m_collectiveRespiderFrequency;

	return true;
}
*/


///////////
//
// SUPPORT for getting the last 100 spidered urls
//
// . sends request to each node
// . each node returns top 100 after scanning spiderdb (cache for speed)
// . master node gets top 100 of the top 100s
// . sends pretty html or json back to socket
// . then user can see why their crawl isn't working
// . also since we are scanning spiderdb indicate how many urls are
//   ignored because they match "ismedia" or "!isonsamedomain" etc. so
//   show each url filter expression then show how many urls matched that.
//   when doing this make the spiderReply null, b/c the purpose is to see
//   what urls
// . BUT url may never be attempted because it matches "ismedia" so that kind
//   of thing might have to be indicated on the spiderdb dump above, not here.
//
//////////

//bool sendPageLast100Urls ( TcpSocket *socket , HttpRequest *hr ) {