open-source-search-engine/main.cpp

//
// Matt Wells, copyright Sep 2001
//

#include "gb-include.h"

#include <sched.h>        // clone()
// declare this stuff up here for call the pread() in our seek test below
//
// maybe we should put this in a common header file so we don't have
// certain files compiled with the platform default, and some not -partap

#include "Version.h" // getVersion()
#include "Mem.h"
#include "Conf.h"
#include "Threads.h"
#include "Hostdb.h"
#include "Indexdb.h"
#include "Posdb.h"
#include "Cachedb.h"
#include "Monitordb.h"
#include "Datedb.h"
#include "Titledb.h"
#include "Revdb.h"
#include "Tagdb.h"
#include "Catdb.h"
#include "Users.h"
//#include "Tfndb.h"
#include "Spider.h"
//#include "Doledb.h"
//#include "Checksumdb.h"
#include "Clusterdb.h"
#include "Sections.h"
#include "Statsdb.h"
#include "UdpServer.h"
#include "PingServer.h"
#include "Repair.h"
#include "DailyMerge.h"
#include "MsgC.h"
#include "HttpServer.h"
#include "Loop.h"
#include "Spider.h"
#include <sys/resource.h>  // setrlimit
#include "Stats.h"
#include "Spider.h"
//#include "GBVersion.h"
#include "Speller.h"       // g_speller
//#include "Thesaurus.h"     // g_thesaurus
//#include "Synonyms.h"      // g_synonyms
#include "Wiki.h"          // g_wiki
#include "Wiktionary.h"    // g_wiktionary
#include "Scraper.h"       // g_scraper
//#include "QueryRouter.h"
#include "Categories.h"
#include "CountryCode.h"
#include "Pos.h"
#include "Title.h"
#include "Speller.h"
//#include "Syncdb.h"

// include all msgs that have request handlers, cuz we register them with g_udp
#include "Msg0.h"
#include "Msg1.h"
#include "Msg4.h"
//#include "Msg6.h"
//#include "Msg7.h"
//#include "Msg11.h"
//#include "Msg12.h"
#include "Msg13.h"
#include "Msg20.h"
#include "Msg22.h"
//#include "Msg23.h"
#include "Msg2a.h"
#include "Msg36.h"
#include "Msg39.h"
#include "Msg40.h"    // g_resultsCache
#include "Msg9b.h"
#include "Msg17.h"
//#include "Msg34.h"
//#include "Msg35.h"
//#include "Msg24.h"
//#include "Msg28.h"
//#include "Msg30.h"
//#include "MsgB.h"
//#include "Msg3e.h"
#include "Parms.h"
//#include "Msg50.h"
//#include "MsgF.h"
//#include "Msg33.h"
//#include "mmseg.h"  // open_lexicon(), etc. for Chinese parsing
//#include "PageTopDocs.h"
//#include "PageNetTest.h"
//#include "Sync.h"
#include "Pages.h"
//#include "Msg1c.h"
//#include "Msg2e.h"
//#include "Msg6a.h"
#include "Unicode.h"

//#include <pthread.h>
#include "AutoBan.h"
//#include "SiteBonus.h"
#include "Msg1f.h"
#include "Profiler.h"
//#include "HashTableT.h"
//#include "Classifier.h"
#include "Blaster.h"
#include "Proxy.h"
//#include "HtmlCarver.h"

//#include "Matchers.h"
#include "linkspam.h"
#include "Process.h"
#include "sort.h"
//#include "SiteBonus.h"
#include "Ads.h"
#include "LanguagePages.h"
//#include "Msg3b.h"
#include "ValidPointer.h"
#include "RdbBuckets.h"
//#include "PageTurk.h"
//#include "QAClient.h"
//#include  "Diff.h"
#include "Placedb.h"
#include "Test.h"
#include "seo.h"
#include "Json.h"
#include "SpiderProxy.h"
//#include "Facebook.h"
//#include "Accessdb.h"

// from qa.cpp
//bool qainject ( ) ;
//bool qatest   ( ) ;

// call this to shut everything down
bool mainShutdown ( bool urgent ) ;
//bool mainShutdown2 ( bool urgent ) ;

bool registerMsgHandlers ( ) ;
bool registerMsgHandlers1 ( ) ;
bool registerMsgHandlers2 ( ) ;
bool registerMsgHandlers3 ( ) ;
// makes a default conf file and saves into confFilename
//void makeNewConf ( int32_t hostId , char *confFilename );

void getPageWrapper ( int fd , void *state ) ;

void allExitWrapper ( int fd , void *state ) ;

//bool QuerySerializeTest( char *ff ); 	// Query.cpp

void rmTest();

int g_inMemcpy=0;

//#ifndef _LARS_
static void dumpTitledb  ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
			   int64_t docId , char justPrintDups ,
			   bool dumpSentences ,
			   bool dumpWords );
//static void dumpTfndb    (char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
//			   bool verify);
static int32_t dumpSpiderdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
			   char printStats , int32_t firstIp );
static void dumpSectiondb( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
static void dumpRevdb    ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);

static void dumpTagdb   ( char *coll,
			  int32_t sfn,
			  int32_t numFiles,
			  bool includeTree,
			  int32_t c,
			  char rec=0,
			  int32_t rdbId = RDB_TAGDB ,
			  char *site = NULL );

static void dumpIndexdb  ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
			   int64_t termId ) ;
void dumpPosdb  ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
		  int64_t termId , bool justVerify ) ;
static void dumpWaitingTree( char *coll );
static void dumpDoledb  ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);

void dumpDatedb   ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
		    int64_t termId , bool justVerify ) ;
void dumpClusterdb       ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
//void dumpChecksumdb      ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree);
//void dumpStatsdb 	 ( int32_t startFileNum, int32_t numFiles, bool includeTree,
//			   int test );

void dumpLinkdb          ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
			   char *url );

void exitWrapper ( void *state ) { exit(0); };

bool g_recoveryMode = false;

int32_t g_recoveryLevel = 0;

bool isRecoveryFutile ( ) ;

int copyFiles ( char *dstDir ) ;

//////
//
// if seo.o is being linked to it needs to override these weak stubs:
//
//////
bool loadQueryLog() __attribute__((weak));
void runSEOQueryLoop ( int fd, void *state ) __attribute__((weak));
bool sendPageSEO(TcpSocket *, HttpRequest *) __attribute__((weak));
void handleRequest8e(UdpSlot *, int32_t netnice ) __attribute__((weak));
void handleRequest4f(UdpSlot *, int32_t netnice ) __attribute__((weak));
void handleRequest95(UdpSlot *, int32_t netnice ) __attribute__((weak));

// make the stubs here. seo.o will override them
bool loadQueryLog() { return true; }
void runSEOQueryLoop ( int fd, void *state ) { return; }
bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {
	return g_httpServer.sendErrorReply(s,500,"Seo support not present"); }
void handleRequest8e(UdpSlot *, int32_t netnice ) {return; }
void handleRequest4f(UdpSlot *, int32_t netnice ) {return; }
void handleRequest95(UdpSlot *, int32_t netnice ) {return; }

char *getcwd2 ( char *arg ) ;

// for cleaning up indexdb
void dumpMissing ( char *coll );
void dumpDups    ( char *coll );
void removeDocIds ( char *coll , char *filename );

static void dumpIndexdbFile ( int32_t fn , int64_t off , char *f , int32_t ks ,
			      char *NAME = NULL );

//static void dumpCachedRecs  ( char *coll,int32_t sfn,int32_t numFiles,bool includeTree,
//			   int64_t docId );
//static bool testBoolean() ;
//static void qaTest(char *s1, char *s2, char *u, char *q);
//static void xmlDiffTest(char *f1, char *f2, DiffOpt *opt);
//void testSpamRules(char *coll,int32_t startFileNum,int32_t numFiles,bool includeTree,
//		   int64_t docid);

//void takeSnapshotWrapper( int status, void *state);

// JAB: warning abatement
//static bool checkDataParity ( ) ;

//#endif

static int32_t checkDirPerms ( char *dir ) ;
//static bool fixTitleRecs( char *coll ) ;
//static int32_t getRecSize ( BigFile *f , int64_t off ) ;
//static bool addToChecksumdb ( char *coll , TitleRec *tr ) ;
//static bool addToSpiderdb   ( char *coll , TitleRec *tr ) ;

//Need these two if tr's in addtospiderdb are getting their quality from
// their root urls.
/*static HashTableT <int64_t,char> s_rootUrls;
  static bool loadRootUrls    ( char *filename);*/
//static bool addToTfndb      ( char *coll , TitleRec  *tr , int32_t id2 ) ;
//static bool addToTfndb2     ( char *coll , SpiderRec *sr , int32_t id2 ) ;
//static bool mergeChecksumFiles ( ) ;
//static bool genDbs    ( char *coll ) ;
//static bool genTfndb  ( char *coll ) ;
//static bool fixTfndb  ( char *coll ) ;
//static bool makeClusterdb ( char *coll ) ;
//static bool genDateRange  ( char *coll ) ;
// diff with indexdb in sync/ dir
//bool syncIndexdb ( );
//bool gbgzip (char *filename);
//bool gbgunzip (char *filename);
//bool trietest    ( ) ;
//bool matchertest ( int argc, char* argv[] );
// benchmark RdbTree::addRecord() for indexdb
bool treetest    ( ) ;
bool bucketstest ( char *dbname ) ;
bool hashtest    ( ) ;
// how fast to parse the content of this docId?
bool parseTest ( char *coll , int64_t docId , char *query );
//bool carveTest ( uint32_t radius, char *fname, char* query );
bool summaryTest1   ( char *rec, int32_t listSize, char *coll , int64_t docId ,
		      char *query );
//bool summaryTest2   ( char *rec, int32_t listSize, char *coll , int64_t docId ,
//		      char *query );
//bool summaryTest3   ( char *rec, int32_t listSize, char *coll , int64_t docId ,
//		      char *query );

// time a big write, read and then seeks
bool thrutest ( char *testdir , int64_t fileSize ) ;
void seektest ( char *testdir , int32_t numThreads , int32_t maxReadSize ,
		char *filename );

bool pingTest ( int32_t hid , uint16_t clientPort );
bool memTest();
bool cacheTest();
bool ramdiskTest();
void countdomains( char* coll, int32_t numRecs, int32_t verb, int32_t output );

UdpProtocol g_dp; // Default Proto

//void zlibtest ( );

// installFlag konstants
typedef enum {
	ifk_install = 1,
	ifk_start ,
	ifk_installgb ,
	ifk_installgbrcp ,
	ifk_installconf ,
	ifk_gendbs ,
	ifk_fixtfndb ,
	ifk_gentfndb ,
	ifk_installcat,
	ifk_installnewcat,
	ifk_genclusterdb ,
	ifk_distributeC ,
	ifk_installgb2 ,
	ifk_dsh ,
	ifk_dsh2 ,
	ifk_backupcopy ,
	ifk_backupmove ,
	ifk_backuprestore ,
	ifk_proxy_start ,
	ifk_installconf2 ,
	ifk_installcat2 ,
	ifk_kstart ,
	ifk_dstart ,
	ifk_installnewcat2 ,
	ifk_dumpmissing ,
	ifk_removedocids ,
	ifk_dumpdups ,
	//ifk_install2,
	ifk_tmpstart ,
	ifk_installtmpgb ,
	ifk_proxy_kstart ,
	ifk_start2
} install_flag_konst_t;

int install ( install_flag_konst_t installFlag , int32_t hostId ,
	      char *dir = NULL , char *coll = NULL , int32_t hostId2 = -1 ,
	      char *cmd = NULL );
int scale   ( char *newhostsconf , bool useShotgunIp );
int collinject ( char *newhostsconf );
int collcopy ( char *newHostsConf , char *coll , int32_t collnum ) ;

bool doCmd ( const char *cmd , int32_t hostId , char *filename , bool sendToHosts,
	     bool sendToProxies, int32_t hostId2=-1 );
int injectFile ( char *filename , char *ips ,
		 //int64_t startDocId ,
		 //int64_t endDocId ,
		 //bool isDelete ) ;
		 char *coll );
int injectFileTest ( int32_t  reqLen  , int32_t hid ); // generates the file
void membustest ( int32_t nb , int32_t loops , bool readf ) ;

bool dosOpen(int32_t targetIp, uint16_t port, int numSocks);

//void tryMergingWrapper ( int fd , void *state ) ;

void saveRdbs ( int fd , void *state ) ;
bool shutdownOldGB ( int16_t port ) ;
//void resetAll ( );
//void spamTest ( ) ;

extern void resetPageAddUrl    ( );
extern void resetHttpMime      ( );
extern void reset_iana_charset ( );
extern void resetAdultBit      ( );
extern void resetDomains       ( );
extern void resetEntities      ( );
extern void resetQuery         ( );
extern void resetStopWords     ( );
extern void resetUnicode       ( );

extern void tryToSyncWrapper ( int fd , void *state ) ;

#if 0
void stack_test();
void stack_test(){
	char *dummy[7000000];
	dummy[0] = '\0';
	dummy[6999999] = '\0';
	printf("dummy: 0x%x = 0x%x",
	       (unsigned int)&(dummy[0]), (unsigned int)&(dummy[6999999]));

}
#endif

int main2 ( int argc , char *argv[] ) ;

int main ( int argc , char *argv[] ) {

	//fprintf(stderr,"Starting gb.\n");

	int ret = main2 ( argc , argv );

	if ( ret ) fprintf(stderr,"Failed to start gb. Exiting.\n");
}

int main2 ( int argc , char *argv[] ) {

	g_conf.m_runAsDaemon = false;
	g_conf.m_logToFile = false;

#ifndef CYGWIN
	// appears that linux 2.4.17 kernel would crash with this?
	// let's try again on gk127 to make sure
	// YES! gk0 cluster has run for months with this just fine!!
	mlockall(MCL_CURRENT|MCL_FUTURE);
#endif

	//g_timedb.makeStartKey ( 0 );

	// Anchor the stack start point at the first stack variable
	// in main.
	char stackPointTestAnchor;
	g_mem.setStackPointer( &stackPointTestAnchor );

	// record time for uptime
	g_stats.m_uptimeStart = time(NULL);

	// malloc test for efence
	//char *ff = (char *)mmalloc(100,"efence");
	//ff[100] = 1;

	// Begin Pointer Check setup
	//uint32_t firstArg = 0;
	//ValidPointer vpointerObject((void*)&firstArg);
	//vpointerObject.isValidPointer(&vpointerObject); // whiny compiler
	// End Pointer Check setup

	if (argc < 0) {
	printHelp:
		SafeBuf sb;
		sb.safePrintf(
			      "\n"
			      "Usage: gb [CMD]\n");
		sb.safePrintf(
			      "\n"
			      "\tgb will first try to load "
			      "the hosts.conf in the same directory as the "
			      "gb binary. "
			      "Then it will determine its hostId based on "
			      "the directory and IP address listed in the "
			      "hosts.conf file it loaded. Things in []'s "
			      "are optional.");
		/*
		sb.safePrintf(
			      "\n\t"
			 "[hostsConf] is the hosts.conf config file as "
			 "described in overview.html. If not provided then "
			 "it is assumed to be ./hosts.conf. If "
			      "./localhosts.conf exists then that will be "
			      "used instead of ./hosts.conf. That is "
			      "convenient to use since it will not be "
			      "overwritten from git pulls.\n\n" );
		*/
		sb.safePrintf(
			"[CMD] can have the following values:\n\n"

			"-h\tPrint this help.\n\n"
			"-v\tPrint version and exit.\n\n"

			//"<hostId>\n"
			//"\tstart the gb process for this <hostId> locally."
			//" <hostId> is 0 to run as host #0, for instance."
			//"\n\n"


			//"<hostId> -d\n\trun as daemon.\n\n"
			"-d\tRun as daemon.\n\n"

			//"-o\tprint the overview documentation in HTML. "
			//"Contains the format of hosts.conf.\n\n"

			// "<hostId> -r\n\tindicates recovery mode, "
			// "sends email to addresses "
			// "specified in Conf.h upon startup.\n\n"
			// "-r\tindicates recovery mode, "
			// "sends email to addresses "
			// "specified in Conf.h upon startup.\n\n"

			"start [hostId]\n"
			"\tStart the gb process on all hosts or just on "
			"[hostId], if specified, using an ssh command. Runs "
			"each gb process in a keepalive loop under bash.\n\n"

			"start <hostId1-hostId2>\n"
			"\tLike above but just start gb on the supplied "
			"range of hostIds.\n\n"

			"dstart [hostId]\n"
			"\tLike above but do not use a keepalive loop. So "
			"if gb crashes it will not auto-resstart.\n\n"

			/*
			"kstart [hostId]\n"
			"\tstart the gb process on all hosts or just on "
			"[hostId] if specified using an ssh command and "
			"if the gb process cores then restart it. k stands "
			"for keepalive.\n\n"
			*/

			"stop [hostId]\n"
			"\tSaves and exits for all gb hosts or "
			"just on [hostId], if specified.\n\n"

			"stop <hostId1-hostId2>\n"
			"\tTell gb to save and exit on the given range of "
			"hostIds.\n\n"

			"save [hostId]\n"
			"\tJust saves for all gb hosts or "
			"just on [hostId], if specified.\n\n"


			/*
			"tmpstart [hostId]\n"
			"\tstart the gb process on all hosts or just on "
			"[hostId] if specified, but "
			"use the ports specified in hosts.conf PLUS one. "
			"Then you can switch the "
			"proxy over to point to those and upgrade the "
			"original cluster's gb. "
			"That can be done in the Master Controls of the "
			"proxy using the 'use "
			"temporary cluster'. Also, this assumes the binary "
			"name is tmpgb not gb.\n\n"

			"tmpstop [hostId]\n"
			"\tsaves and exits for all gb hosts or "
			"just on [hostId] if specified, for the "
			"tmpstart command.\n\n"
			*/

			"spidersoff [hostId]\n"
			"\tDisables spidering for all gb hosts or "
			"just on [hostId], if specified.\n\n"

			"spiderson [hostId]\n"
			"\tEnables spidering for all gb hosts or "
			"just on [hostId], if specified.\n\n"

			/*
			"cacheoff [hostId]\n"
			"\tdisables all disk PAGE caches on all hosts or "
			"just on [hostId] if specified.\n\n"

			"freecache [maxShmid]\n"
			"\tfinds and frees all shared memory up to shmid "
			"maxShmid, default is 3000000.\n\n"
			*/

			/*
			"ddump [hostId]\n"
			"\tdump all b-trees in memory to sorted files on "
			"disk. "
			"Will likely trigger merges on files on disk. "
			"Restrict to just host [hostId] if given.\n\n"
			*/

			/*
			"pmerge [hostId|hostId1-hostId2]\n"
			"\tforce merge of posdb files "
			"just on [hostId] if specified.\n\n"

			"smerge [hostId|hostId1-hostId2]\n"
			"\tforce merge of sectiondb files "
			"just on [hostId] if specified.\n\n"

			"tmerge [hostId|hostId1-hostId2]\n"
			"\tforce merge of titledb files "
			"just on [hostId] if specified.\n\n"

			"merge [hostId|hostId1-hostId2]\n"
			"\tforce merge of all rdb files "
			"just on [hostId] if specified.\n\n"
			*/

			"dsh <CMD>\n"
			"\tRun this command on the primary IPs of "
			"all active hosts in hosts.conf. It will be "
			"executed in the gigablast working directory on "
			"each host. Example: "
			"gb dsh 'ps auxw; uptime'\n\n"

			/*
			"dsh2 <CMD>\n"
			"\trun this command on the secondary IPs of "
			"all active hosts in hosts.conf. Example: "
			"gb dsh2 'ps auxw; uptime'\n\n"
			*/

			"install [hostId]\n"
			"\tInstall all required files for gb from "
			"current working directory of the gb binary "
			"to [hostId]. If no [hostId] is specified, install "
			"to ALL hosts.\n\n"

			/*
			"install2 [hostId]\n"
			"\tlike above, but use the secondary IPs in the "
			"hosts.conf.\n\n"
			*/

			"installgb [hostId]\n"
			"\tLike above, but install just the gb executable.\n\n"

			"installgbrcp [hostId]\n"
			"\tLike above, but install just the gb executable "
			"and using rcp.\n\n"

			/*
			"installgb2 [hostId]\n"
			"\tlike above, but use the secondary IPs in the "
			"hosts.conf.\n\n"

			"installtmpgb [hostId]\n"
			"\tlike above, but install just the gb executable "
			"as tmpgb (for tmpstart).\n\n"
			*/
			"installconf [hostId]\n"
			"\tlike above, but install hosts.conf and gb.conf\n\n"
			/*
			"installconf2 [hostId]\n"
			"\tlike above, but install hosts.conf and gbN.conf "
			"to the secondary IPs.\n\n"


			"installcat [hostId]\n"
			"\tlike above, but install just the catdb files.\n\n"

			"installcat2 [hostId]\n"
			"\tlike above, but install just the catdb files to "
                        "the secondary IPs.\n\n"

			"installnewcat [hostId]\n"
			"\tlike above, but install just the new catdb files."
			"\n\n"

			"installnewcat2 [hostId]\n"
			"\tlike above, but install just the new catdb files "
			"to the secondary IPs.\n\n"

			"backupcopy <backupSubdir>\n"
			"\tsave a copy of all xml, config, data and map files "
			"into <backupSubdir> which is relative "
			"to the working dir. Done for all hosts.\n\n"

			"backupmove <backupSubdir>\n"
			"\tmove all all xml, config, data and map files "
			"into <backupSubdir> which  is relative "
			"to the working dir. Done for all hosts.\n\n"

			"backuprestore <backupSubdir>\n"
			"\tmove all all xml, config, data and map files "
			"in <backupSubdir>,  which is relative "
			"to the working dir, into the working dir. "
			"Will NOT overwrite anything. Done for all "
			"hosts.\n\n"

			"proxy start [proxyId]\n"
			"\tStart a proxy that acts as a frontend to gb "
			"and passes on requests to random machines on "
			"the cluster given in hosts.conf. Helps to "
			"distribute the load evenly across all machines.\n\n"

			"proxy load <proxyId>\n"
			"\tStart a proxy process directly without calling "
			"ssh. Called by 'gb proxy start'.\n\n"

			"proxy stop [proxyId]\n"
			"\tStop a proxy that acts as a frontend to gb.\n\n"

			"blasterdiff [-v] [-j] [-p] <file1> <file2> "
			"<maxNumThreads> <wait>\n"
			"\tcompare search results between urls in file1 and"
			"file2 and output the search results in the url"
			" from file1 not found in the url from file2 "
			"maxNumThreads is the number of concurrent "
			"comparisons "
			"that should be done at one time and wait is the"
			"time to wait between comparisons.  -v is for "
			"verbose "
			" and -j is to just display links not found and "
			"not "
			"search for them on server2. If you do not want to"
			" use the proxy server "
			"on gk10, use -p\n\n"
			*/

			/*
			"blaster [-l|-u|-i] <file> <maxNumThreads> <wait>\n"
			"\tget documents from the urls given in file. The "
			"-l argument is to "
			"automatically get documents "
			"from the gigablast log file.\n"
			"\t-u means to inject/index the url into gb.\n"
			"\t-i means to inject/index the url into gb AND "
			"add all of its outlinks to\n"
			"\tspiderdb for spidering, "
			"which also entails a DNS lookup on each outlink.\n"
			"\tmaxNumThreads is the"
			" number of concurrent threads at one time and wait "
			" is the time to wait between threads.\n\n"
			*/

			/*
			"scale <newHosts.conf>\n"
			"\tGenerate a script to be called to migrate the "
			"data to the new places. Remaining hosts will "
			"keep the data they have, but it will be "
			"filtered during the next merge operations.\n\n"

			"collcopy <newHosts.conf> <coll> <collnum>\n"
			"\tGenerate a script to copy the collection data on "
			"the cluster defined by newHosts.conf to the "
			"current cluster. Remote network must have "
			"called \"gb ddump\" twice in a row just before to "
			"ensure all of its data is on disk.\n\n"
			*/


			// gb inject <file> <ip:port> [startdocid]
			// gb inject titledb <newhosts.conf> [startdocid]
			"inject <filename> "
			"<ip:port> [collection]\n"
			"\tInject all documents in <filename> into the gb "
			"host at ip:port. File must be in WARC format. "
			"Uses collection of 'main' if not specified. If "
			"ip:port is a hosts.conf file then a round-robin "
			"approach will be used."
			// "Each document listed in the file "
			// "must be preceeded by a valid HTTP mime with "
			// "a Content-Length: field. WARC files are also ok."
			"\n\n"

			/*
			"inject titledb-<DIR> <newhosts.conf> [startdocid]\n"
			"\tInject all pages from all the titledb "
			"files in the <DIR> directory into the appropriate "
			"host defined by the newhosts.conf config file. This "
			"is useful for populating one search engine with "
			"another. "
			"\n\n"

			"injecttest <requestLen> [hostId]\n"
			"\tinject random documents into [hostId]. If [hostId] "
			"not given 0 is assumed.\n\n"

			"ping <hostId> [clientport]\n"
			"\tperforms pings to <hostId>. [clientport] defaults "
			"to 2050.\n\n"
			*/

			/*
			"spellcheck <file>\n"
			"\tspellchecks the the queries in <file>.\n\n"

			"dictlookuptest <file>\n"
			"\tgets the popularities of the entries in the "
			"<file>. Used to only check performance of "
			"getPhrasePopularity.\n\n"

			//"stemmertest <file>\n"
			//"\truns the stemmer on words in <file>.\n\n"

			//"queryserializetest <file>\n"
			//"\tserializes every query in <file> and tracks "
			//"statistics, as well as \t\nverifying consistency; "
			//"takes raw strings or URLs as input\n\n"

			// less common things
			"gendict <coll> [numWordsToDump]\n\tgenerate "
			"dictionary used for spellchecker "
			"from titledb files in collection <coll>. Use "
			"first [numWordsToDump] words.\n\n"
			//#ifndef _LARS_
			//"gendbs <coll> [hostId]\n\tgenerate missing spiderdb, "
			//"tfndb and checksumdb files from titledb files.\n\n"

			//"gentfndb <coll> [hostId]\n\tgenerate missing tfndb. "
			//"titledb disk dumps and tight merges are no "
			//"longer necessary. Also "
			//"generates tfndb from spiderdb. tfndb-saved.dat "
			//"and all tfndb* files in the collection subdir "
			//"must not exist, so move them to a temp dir.\n\n"

			//"fixtfndb <coll> [hostId]\n\tremove tfndb recs "
			//"referring to non-existent titledb recs.\n\n"

			//"genclusterdb <coll> [hostId]\n\tgenerate missing "
			//"clusterdb.\n\n"

			//"gendaterange <coll> [hostId]\n\tgenerate missing "
			//"date range terms in all title recs.\n\n"

			//"update\tupdate titledb0001.dat\n\n"
			//"mergechecksumdb\tmerge checksumdb flat files\n\n"
			"treetest\n\ttree insertion speed test\n\n"

			"bucketstest [dbname]\n\tcompare speed and accuracy of "
			"buckets vs tree in add, getList and deleteList.  "
			"With an argument, test validity of db's saved buckets\n\n"

			"hashtest\n\tadd and delete into hashtable test\n\n"

			"parsetest <docIdToTest> [coll] [query]\n\t"
			"parser speed tests\n\n"
			*/

			/*
			"thrutest [dir] [fileSize]\n\tdisk write/read speed "
			"test\n\n"

			"seektest [dir] [numThreads] [maxReadSize] "
			"[filename]\n"
			"\tdisk seek speed test\n\n"

			"memtest\n"
			"\t Test how much memory we can use\n\n"
			*/

			/*
			// Quality Tests
			"countdomains <coll> <X>\n"
			"\tCounts the domains and IPs in collection coll and "
			"in the first X titledb records.  Results are sorted"
			"by popularity and stored in the log file. \n\n"

			"cachetest\n\t"
			"cache stability and speed tests\n\n"

			"ramdisktest\n\t"
			"test ramdisk functionality\n\n"

			"dosopen <ip> <port> <numThreads>\n"
			"\tOpen  numThreads tcp sockets to ip:port and just "
			"sit there.  For testingthe robustness of gb.\n\n"

			"xmldiff [-td] <file1> <file2>\n"
			"\tTest xml diff routine on file1 and file2.\n"
			"\t-t: only show diffs in tag structure.\n"
			"\t-d: run as daemon.\n"
			"\n"

			"dump e <coll> <UTCtimestamp>\n\tdump all events "
			"as if the time is UTCtimestamp.\n\n"

			"dump es <coll> <UTCtimestamp>\n\tdump stats for "
			"all events as if the time is UTCtimestamp.\n\n"
			*/

			/*
#ifdef _CLIENT_
			//there was <hostId> in this command but it
			// wasn't used in the program, so deleting it from
			// here
			"dump <V> [C [X [Y [Z]]]]\n\tdump a db in "
#else
			*/

			//"dump <db> <collection> [T]\n\tDump a db from disk. "
			"dump <db> <collection>\n\tDump a db from disk. "
			"Example: gb dump t main\n"
			"\t<collection> is the name of the collection.\n"

			"\t<db> is s to dump spiderdb."
			//"set [T] to 1 to print "
			//"new stats. 2 to print old stats. "
			//"T is ip of firstip."
			"\n"

			"\t<db> is t to dump titledb. "
			//"\tT is the first docId to dump. Applies only to "
			//"titledb. "
			"\n"

			"\t<db> is p to dump posdb (the index)."
			//"\tOptional: T is the termid to dump."
			"\n"

			"\t<db> is D to dump duplicate docids in titledb.\n"
			"\t<db> is c to dump checksumdb.\n"
			"\t<db> is S to dump tagdb.\n"
			"\t<db> is W to dump tagdb for wget.\n"
			"\t<db> is x to dump doledb.\n"
			"\t<db> is w to dump waiting tree.\n"
			"\t<db> is B to dump sectiondb.\n"
			"\t<db> is C to dump catdb.\n"
			"\t<db> is l to dump clusterdb.\n"
			"\t<db> is z to dump statsdb all keys.\n"
			"\t<db> is Z to dump statsdb all keys and "
			"data samples.\n"
			"\t<db> is L to dump linkdb.\n"

			/*
			"dump <V> [C [X [Y [Z [T]]]]]\n\tdump a db in "
			//#endif
			"working directory.\n"
			//#ifndef _CLIENT_
			//#ifndef _METALINCS_
			//"\tV is u to dump tfndb.\n"
			"\tV is d to dump datedb.\n"
			//#endif
			//#endif
			"\tV is s to dump spiderdb. set [T] to 1 to print "
			"new stats. 2 to print old stats. T is ip of firstip."
			"\n"
			"\tV is t to dump titledb.\n"
			//"\tV is ts to dump sentences from events.\n"
			//"\tV is tw to dump words from events.\n"
			"\tV is D to dump duplicate docids in titledb.\n"
			"\tV is c to dump checksumdb.\n"
			"\tV is S to dump tagdb.\n"
			"\tV is W to dump tagdb for wget.\n"
			//"\tV is V to dump revdb.\n"
			"\tV is x to dump doledb.\n"
			"\tV is w to dump waiting tree.\n"
			"\tV is B to dump sectiondb.\n"
			"\tV is C to dump catdb.\n"
			"\tV is l to dump clusterdb.\n"
			"\tV is z to dump statsdb all keys.\n"
			"\tV is Z to dump statsdb all keys and data samples.\n"
			"\tV is L to dump linkdb.\n"
			//"\tV is u to dump tfndb.\n"
			//"\tV is vu to verify tfndb.\n"
			"\tC is the name of the collection.\n"
			"\tX is start file num.    (default  0)\n"
			"\tY is num files.         (default -1)\n"
			"\tZ is 1 to include tree. (default  1)\n"
			//#ifndef _CLIENT_
			//#ifndef _METALINCS_
			//#ifndef _GLOBALSPEC_
			"\tT is the termid to dump. Applies only to indexdb.\n"
			//#endif
			//#endif
			//#endif
			"\tT is the first docId to dump. Applies only to "
			"titledb. "
			//"(default none)\n\n"
			"\tV is c to dump cached recs.\n"

			"\n"


			"dump s [X [Y [Z [C]]]\n"
			"\tdump spider in working directory.\n"
			"\tC is the collection name.       (default  none)\n"
			"\tX is start file num.            (default  0)\n"
			"\tY is num files.                 (default -1)\n"
			"\tZ is 1 to include tree.         (default  1)\n"
			//"\tA is 1 for new urls, 0 for old. (default  1)\n"
			//"\tA is -1 to dump all urls in all queues.\n"
			//"\tB is priority of urls.          (default -1)\n"
			//"\tB is -1 to dump all priorities\n"
			"\tC is 1 to just show the stats.  (default  0)\n"
			"\n"
			*/


			//"dump i X Y Z t\n\tdump indexdb termId t in working "
			//"directory.\n"
			//"\tX is start file num.     (default  0)\n"
			//"\tY is num files.          (default -1)\n"
			//"\tZ is 1 to include tree.  (default  1)\n"
			//"\tt is the termid to dump. (default none)\n\n"
			//#ifndef _CLIENT_
			//#ifndef _METALINCS_
			/*
			"dump I [X [V]]\n\tdump indexdb in working "
			"directory at "
			"an offset.\n"
			//#endif
			//#endif
			"\tX is the file NAME.      (default  NULL)\n"
			"\tV is the start offset.   (default  0)\n"
			*/
			/*
			"\n"
			"dumpmissing <coll> [hostId]\n\t"
			"dump the docIds in indexdb but not "
			"in tfndb/titledb to stderr. "
			" Used for passing in to removedocids.\n"
			"\n"

			"dumpdups <coll> [hostId]\n\t"
			"dump the docIds in duplicated in indexdb when "
			"they should not be to stderr. Usually a sign "
			"of mis-indexing. Used for passing in to "
			"removedocids.\n"
			"\n"

			"removedocids <coll> <fileOfDocIds> "
			"[hostId|hostId1-hostId2]"
			"\n\tremoves the docids in fileOfDocIds from indexdb, "
			"clusterdb, checksumdb and tfndb. Effectively "
			"completely deleting that docid. "
			"fileOfDocIds contains one "
			"docId per line, and nothing more.\n"
			"\n"

			"setnote <hostid> <note>"
			"\n\tsets the note for host with hostid <hostid> to "
			"the given note <note>.\n"
			"\n"

			"setsparenote <spareid> <note>"
			"\n\tsets the note for spare with spareid <spareid> to "
			"the given note <note>.\n"
			"\n"

			"replacehost <hostid> <spareid>"
			"\n\treplaces host with hostid <hostid> with the "
			"spare that has the spareid <spareid>.  the host "
			"being replaced should already be shut down or dead.\n"
			"\n"

			"synchost <hostid>"
			"\n\trecopies this host from its twin. host directory "
			"must be empty and the host must be marked as dead "
			"in the current gb. Use synchost2 to use secondary "
			"IPs.\n"
			"\n"
			*/
			//#endif
			);
		SafeBuf sb2;
		sb2.brify2 ( sb.getBufStart() , 60 , "\n\t" , false );
		fprintf(stdout,"%s",sb2.getBufStart());
		// disable printing of used memory
		g_mem.m_used = 0;
		return 0;
	}

	//SafeBuf tt;
	//tt.base64Encode("any carnal pleas",16);
	//fprintf(stderr,"%s\n",tt.getBufStart());
	//exit(0);

	// get hosts.conf file
	//char *hostsConf = "./hosts.conf";
	//int32_t hostId = -1;
	int32_t  cmdarg = 0;
	//char *workingDir = NULL;
	//if(argc >= 3 && argv[1][0]=='-'&&argv[1][1]=='w'&&argv[1][2]=='\0') {
	// 	//hostsConf = argv[2];
	// 	workingDir = argv[2];
	// 	cmdarg    = 3;
	// }

	// get command
	//if ( argc <= cmdarg ) goto printHelp;
	// it might not be there, might be a simple "./gb"
	char *cmd = "";
	if ( argc >= 2 ) {
		cmdarg = 1;
		cmd = argv[1];
	}

	char *cmd2 = "";
	if ( argc >= 3 )
		cmd2 = argv[2];

	int32_t arch = 64;
	if ( sizeof(char *) == 4 ) arch = 32;

	// help
	if ( strcmp ( cmd , "-h" ) == 0 ) goto printHelp;
	// version
	if ( strcmp ( cmd , "-v" ) == 0 ) {
		fprintf(stdout,"Gigablast Version: %s\n",getVersion());
		fprintf(stdout,"Gigablast Architecture: %"INT32"-bit\n",arch);
	//	fprintf(stderr,"Gigablast %s\nMD5KEY: %s\n"
	//		"TAG: %s\nPATH:   %s\n",
	//		GBVersion, GBCommitID, GBTag, GBBuildPath);
		return 0;
	}

	// print overview
	//if ( strcmp ( cmd , "-o" ) == 0 ) {
	//	//printOverview ( );
	//	return 0;
	//}

	//bool hadHostId = false;
 	// assume our hostId is the command!
	// now we advance 'cmd' past the hostId if we detect
	// the presence of more args.
	// WE NO LONGER do it this way...
	// if ( is_digit(argv[cmdarg][0]) ) {
	// 	hostId = atoi(argv[cmdarg]);
	// 	if(argc > cmdarg+1) {
	// 		cmd = argv[++cmdarg];
	// 	}
	// 	hadHostId = true;
	// }

	if ( strcmp ( cmd , "dosopen" ) == 0 ) {
		int32_t ip;
		int16_t port = 8000;
		int32_t numSockets = 100;
		if ( cmdarg + 1 < argc )
			ip = atoip(argv[cmdarg+1],gbstrlen(argv[cmdarg+1]));
		else goto printHelp;
		if ( cmdarg + 2 < argc )
			port = (int16_t)atol ( argv[cmdarg+2] );
		if ( cmdarg + 3 < argc )
			numSockets = atol ( argv[cmdarg+3] );

		return dosOpen(ip, port, numSockets);
	}

	//SafeBuf sb;
	//char *str = "fun glassblowing now";
	//sb.truncateLongWords ( str , strlen(str),10);

	//send an email on startup for -r, like if we are recovering from an
	//unclean shutdown.
	g_recoveryMode = false;
	char *cc = NULL;
	if ( strncmp ( cmd , "-r" ,2 ) == 0 ) cc = cmd;
	if ( strncmp ( cmd2 , "-r",2 ) == 0 ) cc = cmd2;
	if ( cc ) {
		g_recoveryMode = true;
		g_recoveryLevel = 1;
		if ( cc[2] ) g_recoveryLevel = atoi(cc+2);
		if ( g_recoveryLevel < 0 ) g_recoveryLevel = 0;
	}

	// run as daemon? then we have to fork
	if ( strcmp ( cmd , "-d" ) == 0 ) g_conf.m_runAsDaemon = true;
	if ( strcmp ( cmd2 , "-d" ) == 0 ) g_conf.m_runAsDaemon = true;

	if ( strcmp ( cmd , "-l" ) == 0 ) g_conf.m_logToFile = true;
	if ( strcmp ( cmd2 , "-l" ) == 0 ) g_conf.m_logToFile = true;

	bool testMandrill = false;
	if ( strcmp ( cmd , "emailmandrill" ) == 0 ) {
		testMandrill = true;
	}

	/*
	class foo {
	public:
		int32_t poo;
	};
	class fart {
	public:
		int16_t fart3;
		char fart1;
		char fart2;
	};
	foo xxx;
	xxx.poo = 38123;
	fart *yyy = (fart *)&xxx;
	fprintf(stderr,"fart1=%"INT32" fart2=%"INT32" fart3=%"INT32"\n",
		(int32_t)yyy->fart1,(int32_t)yyy->fart2,(int32_t)yyy->fart3);
	exit(0);
	*/

	// gb gendbs, preset the hostid at least
	if ( //strcmp ( cmd , "gendbs"   ) == 0 ||
	     //strcmp ( cmd , "gentfndb" ) == 0 ||
	     //strcmp ( cmd , "fixtfndb" ) == 0 ||
	     strcmp ( cmd , "dumpmissing" ) == 0 ||
	     strcmp ( cmd , "dumpdups" ) == 0 ||
	     //strcmp ( cmd , "gencatdb" ) == 0 ||
	     //strcmp ( cmd , "genclusterdb" ) == 0 ||
	     //strcmp ( cmd , "gendaterange" ) == 0 ||
	     strcmp ( cmd , "distributeC" ) == 0 ) {
		// ensure we got a collection name after the cmd
		if ( cmdarg + 2 >  argc ) goto printHelp;
		// may also have an optional hostid
		//if ( cmdarg + 3 == argc ) hostId = atoi ( argv[cmdarg+2] );
	}

	if( (strcmp( cmd, "countdomains" ) == 0) &&  (argc >= (cmdarg + 2)) ) {
		uint32_t tmp = atoi( argv[cmdarg+2] );
		if( (tmp * 10) > g_mem.m_memtablesize )
		g_mem.m_memtablesize = tmp * 10;
	}

	// set it for g_hostdb and for logging
	//g_hostdb.m_hostId = hostId;

	//if ( strcmp ( cmd , "gzip" ) == 0 ) {
	//	if ( argc > cmdarg+1 ) gbgzip(argv[cmdarg+1]);
	//	else goto printHelp;
	//	return 0;
	//}

	//if ( strcmp ( cmd , "gunzip" ) == 0 ) {
	//	if ( argc > cmdarg+1 ) gbgunzip(argv[cmdarg+1]);
	//	else goto printHelp;
	//	return 0;
	//}

	// these tests do not need a hosts.conf
	/*
	if ( strcmp ( cmd , "trietest" ) == 0 ) {
		trietest();
		return 0;
	}
	if (strcmp ( cmd, "matchertest" ) == 0 ) {
		matchertest(argc - 2, argv + 2);
		return 0;
	}
	*/

	/*
	char cmd3[2048];
	snprintf(cmd3,2047,
		 "ulimit -v 25000  ; "
		 "ulimit -t 30 ; "
		 "ulimit -a; "
		 "export ANTIWORDHOME=%s/antiword-dir ; "
		 "rm poo.txt ; "
		 "timeout 10s nice -n 19 %s/antiword %s> %s" ,
		 "/home/mwells/master-testing/" ,
		 "/home/mwells/master-testing/" ,
		 "/home/mwells/testing/poo.doc",
		 "/home/mwells/master-testing/poo.txt ; "
		 "cat poo.txt"
		 );
	system(cmd3);
	exit(-1);
	*/

	if ( strcmp ( cmd , "bucketstest" ) == 0 ) {
		if ( argc > cmdarg+1 ) bucketstest(argv[cmdarg+1]);
		else if( argc == cmdarg+1 ) bucketstest(NULL);
		else goto printHelp;
		return 0;
	}

	// these tests do not need a hosts.conf
	if ( strcmp ( cmd , "treetest" ) == 0 ) {
		if ( argc > cmdarg+1 ) goto printHelp;
		treetest();
		return 0;
	}
	// these tests do not need a hosts.conf
	if ( strcmp ( cmd , "hashtest" ) == 0 ) {
		if ( argc > cmdarg+1 ) goto printHelp;
		hashtest();
		return 0;
	}
	// these tests do not need a hosts.conf
	if ( strcmp ( cmd , "memtest" ) == 0 ) {
		if ( argc > cmdarg+1 ) goto printHelp;
		memTest();
		return 0;
	}
	if ( strcmp ( cmd , "cachetest" ) == 0 ) {
		if ( argc > cmdarg+1 ) goto printHelp;
		cacheTest();
		return 0;
	}
	if ( strcmp ( cmd , "ramdisktest" ) == 0 ) {
		if ( argc > cmdarg+1 ) goto printHelp;
		ramdiskTest();
		return 0;
	}
	if ( strcmp ( cmd , "parsetest"  ) == 0 ) {
		if ( cmdarg+1 >= argc ) goto printHelp;
		// load up hosts.conf
		//if ( ! g_hostdb.init(hostId) ) {
		//	log("db: hostdb init failed." ); return 1; }
		// init our table for doing zobrist hashing
		if ( ! hashinit() ) {
			log("db: Failed to init hashtable." ); return 1; }

		int64_t docid = atoll1(argv[cmdarg+1]);
		char *coll   = "";
		char *query  = "";
		if ( cmdarg+3 <= argc ) coll  = argv[cmdarg+2];
		if ( cmdarg+4 == argc ) query = argv[cmdarg+3];
		parseTest( coll, docid, query );
		return 0;
	}

	/*
        if ( strcmp ( cmd , "carvetest"  ) == 0 ) {
		 if ( ! g_hostdb.init(hostsConf, hostId) ) {
		 	log("db: hostdb init failed." ); return 1; }
		 if ( ! hashinit() ) {
		 	log("db: Failed to init hashtable." ); return 1; }
		if (!ucInit(g_hostdb.m_dir)) {
			log("Unicode initialization failed!");
			return 1;
		}
		if (cmdarg+2 >= argc) {
			log("usage: gb carvetest qt1 ..." ); return 2; }
		uint32_t radius = atoi(argv[cmdarg+1]);
		char* fname = argv[cmdarg+2];
		char buf[65535];
		*buf = '\0';
		int virgin = 1;
		for (int i = cmdarg+3; i < argc; i++) {
			if (!virgin)
				strcat(buf, " ");
			else
				virgin = 0;
			strcat(buf, argv[i]);
		}
		printf("file: '%s' query: '%s'\n", fname, buf);
		carveTest(radius, fname, buf);
		return 0;
	}
	*/

	if ( strcmp ( cmd , "booltest" ) == 0 ){
		//if ( ! g_hostdb.init(hostId) ) {
		//	log("db: hostdb init failed." ); return 1; }
		// init our table for doing zobrist hashing
		if ( ! hashinit() ) {
			log("db: Failed to init hashtable." ); return 1; }
		if (!ucInit(g_hostdb.m_dir)) {
			log("Unicode initialization failed!");
			return 1;
		}
		//testBoolean();
		return 0;

	}

	/*
	//  test json parser error with bad json
	Json jp;
	char xxx[1024];
	//sprintf(xxx,"\"categories\":[\"shop\"");
	sprintf(xxx,"\"too small\"");
	jp.parseJsonStringIntoJsonItems(xxx,0);
	JsonItem *ji = jp.getFirstItem();
	for ( ; ji ; ji = ji->m_next ) {
		if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
			continue;
	}
	*/


	/*
	if ( strcmp ( cmd , "querytest" ) == 0){
		if ( ! g_hostdb.init(hostsConf, hostId) ) {
			log("db: hostdb init failed." ); return 1; }
		// init our table for doing zobrist hashing
		if ( ! hashinit() ) {
			log("db: Failed to init hashtable." ); return 1; }
		if (!ucInit(g_hostdb.m_dir)) {
			log("Unicode initialization failed!");
			return 1;
		}
		queryTest();
		return 0;

	}
	*/

	if ( strcmp ( cmd ,"isportinuse") == 0 ) {
		if ( cmdarg+1 >= argc ) goto printHelp;
		int port = atol ( argv[cmdarg+1] );
		// make sure port is available. returns false if in use.
		if ( ! g_httpServer.m_tcp.testBind(port,false) )
			// and we should return with 1 so the keep alive
			// script will exit
			exit (1);
		// port is not in use, return 0
		exit(0);
	}

	// need threads here for tests?

	// gb thrutest <testDir> <fileSize>
	if ( strcmp ( cmd , "thrutest" ) == 0 ) {
		if ( cmdarg+2 >= argc ) goto printHelp;
		char     *testdir         = argv[cmdarg+1];
		int64_t fileSize        = atoll1 ( argv[cmdarg+2] );
		thrutest ( testdir , fileSize );
		return 0;
	}
	// gb seektest <testdir> <numThreads> <maxReadSize>
	if ( strcmp ( cmd , "seektest" ) == 0 ) {
		char     *testdir         = "/tmp/";
		int32_t      numThreads      = 20; //30;
		int64_t maxReadSize     = 20000;
		char     *filename        = NULL;
		if ( cmdarg+1 < argc ) testdir     = argv[cmdarg+1];
		if ( cmdarg+2 < argc ) numThreads  = atol(argv[cmdarg+2]);
		if ( cmdarg+3 < argc ) maxReadSize = atoll1(argv[cmdarg+3]);
		if ( cmdarg+4 < argc ) filename    = argv[cmdarg+4];
		seektest ( testdir , numThreads , maxReadSize , filename );
		return 0;
	}

	/*
	if ( strcmp ( cmd, "qa" ) == 0 ) {
		if ( ! g_hostdb.init(hostsConf, hostId) ) {
			log("db: hostdb init failed." ); return 1; }
		// init our table for doing zobrist hashing
		if ( ! hashinit() ) {
			log("db: Failed to init hashtable." ); return 1; }
		if (!ucInit(g_hostdb.m_dir)) {
			log("Unicode initialization failed!");
			return 1;
		}
		char *s1 = NULL;
		char *s2 = NULL;
		char *u = NULL;
		char *q = NULL;

		if ( cmdarg+1 < argc ) s1 = argv[cmdarg+1];
		if ( cmdarg+2 < argc ) s2 = argv[cmdarg+2];
		if ( cmdarg+3 < argc ) u  = argv[cmdarg+3];
		if ( cmdarg+4 < argc ) q  = argv[cmdarg+4];

		qaTest(s1, s2, u, q);
		return 0;
	}
	// gb xmldiff file1 file2
	if (strcmp ( cmd, "xmldiff" )  == 0 ) {
		if ( cmdarg+2 >= argc ) goto printHelp;
		// init our table for doing zobrist hashing
		if ( ! g_hostdb.init(hostsConf, hostId) ) {
			log("db: hostdb init failed." ); return 1; }
		if ( ! hashinit() ) {
			log("db: Failed to init hashtable." ); return 1; }
		if (!ucInit(g_hostdb.m_dir)) {
			log("Unicode initialization failed!");
			return 1;
		}
		DiffOpt opt;
		int nextArg = cmdarg+1;
		while ( argc > nextArg && argv[nextArg][0] == '-'){
			char *c = argv[nextArg] + 1;
			while (*c){
				switch(*c++){
				case 't': opt.m_tagOnly = true; break;
				case 'd': opt.m_debug++       ; break;
				case 'c': opt.m_context++     ; break;
				default: goto printHelp;
				}
			}
			nextArg++;
		}
		if ( nextArg+1 >= argc ) goto printHelp;
		char *file1         = argv[nextArg  ];
		char *file2         = argv[nextArg+1];
		xmlDiffTest(file1, file2, &opt);
		return 0;
	}
	*/

	// note the stack size for debug purposes
	struct rlimit rl;
	getrlimit(RLIMIT_STACK, &rl);
	log(LOG_INFO,"db: Stack size is %"INT64".", (int64_t)rl.rlim_cur);


	// let's ensure our core file can dump
	struct rlimit lim;
	lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
	if ( setrlimit(RLIMIT_CORE,&lim) )
		log("db: setrlimit: %s.", mstrerror(errno) );
	// limit fds
	// try to prevent core from systems where it is above 1024
	// because our FD_ISSET() libc function will core! (it's older)
	int32_t NOFILE = 1024;
	lim.rlim_cur = lim.rlim_max = NOFILE;
	if ( setrlimit(RLIMIT_NOFILE,&lim))
		log("db: setrlimit RLIMIT_NOFILE %"INT32": %s.",
		    NOFILE,mstrerror(errno) );
	struct rlimit rlim;
	getrlimit ( RLIMIT_NOFILE,&rlim);
	if ( (int32_t)rlim.rlim_max > NOFILE || (int32_t)rlim.rlim_cur > NOFILE ) {
		log("db: setrlimit RLIMIT_NOFILE failed!");
		char *xx=NULL;*xx=0;
	}

	// set the s_pages array for print admin pages
	g_pages.init ( );

	bool isProxy = false;
	if ( strcmp( cmd , "proxy" ) == 0 &&
	     strcmp( argv[cmdarg+1] , "load" ) == 0 ) {
		isProxy = true;
		// we need to parse out the hostid too!
		//if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] );
	}

	// this is just like starting up a gb process, but we add one to
	// each port, we are a dummy machine in the dummy cluster.
	// gb -w <workingdir> tmpstart [hostId]
	char useTmpCluster = 0;
	if ( strcmp ( cmd , "tmpstart" ) == 0 )
		useTmpCluster = 1;
	// gb -w <workingdir> tmpstop [hostId]
	if ( strcmp ( cmd , "tmpstop" ) == 0 )
		useTmpCluster = 1;
	// gb -w <workingdir> tmpstarthost <hostId>
	if ( strcmp ( cmd , "tmpstarthost" ) == 0 ) {
		useTmpCluster = 1;
		// we need to parse out the hostid too!
		//if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		//else goto printHelp;
	}

	// gb inject <file> <ip:port> [startdocid]
	// gb inject titledb-coll.main.0 <newhosts.conf> [startdocid]
	// gb inject titledb-somedir <newhosts.conf> [startdocid]
	// gb inject titledb-coll.foobar.5 <newhosts.conf> [startdocid]
	if ( strcmp ( cmd , "inject"  ) == 0 ) {
		if ( argc != cmdarg+3 &&
		     argc != cmdarg+4 &&
		     argc != cmdarg+5 )
			goto printHelp;
		char *file = argv[cmdarg+1];
		char *ips  = argv[cmdarg+2];
		char *coll = argv[cmdarg+3];
		// int64_t startDocId = 0LL;
		// int64_t endDocId   = DOCID_MASK;
		// if ( cmdarg+3 < argc ) startDocId = atoll(argv[cmdarg+3]);
		// if ( cmdarg+4 < argc ) endDocId   = atoll(argv[cmdarg+4]);
		//injectFile ( file , ips , startDocId , endDocId , false );
		injectFile ( file , ips , coll );
		return 0;
	}

	//
	// get current working dir that the gb binary is in. all the data
	// files should in there too!!
	//
	//if ( ! workingDir ) workingDir = getcwd2 ( argv[0] );
	char *workingDir = getcwd2 ( argv[0] );
	if ( ! workingDir ) {
		fprintf(stderr,"could not get working dir. Exiting.\n");
		return 1;
	}

	//log("host: working directory is %s",workingDir);


	// load up hosts.conf
	// . it will determine our hostid based on the directory path of this
	//   gb binary and the ip address of this server
	if ( ! g_hostdb.init(-1, // we don't know it!!!hostId,
			     NULL,
			     isProxy,
			     useTmpCluster,
			     workingDir)){
		log("db: hostdb init failed." ); return 1; }

	Host *h9 = g_hostdb.m_myHost;

	// set clock file name so gettimeofdayInMmiilisecondsGlobal()
	// see g_clockInSync to be true... unles clockadjust.dat is more
	// than 2 days old in which case not!
	if ( g_hostdb.m_myHost->m_hostId != 0 ) {
		// host #0 does not need this, everyone syncs with him
		setTimeAdjustmentFilename(g_hostdb.m_dir , "clockadjust.dat");
		// might as well load it i guess
		loadTimeAdjustment();
	}

	// the supporting network, used by gov.gigablast.com to get link text
	// from the larger main index. g_hostdb2. we don't care if this load
	// fails or not.
	//char h2[128];
	//sprintf ( h2 , "%shosts2.conf" , g_hostdb.m_dir );
	//if ( ! g_hostdb2.init(h2, 0 ,"external") ) {
	//	log("db: hosts2.conf hostdb init failed." ); return 1; }
	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log("db: Failed to init hashtable." ); return 1; }
	// . hashinit() calls srand() w/ a fixed number
	// . let's mix it up again
	srand ( time(NULL) );

	// do not save conf if any core dump occurs starting here
	// down to where we set this back to true
	g_conf.m_save = false;


	//
	// run our smoketests
	//
	/*
	if ( strcmp ( cmd, "qa" ) == 0 ||
	     strcmp ( cmd, "qainject" ) == 0 ||
	     strcmp ( cmd, "qaspider" ) == 0 ) {
		// let's ensure our core file can dump
		struct rlimit lim;
		lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
		if ( setrlimit(RLIMIT_CORE,&lim) )
			log("qa::setrlimit: %s", mstrerror(errno) );
		// in build mode we store downloaded http replies in the
		// /qa subdir
		//g_conf.m_qaBuildMode = 0;
		//if (  cmdarg+1 < argc )
		//	g_conf.m_qaBuildMode = atoi(argv[cmdarg+1]);
		// 50MB
		g_conf.m_maxMem = 50000000;
		// init our table for doing zobrist hashing
		if ( ! hashinit() ) {
			log("qa::hashinit failed" ); return 0; }
		// init memory class after conf since it gets maxMem from Conf
		if ( ! g_mem.init ( 200000000 ) ) {
			log("qa::Mem init failed" ); return 0; }
		if (!ucInit(g_hostdb.m_dir)) {
			log("Unicode initialization failed!");
			return 1;
		}
		g_conf.m_askRootNameservers = true;
		//g_conf.m_dnsIps  [0]    = atoip ( "192.168.0.1", 11 );
		//g_conf.m_dnsClientPort  = 9909;
		g_conf.m_dnsMaxCacheMem = 1024*10;
		// hack http server port to -1 (none)
		//g_conf.m_httpPort           = 0;
		g_conf.m_httpMaxSockets     = 200;
		//g_conf.m_httpMaxReadBufSize = 102*1024*1024;
		g_conf.m_httpMaxSendBufSize = 16*1024;
		// init the loop
		if ( ! g_loop.init() ) {
			log("qa::Loop init failed" ); return 0; }
		// . then dns client
		// . server should listen to a socket and register with g_loop
		if ( ! g_dns.init(14834)        ) {
			log("qa::Dns client init failed" ); return 0; }
		// . then webserver
		// . server should listen to a socket and register with g_loop
		// . use -1 for both http and https ports to mean do not
		//   listen on any ports. we are a client only.
		if ( ! g_httpServer.init( -1 , -1 ) ) {
			log("qa::HttpServer init failed" ); return 0; }
		// set our new pid
		g_mem.setPid();
		g_threads.setPid();
		g_log.setPid();
		//
		// beging the qaloop
		//
		if ( strcmp(cmd,"qa") == 0 )
			qatest();
		else if ( strcmp(cmd,"qaspider") == 0 )
			qaspider();
		else if ( strcmp(cmd,"qainject") == 0 )
			qainject();

		//
		// wait for some i/o signals
		//
		if ( ! g_loop.runLoop()    ) {
			log("db: runLoop failed." );
			return 1;
		}
		// no error, return 0
		return 0;
	}
	*/


	//Put this here so that now we can log messages
  	if ( strcmp ( cmd , "proxy" ) == 0 ) {
		if (argc < 3){
			goto printHelp;
			exit (1);
		}

		int32_t proxyId = -1;
		if ( cmdarg+2 < argc ) proxyId = atoi ( argv[cmdarg+2] );

		if ( strcmp ( argv[cmdarg+1] , "start" ) == 0 ) {
			return install ( ifk_proxy_start , proxyId );
		}
		if ( strcmp ( argv[cmdarg+1] , "dstart" ) == 0 ) {
			return install ( ifk_proxy_kstart , proxyId );
		}

		else if ( strcmp ( argv[cmdarg+1] , "stop" ) == 0 ) {
			g_proxy.m_proxyRunning = true;
			return doCmd ( "save=1" , proxyId , "master" ,
				       false,//sendtohosts
				       true);//sendtoproxies
		}

		else if ( strcmp ( argv[cmdarg+1] , "replacehost" ) == 0 ) {
			g_proxy.m_proxyRunning = true;
			int32_t hostId = -1;
			int32_t spareId = -1;
			if ( cmdarg + 2 < argc )
				hostId = atoi ( argv[cmdarg+2] );
			if ( cmdarg + 2 < argc )
				spareId = atoi ( argv[cmdarg+3] );
			char replaceCmd[256];
			sprintf(replaceCmd, "replacehost=1&rhost=%"INT32"&rspare=%"INT32"",
				hostId, spareId);
			return doCmd ( replaceCmd, -1, "admin/hosts" ,
				       false,//sendtohosts
				       true);//sendtoproxies
		}

		else if ( proxyId == -1 || strcmp ( argv[cmdarg+1] , "load" ) != 0 ) {
			goto printHelp;
			exit(1);
		}

		int32_t yippyPort;
		if ( g_isYippy ) {
			yippyPort = proxyId;
			proxyId = 0;
		}
		Host *h = g_hostdb.getProxy( proxyId );
		uint16_t httpPort = h->m_httpPort;
		if ( g_isYippy ) httpPort = yippyPort;
		uint16_t httpsPort = h->m_httpsPort;
		//we need udpserver for addurl and udpserver2 for pingserver
		uint16_t udpPort  = h->m_port;
		//uint16_t udpPort2 = h->m_port2;
		// g_conf.m_maxMem = 2000000000;

		if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) {
			log("db: Conf init failed." ); return 1; }

		// init the loop before g_process since g_process
		// registers a sleep callback!
		if ( ! g_loop.init() ) {
			log("db: Loop init failed." ); return 1; }

		//if ( ! g_threads.init()     ) {
		//	log("db: Threads init failed." ); return 1; }

		g_process.init();

		if ( ! g_process.checkNTPD() )
			return log("db: ntpd not running on proxy");

		if ( ! g_isYippy && !ucInit(g_hostdb.m_dir))
			return log("db: Unicode initialization failed!");

		// load speller unifiedDict for spider compression proxy
		//if ( g_hostdb.m_myHost->m_type & HT_SCPROXY )
		//	g_speller.init();

		if ( ! g_udpServer.init( g_hostdb.getMyPort() ,
					 &g_dp,
					 0  , // niceness
					 20000000 ,   // readBufSIze
					 20000000 ,   // writeBufSize
					 20       ,   // pollTime in ms
					 3500     , // max udp slots
					 false    )){ // is dns?
			log("db: UdpServer init failed." ); return 1; }


		if (!g_proxy.initProxy (proxyId, udpPort, 0, &g_dp))
			return log("proxy: init failed");

		// initialize Users
		if ( ! g_users.init()  ){
			log("db: Users init failed. "); return 1;}

		// then statsdb
		if ( ! g_isYippy && ! g_statsdb.init() ) {
			log("db: Statsdb init failed." ); return 1; }

		// init our table for doing zobrist hashing
		if ( ! hashinit() ) {
			log("db: Failed to init hashtable." ); return 1; }

		// Msg13.cpp now uses the address class so it needs this
		//if ( ! initPlaceDescTable ( ) ) {
		//	log("events: places table init failed"); return 1; }

	tryagain:
		if ( ! g_proxy.initHttpServer( httpPort, httpsPort ) ) {
			log("db: HttpServer init failed. Another gb "
			    "already running? If not, try editing "
			    "./hosts.conf to "
			    "change the port from %"INT32" to something bigger. "
			    "Or stop gb by running 'gb stop' or by "
			    "clicking 'save & exit' in the master controls."
			    , (int32_t)httpPort );
			// this is dangerous!!! do not do the shutdown thing
			return 1;
			// just open a socket to port X and send
			// GET /master?save=1
			if ( shutdownOldGB(httpPort) ) goto tryagain;
			log("db: Shutdown failed.");
			return 1;
		}

		//we should save gb.conf right ?
		g_conf.m_save = true;

		// initiazlie Users
		//if ( ! g_users.init()  ){
		//log("db: Users init failed. "); return 1;}

		if ( ! g_loop.runLoop()    ) {
			log("db: runLoop failed." );
			return 1;
		}

		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		return 0;
	}

	if(strcmp(cmd, "catlang") == 0) {
		log(LOG_INFO, "cat: Building the DMOZ category language tables...\n");
		g_categories->initLangTables();
		log(LOG_INFO, "cat: Done.\n");
		return(0);
	}

	if(strcmp(cmd, "catcountry") == 0) {
		// Load categories and generate country table
		char structureFile[256];
		g_conf.m_maxMem = 1000000000LL; // 1G
		//g_mem.m_maxMem  = 1000000000LL; // 1G
		sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
		g_categories = &g_categories1;
		if (g_categories->loadCategories(structureFile) != 0) {
			log("cat: Loading Categories From %s Failed.", structureFile);
			return(0);
		}
		log(LOG_INFO, "cat: Building the DMOZ category country table...\n");
		g_countryCode.createHashTable();
		log(LOG_INFO, "cat: Done.\n");
		return(0);
	}

  	if ( strcmp ( cmd , "blaster" ) == 0 ) {
		int32_t i=cmdarg+1;
		bool isLogFile=false;
		bool injectUrlWithLinks=false;
		bool injectUrl=false;
		int32_t wait = 0;

		if ( strcmp (argv[i],"-l") == 0 ){
			isLogFile=true;
			i++;
		}
		if ( strcmp (argv[i],"-i") == 0 ){
			injectUrlWithLinks=true;
			i++;
		}
		if ( strcmp (argv[i],"-u") == 0 ){
			injectUrl=true;
			i++;
		}

		char *filename = argv[i];
		int32_t maxNumThreads=1;
		if (argv[i+1])  maxNumThreads=atoi(argv[i+1]);
		if (argv[i+2]) wait=atoi(argv[i+2]);
		g_conf.m_maxMem = 2000000000;
		//wait atleast 10 msec before you start again.
		if (wait<1000) wait=10;
		g_blaster.runBlaster (filename,NULL,
					      maxNumThreads,wait,
					      isLogFile,false,false,false,
				      injectUrlWithLinks,
				      injectUrl);
		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		return 0;
	}

	if ( strcmp ( cmd , "blasterdiff" ) == 0 ) {
		int32_t i=cmdarg+1;
		bool verbose=false;
		bool justDisplay=false;
		bool useProxy=true;
		//cycle through the arguments to check for -v,-j,-p
		while (argv[i] && argv[i][0]=='-'){
			if ( strcmp (argv[i],"-v") == 0 ){
				verbose=true;
			}
			else if ( strcmp (argv[i],"-j") == 0 ){
				justDisplay=true;
			}
			else if ( strcmp (argv[i],"-p") == 0){
				useProxy=false;
			}
			i++;
		}

		char *file1 = argv[i];
		char *file2 = argv[i+1];
		int32_t maxNumThreads=1;
		if (argv[i+2])  maxNumThreads=atoi(argv[i+2]);
		int32_t wait;
		if (argv[i+3]) wait=atoi(argv[i+3]);
		//wait atleast 1 sec before you start again.
		if (wait<1000) wait=1000;
		g_blaster.runBlaster(file1,file2,
				     maxNumThreads,wait,false,
				     verbose,justDisplay,useProxy);
		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		return 0;
	}

	// g_conf.init was here

	// now that we have init'd g_hostdb and g_log, call this for an ssh
	//if ( strcmp ( cmd , "gendbs" ) == 0 && cmdarg + 2 == argc )
	//	return install ( ifk_gendbs , -1 , NULL ,
	//			 argv[cmdarg+1] ); // coll
	if( strcmp(cmd, "distributeC") == 0 && cmdarg +2 == argc )
		return install ( ifk_distributeC, -1, NULL, argv[cmdarg+1] );
	//if ( strcmp ( cmd , "gentfndb" ) == 0 && cmdarg + 2 == argc )
	//	return install ( ifk_gentfndb , -1 , NULL ,
	//			 argv[cmdarg+1] ); // coll

	//if ( strcmp ( cmd , "fixtfndb" ) == 0 && cmdarg + 2 == argc )
	//	return install ( ifk_fixtfndb , -1 , NULL ,
	//			 argv[cmdarg+1] ); // coll

	//if ( strcmp ( cmd, "genclusterdb" ) == 0 && cmdarg + 2 == argc )
	//	return install ( ifk_genclusterdb , -1 , NULL ,
	//			 argv[cmdarg+1] ); // coll

	// . dumpmissing <coll> [hostid]
	// . if hostid not there, ssh to all using install()
	if ( strcmp ( cmd, "dumpmissing" ) == 0 && cmdarg + 2 == argc )
		return install ( ifk_dumpmissing , -1 , NULL ,
				 argv[cmdarg+1] ); // coll
	if ( strcmp ( cmd, "dumpdups" ) == 0 && cmdarg + 2 == argc )
		return install ( ifk_dumpdups , -1 , NULL ,
				 argv[cmdarg+1] ); // coll

	// . gb removedocids <coll> <docIdsFilename> [hostid1-hostid2]
	// . if hostid not there, ssh to all using install()
	// . use removedocids below if only running locally
	// . cmdarg+3 can be 4 or 5, depending if [hostid1-hostid2] is present
	// . argc is 5 if [hostid1-hostid2] is present, 4 if not
	if ( strcmp ( cmd, "removedocids" ) == 0 && cmdarg + 3 >= 4 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 3 < argc ) hostId = atoi ( argv[cmdarg+3] );
		// might have a range
		if ( cmdarg + 3 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+3],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return install ( ifk_removedocids ,
						 h1,
						 argv[cmdarg+2], // filename
						 argv[cmdarg+1], // coll
						 h2            );
		}
		// if we had no hostid given, cast to all
		if ( hostId == -1 )
			return install ( ifk_removedocids ,
					 -1            ,  // hostid1
					 argv[cmdarg+2], // filename
					 argv[cmdarg+1], // coll
					 -1            ); // hostid2
		// otherwise, a hostid was given and we will call
		// removedocids() directly below
	}

	// gb ping [hostId] [clientPort]
	if ( strcmp ( cmd , "ping" ) == 0 ) {
		int32_t hostId = 0;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		uint16_t port = 2050;
		if ( cmdarg + 2 < argc )
			port = (uint16_t)atoi ( argv[cmdarg+2] );
		pingTest ( hostId , port );
		return 0;
	}
	// gb injecttest <requestLen> [hostId]
	if ( strcmp ( cmd , "injecttest" ) == 0 ) {
		if ( cmdarg+1 >= argc ) goto printHelp;
		int32_t hostId = 0;
		if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] );
		int32_t reqLen = atoi ( argv[cmdarg+1] );
		if ( reqLen == 0 ) goto printHelp;
		injectFileTest ( reqLen , hostId );
		return 0;
	}
	// gb updatetitledb
	/*
	if ( strcmp ( cmd , "updatetitledb" ) == 0 ) {
		if ( cmdarg+1 != argc ) goto printHelp;
		log(LOG_INIT,"db: *-*-*-* Updating Titledb et al.");
		g_conf.m_spiderdbMinFilesToMerge   = 5;
		g_conf.m_tfndbMaxDiskPageCacheMem      = 0;
		//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
		g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
		//g_conf.m_tfndbMaxTreeMem = 100*1024*1024;
		// . re-write all the keys so that they contain the site and
		//   content hashes in the low bits
		// . there should only be one file for this since we don't
		//   support  negatives
		fixTitleRecs ( "" ); // coll
		return 0;
	}
	*/
	// this is a hack too!
	/*
	if ( strcmp ( cmd , "mergechecksumdb" ) == 0 ) {
		if ( cmdarg+1 != argc ) goto printHelp;
		log(LOG_INIT,"db: *-*-*-* Merging checksumdb flat files.");
		int32_t old = g_conf.m_checksumdbMinFilesToMerge ;
		g_conf.m_checksumdbMinFilesToMerge = 50;
		// set up checksumdb
		g_conf.m_checksumdbMaxTreeMem = 50000000; // 50M
		g_conf.m_maxMem = 1000000000LL; // 1G
		g_mem.m_maxMem  = 1000000000LL; // 1G
		// init it
		if ( ! g_checksumdb.init ( ) ) {
			log("db: Checksumdb init failed for merge." );
			return 1;
		}
		g_collectiondb.init(true);
		g_checksumdb.getRdb()->addRdbBase1 ( "finalmerge" );
		// no, otherwise won't be able to load into tree!
		//g_conf.m_checksumdbMaxTreeMem = 50*1024*1024;
		mergeChecksumFiles();
		// reset so when we save value goes back to original
		g_conf.m_checksumdbMinFilesToMerge = old;
		// save tree to disk
		Rdb *r = g_checksumdb.getRdb();
		r->m_tree.fastSave ( r->getDir()    ,
				     r->m_dbname    , // &m_saveFile ,
				     false          , // useThread   ,
				     NULL           , // this        ,
				     NULL           );// doneSaving ) )
		return 0;
	}
	*/
	/*
	// gb inject <file> <ip:port> [startdocid]
	// gb inject titledb <newhosts.conf> [startdocid]
	if ( strcmp ( cmd , "inject"  ) == 0 ) {
		if ( argc != cmdarg+3 &&
		     argc != cmdarg+4 &&
		     argc != cmdarg+5 )
			goto printHelp;
		char *file = argv[cmdarg+1];
		char *ips  = argv[cmdarg+2];
		int64_t startDocId = 0LL;
		int64_t endDocId   = DOCID_MASK;
		if ( cmdarg+3 < argc ) startDocId = atoll(argv[cmdarg+3]);
		if ( cmdarg+4 < argc ) endDocId   = atoll(argv[cmdarg+4]);
		injectFile ( file , ips , startDocId , endDocId , false );
		return 0;
	}
	*/
	/*
	if ( strcmp ( cmd , "reject"  ) == 0 ) {
		if ( argc != cmdarg+3 &&
		     argc != cmdarg+4 &&
		     argc != cmdarg+5 )
			goto printHelp;
		char *file = argv[cmdarg+1];
		char *ips  = argv[cmdarg+2];
		int64_t startDocId = 0LL;
		int64_t endDocId   = DOCID_MASK;
		//if ( cmdarg+3 < argc ) startDocId = atoll(argv[cmdarg+3]);
		//if ( cmdarg+4 < argc ) endDocId   = atoll(argv[cmdarg+4]);
		injectFile ( file , ips , startDocId , endDocId , true );
		return 0;
	}
	*/
	// gb dsh
	if ( strcmp ( cmd , "dsh" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		//int32_t hostId = -1;
		if ( cmdarg+1 >= argc ) goto printHelp;
		char *cmd = argv[cmdarg+1];
		return install ( ifk_dsh , -1,NULL,NULL,-1, cmd );
	}
	// gb dsh2
	if ( strcmp ( cmd , "dsh2" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		//int32_t hostId = -1;
		if ( cmdarg+1 >= argc ) goto printHelp;
		char *cmd = argv[cmdarg+1];
		return install ( ifk_dsh2 , -1,NULL,NULL,-1, cmd );
	}
	// gb copyfiles, like gb install but takes a dir not a host #
	if ( strcmp ( cmd , "copyfiles" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) goto printHelp;
		char *dir = argv[cmdarg+1];
		return copyFiles ( dir );
	}
	// gb install
	if ( strcmp ( cmd , "install" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t h1 = -1;
		int32_t h2 = -1;
		if ( cmdarg + 1 < argc ) h1 = atoi ( argv[cmdarg+1] );
		// might have a range
		if (cmdarg + 1 < argc && strstr(argv[cmdarg+1],"-") )
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
		return install ( ifk_install , h1 , NULL , NULL , h2 );
	}
	// gb install
	// if ( strcmp ( cmd , "install2" ) == 0 ) {
	// 	// get hostId to install TO (-1 means all)
	// 	int32_t hostId = -1;
	// 	if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
	// 	return install ( ifk_install2 , hostId );
	// }
	// gb installgb
	if ( strcmp ( cmd , "installgb" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installgb , hostId );
	}
	// gb installgbrcp
	if ( strcmp ( cmd , "installgbrcp" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installgbrcp , hostId );
	}
	// gb installgb
	if ( strcmp ( cmd , "installgb2" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installgb2 , hostId );
	}
	// gb installtmpgb
	if ( strcmp ( cmd , "installtmpgb" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installtmpgb , hostId );
	}
	// gb installconf
	if ( strcmp ( cmd , "installconf" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installconf , hostId );
	}
	// gb installconf2
	if ( strcmp ( cmd , "installconf2" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installconf2 , hostId );
	}
	// gb installcat
	if ( strcmp ( cmd , "installcat" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installcat , hostId );
	}
	// gb installcat2
	if ( strcmp ( cmd , "installcat2" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installcat2 , hostId );
	}
	// gb installnewcat
	if ( strcmp ( cmd , "installnewcat" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installnewcat , hostId );
	}
	// gb installnewcat2
	if ( strcmp ( cmd , "installnewcat2" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return install ( ifk_installnewcat2 , hostId );
	}
	// gb start [hostId]
	if ( strcmp ( cmd , "start" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				//
				// default to keepalive start for now!!
				//
				return install ( ifk_kstart , h1,
						 NULL,NULL,h2 );
		}
		// if it is us, do it
		//if ( hostId != -1 ) goto mainStart;
		//
		// default to keepalive start for now!! (was ifk_start)
		//
		return install ( ifk_kstart , hostId );
	}
	// gb astart [hostId] (non-keepalive start)
	if ( strcmp ( cmd , "nstart" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return install ( ifk_start , h1,
						 NULL,NULL,h2 );
		}
		// if it is us, do it
		//if ( hostId != -1 ) goto mainStart;
		return install ( ifk_start , hostId );
	}
	// gb tmpstart [hostId]
	if ( strcmp ( cmd , "tmpstart" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return install ( ifk_tmpstart , h1,
						 NULL,NULL,h2 );
		}
		// if it is us, do it
		//if ( hostId != -1 ) goto mainStart;
		return install ( ifk_tmpstart, hostId );
	}
	if ( strcmp ( cmd , "tmpstop" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return doCmd ( "save=1" , h1 , "master" ,
					       true , //sendtohosts
					       false,//sendtoproxies
					       h2 );
		}
		return doCmd ( "save=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	// gb start2 [hostId]
	if ( strcmp ( cmd , "start2" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return install ( ifk_start2 , h1,
						 NULL,NULL,h2 );
		}
		// if it is us, do it
		//if ( hostId != -1 ) goto mainStart;
		return install ( ifk_start2 , hostId );
	}
	//keep alive start... not!
	if ( strcmp ( cmd , "dstart" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return install ( ifk_dstart , h1,
						 NULL,NULL,h2 );
		}
		return install ( ifk_dstart , hostId );
	}
	if ( strcmp ( cmd , "kstop" ) == 0 ) {
		//same as stop, here for consistency
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return doCmd ( "save=1" , h1 , "master" ,
					       true , //sendtohosts
					       false,//sendtoproxies
					       h2 );
		}
		return doCmd ( "save=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies

	}
	// gb backupcopy [hostId] <backupSubdirName>
	if ( strcmp ( cmd , "backupcopy" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) goto printHelp;
		return install ( ifk_backupcopy , -1 , argv[cmdarg+1] );
	}
	// gb backupmove [hostId] <backupSubdirName>
	if ( strcmp ( cmd , "backupmove" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) goto printHelp;
		return install ( ifk_backupmove , -1 , argv[cmdarg+1] );
	}
	// gb backupmove [hostId] <backupSubdirName>
	if ( strcmp ( cmd , "backuprestore" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) goto printHelp;
		return install ( ifk_backuprestore, -1 , argv[cmdarg+1] );
	}
	// gb scale <hosts.conf>
	if ( strcmp ( cmd , "scale" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) goto printHelp;
		return scale ( argv[cmdarg+1] , true );
	}
	if ( strcmp ( cmd , "collinject" ) == 0 ) {
		if ( cmdarg + 1 >= argc ) goto printHelp;
		return collinject ( argv[cmdarg+1] );
	}
	// gb collcopy <hosts.conf> <coll> <collnum>>
	if ( strcmp ( cmd , "collcopy" ) == 0 ) {
		if ( cmdarg + 4 != argc ) goto printHelp;
		char *hostsconf = argv[cmdarg+1];
		char *coll      = argv[cmdarg+2];
		int32_t  collnum   = atoi(argv[cmdarg+3]);
		return collcopy ( hostsconf , coll , collnum );
	}
	// gb stop [hostId]
	if ( strcmp ( cmd , "stop" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return doCmd ( "save=1" , h1 , "master" ,
					       true , //sendtohosts
					       false,//sendtoproxies
					       h2 );
		}
		return doCmd ( "save=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	// gb save [hostId]
	if ( strcmp ( cmd , "save" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return doCmd ( "js=1" , h1 , "master" ,
					       true , //sendtohosts
					       false,//sendtoproxies
					       h2 );
		}
		return doCmd ( "js=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	// gb spidersoff [hostId]
	if ( strcmp ( cmd , "spidersoff" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return doCmd ( "se=0" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	// gb spiderson [hostid]
	if ( strcmp ( cmd , "spiderson" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return doCmd ( "se=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	// gb cacheoff [hostId]
	if ( strcmp ( cmd , "cacheoff" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return doCmd ( "dpco=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}

	// gb freecache [hostId]
	if ( strcmp ( cmd , "freecache" ) == 0 ) {
		int32_t max = 7000000;
		if ( cmdarg + 1 < argc ) max = atoi ( argv[cmdarg+1] );
		//freeAllSharedMem( max );
		return true;
	}

	// gb ddump [hostId]
	if ( strcmp ( cmd , "ddump" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		return doCmd ( "dump=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	// gb pmerge [hostId]
	if ( strcmp ( cmd , "pmerge" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return doCmd ( "pmerge=1",h1,"master",
					       true , //sendtohosts
					       false ,//sendtoproxiesh2
					       h2 );
		}
		return doCmd ( "pmerge=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	// gb smerge [hostId]
	if ( strcmp ( cmd , "smerge" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return doCmd ( "smerge=1",h1,"master",
					       true , //sendtohosts
					       false ,//sendtoproxies
					       h2 );
		}
		return doCmd ( "smerge=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	// gb tmerge [hostId]
	if ( strcmp ( cmd , "tmerge" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return doCmd ( "tmerge=1",h1,"master",
					       true , //sendtohosts
					       false, //sendtoproxies
					       h2);
		}
		return doCmd ( "tmerge=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	// gb merge [hostId]
	if ( strcmp ( cmd , "merge" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		// might have a range
		if ( cmdarg + 1 < argc ) {
			int32_t h1 = -1;
			int32_t h2 = -1;
			sscanf ( argv[cmdarg+1],"%"INT32"-%"INT32"",&h1,&h2);
			if ( h1 != -1 && h2 != -1 && h1 <= h2 )
				return doCmd ( "merge=1",h1,"master",
					       true , //sendtohosts
					       false,//sendtoproxies
					       h2);
		}
		return doCmd ( "merge=1" , hostId , "master" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}

	// gb setnote <hostid> <note>
	if ( strcmp ( cmd, "setnote" ) == 0 ) {
		int32_t hostId;
		char *note;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		else return false;
		if ( cmdarg + 2 < argc ) note = argv[cmdarg+2];
		else return false;
		char urlnote[1024];
		urlEncode(urlnote, 1024, note, gbstrlen(note));
		log ( LOG_INIT, "conf: setnote %"INT32": %s", hostId, urlnote );
		char setnoteCmd[256];
		sprintf(setnoteCmd, "setnote=1&host=%"INT32"&note=%s",
				    hostId, urlnote);
		return doCmd ( setnoteCmd, -1, "admin/hosts" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}

	// gb setsparenote <spareid> <note>
	if ( strcmp ( cmd, "setsparenote" ) == 0 ) {
		int32_t spareId;
		char *note;
		if ( cmdarg + 1 < argc ) spareId = atoi ( argv[cmdarg+1] );
		else return false;
		if ( cmdarg + 2 < argc ) note = argv[cmdarg+2];
		else return false;
		char urlnote[1024];
		urlEncode(urlnote, 1024, note, gbstrlen(note));
		log(LOG_INIT, "conf: setsparenote %"INT32": %s", spareId, urlnote);
		char setnoteCmd[256];
		sprintf(setnoteCmd, "setsparenote=1&spare=%"INT32"&note=%s",
				    spareId, urlnote);
		return doCmd ( setnoteCmd, -1, "admin/hosts" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}

	// gb replacehost <hostid> <spareid>
	if ( strcmp ( cmd, "replacehost" ) == 0 ) {
		int32_t hostId = -1;
		int32_t spareId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		if ( cmdarg + 2 < argc ) spareId = atoi ( argv[cmdarg+2] );
		char replaceCmd[256];
		sprintf(replaceCmd, "replacehost=1&rhost=%"INT32"&rspare=%"INT32"",
				    hostId, spareId);
		return doCmd ( replaceCmd, -1, "admin/hosts" ,
			       true , //sendtohosts
			       true );//sendtoproxies
	}

	// gb synchost <hostid>
	if ( strcmp ( cmd, "synchost" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		else return false;
		char syncCmd[256];
		sprintf(syncCmd, "synchost=1&shost=%"INT32"", hostId);
		return doCmd ( syncCmd, g_hostdb.m_hostId, "admin/hosts" ,
			       true , //sendtohosts
			       false );//sendtoproxies
	}
	if ( strcmp ( cmd, "synchost2" ) == 0 ) {
		int32_t hostId = -1;
		if ( cmdarg + 1 < argc ) hostId = atoi ( argv[cmdarg+1] );
		else return false;
		char syncCmd[256];
		sprintf(syncCmd, "synchost=2&shost=%"INT32"", hostId);
		return doCmd ( syncCmd, g_hostdb.m_hostId, "admin/hosts" ,
		true, //sendToHosts
		false );// sendtoproxies
	}

	// gb startclassifier coll ruleset [hostId]
	/*
	if ( strcmp ( cmd , "startclassifier" ) == 0 ) {
		int32_t hostId = 0;
		char *coll;
		char *ruleset;
		char *siteListFile = NULL;
		if ( cmdarg + 1 < argc ) coll = argv[cmdarg+1];
		else return false;
		if ( cmdarg + 2 < argc ) ruleset = argv[cmdarg+2];
		else return false;
		if ( cmdarg + 3 < argc ) hostId = atoi ( argv[cmdarg+3] );
		if ( cmdarg + 4 < argc ) siteListFile = argv[cmdarg+4];
		char classifierCmd[512];
		if ( ! siteListFile )
			sprintf(classifierCmd, "startclassifier=1&c=%s"
					       "&ruleset=%s", coll, ruleset);
		else
			sprintf(classifierCmd, "startclassifier=1&c=%s"
					       "&ruleset=%s&sitelistfile=%s",
					       coll, ruleset, siteListFile );
		return doCmd(classifierCmd , hostId , "admin/tagdb" ,
			     true , //sendtohosts
			     false );//sendtoproxies
	}

	// gb stopclassifier [hostId]
	if ( strcmp ( cmd , "stopclassifier" ) == 0 ) {
		char *coll;
		if ( cmdarg + 1 < argc ) coll = argv[cmdarg+1];
		else return false;
		int32_t hostId = 0;
		if ( cmdarg + 2 < argc ) hostId = atoi ( argv[cmdarg+2] );
		char classifierCmd[512];
		sprintf(classifierCmd, "stopclassifier=1&c=%s", coll );
		return doCmd(classifierCmd , hostId , "admin/tagdb" ,
			     true , //sendtohosts
			     false );//sendtoproxies
	}
	*/

	// gb [-h hostsConf] <hid>
	// mainStart:

	// get host info for this host
	//Host *h = g_hostdb.getHost ( hostId );
	//if ( ! h ) { log("db: No host has id %"INT32".",hostId); return 1;}

	// once we are in recoverymode, that means we are being restarted
	// from having cored, so to prevent immediate core and restart
	// ad inifinitum, look got "sigbadhandler" at the end of the
	// last 5 logs in the last 60 seconds. if we see that then something
	// is prevent is from starting up so give up and exit gracefully
	if ( g_recoveryMode && isRecoveryFutile () )
		// exiting with 0 means no error and should tell our
		// keep alive loop to not restart us and exit himself.
		exit (0);


	// HACK: enable logging for Conf.cpp, etc.
	g_process.m_powerIsOn = true;

	// . read in the conf file
	// . this now initializes from a dir and hostId, they should all be
	//   name gbHID.conf
	// . now that hosts.conf has more of the burden, all gbHID.conf files
	//   can be identical
 	if ( ! g_conf.init ( h9->m_dir ) ) { // , h->m_hostId ) ) {
		log("db: Conf init failed." ); return 1; }
	//if ( ! g_hostdb.validateIps ( &g_conf ) ) {
	//	log("db: Failed to validate ips." ); return 1;}
	//if ( ! g_hostdb2.validateIps ( &g_conf ) ) {
	//	log("db: Failed to validate ips." ); return 1;}

	// put in read only mode
	if ( useTmpCluster )
		g_conf.m_readOnlyMode = true;
	if ( useTmpCluster )
		g_conf.m_sendEmailAlerts = false;

	// log how much mem we can use
	//log(LOG_INIT,"conf: Max mem allowed to use is %"INT64"\n",
	//g_conf.m_maxMem);

	// load the language specific pages
	g_languagePages.reloadPages();

	// init the loop, needs g_conf
	if ( ! g_loop.init() ) {
		log("db: Loop init failed." ); return 1; }


	// test the inifinite keep alive bug fix. is recovery futile bug.
	//char *xx=NULL;*xx=0;

	// the new way to save all rdbs and conf
	// if g_process.m_powerIsOn is false, logging will not work, so init
	// this up here. must call after Loop::init() so it can register
	// its sleep callback
	g_process.init();

	// set up the threads, might need g_conf

	// avoid logging threads msgs to stderr if not actually starting up
	// a gb daemon...
	//if(cmd && cmd[0] && ! is_digit(cmd[0]) && ! g_threads.init()     ) {
	//if ( ! g_threads.init()     ) {
	//	log("db: Threads init failed." ); return 1; }

	// gb gendict
	if ( strcmp ( cmd , "gendict" ) == 0 ) {
		// get hostId to install TO (-1 means all)
		if ( argc != cmdarg + 2 &&
		     argc != cmdarg + 3 ) goto printHelp; // take no other args
		char *coll = argv[cmdarg+1];
		// get numWordsToDump
		int32_t  nn = 10000000;
		if ( argc == cmdarg + 3 ) nn = atoi ( argv[cmdarg+2] );
		// . generate the dict files
		// . use the first 100,000,000 words/phrases to make them
		g_speller.generateDicts ( nn , coll );
		return 0;
	}
	if ( strcmp ( cmd , "dumpmissing" ) == 0 ) {
		// got collection and hostid in here
		if ( argc != cmdarg + 3 ) goto printHelp;
		char *coll = argv[cmdarg+1];
		dumpMissing ( coll );
		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		return 0;
	}
	if ( strcmp ( cmd , "dumpdups" ) == 0 ) {
		// got collection and hostid in here
		if ( argc != cmdarg + 3 ) goto printHelp;
		char *coll = argv[cmdarg+1];
		dumpDups ( coll );
		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		return 0;
	}
	// removedocids <coll> <filename> <hostid>
	if ( strcmp ( cmd , "removedocids" ) == 0 ) {
		if ( argc != cmdarg + 4 ) goto printHelp;
		char *coll = argv[cmdarg+1];
		char *file = argv[cmdarg+2];
		removeDocIds ( coll , file );
		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		return 0;
	}


#ifndef _CLIENT_
#ifndef _METALINCS_
	// gb dump i [fileNum] [off]
	if ( strcmp ( cmd , "dump" ) == 0 && argc > cmdarg + 1 &&
	     argv[cmdarg+1][0]=='I')  {

		//if ( ! hadHostId ) {
		//	log("you must supply hostid in the dump cmd");
		//	return 0;
		//}

		int32_t      fileNum = 0;
		int64_t off     = 0LL;
		char     *NAME = NULL;
		//if ( cmdarg + 2 < argc ) fileNum = atoi  (argv[cmdarg+2]);
		if ( cmdarg + 2 < argc ) NAME = argv[cmdarg+2];
		if ( cmdarg + 3 < argc ) off  = atoll1(argv[cmdarg+3]);
		dumpIndexdbFile ( fileNum , off , "indexdb" , 12 , NAME );
		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		return 0;
	}

	if ( strcmp ( cmd , "rmtest" ) == 0 ) {
		rmTest();
		return 0;
	}

	if ( strcmp ( cmd , "dump" ) == 0 && argc > cmdarg + 1 &&
	     argv[cmdarg+1][0]=='T')  {

		//if ( ! hadHostId ) {
		//	log("you must supply hostid in the dump cmd");
		//	return 0;
		//}

		int32_t      fileNum = 0;
		int64_t off     = 0LL;
		if ( cmdarg + 2 < argc ) fileNum = atoi  (argv[cmdarg+2]);
		if ( cmdarg + 3 < argc ) off     = atoll1(argv[cmdarg+3]);
		dumpIndexdbFile ( fileNum , off , "datedb" , 16 );
		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		return 0;
	}
#endif
#endif
	// . gb dump [dbLetter][coll][fileNum] [numFiles] [includeTree][termId]
	// . spiderdb is special:
	//   gb dump s [coll][fileNum] [numFiles] [includeTree] [0=old|1=new]
	//           [priority] [printStats?]
	if ( strcmp ( cmd , "dump" ) == 0 ) {

		// if ( ! hadHostId ) {
		// 	log("you must supply hostid in the dump cmd");
		// 	return 0;
		// }

		//
		// tell Collectiondb, not to verify each rdb's data
		//
		g_dumpMode = true;

		if ( cmdarg+1 >= argc ) goto printHelp;
		int32_t startFileNum =  0;
		int32_t numFiles     = -1;
		int32_t includeTree  =  1;
		int64_t termId  = -1;
		char *coll = "";

		// so we do not log every collection coll.conf we load
		g_conf.m_doingCommandLine = true;

		// we have to init collection db because we need to know if
		// the collnum is legit or not in the tree
		if ( ! g_collectiondb.loadAllCollRecs()   ) {
			log("db: Collectiondb init failed." ); return 1; }

		if ( cmdarg+2 < argc ) coll         = argv[cmdarg+2];
		if ( cmdarg+3 < argc ) startFileNum = atoi(argv[cmdarg+3]);
		if ( cmdarg+4 < argc ) numFiles     = atoi(argv[cmdarg+4]);
		if ( cmdarg+5 < argc ) includeTree  = atoi(argv[cmdarg+5]);
		if ( cmdarg+6 < argc ) {
			char *targ = argv[cmdarg+6];
			if ( is_alpha_a(targ[0]) ) {
				char *colon = strstr(targ,":");
				int64_t prefix64 = 0LL;
				if ( colon ) {
					*colon = '\0';
					prefix64 = hash64n(targ);
					targ = colon + 1;
				}
				// hash the term itself
				termId = hash64n(targ);
				// hash prefix with termhash
				if ( prefix64 )
					termId = hash64(termId,prefix64);
				termId &= TERMID_MASK;
			}
			else {
				termId = atoll1(targ);
			}
		}
		if      ( argv[cmdarg+1][0] == 't' ) {
			int64_t docId = 0LL;
			if ( cmdarg+6 < argc ) docId = atoll1(argv[cmdarg+6]);
			bool justPrintSentences = false;
			bool justPrintWords     = false;
			// support "ts"
			if ( argv[cmdarg+1][1] == 's' )
				justPrintSentences = true;
			// support "tw"
			if ( argv[cmdarg+1][1] == 'w' )
				justPrintWords = true;

			dumpTitledb (coll,startFileNum,numFiles,includeTree,
				     docId,0,
				     justPrintSentences,
				     justPrintWords);

		}
		else if ( argv[cmdarg+1][0] == 'D' ) {
			int64_t docId = 0LL;
			if ( cmdarg+6 < argc ) docId = atoll1(argv[cmdarg+6]);
			dumpTitledb(coll,startFileNum,numFiles,includeTree,
				     docId,1,false,false);
		}
		//else if(argv[cmdarg+1][0] == 'v' && argv[cmdarg+1][1] =='u' )
		//	dumpTfndb   (coll,startFileNum,numFiles,includeTree,1);
		//else if ( argv[cmdarg+1][0] == 'u' )
		//	dumpTfndb   (coll,startFileNum,numFiles,includeTree,0);
		else if ( argv[cmdarg+1][0] == 'w' )
		       dumpWaitingTree(coll);
		else if ( argv[cmdarg+1][0] == 'x' )
			dumpDoledb  (coll,startFileNum,numFiles,includeTree);
		else if ( argv[cmdarg+1][0] == 's' ) {
			//int32_t  isNew    = 1;
			//int32_t  priority = -1;
			char  printStats = 0;
			int32_t firstIp = 0;
			//char *coll     = NULL;
			//if(cmdarg+6 < argc ) isNew    = atol(argv[cmdarg+6]);
			//if(cmdarg+7 < argc ) priority = atol(argv[cmdarg+7]);
			if ( cmdarg+6 < argc ){
				printStats= atol(argv[cmdarg+6]);
				// it could be an ip instead of printstats
				if ( strstr(argv[cmdarg+6],".") ) {
					printStats = 0;
					firstIp = atoip(argv[cmdarg+6]);
				}
			}
			//if ( cmdarg+7 < argc ) coll     = argv[cmdarg+7];
			int32_t ret = dumpSpiderdb ( coll,startFileNum,numFiles,
						  includeTree ,
						  printStats ,
						  firstIp );
			if ( ret == -1 )
				fprintf(stdout,"error dumping spiderdb\n");
		}
		else if ( argv[cmdarg+1][0] == 'B' )
		       dumpSectiondb(coll,startFileNum,numFiles,includeTree);
		else if ( argv[cmdarg+1][0] == 'V' )
		       dumpRevdb(coll,startFileNum,numFiles,includeTree);
		else if ( argv[cmdarg+1][0] == 'S' ) {
			char *site = NULL;
			if ( cmdarg+6 < argc ) site = argv[cmdarg+6];
			dumpTagdb(coll,
				  startFileNum,
				  numFiles,
				  includeTree,
				  0,
				  0,
				  RDB_TAGDB,
				  site);
		}
		else if ( argv[cmdarg+1][0] == 'z' ) {
			char *site = NULL;
			if ( cmdarg+6 < argc ) site = argv[cmdarg+6];
			dumpTagdb  (coll,startFileNum,numFiles,includeTree,0,
				    'z',RDB_TAGDB,site);
		}
		else if ( argv[cmdarg+1][0] == 'A' )
			dumpTagdb  (coll,startFileNum,numFiles,includeTree,0,
				     'A');
		else if ( argv[cmdarg+1][0] == 'a' )
			dumpTagdb  (coll,startFileNum,numFiles,includeTree,0,
				     'D');
		else if ( argv[cmdarg+1][0] == 'G' )
			dumpTagdb  (coll,startFileNum,numFiles,includeTree,0,
				     'G');
		else if ( argv[cmdarg+1][0] == 'W' )
			dumpTagdb  (coll,startFileNum,numFiles,includeTree,1);
		else if ( argv[cmdarg+1][0] == 'C' )
			dumpTagdb  (coll,startFileNum,numFiles,includeTree,0,
				     0,RDB_CATDB);
		else if ( argv[cmdarg+1][0] == 'l' )
			dumpClusterdb (coll,startFileNum,numFiles,includeTree);
		//else if ( argv[cmdarg+1][0] == 'c' )
		//	dumpChecksumdb(coll,startFileNum,numFiles,includeTree);
		//else if ( argv[cmdarg+1][0] == 'z' )
		//	dumpStatsdb(startFileNum,numFiles,includeTree,2);
		//else if ( argv[cmdarg+1][0] == 'Z' )
		//	dumpStatsdb(startFileNum,numFiles,includeTree,4);
		else if ( argv[cmdarg+1][0] == 'L' ) {
			char *url = NULL;
			if ( cmdarg+6 < argc ) url = argv[cmdarg+6];
			dumpLinkdb(coll,startFileNum,numFiles,includeTree,url);
		}
#ifndef _CLIENT_
#ifndef _METALINCS_
#ifndef _GLOBALSPEC_
		else if ( argv[cmdarg+1][0] == 'i' )
			dumpIndexdb (coll,startFileNum,numFiles,includeTree,
				     termId);
		else if ( argv[cmdarg+1][0] == 'p' )
			dumpPosdb (coll,startFileNum,numFiles,includeTree,
				     termId,false);
		else if ( argv[cmdarg+1][0] == 'd' )
			dumpDatedb  (coll,startFileNum,numFiles,includeTree,
				     termId,false);
#endif
#endif
#endif
		/*
		else if      ( argv[cmdarg+1][0] == 'c' ) {
			int64_t docId = 0LL;
			if ( cmdarg+6 < argc ) docId = atoll1(argv[cmdarg+6]);
			dumpCachedRecs (coll,startFileNum,numFiles,includeTree,
					docId);
		}
		*/
		/*
		else if      ( argv[cmdarg+1][0] == 'R' ) {
			int64_t docId = 0LL;
			if ( cmdarg+6 < argc ) docId = atoll1(argv[cmdarg+6]);
			testSpamRules (coll,startFileNum,numFiles,includeTree,
				       docId);
		}
		*/


		else goto printHelp;
		// disable any further logging so final log msg is clear
		g_log.m_disabled = true;
		return 0;
	}

	if( strcmp( cmd, "countdomains" ) == 0 && argc >= (cmdarg + 2) ) {
		char *coll = "";
		int32_t verb;
		int32_t outpt;
		coll = argv[cmdarg+1];
		if( argv[cmdarg+2][0] < 0x30 && argv[cmdarg+2][0] > 0x39 )
			goto printHelp;
		int32_t numRecs = atoi( argv[cmdarg+2] );

		if( argc > (cmdarg + 2) ) verb = atoi( argv[cmdarg+2] );
		else verb = 0;

		if( argc > (cmdarg + 3) ) outpt = atoi( argv[cmdarg+3] );
		else outpt = 0;

		log( LOG_INFO, "cntDm: Allocated Larger Mem Table for: %"INT32"",
		     g_mem.m_memtablesize );
		if (!ucInit(g_hostdb.m_dir)) {
			log("Unicode initialization failed!");
			return 1;
		}

		countdomains( coll, numRecs, verb, outpt );
		g_log.m_disabled = true;
		return 0;
	}

	//log("db: RLIMIT_NOFILE = %"INT32"",(int32_t)rlim.rlim_max);
	//exit(0);
	// . disable o/s's and hard drive's read ahead
	// . set multcount to 16 --> 1 interrupt for every 16 sectors read
	// . multcount of 16 reduces OS overhead by 30%-50% (more throughput)
	// . use hdparm -i to find max mult count
	// . -S 100 means turn off spinning if idle for 500 seconds
	// . this should be done in /etc/rc.sysinit or /etc/sysconfig/harddisks
	//system("hdparm -a 0 -A 0 -m 16 -S 100 /dev/hda");
	//system("hdparm -a 0 -A 0 -m 16 -S 100 /dev/hdb");
	//system("hdparm -a 0 -A 0 -m 16 -S 100 /dev/hdc");
	//system("hdparm -a 0 -A 0 -m 16 -S 100 /dev/hdd");
	//system ("rm /gigablast/*.dat");
	//system ("rm /gigablast/*.map");

	//if ( g_hostdb.m_hostId == 0 ) g_conf.m_logDebugUdp = 1;
	//g_conf.m_spideringEnabled = 1;
	//g_conf.m_logDebugBuild = 1;

	// temp merge test
	//RdbList list;
	//list.testIndexMerge();

	// file creation test, make sure we have dir control
	if ( checkDirPerms ( g_hostdb.m_dir ) < 0 ) return 1;

	// . make sure we have critical files
	// . make sure elvtune is in the /etc/rcS.d/S99local if need be
	//if ( ! checkFiles ( g_hostdb.m_dir ) ) return 1;
	if ( ! g_process.checkFiles ( g_hostdb.m_dir ) ) return 1;

	// load the appropriate dictionaries
	//g_speller.init();
	//if ( !g_speller.init ( ) ) return 1;
	g_errno = 0;
	//g_speller.test ( );
	//exit(-1);
	/*
	char dst[1024];
	char test[1024];
 spellLoop:
	test[0] = '\0';
	gets ( test );
	if ( test[gbstrlen(test)-1] == '\n' ) test[gbstrlen(test)-1] = '\0';
	Query qq;
	qq.set ( test , gbstrlen(test) , NULL , 0 , false );
	if ( g_speller.getRecommendation ( &qq , dst , 1000 ) )
		log("spelling suggestion: %s", dst );
	goto spellLoop;
	*/

	//if ( strcmp ( cmd , "fixtfndb" ) == 0 ) {
	//	char *coll = argv[cmdarg+1];
	//	// clean out tfndb*.dat
	//	fixTfndb ( coll ); // coll
	//}

	// make sure port is available, no use loading everything up then
	// failing because another process is already running using this port
	//if ( ! g_udpServer.testBind ( g_hostdb.getMyPort() ) )
	if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->m_httpPort,
					   true)) // printmsg?
		return 1;

	int32_t *ips;

	//if ( strcmp ( cmd , "gendbs"       ) == 0 ) goto jump;
	//if ( strcmp ( cmd , "gentfndb"     ) == 0 ) goto jump;
	if ( strcmp ( cmd , "gencatdb"     ) == 0 ) goto jump;
	//if ( strcmp ( cmd , "genclusterdb" ) == 0 ) goto jump;
	//	if ( cmd && ! is_digit(cmd[0]) ) goto printHelp;


	log("db: Logging to file %s.",
	    g_hostdb.m_logFilename );

	if ( ! g_conf.m_runAsDaemon )
		log("db: Use 'gb -d' to run as daemon. Example: "
		    "gb -d");

	/*
	// tmp stuff to generate new query log
	if ( ! ucInit(g_hostdb.m_dir, true)) return 1;
	if ( ! g_wiktionary.load() ) return 1;
	if ( ! g_wiktionary.test() ) return 1;
	if ( ! g_wiki.load() ) return 1;
	if ( ! g_speller.init() && g_conf.m_isLive ) return 1;
	if ( ! g_langList.loadLists ( ) ) log("init: loadLists Failed");
	if ( ! loadQueryLog() ) return 1;
	return 0;
	*/

	// start up log file
	if ( ! g_log.init( g_hostdb.m_logFilename )        ) {
		fprintf (stderr,"db: Log file init failed. Exiting.\n" );
		return 1;
	}

	// in case we do not have one, we need it for Images.cpp
	if ( ! makeTrashDir() ) {
		fprintf (stderr,"db: failed to make trash dir. Exiting.\n" );
		return 1;
	}


	g_errno = 0;

	//
	// run as daemon now
	//
	//fprintf(stderr,"running as daemon\n");
	if ( g_conf.m_runAsDaemon ) {
		pid_t pid, sid;
		pid = fork();
		if ( pid < 0 ) exit(EXIT_FAILURE);
		// seems like we core unless parent sets this to NULL.
		// it does not affect the child.
		//if ( pid > 0 ) g_hostdb.m_myHost = NULL;
		// child gets a 0, parent gets the child's pid, so exit
		if ( pid > 0 ) exit(EXIT_SUCCESS);
		// change file mode mask
		umask(0);
		sid = setsid();
		if ( sid < 0 ) exit(EXIT_FAILURE);
		//fprintf(stderr,"done\n");
		// set our new pid
		g_mem.setPid();
		g_threads.setPid();
		g_log.setPid();

		// if we do not do this we don't get sigalarms or quickpolls
		// when running as 'gb -d'
		g_loop.init();
	}

	// initialize threads down here now so it logs to the logfile and
	// not stderr
	//if ( ( ! cmd || !cmd[0]) && ! g_threads.init()     ) {
	//	log("db: Threads init failed." ); return 1; }

	g_log.m_logTimestamps = true;

	// log the version
	log(LOG_INIT,"conf: Gigablast Version: %s",getVersion());
	log(LOG_INIT,"conf: Gigablast Architecture: %"INT32"-bit\n",arch);


	// show current working dir
	log("host: Working directory is %s",workingDir);

	log("host: Using %shosts.conf",g_hostdb.m_dir);

	{
		pid_t pid = getpid();
		log("host: Process ID is %"UINT64"",(int64_t)pid);
	}

	// from Hostdb.cpp
	ips = getLocalIps();
	for ( ; ips && *ips ; ips++ )
		log("host: Detected local ip %s",iptoa(*ips));

	// show it
	log("host: Running as host id #%"INT32"",g_hostdb.m_hostId );


	if (!ucInit(g_hostdb.m_dir, true)) {
		log("Unicode initialization failed!");
		return 1;
	}

	// some tests. the greek letter alpha with an accent mark (decompose)
	/*
	{
		char us[] = {0xe1,0xbe,0x80};
		UChar32 uc = utf8Decode(us);//,&next);
		UChar32 ttt[32];
		int32_t klen = recursiveKDExpand(uc,ttt,256);
		char obuf[64];
		for ( int32_t i = 0 ; i < klen ; i++ ) {
			UChar32 ui = ttt[i];
			int32_t blen = utf8Encode(ui,obuf);
			obuf[blen]=0;
			int32_t an = ucIsAlpha(ui);

			fprintf(stderr,"#%"INT32"=%s (alnum=%"INT32")\n",i,obuf,an);
		}
		fprintf(stderr,"hey\n");
		exit(0);
	}
	*/

	/*

	  PRINT OUT all Unicode characters and their decompositions

	{
		for ( int32_t uc = 0 ; uc < 0xe01ef ; uc++ ) {
			//if ( ! ucIsAlnum(uc) ) continue;
			UChar32 ttt[32];
			int32_t klen = recursiveKDExpand(uc,ttt,256);
			char obuf[64];
			int32_t clen = utf8Encode(uc,obuf);
			obuf[clen]=0;
			// print utf8 char we are decomposing
			fprintf(stderr,"%"XINT32") %s --> ",uc,obuf);
			// sanity
			if ( klen > 1 && ttt[0] == (UChar32)uc ) {
				fprintf(stderr,"SAME\n");
				continue;
			}
			// print decomposition
			for ( int32_t i = 0 ; i < klen ; i++ ) {
				UChar32 ui = ttt[i];
				char qbuf[64];
				int32_t blen = utf8Encode(ui,qbuf);
				qbuf[blen]=0;
				fprintf(stderr,"%s",qbuf);
				// show the #
				fprintf(stderr,"{%"XINT32"}",(int32_t)ui);
				if ( i+1<klen ) fprintf(stderr,", ");
			}
			// show utf8 rep
			fprintf(stderr," [");
			for ( int32_t i = 0 ; i < clen ; i++ ) {
				fprintf(stderr,"0x%hhx",(int)obuf[i]);
				if ( i+1<clen) fprintf(stderr," ");
			}
			fprintf(stderr,"]");
			fprintf(stderr,"\n");
		}
		exit(0);
	}
	*/


	// the wiktionary for lang identification and alternate word forms/
	// synonyms
	if ( ! g_wiktionary.load() ) return 1;
	if ( ! g_wiktionary.test() ) return 1;

	// . load synonyms, synonym affinity, and stems
	// . now we are using g_synonyms
	//g_thesaurus.init();
	//g_synonyms.init();

	// the wiki titles
	if ( ! g_wiki.load() ) return 1;

	// the query log split
	//if ( ! loadQueryLog() ) return 1;


 jump:
	// force give up on dead hosts to false
	g_conf.m_giveupOnDeadHosts = 0;

	// shout out if we're in read only mode
	if ( g_conf.m_readOnlyMode )
		log("db: -- Read Only Mode Set. Can Not Add New Data. --");
//#ifdef SPLIT_INDEXDB
	//if ( g_hostdb.m_indexSplits > 1 )
	//	log("db: -- Split Index ENABLED. Split count set to: %"INT32" --",
	//	    g_hostdb.m_indexSplits);
//#endif

	// . set up shared mem now, only on udpServer2
	// . will only set it up if we're the lowest hostId on this ip
	//if ( ! g_udpServer2.setupSharedMem() ) {
	//	log("db: SharedMem init failed" ); return 1; }
	// the robots.txt db
	//if ( ! g_robotdb.init() ) {
	//	log("db: Robotdb init failed." ); return 1; }

	// . collectiondb, does not use rdb, loads directly from disk
	// . do this up here so RdbTree::fixTree() can fix RdbTree::m_collnums
	// . this is a fake init, cuz we pass in "true"
	if ( ! g_isYippy && ! g_collectiondb.loadAllCollRecs() ) {
		log("db: Collectiondb load failed." ); return 1; }

	// a hack to rename files that were not renamed because of a bug
	// in the repair/build process
	/*
	if ( ! g_titledb2.init2    ( 100000000 ) ) {
		log("db: Titledb init2 failed." ); return 1; }
	if ( ! g_titledb2.addRdbBase1  ( "mainRebuild" ) ) {
		log("db: Titledb addcoll failed." ); return 1; }
	g_titledb2
	// get the base
	RdbBase *base = g_titledb2.m_rdb.m_bases[1];
	// panic?
	if ( ! base ) { log("db: titledb2: no base."); return 1; }
	// now clean them up
	base->removeRebuildFromFilenames ( ) ;
	// stop
	return 1;
	*/

	// then statsdb
	if ( ! g_statsdb.init() ) {
		log("db: Statsdb init failed." ); return 1; }

	// allow adds to statsdb rdb tree
	g_process.m_powerIsOn = true;

	// then indexdb
	//if ( ! g_indexdb.init()    ) {
	//	log("db: Indexdb init failed." ); return 1; }
	if ( ! g_posdb.init()    ) {
		log("db: Posdb init failed." ); return 1; }
	// for sorting results by date
	//if ( ! g_datedb.init()    ) {
	//	log("db: Datedb init failed." ); return 1; }
	// for sorting events by time
	//if ( ! g_timedb.init()    ) {
	//	log("db: Datedb init failed." ); return 1; }
	// then titledb
	if ( ! g_titledb.init()    ) {
		log("db: Titledb init failed." ); return 1; }
	// then revdb
	//if ( ! g_revdb.init()    ) {
	//	log("db: Revdb init failed." ); return 1; }
	// then tagdb
	if ( ! g_tagdb.init()     ) {
		log("db: Tagdb init failed." ); return 1; }
	// the catdb, it's an instance of tagdb, pass RDB_CATDB
	if ( ! g_catdb.init()   ) {
		log("db: Catdb1 init failed." ); return 1; }
	// initialize Users
	if ( ! g_users.init()  ){
		log("db: Users init failed. "); return 1;}

	// int64_t uu = gettimeofdayInMilliseconds();
	// for ( int i = 0 ; i < 10000000 ; i++ )
	// 	bool x = g_threads.amThread();
	// int64_t uu2 = gettimeofdayInMilliseconds();
	// log("tod: took %"INT64,uu2-uu);

	//if ( ! g_syncdb.init() ) {
	//	log("db: Syncdb init failed." ); return 1; }

	// if generating spiderdb/tfndb/checksumdb, boost minfiles
	//if ( strcmp ( cmd, "gendbs" ) == 0 ) {
	//	// don't let spider merge all the time!
	//	g_conf.m_spiderdbMinFilesToMerge = 20;
	//	g_conf.m_tfndbMinFilesToMerge    = 5;
	//	// set up spiderdb
	//	g_conf.m_spiderdbMaxTreeMem = 200000000; // 200M
	//	g_conf.m_maxMem = 2950000000LL; // 2G
	//	g_mem.m_maxMem  = 2950000000LL; // 2G
	//}

	//if ( strcmp ( cmd, "gentfndb" ) == 0 ) {
	//	g_conf.m_tfndbMinFilesToMerge = 20;
	//	// set up tfndb
	//	g_conf.m_tfndbMaxTreeMem = 200000000; // 200M
	//	g_conf.m_maxMem = 2000000000LL; // 2G
	//	g_mem.m_maxMem  = 2000000000LL; // 2G
	//}

	// then tfndb
	//if ( ! g_tfndb.init()   ) {
	//	log("db: Tfndb init failed." ); return 1; }
	// then spiderdb
	if ( ! g_spiderdb.init()   ) {
		log("db: Spiderdb init failed." ); return 1; }
	// then doledb
	if ( ! g_doledb.init()   ) {
		log("db: Doledb init failed." ); return 1; }
	// the spider cache used by SpiderLoop
	if ( ! g_spiderCache.init() ) {
		log("db: SpiderCache init failed." ); return 1; }
	if ( ! g_test.init() ) {
		log("db: test init failed" ); return 1; }

	// then checksumdb
	//if ( ! g_checksumdb.init()   ) {
	//	log("db: Checksumdb init failed." ); return 1; }


	// ensure clusterdb tree is big enough for quicker generation
	//if ( strcmp ( cmd, "genclusterdb" ) == 0 ) {
	//	g_conf.m_clusterdbMinFilesToMerge = 20;
	//	// set up clusterdb
	//	g_conf.m_clusterdbMaxTreeMem = 50000000; // 50M
	//	g_conf.m_maxMem = 2000000000LL; // 2G
	//	g_mem.m_maxMem  = 2000000000LL; // 2G
	//}

	// site clusterdb
	if ( ! g_clusterdb.init()   ) {
		log("db: Clusterdb init failed." ); return 1; }
	// linkdb
	if ( ! g_linkdb.init()     ) {
		log("db: Linkdb init failed."   ); return 1; }
	// if ( ! g_cachedb.init()     ) {
	// 	log("db: Cachedb init failed."   ); return 1; }
	// if ( ! g_serpdb.init()     ) {
	// 	log("db: Serpdb init failed."   ); return 1; }
	// if ( ! g_monitordb.init()     ) {
	// 	log("db: Monitordb init failed."   ); return 1; }
	// use sectiondb again for its immense voting power for detecting and
	// removing web page chrome, categories, etc. only use if
	// CollectionRec::m_isCustomCrawl perhaps to save space.
	if ( ! g_sectiondb.init()     ) {
		log("db: Sectiondb init failed."   ); return 1; }
	//if ( ! g_placedb.init()     ) {
	//	log("db: Placedb init failed."   ); return 1; }
	// now clean the trees since all rdbs have loaded their rdb trees
	// from disk, we need to remove bogus collection data from teh trees
	// like if a collection was delete but tree never saved right it'll
	// still have the collection's data in it
	if ( ! g_collectiondb.addRdbBaseToAllRdbsForEachCollRec ( ) ) {
		log("db: Collectiondb init failed." ); return 1; }
	// . now read in a little bit of each db and make sure the contained
	//   records belong in our group
	// . only do this if we have more than one group
	// . we may have records from other groups if we are scaling, but
	//   if we cannot find *any* records in our group we probably have
	//   the wrong data files.
	//if ( ! checkDataParity() ) return 1;

	// init pageturk
	//if ( ! g_pageTurk.init()  ){
	//	log("db: PageTurk init failed. "); return 1;}

	// init the vector cache
	/*
	if ( ! g_vectorCache.init ( g_conf.m_maxVectorCacheMem,
				    VECTOR_REC_SIZE-sizeof(key_t),
				    true,
				    g_conf.m_maxVectorCacheMem /
				      ( sizeof(collnum_t) + 20 +
					VECTOR_REC_SIZE )        ,
				    true,
				    "vector",
				    false,
				    12,
				    12 ) ) {
		log("db: Vector Cache init failed." ); return 1; }
	*/
	// . gb gendbs
	// . hostId should have already been picked up above, so it could be
	//   used to initialize all the rdbs
	//if ( strcmp ( cmd , "gendbs" ) == 0 ) {
	//	char *coll = argv[cmdarg+1];
	//	// generate the dbs
	//	genDbs ( coll ); // coll
	//	g_log.m_disabled = true;
	//	return 0;
	//}
	//if ( strcmp ( cmd , "gentfndb" ) == 0 ) {
	//	char *coll = argv[cmdarg+1];
	//	genTfndb ( coll );
	//	g_log.m_disabled = true;
	//	return 0;
	//}
	//if ( strcmp ( cmd, "genclusterdb" ) == 0 ) {
	//	char *coll = argv[cmdarg+1];
	//	makeClusterdb ( coll );
	//	g_log.m_disabled = true;
	//	return 0;
	//}

	// test all collection dirs for write permission -- metalincs' request
	int32_t pcount = 0;
	for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
		CollectionRec *cr = g_collectiondb.m_recs[i];
		if ( ! cr ) continue;
		if ( ++pcount >= 100 ) {
			log("rdb: not checking directory permission for "
			    "more than first 100 collections to save time.");
			break;
		}
		char tt[1024 + MAX_COLL_LEN ];
		sprintf ( tt , "%scoll.%s.%"INT32"",
			  g_hostdb.m_dir, cr->m_coll , (int32_t)cr->m_collnum );
		checkDirPerms ( tt ) ;
	}

	// and now that all rdbs have loaded lets count the gbeventcount
	// keys we have in datedb. those represent the # of events we
	// have indexed.
	//g_collectiondb.countEvents();

	//if (!ucInit(g_hostdb.m_dir, true)) {
	//	log("Unicode initialization failed!");
	//	return 1;
	//}

	//
	// NOTE: ANYTHING THAT USES THE PARSER SHOULD GO BELOW HERE, UCINIT!
	//

	// load the appropriate dictionaries
	if ( ! g_speller.init() && g_conf.m_isLive ) {
		return 1;
	}

	// have to test after unified dict is loaded because if word is
	// of unknown langid we try to get syns for it anyway if it has
	// only one possible lang according to unified dict
	//if ( ! g_wiktionary.test2() ) return 1;

	/*
	if ( strcmp ( cmd, "gendaterange" ) == 0 ) {
		char *coll = argv[cmdarg+1];
		genDateRange ( coll );
		g_log.m_disabled = true;
		return 0;
	}
	*/

	// load language lists
	if ( !g_langList.loadLists ( ) ) {
		log("init: LangList loadLists Failed" );
		//not really fatal, so carry on.
		//return 1;
	}

	// the query log split. only for seo tools, so only do if
	// we are running in Matt Wells's datacenter.
	if ( g_conf.m_isMattWells && ! loadQueryLog() ) {
		log("init: failed to load query log. continuing with seo "
		    "support.");
		//return 1;
	}

	//if( !g_pageTopDocs.init() ) {
	//	log( "init: PageTopDocs init failed." );
	//	return 1;
	//}

	//if( !g_pageNetTest.init() ) {
	//	log( "init: PageNetTest init failed." );
	//	return 1;
	//}

	//if(!Msg6a::init()) {
	//	log( "init: Quality Agent init failed." );
	//}

	if ( ! g_scraper.init() ) return 1;

	//if ( ! DateParse::init()  ) {
	//	log("db: DateParse init failed." ); return 1;
	//}

	//countdomains was HERE, moved up to access more mem.

	// load up the dmoz categories here
	char structureFile[256];
	sprintf(structureFile, "%scatdb/gbdmoz.structure.dat", g_hostdb.m_dir);
	g_categories = &g_categories1;
	if (g_categories->loadCategories(structureFile) != 0) {
		log("cat: Loading Categories From %s Failed.",
		    structureFile);
		//return 1;
	}
	log(LOG_INFO, "cat: Loaded Categories From %s.",
	    structureFile);

	// Load the category language table
	g_countryCode.loadHashTable();
	int32_t nce = g_countryCode.getNumEntries();
	//log(LOG_INFO, "cat: Loaded %"INT32" entries from Category country table.",
	//		g_countryCode.getNumEntries());
	if ( nce != 544729 )
		log("cat: unsupported catcountry.dat file with %"INT32" entries",
		    nce);


	//g_siteBonus.init();


	if(!g_autoBan.init()) {
		log("autoban: init failed.");
		return 1;
	}

	//if(!g_classifier.restore()) {
	//	log("classifier: init failed.");
	//	//return 1;
	//}

	// deprecated in favor of Msg13-based throttling
	//if ( !g_msg6.init() ) {
	//	log ( "init: msg6 init failed." );
	//	return 1;
	//}

	// if(!g_profiler.init()) {
	// 	log("profiler: init failed.");
	// }
	// g_profiler.readSymbolTable();

	//exit(0);
	// diff with indexdb in sync/ dir
	//syncIndexdb ( );
	//exit(-1);
	// init the cache in Msg40 for caching search results
	// if cache not initialized now then do it now
	int32_t maxMem = g_conf.m_searchResultsMaxCacheMem;
	if ( ! g_genericCache[SEARCHRESULTS_CACHEID].init (
				     maxMem      ,   // max cache mem
				     -1          ,   // fixedDataSize
				     false       ,   // support lists of recs?
				     maxMem/2048 ,   // max cache nodes
				     false       ,   // use half keys?
				     "results"   ,   // filename
				     //g_conf.m_searchResultsSaveCache ) ) {
				     true)){
		log("db: ResultsCache: %s",mstrerror(g_errno));
		return 1;
	}
	/*
	maxMem = 40000000;
	int32_t maxNodes2 = maxMem/(8+8+50*(8+4+4));
	if ( ! g_genericCache[SEORESULTS_CACHEID].init (
				     maxMem     ,   // max cache mem
				     -1          ,   // fixedDataSize
				     false       ,   // support lists of recs?
				     maxNodes2   ,   // max cache nodes
				     false       ,   // use half keys?
				     "seoresults"   ,   // filename
				     true)){ // save to disk?
		log("db: ResultsCache: %s",mstrerror(g_errno));
		return 1;
	}
	*/
	/*
	int32_t maxMem1 = g_conf.m_siteLinkInfoMaxCacheMem;
	if ( ! g_genericCache[SITELINKINFO_CACHEID].init (
				     maxMem1     ,   // max cache mem
				     4           ,   // fixedDataSize
				     false       ,   // support lists of recs?
				     maxMem1/36  ,   // max cache nodes
				     false       ,   // use half keys?
				     "sitelinkinfo" ,   // filename
				     //g_conf.m_siteLinkInfoSaveCache ) ) {
				     true)){
		log("db: SiteLinkInfoCache: %s",mstrerror(g_errno));
		return 1;
	}
	int32_t maxMem2a = g_conf.m_siteQualityMaxCacheMem;
	if ( ! g_genericCache[SITEQUALITY_CACHEID].init (
				     maxMem2a    ,   // max cache mem
				     1           ,   // fixedDataSize
				     false       ,   // support lists of recs?
				     maxMem2a/36 ,   // max cache nodes
				     false       ,   // use half keys?
				     "sitequality" ,   // filename
				     //g_conf.m_siteQualitySaveCache ) ) {
				     true)) {
		log("db: SiteQualityCache: %s",mstrerror(g_errno));
		return 1;
	}
	*/
	/*
	int32_t maxMem2b = g_conf.m_siteQualityMaxCacheMem * .10 ;
	if ( ! g_genericCacheSmallLocal[SITEQUALITY_CACHEID].init (
				     maxMem2b    ,   // max cache mem
				     1           ,   // fixedDataSize
				     false       ,   // support lists of recs?
				     maxMem2b/36 ,   // max cache nodes
				     false       ,   // use half keys?
				     "sitequality" ,   // filename
				     //g_conf.m_siteQualitySaveCache ) ) {
				     false)) {
		log("db: SiteQualityCacheSmallLocal: %s",mstrerror(g_errno));
		return 1;
	}
	*/

	// init minsitenuminlinks buffer
	if ( ! g_tagdb.loadMinSiteInlinksBuffer() ) {
		log("db: failed to load sitelinks.txt data");
		return 1;
	}

	// . then our main udp server
	// . must pass defaults since g_dns uses it's own port/instance of it
	// . server should listen to a socket and register with g_loop
	// . sock read/write buf sizes are both 64000
	// . poll time is 60ms
	// . if the read/write bufs are too small it severely degrades
	//   transmission times for big messages. just use ACK_WINDOW *
	//   MAX_DGRAM_SIZE as the size so when sending you don't drop dgrams
	// . the 400k size allows us to cover Sync.cpp's activity well
	if ( ! g_udpServer.init( g_hostdb.getMyPort() ,&g_dp,2/*niceness*/,
				 20000000 ,   // readBufSIze
				 20000000 ,   // writeBufSize
				 20       ,   // pollTime in ms
				 3500     ,   // max udp slots
				 false    )){ // is dns?
		log("db: UdpServer init failed." ); return 1; }
	// . this is the high priority udpServer, it's stuff is handled 1st
	//   sock read/write buf sizes are both almost 2 megs
	// . a niceness of -1 means its signal won't be blocked, real time
	// . poll time is 20ms
	//if ( ! g_udpServer2.init( g_hostdb.getMyPort2(),&g_dp,-1/*niceness*/,
	//			  10000000 ,   // readBufSIze
	//			  10000000 ,   // writeBufSize
	//			  20       ,   // pollTime in ms
	//			  1000     )){ // max udp slots
	//	log("db: UdpServer2 init failed." ); return 1; }
	// start pinging right away
	if ( ! g_pingServer.init() ) {
		log("db: PingServer init failed." ); return 1; }
	// start up repair loop
	if ( ! g_repair.init() ) {
		log("db: Repair init failed." ); return 1; }
	// start up repair loop
	if ( ! g_dailyMerge.init() ) {
		log("db: Daily merge init failed." ); return 1; }
	// . then dns Distributed client
	// . server should listen to a socket and register with g_loop
	// . Only the distributed cache shall call the dns server.
	if ( ! g_dns.init( h9->m_dnsClientPort ) ) {
		log("db: Dns distributed client init failed." ); return 1; }
	// . then dns Local client
	//if ( ! g_dnsLocal.init( 0 , false ) ) {
	//	log("db: Dns local client init failed." ); return 1; }
	// . then webserver
	// . server should listen to a socket and register with g_loop
	// again:
	if ( ! g_httpServer.init( h9->m_httpPort, h9->m_httpsPort ) ) {
		log("db: HttpServer init failed. Another gb already "
		    "running?" );
		// this is dangerous!!! do not do the shutdown thing
		return 1;
		/*
		// just open a socket to port X and send GET /master?save=1
		if ( shutdownOldGB(h->m_httpPort) ) goto again;
		log("db: Shutdown failed.");
		resetAll();
		return 1;
		*/
	}

	if(!Msg1f::init()) {
		log("logviewer: init failed.");
		return 1;
	}

	// . now register all msg handlers with g_udp server
	if ( ! registerMsgHandlers() ) {
		log("db: registerMsgHandlers failed" ); return 1; }

	// for Events.cpp event extraction we need to parse out "places" from
	// each doc
	//if ( ! initPlaceDescTable ( ) ) {
	//	log("events: places table init failed"); return 1; }

	// init our city lists for mapping a lat/lon to nearest cityid
	// for getting the timezone for getting all events "today".
	// city lists are used by the get
	//if ( ! initCityLists() ) {
	//	log("events: city lists init failed"); return 1; }

	//if ( ! initCityLists_new() ) {
	//	log("events: city lists init failed"); return 1; }

	// . get a doc every hour from gigablast.com as a registration thang
	// . security, man
	//if((int32_t) g_conf.m_mainExternalIp != atoip ( "207.114.174.29" ,14) )
	g_loop.registerSleepCallback(5000, NULL, getPageWrapper);
	// save our rdbs every 5 seconds and save rdb if it hasn't dumped
	// in the last 10 mins
	//if ( ! g_loop.registerSleepCallback(5, NULL, saveRdbs ) ) {
	//	return log("db: save register failed"); return 1; }

	//
	// the new way to save all rdbs and conf
	//
	//g_process.init();

	// gb spellcheck
	if ( strcmp ( cmd , "spellcheck" ) == 0 ) {
		if ( argc != cmdarg + 2 ) goto printHelp; // take no other args
		g_speller.test ( argv[cmdarg + 1] );
		return 0;
	}

	// gb dictLookupTest
	if ( strcmp ( cmd , "dictlookuptest" ) == 0 ) {
		if ( argc != cmdarg + 2 ) goto printHelp; // take no other args
		g_speller.dictLookupTest ( argv[cmdarg + 1] );
		return 0;
	}

	// gb stemmertest
	//if ( strcmp ( cmd , "stemmertest" ) == 0 ) {
	//	if ( argc != cmdarg + 2 ) goto printHelp;
	//	g_stemmer.test ( argv[cmdarg + 1] );
	//	return 0;
	//}

	// gb queryserializetest
	/*
	if ( strcmp ( cmd , "queryserializetest" ) == 0 ) {
		if ( argc != cmdarg + 2 ) goto printHelp;
		int64_t starttime = gettimeofdayInMilliseconds();
		QuerySerializeTest( argv[cmdarg + 1] );
		log(LOG_INFO, "query: took %"INT64"msecs for query serialize" \
			"test on %s", gettimeofdayInMilliseconds() - starttime,
			argv[cmdarg + 1]);
		return 0;
	}
	*/

#ifdef _LIMIT10_
	// how many pages have we indexed so far?
	//int64_t numPages = g_titledb.getRdb()->getNumGlobalRecs();
	int64_t numPages = g_clusterdb.getRdb()->getNumGlobalRecs();
	if ( numPages > 10123466 )
		log("WARNING: Over 10 million documents are in the index. "
		     "You have exceeded the terms of your license. "
		     "Please contact mwells@gigablast.com for a new license.");
#endif
	// bdflush needs to be turned off because we need to control the
	// writes directly. we do this by killing the write thread.
	// we kill it when we need to do important reads, otherwise, if
	// we cannot control the writes it fucks up our reading.
	// no, now i use fsync(fd) in BigFile.cpp
	//log("WARNING: burstify bdflush with a "
	// "'echo 1 > /proc/sys/vm/bdflush' to optimize query response time "
	//    "during spidering.");
	//log("WARNING: mount with noatime option to speed up writes.");
	//log("         since we now call fsync(fd) after each write." );

	// debug msgs
	//log("REMINDER: make HOT again!");
	//log("REMINDER: reinsert thread call failed warning in BigFile.cpp.");
	//log("REMINDER: remove mem leack checking");
	//log("REMINDER: put thread back in Msg39");

	// . now check with gigablast.com (216.243.113.1) to see if we
	//   are licensed, for now, just get the doc
	// . TODO: implement this (GET /license.html \r\n
	//                         Host: www.gigablast.com\r\n\r)

	// do the zlib test
	//zlibtest();
	// . now m_minToMerge might have changed so try to do a merge
	// . only does one merge at a time
	// . other rdb's will sleep and retry until it's their turn
	//g_indexdb.getRdb()->m_minToMerge = 3;
	//g_loop.registerSleepCallback ( 1000 ,
	//			       NULL ,
	//			       tryMergingWrapper );
	// . register a callback to try to merge everything every 2 seconds
	// . do not exit if we couldn't do this, not a huge deal
	// . put this in here instead of Rdb.cpp because we don't want
	//   generator commands merging on us
	// . the (void *)1 prevents gb from logging merge info every 2 seconds
	// . niceness is 1
	if ( ! g_loop.registerSleepCallback(2000,(void *)1,attemptMergeAll,1))
		log("db: Failed to init merge sleep callback.");

	// SEO MODULE
	// . only use if we are in Matt Wells's data center
	//   and have access to the seo tools
	if ( g_conf.m_isMattWells &&
	     ! g_loop.registerSleepCallback(2000,(void *)1,runSEOQueryLoop))
		log("db: Failed to register seo query loop");

	// try to sync parms (and collection recs) with host 0
	if ( ! g_loop.registerSleepCallback(1000,NULL,tryToSyncWrapper,0))
		return false;

	//if( !g_loop.registerSleepCallback(2000,(void *)1,controlDumpTopDocs) )
	//	log("db: Failed to init dump TopDocs sleep callback.");

        // MTS: removing nettest, this breaks NetGear switches when all links
        //      are transmitting full bore and full duplex.
	//if( !g_loop.registerSleepCallback(2000,(void *)1,controlNetTest) )
	//	log("db: Failed to init network test sleep callback.");

	//if( !g_loop.registerSleepCallback(60000,(void *)1,takeSnapshotWrapper))
	//	log("db: Failed to init Statsdb snapshot sleep callback.");

	// check to make sure we have the latest parms
	//Msg3e msg3e;
	//msg3e.checkForNewParms();

	// this stuff is similar to alden's msg3e but will sync collections
	// that were added/deletede
	//if ( ! g_parms.syncParmsWithHost0() ) {
	//	log("parms: error syncing parms: %s",mstrerror(g_errno));
	//	return 0;
	//}


	if(g_recoveryMode) {
		//now that everything is init-ed send the message.
		char buf[256];
		log("admin: Sending emails.");
		sprintf(buf, "Host %"INT32" respawning after crash.(%s)",
			h9->m_hostId, iptoa(g_hostdb.getMyIp()));
		g_pingServer.sendEmail(NULL, buf);
	}

	if ( testMandrill ) {
		static EmailInfo ei;
		//ei.m_cr = g_collectiondb.getRec(1);
		ei.m_collnum = 1;
		ei.m_fromAddress.safePrintf("support@diffbot.com");
		ei.m_toAddress.safePrintf("matt@diffbot.com");
		ei.m_callback = exitWrapper;
		sendEmailThroughMandrill ( &ei );
		g_conf.m_spideringEnabled = false;
		g_conf.m_save = true;
	}

	Json json;
	json.test();
	json.reset();

	// . start the spiderloop
	// . comment out when testing SpiderCache
	g_spiderLoop.startLoop();

	// allow saving of conf again
	g_conf.m_save = true;

	// test speed of select statement used in Loop::doPoll()
	// descriptor bits for calling select()
	/*
	fd_set readfds;
	fd_set writefds;
	fd_set exceptfds;
	// clear fds for select()
	FD_ZERO ( &readfds   );
	FD_ZERO ( &writefds  );
	FD_ZERO ( &exceptfds );
	timeval v;
	v.tv_sec  = 0;
	v.tv_usec = 1;
	// set descriptors we should watch
	for ( int32_t i = 0 ; i < MAX_NUM_FDS ; i++ ) {
		if ( g_loop.m_readSlots [i] ) {
			FD_SET ( i , &readfds   );
			FD_SET ( i , &exceptfds );
		}
		if ( g_loop.m_writeSlots[i] ) {
			FD_SET ( i , &writefds );
			FD_SET ( i , &exceptfds );
		}
	}
	// . poll the fd's searching for socket closes
	// . this takes 113ms with the FD_SET() stuff, and 35ms without
	//   for doing 10,000 loops... pretty fast.
	int64_t t1 = gettimeofdayInMilliseconds();
	int32_t i = 0;
	for ( i = 0 ; i < 10000 ; i++ ) {
		// descriptor bits for calling select()
		fd_set readfds;
		fd_set writefds;
		fd_set exceptfds;
		// clear fds for select()
		FD_ZERO ( &readfds   );
		FD_ZERO ( &writefds  );
		FD_ZERO ( &exceptfds );
		timeval v;
		v.tv_sec  = 0;
		v.tv_usec = 1;
		// set descriptors we should watch
		for ( int32_t i = 0 ; i < MAX_NUM_FDS ; i++ ) {
			if ( g_loop.m_readSlots [i] ) {
				FD_SET ( i , &readfds   );
				FD_SET ( i , &exceptfds );
			}
			if ( g_loop.m_writeSlots[i] ) {
				FD_SET ( i , &writefds );
				FD_SET ( i , &exceptfds );
			}
		}

		int32_t n = select (MAX_NUM_FDS,&readfds,&writefds,&exceptfds,&v);
		if ( n >= 0 ) continue;
		log("loop: select: %s.",strerror(g_errno));
		break;
	}
	int64_t t2 = gettimeofdayInMilliseconds();
	log(LOG_INFO,"loop: %"INT32" selects() called in %"INT64" ms.",i,t2-t1);
	*/

	//spamTest();

	// flush stats
	//g_statsdb.flush();

	// ok, now activate statsdb
	g_statsdb.m_disabled = false;

	log("db: gb is now ready");

	// sync loop
	//if ( ! g_sync.init() ) {
	//	log("db: Sync init failed." ); return 1; }
	// . now start g_loops main interrupt handling loop
	// . it should block forever
	// . when it gets a signal it dispatches to a server or db to handle it
	if ( ! g_loop.runLoop()    ) {
		log("db: runLoop failed." ); return 1; }
	// dummy return (0-->normal exit status for the shell)
	return 0;
}

/*
void spamTest ( ) {
	// quick test
	// load in sample
	char *filename = "/home/mwells/poo";
	int fd = open ( filename , O_RDONLY );
	char ppp[100000];
        struct stat stats;
        stat ( filename , &stats );
        int32_t size =  stats.st_size;
	if ( size > 100000 ) size = 99999;
	logf(LOG_INFO,"linkspam: Read %"INT32" bytes.",(int32_t)size);
	// copy errno to g_errno
	read ( fd , ppp  , size );
	ppp[size]=0;
	Xml xml;
	xml.set ( csUTF8,
		  ppp ,
		  size ,
		  false ,
		  size ,
		  false ,
		  TITLEREC_CURRENT_VERSION );
	Url linker;
	Url linkee;
	char *lee = "www.viagrapunch.com";
	linkee.set ( lee , gbstrlen ( lee ) );
	char *rr = "http://www.propeciauk.co.uk/links.htm";
	linker.set ( rr , gbstrlen(rr) );
	char *note = NULL;
        int32_t linkNode = -1;
	Links links;
	//int32_t siteFileNum = 48;//tr->getSiteFilenum();
        //Xml *sx = g_tagdb.getSiteXml ( siteFileNum, "main" , 4 );
        if (!links.set ( true , &xml , &linker ,
	false, // includeLinkHashes
                         true , // useBaseHref?
	TITLEREC_CURRENT_VERSION,
                         0 )) // niceness ))
                return;
	char linkText[1024];
	if ( linkNode < 0 )
		logf(LOG_INFO,"linkspam: linkee not found in content.");
        //int32_t linkTextLen =
	links.getLinkText ( &linkee ,
			    linkText          ,
			    1023 ,
			    NULL,//&m_itemPtr          ,
			    NULL,//&m_itemLen          ,
			    &linkNode           ,
			    0 ); // niceness );
	bool ttt = isLinkSpam  ( &linker ,
				 NULL , //class TitleRec  *tr        ,
				 &xml ,
				 &links ,
				 size ,
				 &note ,
				 &linkee ,
				 linkNode  ,
				 "main" ,
				 0 ); // niceness
	logf(LOG_INFO,"linkspam: linkNode=%"INT32" val=%"INT32" note=%s",
	     linkNode,(int32_t)ttt,note);
	exit(0);
}
*/

int32_t checkDirPerms ( char *dir ) {
	if ( g_conf.m_readOnlyMode ) return 0;
	File f;
	f.set ( dir , "tmpfile" );
	if ( ! f.open ( O_RDWR | O_CREAT | O_TRUNC ) ) {
		log("disk: Unable to create %stmpfile. Need write permission "
		    "in this directory.",dir);
		return -1;
	}
	if ( ! f.unlink() ) {
		log("disk: Unable to delete %stmpfile. Need write permission "
		    "in this directory.",dir);
		return -1;
	}
	return 0;
}

// save them all
static       void doCmdAll   ( int fd, void *state ) ;
static       bool  s_sendToHosts;
static       bool  s_sendToProxies;
static       int32_t  s_hostId;
static       int32_t  s_hostId2;
static const char *s_cmd ;
static       char  s_buffer[128];
static HttpRequest s_r;
bool doCmd ( const char *cmd , int32_t hostId , char *filename ,
	     bool sendToHosts , bool sendToProxies , int32_t hostId2 ) {
	// need loop to work
	if ( ! g_loop.init() ) return log("db: Loop init failed." );
	// save it
	s_cmd = cmd;
	// we are no part of it
	//g_hostdb.m_hostId = -1;
	// pass it on
	s_hostId = hostId;
	s_sendToHosts = sendToHosts;
	s_sendToProxies = sendToProxies;
	s_hostId2 = hostId2;
	// set stuff so http server client-side works right
	g_conf.m_httpMaxSockets = 512;
	sprintf ( g_conf.m_spiderUserAgent ,"Gigabot/1.0");
	// then webserver, client side only
	//if ( ! g_httpServer.init( -1, -1 ) )
	//	return log("db: HttpServer init failed." );
	// no, we just need udp server
	//if ( ! g_udpServer.init( 6345/*port*/,&g_dp,-1/*niceness*/,
	//			  10000000,10000000,20,1000) ) {
	//	log("admin: UdpServer init failed." ); return false; }
	// register sleep callback to get started
	if ( ! g_loop.registerSleepCallback(1, NULL, doCmdAll , 0 ) )
		return log("admin: Loop init failed.");
	// not it
	log(LOG_INFO,"admin: broadcasting %s",cmd);
	// make a fake http request
	sprintf ( s_buffer , "GET /%s?%s HTTP/1.0" , filename , cmd );
	TcpSocket sock; sock.m_ip = 0;
	// make it local loopback so it passes the permission test in
	// doCmdAll()'s call to convertHttpRequestToParmList
	sock.m_ip = atoip("127.0.0.1");
	s_r.set ( s_buffer , gbstrlen ( s_buffer ) , &sock );
	// do not do sig alarms! for now just set this to null so
	// the sigalarmhandler doesn't core
	//g_hostdb.m_myHost = NULL;
	// run the loop
	if ( ! g_loop.runLoop() )
		return log("INJECT: loop run failed.");
	return true;
}

//static Msg28       s_msg28;
//static TcpSocket   s_s;

void doneCmdAll ( void *state ) {
	/*
	if ( s_sendToProxies ){
		if ( ! g_loop.registerSleepCallback(1, NULL, doCmdAll,0 ) ){
			log("admin: Loop init failed.");
			exit ( 0 );
		}
		return;
	}
	*/
	log("cmd: completed command");
	exit ( 0 );
}


void doCmdAll ( int fd, void *state ) {

	// do not keep calling it!
	g_loop.unregisterSleepCallback ( NULL, doCmdAll );

	// make port -1 to indicate none to listen on
	if ( ! g_udpServer.init( 18123 , // port to listen on
				 &g_dp,
				 0, // niceness
				 20000000 ,   // readBufSIze
				 20000000 ,   // writeBufSize
				 20       ,   // pollTime in ms
				 3500     ,   // max udp slots
				 false    )){ // is dns?
		log("db: UdpServer init  on port 18123 failed: %s" ,
		    mstrerror(g_errno));
		exit(0);
	}

	// udpserver::sendRequest() checks we have a handle for msgs we send!
	// so fake it out with this lest it cores
	g_udpServer.registerHandler(0x3f,handleRequest3f);


	SafeBuf parmList;
	// returns false and sets g_errno on error
	if (!g_parms.convertHttpRequestToParmList(&s_r,&parmList,0,NULL)){
		log("cmd: error converting command: %s",mstrerror(g_errno));
		exit(0);
	}

	if ( parmList.length() <= 0 ) {
		log("cmd: no parmlist to send");
		exit(0);
	}

	// restrict broadcast to this hostid range!

	// returns true with g_errno set on error. uses g_udpServer
	if ( g_parms.broadcastParmList ( &parmList ,
					 NULL ,
					 doneCmdAll , // callback when done
					 s_sendToHosts ,
					 s_sendToProxies ,
					 s_hostId ,  // -1 means all
					 s_hostId2 ) ) { // -1 means all
		log("cmd: error sending command: %s",mstrerror(g_errno));
		exit(0);
		return;
	}
	// wait for it
	log("cmd: sent command");
	/*
	bool status = true;
	if ( s_sendToHosts ){
		s_sendToHosts = false;
		status = s_msg28.massConfig ( &s_s, &s_r, s_hostId, NULL,
					      doneCmdAll,false,
					      false,s_hostId2);
	}
	else if ( s_sendToProxies ){
		s_sendToProxies = false;
		status = s_msg28.massConfig ( &s_s, &s_r, s_hostId, NULL,
					      doneCmdAll,false,
					      true,s_hostId2);
	}
	g_loop.unregisterSleepCallback ( NULL, doCmdAll );
	// if we did not block, call the callback directly
	if ( status ) doneCmdAll(NULL);
	*/
}

// copy a collection from one network to another (defined by 2 hosts.conf's)
int collcopy ( char *newHostsConf , char *coll , int32_t collnum ) {
	Hostdb hdb;
	//if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
	if ( ! hdb.init( 0/*assume we're zero*/) ) {
		log("clusterCopy failed. Could not init hostdb with %s",
		    newHostsConf);
		return -1;
	}
	// sanity check
	if ( hdb.getNumShards() != g_hostdb.getNumShards() ) {
		log("Hosts.conf files do not have same number of groups.");
		return -1;
	}
	if ( hdb.getNumHosts() != g_hostdb.getNumHosts() ) {
		log("Hosts.conf files do not have same number of hosts.");
		return -1;
	}
	// host checks
	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
		Host *h = &g_hostdb.m_hosts[i];
		fprintf(stderr,"ssh %s '",iptoa(h->m_ip));
		fprintf(stderr,"du -skc %scoll.%s.%"INT32" | tail -1 '\n",
			h->m_dir,coll,collnum);
	}
	// loop over dst hosts
	for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
		Host *h = &g_hostdb.m_hosts[i];
		// get the src host from the provided hosts.conf
		Host *h2 = &hdb.m_hosts[i];
		// print the copy
		//fprintf(stderr,"rcp %s:%s*db*.dat* ",
		//	iptoa( h->m_ip), h->m_dir  );
		fprintf(stderr,"nohup ssh %s '",iptoa(h->m_ip));
		fprintf(stderr,"rcp -r ");
		fprintf(stderr,"%s:%scoll.%s.%"INT32" ",
			iptoa(h2->m_ip), h2->m_dir , coll, collnum );
		fprintf(stderr,"%s' &\n", h->m_dir  );
		//fprintf(stderr," rcp -p %s*.map* ", h->m_dir );
		//fprintf(stderr," rcp -r %scoll.* ", h->m_dir );
		//fprintf(stderr,"%s:%s " ,iptoa(h2->m_ip), h2->m_dir );
	}
	return 1;
}

// generate the copies that need to be done to scale from oldhosts.conf
// to newhosts.conf topology.
int scale ( char *newHostsConf , bool useShotgunIp) {

	g_hostdb.resetPortTables();

	Hostdb hdb;
	//if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
	if ( ! hdb.init( 0/*assume we're zero*/) ) {
		log("Scale failed. Could not init hostdb with %s",
		    newHostsConf);
		return -1;
	}

	// ptrs to the two hostdb's
	Hostdb *hdb1 = &g_hostdb;
	Hostdb *hdb2 = &hdb;

	// this function was made to scale UP, but if scaling down
	// then swap them!
	if ( hdb1->m_numHosts > hdb2->m_numHosts ) {
		Hostdb *tmp = hdb1;
		hdb1 = hdb2;
		hdb2 = tmp;
	}

	// . ensure old hosts in g_hostdb are in a derivate groupId in
	//   newHostsConf
	// . old hosts may not even be present! consider them the same host,
	//   though, if have same ip and working dir, because that would
	//   interfere with a file copy.
	for ( int32_t i = 0 ; i < hdb1->m_numHosts ; i++ ) {
	Host *h = &hdb1->m_hosts[i];
	// look in new guy
	for ( int32_t j = 0 ; j < hdb2->m_numHosts ; j++ ) {
		Host *h2 = &hdb2->m_hosts[j];
		// if a match, ensure same group
		if ( h2->m_ip != h->m_ip ) continue;
		if ( strcmp ( h2->m_dir , h->m_dir ) != 0 ) continue;
		// bitch if twins not preserved when scaling
		//if ( h2->m_group != h->m_group ) {
		/*
		if ( (h2->m_groupId & hdb1->m_groupMask) !=
		     (h->m_groupId & hdb1->m_groupMask) )  {
			log("Twins not preserved when scaling. New hosts.conf "
			    "must have same twins as old hosts.conf. That is, "
			    "if two hosts were in the same group (GRP) in the "
			    "old hosts.conf, they must be in the same group "
			    "in the new hosts.conf");
			return -1;
		}
		// bitch if a major group change
		if ( (h2->m_group & (hdb1->m_numGroups - 1)) ==
		     h->m_group ) continue;
		log ("hostId #%"INT32" (in group #%"INT32") in %s is not in a "
		     "derivative group of "
		     "hostId #%"INT32" (in group #%"INT32") in old hosts.conf.",
		     h2->m_hostId,h2->m_group,
		     newHostsConf,
		     h->m_hostId,h->m_group);
		return -1;
		*/
	}
	}

	// . ensure that:
	//   (h2->m_groupId & (hdb1->m_numGroups -1)) == h->m_groupId
	//   where h2 is in a derivative group of h.
	// . do a quick monte carlo test to make sure that a key in old
	//   group #0 maps to groups 0,8,16,24 for all keys and all dbs
	uint32_t shard1;
	uint32_t shard2;
	for ( int32_t i = 0 ; i < 1000 ; i++ ) {
		//key_t k;
		//k.n1 = rand(); k.n0 = rand(); k.n0 <<= 32; k.n0 |= rand();
		//key128_t k16;
		//k16.n0 = k.n0;
		//k16.n1 = rand(); k16.n1 <<= 32; k16.n1 |= k.n1;
		char k[MAX_KEY_BYTES];
		for ( int32_t ki = 0 ; ki < MAX_KEY_BYTES ; ki++ )
			k[ki] = rand() & 0xff;

		//char *k2;
		//if ( g_conf.m_checksumdbKeySize == 12 )
		//	k2 = (char *)&k;
		//else
		//	k2 = (char *)&k16;
		// get old group (groupId1) and new group (groupId2)
		shard1 = hdb1->getShardNum ( RDB_TITLEDB , k );//, hdb1 );
		shard2 = hdb2->getShardNum( RDB_TITLEDB , k );//, hdb2 );
		/*
		// ensure groupId2 is derivative of groupId1
		if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) {
			log("Bad engineer. Group id 0x%"XINT32" not derivative of "
			    "group id 0x%"XINT32" for titledb.",groupId2,groupId1);
			return -1;
		}
		*/
		/*
		// get old group (groupId1) and new group (groupId2)
		//groupId1 = g_checksumdb.getGroupId ( k , &g_hostdb );
		//groupId2 = g_checksumdb.getGroupId ( k , &hdb );
		groupId1 = hdb1->g_checksumdb.getGroupId ( k2 , hdb1 );
		groupId2 = hdb2->g_checksumdb.getGroupId ( k2 , hdb2 );
		// ensure groupId2 is derivative of groupId1
		if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) {
			log("Bad engineer. Group id 0x%"XINT32" not derivative of "
			    "group id 0x%"XINT32" for checksumdb.",
			    groupId2,groupId1);
			return -1;
		}
		*/
		/*
		// get old group (groupId1) and new group (groupId2)
		groupId1 = hdb1->getGroupId ( RDB_SPIDERDB , k );
		groupId2 = hdb2->getGroupId ( RDB_SPIDERDB , k );
		// ensure groupId2 is derivative of groupId1
		if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) {
			log("Bad engineer. Group id 0x%"XINT32" not derivative of "
			    "group id 0x%"XINT32" for spiderdb.",
			    groupId2,groupId1);
			return -1;
		}

		// get old group (groupId1) and new group (groupId2)
		groupId1 = hdb1->getGroupId ( RDB_POSDB , k );
		groupId2 = hdb2->getGroupId ( RDB_POSDB , k );
		// ensure groupId2 is derivative of groupId1
		if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) {
			log("Bad engineer. Group id 0x%"XINT32" not derivative of "
			    "group id 0x%"XINT32" for posdb.",
			    groupId2,groupId1);
			return -1;
		}

		// get old group (groupId1) and new group (groupId2)
		groupId1 = hdb1->getGroupId ( RDB_CLUSTERDB , k );
		groupId2 = hdb2->getGroupId ( RDB_CLUSTERDB , k );
		// ensure groupId2 is derivative of groupId1
		if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) {
			log("Bad engineer. Group id 0x%"XINT32" not derivative of "
			    "group id 0x%"XINT32" for clusterdb.",
			    groupId2,groupId1);
			return -1;
		}

		// get old group (groupId1) and new group (groupId2)
		groupId1 = hdb1->getGroupId ( RDB_TAGDB , k );
		groupId2 = hdb2->getGroupId ( RDB_TAGDB , k );
		// ensure groupId2 is derivative of groupId1
		if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) {
			log("Bad engineer. Group id 0x%"XINT32" not derivative of "
			    "group id 0x%"XINT32" for tagdb.",
			    groupId2,groupId1);
			return -1;
		}

		// get old group (groupId1) and new group (groupId2)
		groupId1 = hdb1->getGroupId ( RDB_SECTIONDB , k );
		groupId2 = hdb2->getGroupId ( RDB_SECTIONDB , k );
		// ensure groupId2 is derivative of groupId1
		if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) {
			log("Bad engineer. Group id 0x%"XINT32" not derivative of "
			    "group id 0x%"XINT32" for sectiondb.",
			    groupId2,groupId1);
			return -1;
		}

		// get old group (groupId1) and new group (groupId2)
		groupId1 = hdb1->getGroupId ( RDB_LINKDB , k );
		groupId2 = hdb2->getGroupId ( RDB_LINKDB , k );
		// ensure groupId2 is derivative of groupId1
		if ( (groupId2 & hdb1->m_groupMask) != groupId1 ) {
			log("Bad engineer. Group id 0x%"XINT32" not derivative of "
			    "group id 0x%"XINT32" for linkdb.",
			    groupId2,groupId1);
			return -1;
		}
		*/
	}

	// . now copy all titleRecs in old hosts to all derivatives
	// . going from 8 (3bits) hosts to 32 (5bits), for instance, old
	//   group id #0 would copy to group ids 0,8,16 and 24.
	// . 000 --> 00000(#0), 01000(#8), 10000(#16), 11000(#24)
	// . titledb and tfndb determine groupId by mod'ding the docid
	//   contained in their most significant key bits with the number
	//   of groups.  see Titledb.h::getGroupId(docid)
	// . indexdb and tagdb mask the hi bits of the key with
	//   hdb1->m_groupMask, which is like a reverse mod'ding:
	//   000 --> 00000, 00001, 00010, 00011
	char done [ 8196 ];
	memset ( done , 0 , 8196 );
	for ( int32_t i = 0 ; i < hdb1->m_numHosts ; i++ ) {
	Host *h = &hdb1->m_hosts[i];
	char flag = 0;
	// look in new guy
	for ( int32_t j = 0 ; j < hdb2->m_numHosts ; j++ ) {
		Host *h2 = &hdb2->m_hosts[j];
		// do not copy to oneself
		if ( h2->m_ip == h->m_ip &&
		     strcmp ( h2->m_dir , h->m_dir ) == 0 ) continue;
		// skip if not derivative groupId for titledb
		//if ( (h2->m_groupId & hdb1->m_groupMask) !=
		//     h->m_groupId ) continue;
		// continue if already copying to here
		if ( done[j] ) continue;
		// mark as done
		done[j] = 1;
		/*
		// . don't copy to a twin in the old hosts.conf
		// . WE MUST preserve twins when scaling for this to work
		if ( h2->m_group == h->m_group ) {
			// only skip host h2 if he's in old hosts.conf
			// somewhere. does newhosts.conf contain hosts from
			// old hosts.conf?
			int32_t k = 0;
			for ( k = 0 ; k < hdb1->m_numHosts ; k++ ) {
				Host *h3 = &hdb1->m_hosts[k];
				if ( h2->m_ip == h3->m_ip &&
				     strcmp ( h2->m_dir , h3->m_dir ) == 0 )
					break;
			}
			if ( k < hdb1->m_numHosts )
				continue;
		}
		*/
		// skip local copies for now!!
		//if ( h->m_ip == h2->m_ip ) continue;

		// use ; separator
		if ( flag ) fprintf(stderr,"; ");
		//else        fprintf(stderr,"ssh %s \"",iptoa(h->m_ip));
		else        fprintf(stderr,"ssh %s \"",h->m_hostname);
		// flag
		flag = 1;
		// print the copy
		//fprintf(stderr,"rcp %s:%s*db*.dat* ",
		//	iptoa( h->m_ip), h->m_dir  );
		// if same ip then do a 'cp' not rcp
		char *cmd = "rcp -r";
		if ( h->m_ip == h2->m_ip ) cmd = "cp -pr";

		fprintf(stderr,"%s %s*db*.dat* ", cmd, h->m_dir  );

		if ( h->m_ip == h2->m_ip )
			fprintf(stderr,"%s ;", h2->m_dir );
		else {
			//int32_t ip = h2->m_ip;
			//if ( useShotgunIp ) ip = h2->m_ipShotgun;
			//fprintf(stderr,"%s:%s ;",iptoa(ip), h2->m_dir );
			char *hn = h2->m_hostname;
			if ( useShotgunIp ) hn = h2->m_hostname;//2
			fprintf(stderr,"%s:%s ;",hn, h2->m_dir );

		}

		//fprintf(stderr," rcp -p %s*.map* ", h->m_dir );
		fprintf(stderr," %s %scoll.* ", cmd, h->m_dir );

		if ( h->m_ip == h2->m_ip )
			fprintf(stderr,"%s " , h2->m_dir );
		else {
			//int32_t ip = h2->m_ip;
			//if ( useShotgunIp ) ip = h2->m_ipShotgun;
			//fprintf(stderr,"%s:%s " ,iptoa(ip), h2->m_dir );
			char *hn = h2->m_hostname;
			if ( useShotgunIp ) hn = h2->m_hostname;//2;
			fprintf(stderr,"%s:%s " ,hn, h2->m_dir );
		}

		/*
		fprintf(stderr,"scp %s:%s/titledb* %s:%s\n",
			iptoa( h->m_ip), h->m_dir  ,
			iptoa(h2->m_ip), h2->m_dir );
		fprintf(stderr,"scp %s:%s/tfndb* %s:%s\n",
			iptoa( h->m_ip), h->m_dir  ,
			iptoa(h2->m_ip), h2->m_dir );
		fprintf(stderr,"scp %s:%s/indexdb* %s:%s\n",
			iptoa( h->m_ip), h->m_dir  ,
			iptoa(h2->m_ip), h2->m_dir );
		fprintf(stderr,"scp %s:%s/spiderdb* %s:%s\n",
			iptoa( h->m_ip), h->m_dir  ,
			iptoa(h2->m_ip), h2->m_dir );
		fprintf(stderr,"scp %s:%s/checksumdb* %s:%s\n",
			iptoa( h->m_ip), h->m_dir  ,
			iptoa(h2->m_ip), h2->m_dir );
		fprintf(stderr,"scp %s:%s/clusterdb* %s:%s\n",
			iptoa( h->m_ip), h->m_dir  ,
			iptoa(h2->m_ip), h2->m_dir );
		fprintf(stderr,"scp %s:%s/tagdb* %s:%s\n",
			iptoa( h->m_ip), h->m_dir  ,
			iptoa(h2->m_ip), h2->m_dir );
		*/
	}
	if ( flag ) fprintf(stderr,"\" &\n");
	}
	return 1;
}

// installFlag is 1 if we are really installing, 2 if just starting up gb's
// installFlag should be a member of the ifk_ enum defined above
int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
	      char *coll , int32_t hostId2 , char *cmd ) {

	// use hostId2 to indicate the range hostId-hostId2, but if it is -1
	// then it was not given, so restrict to just hostId
	if ( hostId2 == -1 ) hostId2 = hostId;

	char tmp[1024];
	/*
	int32_t i,j;
	if( installFlag == ifk_distributeC ) {
		int32_t numGroups = g_hostdb.getNumShards();

		char tmp2[100];
		uint32_t groupId1, groupId2;
		int32_t numHostsPerGroup = g_hostdb.getNumHostsPerShard();
		log("distribute copying files to twins for each host");
		for(i=0;i<numGroups;i++) {
			groupId1 = g_hostdb.getGroupId(i);
			Host *h1 = g_hostdb.getGroup(groupId1);
			int32_t baseHostId = h1->m_hostId;
			Host *h2  = h1;
			h2++;

			for(j=1; j< numHostsPerGroup; j++) {
				sprintf(tmp,
					"scp %s:%schecksumg%"INT32"h%"INT32"db ",
					iptoa(h1->m_ip),
					h1->m_dir,baseHostId,
					(int32_t)h1->m_hostId);
				sprintf(tmp2, "%s:%s &",
						iptoa(h2->m_ip),
						h2->m_dir);
				strcat(tmp,tmp2);
				log("distribute %s",tmp);
				system(tmp);
				h2++;

			}

		}


		for(i=1;i<numGroups;i++) {
			log("distribute i=%"INT32"",i);

			for(j=0;j<numGroups;j++) {
				groupId1 = g_hostdb.getGroupId(j);
				Host *h1 = g_hostdb.getGroup(groupId1);


				groupId2 = g_hostdb.getGroupId((j+i)%numGroups);
				Host *h2 = g_hostdb.getGroup(groupId2);


				int32_t baseHostId = h2->m_hostId;
				for(int k=0;k<numHostsPerGroup; k++) {
				sprintf(tmp,
					"scp %s:%schecksumg%"INT32"h%"INT32"db ",
					iptoa(h1->m_ip),
					h1->m_dir,baseHostId,
					(int32_t)h1->m_hostId);
				if(j == numGroups-1 && k == numHostsPerGroup-1)
					sprintf(tmp2, "%s:%s ",
						iptoa(h2->m_ip),
						h2->m_dir);
				else
					sprintf(tmp2, "%s:%s &",
						iptoa(h2->m_ip),
						h2->m_dir);
				strcat(tmp,tmp2);
				log("distribute %s",tmp);
				system(tmp);
				h2++;

				}
			}
		}

	return 0;
	}
*/

	if ( installFlag == ifk_proxy_start ) {
		for ( int32_t i = 0; i < g_hostdb.m_numProxyHosts; i++ ) {
			Host *h2 = g_hostdb.getProxy(i);
			// limit install to this hostId if it is >= 0
			if ( hostId >= 0 && h2->m_hostId != hostId ) continue;

			// . save old log now, too
			char tmp2[1024];
			tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			sprintf(tmp2,
				"mv ./proxylog ./proxylog-`date '+"
				"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"cp -f gb gb.oldsave ; "
				"mv -f gb.installed gb ; %s"
				"./gb proxy load %"INT32" >& ./proxylog &\" &",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				tmp2           ,
				i);
			// log it
			log(LOG_INIT,"%s", tmp);
			// execute it
			int32_t ret = system ( tmp );
			if ( ret < 0 ) {
				fprintf(stderr,"Error loading proxy: %s\n",
					mstrerror(errno));
				exit(-1);
			}
			fprintf(stderr,"If proxy does not start, make sure "
				"its ip is correct in hosts.conf\n");
		}
		return 0;
	}

	if ( installFlag == ifk_proxy_kstart ) {
		for ( int32_t i = 0; i < g_hostdb.m_numProxyHosts; i++ ) {
			Host *h2 = g_hostdb.getProxy(i);
			// limit install to this hostId if it is >= 0
			if ( hostId >= 0 && h2->m_hostId != hostId ) continue;

			// . save old log now, too
			//char tmp2[1024];
			//tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			//sprintf(tmp2,
			//	"mv ./proxylog ./proxylog-`date '+"
			//	"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			//to test add: ulimit -t 10; to the ssh cmd
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"export MALLOC_CHECK_=0;"
				"cp -f gb gb.oldsave ; "
				"mv -f gb.installed gb ; "
				"ADDARGS='' ; "
				"EXITSTATUS=1 ; "
				"while [ \\$EXITSTATUS != 0 ]; do "
 				"{ "
				//"mv ./proxylog ./proxylog-\\`date '+"
				//"%%Y_%%m_%%d-%%H:%%M:%%S'\\` ; "
				"./gb proxy load %"INT32" " // mdw
				"\\$ADDARGS "
				" >& ./proxylog ;"
				"EXITSTATUS=\\$? ; "
				"ADDARGS='-r' ; "
				"} "
 				"done >& /dev/null & \" & ",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			int32_t ret = system ( tmp );
			if ( ret < 0 ) {
				fprintf(stderr,"Error loading proxy: %s\n",
					mstrerror(errno));
				exit(-1);
			}
			fprintf(stderr,"If proxy does not start, make sure "
				"its ip is correct in hosts.conf\n");
		}
		return 0;
	}

	HashTableX iptab;
	char tmpBuf[2048];
	iptab.set(4,4,64,tmpBuf,2048,true,0,"iptsu");

	int32_t maxOut = 6;

	// this is a big scp so only do two at a time...
	if  ( installFlag == ifk_install ) maxOut = 1;

	// same with this. takes too long on gk144, jams up
	if  ( installFlag == ifk_installgb ) maxOut = 4;

	if  ( installFlag == ifk_installgbrcp ) maxOut = 4;

	//int32_t maxOutPerIp = 6;

	// go through each host
	for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) {
		Host *h2 = g_hostdb.getHost(i);

		char *amp = " ";

		// if i is NOT multiple of maxOut then use '&'
		// even if all all different machines (IPs) scp chokes and so
		// does rcp a little. so restrict to maxOut at a time.
		if ( (i+1) % maxOut ) amp = "&";


		// if host ip is like the 10th occurence then do
		// not do ampersand. this is for hosts on the same IP.
		//int32_t score = iptab.getScore32(&h2->m_ip);
		//if ( (score % maxOutPerIp) ) amp = "&";
		//iptab.addScore((int32_t *)&h2->m_ip);

		// limit install to this hostId if it is >= 0
		//if ( hostId >= 0 && h2->m_hostId != hostId ) continue;
		if ( hostId >= 0 && hostId2 == -1 ) {
			if ( h2->m_hostId != hostId ) continue;
		}
		// if doing a range of hostid, hostId2 is >= 0
		else if ( hostId >= 0 && hostId2 >= 0 ) {
			if ( h2->m_hostId < hostId  ) continue;
			if ( h2->m_hostId > hostId2 ) continue;
		}
		// do not install to self
		//if ( h2->m_hostId == g_hostdb.m_hostId ) continue;
		// backupcopy
		if ( installFlag == ifk_backupcopy ) {
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"mkdir %s ; "
				"cp -ai *.dat* *.map gb.conf "
				"hosts.conf %s\" &",
				iptoa(h2->m_ip), h2->m_dir , dir , dir );
			// log it
			log ( "%s", tmp);
			// execute it
			system ( tmp );
			continue;
		}
		// backupmove
		if ( installFlag == ifk_backupmove ) {
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"mkdir %s ; "
				"mv -i *.dat* *.map "
				"%s\" &",
				iptoa(h2->m_ip), h2->m_dir , dir , dir );
			// log it
			log ( "%s", tmp);
			// execute it
			system ( tmp );
			continue;
		}
		// backuprestore
		if ( installFlag == ifk_backuprestore ) {
			sprintf(tmp,
				"ssh %s \"cd %s ; cd %s ; "
				"mv -i *.dat* *.map gb.conf "
				"hosts.conf %s\" &",
				iptoa(h2->m_ip), h2->m_dir , dir , h2->m_dir );
			// log it
			log ( "%s", tmp);
			// execute it
			system ( tmp );
			continue;
		}

		// dumpmissing logic
		else if ( installFlag == ifk_dumpmissing ) {
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"cp -f gb gb.oldsave ; "
				"mv -f gb.installed gb ; "
				"./gb dumpmissing %s %"INT32" "
				">& ./missing%"INT32" &\" &",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				//h2->m_dir      ,
				coll           ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}
		else if ( installFlag == ifk_dumpdups ) {
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"cp -f gb gb.oldsave ; "
				"mv -f gb.installed gb ; "
				"./gb dumpdups %s %"INT32" "
				">& ./dups%"INT32" &\" &",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				//h2->m_dir      ,
				coll           ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}
		// removedocids logic
		else if ( installFlag == ifk_removedocids ) {
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"cp -f gb gb.oldsave ; "
				"mv -f gb.installed gb ; "
				// hostid is now inferred from path
				"./gb "//%"INT32" "
				"removedocids %s %s %"INT32" "
				">& ./removelog%03"INT32" &\" &",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				//h2->m_dir      ,
				//h2->m_hostId   ,
				coll           ,
				dir            , // really docidsFile
				h2->m_hostId   ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}

		char *dir = "./";
		// install to it
		if      ( installFlag == ifk_install ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;

			char *srcDir = "./";
			SafeBuf fileListBuf;
			g_process.getFilesToCopy ( srcDir , &fileListBuf );

			// include this one as well for install
			//fileListBuf.safePrintf(" %shosts.conf",srcDir);
			// the dmoz data dir if there
			fileListBuf.safePrintf(" %scat",srcDir);
			fileListBuf.safePrintf(" %shosts.conf",srcDir);
			fileListBuf.safePrintf(" %sgb.conf",srcDir);

			char *ipStr = iptoa(h2->m_ip);

			SafeBuf tmpBuf;
			tmpBuf.safePrintf(
					  // ensure directory is there, if
					  // not then make it
					  "ssh %s 'mkdir %s' ; "
					  "scp -r %s %s:%s"
					  , ipStr
					  , h2->m_dir

					  , fileListBuf.getBufStart()
					  , iptoa(h2->m_ip)
					  , h2->m_dir
					  );
			char *tmp = tmpBuf.getBufStart();
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		/*
		if      ( installFlag == ifk_install2 ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;
			sprintf(tmp,
				"rcp -r "
				"%sgb "
				//"%sgbfilter "
				"%shosts.conf "
				"%shosts2.conf "
				"%sgb.conf "
				"%stmpgb "
				//"%scollections.dat "
				"%sgb.pem "
				"%sdict "
				"%sucdata "
				"%stop100000Alexa.txt "
				//"%slanglist "
				"%santiword "
				"%s.antiword "
				"badcattable.dat "
				"catcountry.dat "
				"%spdftohtml "
				"%spstotext "
				"%sxlhtml "
				"%sppthtml "
				//"%stagdb*.xml "
				"%shtml "
				"%scat "
				"%s:%s",
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				dir,
				//iptoa(h2->m_ip2),
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"rcp %sgb.conf %s:%sgb.conf",
				dir ,
				//h->m_hostId ,
				//iptoa(h2->m_ip),
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			        //h2->m_hostId);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		*/
		else if ( installFlag == ifk_installgb ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;

			File f;
			char *target = "gb.new";
			f.set(g_hostdb.m_myHost->m_dir,target);
			if ( ! f.doesExist() ) target = "gb";

			sprintf(tmp,
				"scp -c arcfour " // blowfish is faster
				"%s%s "
				"%s:%s/gb.installed%s",
				dir,
				target,
				iptoa(h2->m_ip),
				h2->m_dir,
				amp);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		else if ( installFlag == ifk_installgbrcp ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;

			File f;
			char *target = "gb.new";
			f.set(g_hostdb.m_myHost->m_dir,target);
			if ( ! f.doesExist() ) target = "gb";

			sprintf(tmp,
				"rcp "
				"%s%s "
				"%s:%s/gb.installed%s",
				dir,
				target,
				iptoa(h2->m_ip),
				h2->m_dir,
				amp);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		else if ( installFlag == ifk_installtmpgb ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;
			sprintf(tmp,
				"scp "
				"%sgb.new "
				"%s:%s/tmpgb.installed &",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		else if ( installFlag == ifk_installconf ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;
			sprintf(tmp,
				"scp %sgb.conf %shosts.conf %s:%s %s",
				dir ,
				dir ,
				//h->m_hostId ,
				iptoa(h2->m_ip),
				h2->m_dir,
				//h2->m_hostId);
				amp);

			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			// sprintf(tmp,
			// 	"scp %shosts.conf %s:%shosts.conf &",
			// 	dir ,
			// 	iptoa(h2->m_ip),
			// 	h2->m_dir);
			// log(LOG_INIT,"admin: %s", tmp);
			// system ( tmp );
			// sprintf(tmp,
			// 	"scp %shosts2.conf %s:%shosts2.conf &",
			// 	dir ,
			// 	iptoa(h2->m_ip),
			// 	h2->m_dir);
			// log(LOG_INIT,"admin: %s", tmp);
			// system ( tmp );
		}
		else if ( installFlag == ifk_start ) {
			// . save old log now, too
			//char tmp2[1024];
			//tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			//sprintf(tmp2,
			//	"mv ./log%03"INT32" ./log%03"INT32"-`date '+"
			//	"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
			//	h2->m_hostId   ,
			//	h2->m_hostId   );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			sprintf(tmp,
				"ssh %s \"cd %s ; ulimit -c unlimited; "
				"cp -f gb gb.oldsave ; "
				"mv -f gb.installed gb ; " // %s"
				//"./gb %"INT32" >& ./log%03"INT32" &\" %s",
				// without "sleep 1" ssh seems to exit
				// bash before it can start gb and gb does
				// not start up.
				// hostid is now inferred from path.
				"./gb & sleep 1\" %s",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				//tmp2           ,
				//h2->m_dir      ,
				//h2->m_hostId   ,
				//h2->m_hostId   ,
				amp);
			// log it
			//log(LOG_INIT,"admin: %s", tmp);
			fprintf(stdout,"admin: %s\n", tmp);
			// execute it
			system ( tmp );
		}
		/*
		// SEQUENTIALLY start
		else if ( installFlag == ifk_start2 ) {
			// . save old log now, too
			char tmp2[1024];
			tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			sprintf(tmp2,
				"mv ./log%03"INT32" ./log%03"INT32"-`date '+"
				"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			char *amp = " &";
			if ( i > 0 && (i%5) == 0 ) amp = "";
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"cp -f gb gb.oldsave ; "
				"mv -f gb.installed gb ; %s"
				"./gb %"INT32" >& ./log%03"INT32" &\"%s",
				iptoa(h2->m_ipShotgun),
				h2->m_dir      ,
				tmp2           ,
				//h2->m_dir      ,
				h2->m_hostId   ,
				h2->m_hostId   ,
				amp );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}
		*/
		// start up a dummy cluster using hosts.conf ports + 1
		else if ( installFlag == ifk_tmpstart ) {
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			sprintf(tmp,
				"ssh %s \"cd %s ; "
				"cp -f tmpgb tmpgb.oldsave ; "
				"mv -f tmpgb.installed tmpgb ; "
				"%s/tmpgb tmpstarthost "
				"%"INT32" >& ./tmplog%03"INT32" &\" &",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				h2->m_dir      ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}
		else if ( installFlag == ifk_kstart ) {
			//keepalive
			// . save old log now, too
			//char tmp2[1024];
			//tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			// we do not run as daemon so keepalive loop will
			// work properly...
			//sprintf(tmp2,
			//	"mv ./log%03"INT32" ./log%03"INT32"-`date '+"
			//	"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
			//	h2->m_hostId   ,
			//	h2->m_hostId   );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			//to test add: ulimit -t 10; to the ssh cmd
			sprintf(tmp,
				"ssh %s \"cd %s ; ulimit -c unlimited; "
				"export MALLOC_CHECK_=0;"
				"cp -f gb gb.oldsave ; "
				"ADDARGS='' "
				"INC=1 "
				"EXITSTATUS=1 ; "
				 "while [ \\$EXITSTATUS != 0 ]; do "
 				 "{ "

				// if gb still running, then do not try to
				// run it again. we
				// probably double-called './gb start'.
				// so see if the port is bound to.
				// "./gb isportinuse %i ; "
				// "if [ \\$? -eq 1 ] ; then "
				// "echo \"gb or something else "
				// "is already running on "
				// "port %i. Not starting.\" ; "
				// "exit 0; "
				// "fi ; "

				// ok, the port is available
				//"echo \"Starting gb\"; "

				//"exit 0; "

				// in case gb was updated...
				"mv -f gb.installed gb ; "

				// move the log file
				"mv ./log%03"INT32" ./log%03"INT32"-\\`date '+"
				"%%Y_%%m_%%d-%%H:%%M:%%S'\\` ; "

				// indicate -l so we log to a logfile
				"./gb -l "//%"INT32" "
				"\\$ADDARGS "

				// no longer log to stderr so we can
				// do log file rotation
				//" >& ./log%03"INT32""
				" ;"

				"EXITSTATUS=\\$? ; "
				"ADDARGS='-r'\\$INC ; "
				"INC=\\$((INC+1));"
				"} "
 				"done >& /dev/null & \" %s",
 				//"done & \" %s",
				//"\" %s",
				iptoa(h2->m_ip),
				h2->m_dir      ,

				// for ./gb isportinuse %i
				// h2->m_httpPort ,
				// h2->m_httpPort ,

				// for moving log file
				 h2->m_hostId   ,
				 h2->m_hostId   ,

				//h2->m_dir      ,

				// hostid is now inferred from path
				//h2->m_hostId   ,
				amp );

			// log it
			//log(LOG_INIT,"admin: %s", tmp);
			fprintf(stdout,"admin: %s\n", tmp);
			// execute it
			system ( tmp );
		}
		else if ( installFlag == ifk_dstart ) {
			//keepalive
			// . save old log now, too
			//char tmp2[1024];
			//tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			// we do not run as daemon so keepalive loop will
			// work properly...
			//sprintf(tmp2,
			//	"mv ./log%03"INT32" ./log%03"INT32"-`date '+"
			//	"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
			//	h2->m_hostId   ,
			//	h2->m_hostId   );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			amp = "&";
			//if ( i > 0 && (i%5) == 0 ) amp = "";
			//to test add: ulimit -t 10; to the ssh cmd
			sprintf(tmp,
				"ssh %s \"cd %s ; ulimit -c unlimited; "
				"export MALLOC_CHECK_=0;"
				"cp -f gb gb.oldsave ; "
				"mv -f gb.installed gb ; "
				//"ADDARGS='' ; "
				//"EXITSTATUS=1 ; "
				// "while [ \\$EXITSTATUS != 0 ]; do "
 				// "{ "

				// move the log file
				//"mv ./log%03"INT32" ./log%03"INT32"-\\`date '+"
				//"%%Y_%%m_%%d-%%H:%%M:%%S'\\` ; "

				"./gb -d "//%"INT32" "
				//"\\$ADDARGS "
				//" ;"
				//" >& ./log%03"INT32" ;"

				//"EXITSTATUS=\\$? ; "
				//"ADDARGS='-r' ; "
				//"} "
 				//"done >& /dev/null & \" %s",
				"\" %s",
				iptoa(h2->m_ip),
				h2->m_dir      ,

				// for moving log file
				// h2->m_hostId   ,
				// h2->m_hostId   ,

				//h2->m_dir      ,

				// hostid is now inferred from path
				//h2->m_hostId   ,
				amp );

			// log it
			//log(LOG_INIT,"admin: %s", tmp);
			fprintf(stdout,"admin: %s\n", tmp);
			// execute it
			system ( tmp );
		}
		/*
		else if ( installFlag == ifk_gendbs ) {
			// . save old log now, too
			char tmp2[1024];
			tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			sprintf(tmp2,
				"mv ./log%03"INT32" ./log%03"INT32"-`date '+"
				"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			sprintf(tmp,
				"ssh %s \"cd %s ; %s"
				"./gb -c %shosts.conf gendbs %s %"INT32" >&"
				"./log%03"INT32" &\" &",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				tmp2           ,
				h2->m_dir      ,
				coll           ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// log it
			log(LOG_INFO,"installM %s",tmp);
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}


		else if ( installFlag == ifk_fixtfndb ) {
			// . save old log now, too
			char tmp2[1024];
			tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			sprintf(tmp2,
				"mv ./log%03"INT32" ./log%03"INT32"-`date '+"
				"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			sprintf(tmp,
				"ssh %s \"cd %s ; %s"
				"./gb -c %shosts.conf fixtfndb %s %"INT32" >&"
				"./log%03"INT32" &\" &",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				tmp2           ,
				h2->m_dir      ,
				coll           ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}
		else if ( installFlag == ifk_gentfndb ) {
			// . save old log now, too
			char tmp2[1024];
			tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			sprintf(tmp2,
				"mv ./log%03"INT32" ./log%03"INT32"-`date '+"
				"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			sprintf(tmp,
				"ssh %s \"cd %s ; %s"
				"./gb -c %shosts.conf gentfndb %s %"INT32" >&"
				"./log%03"INT32" &\" &",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				tmp2           ,
				h2->m_dir      ,
				coll           ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}
		*/
		else if ( installFlag == ifk_installcat ) {
			// . copy catdb files to all hosts
			// don't copy to ourselves
			if ( h2->m_hostId == 0 )
				continue;
			/*
			if ( h2->m_hostId == 0 ) {
				sprintf(tmp,
					"cp "
					"content.rdf.u8 "
					"structure.rdf.u8 "
					"gbdmoz.structure.dat "
					"gbdmoz.content.dat "
					"%scatdb/",
					h2->m_dir);
				log(LOG_INIT,"admin: %s", tmp);
				system ( tmp );
				continue;
			}
			*/
			sprintf(tmp,
				"scp "
				"%scatdb/content.rdf.u8 "
				"%s:%scatdb/content.rdf.u8",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"scp "
				"%scatdb/structure.rdf.u8 "
				"%s:%scatdb/structure.rdf.u8",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"scp "
				"%scatdb/gbdmoz.structure.dat "
				"%s:%scatdb/gbdmoz.structure.dat",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"scp "
				"%scatdb/gbdmoz.content.dat "
				"%s:%scatdb/gbdmoz.content.dat",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			//system ( tmp );
			//sprintf(tmp,
			//	"scp "
			//	"%scatdb/gbdmoz.content.dat.diff "
			//	"%s:%scatdb/gbdmoz.content.dat.diff",
			//	dir,
			//	iptoa(h2->m_ip),
			//	h2->m_dir);
			//log(LOG_INIT,"admin: %s", tmp);
			//system ( tmp );
		}
		else if ( installFlag == ifk_installnewcat ) {
			// . copy catdb files to all hosts
			// don't copy to ourselves
			if ( h2->m_hostId == 0 ) continue;
			sprintf(tmp,
				"scp "
				"%scatdb/content.rdf.u8.new "
				"%s:%scatdb/content.rdf.u8.new",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"scp "
				"%scatdb/structure.rdf.u8.new "
				"%s:%scatdb/structure.rdf.u8.new",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"scp "
				"%scatdb/gbdmoz.structure.dat.new "
				"%s:%scatdb/gbdmoz.structure.dat.new",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"scp "
				"%scatdb/gbdmoz.content.dat.new "
				"%s:%scatdb/gbdmoz.content.dat.new",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"scp "
				"%scatdb/gbdmoz.content.dat.new.diff "
				"%s:%scatdb/gbdmoz.content.dat.new.diff",
				dir,
				iptoa(h2->m_ip),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		else if ( installFlag == ifk_genclusterdb ) {
			// . save old log now, too
			char tmp2[1024];
			tmp2[0]='\0';
			// let's do this for everyone now
			//if ( h2->m_hostId == 0 )
			//sprintf(tmp2,
			//	"mv ./log%03"INT32" ./log%03"INT32"-`date '+"
			//	"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
			//	h2->m_hostId   ,
			//	h2->m_hostId   );
			// . assume conf file name gbHID.conf
			// . assume working dir ends in a '/'
			sprintf(tmp,
				"ssh %s \"cd %s ;"
				//"%s"
				"./gb genclusterdb %s %"INT32" >&"
				"./log%03"INT32"-genclusterdb &\" &",
				iptoa(h2->m_ip),
				h2->m_dir      ,
				//h2->m_dir      ,
				//tmp2           ,
				coll           ,
				h2->m_hostId   ,
				h2->m_hostId   );
			// log it
			log(LOG_INIT,"admin: %s", tmp);
			// execute it
			system ( tmp );
		}
		/*
		// SEQUENTIAL rcps
		else if ( installFlag == ifk_installgb2 ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;
			char *amp = " &";
			if ( i > 0 && (i%5) == 0 ) amp = "";

			File f;
			char *target = "gb.new";
			f.set(h2->m_dir,target);
			if ( ! f.doesExist() ) target = "gb";

			sprintf(tmp,
				"rcp "
				"%s%s "
				"%s:%s/gb.installed %s",
				dir,
				target ,
				iptoa(h2->m_ipShotgun),
				h2->m_dir,
				amp);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		*/
		// dsh
		else if ( installFlag == ifk_dsh ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;
			sprintf(tmp,
				"ssh %s 'cd %s ; %s' %s",
				iptoa(h2->m_ip),
				h2->m_dir,
				cmd ,
				amp );
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		// dsh2
		else if ( installFlag == ifk_dsh2 ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;
			//sprintf(tmp,
			//	"ssh %s '%s' &",
			//	iptoa(h2->m_ipShotgun),
			//	cmd );
			sprintf(tmp,
				"ssh %s 'cd %s ; %s'",
				iptoa(h2->m_ip),
				h2->m_dir,
				cmd );
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
		// installconf2
		else if ( installFlag == ifk_installconf2 ) {
			// don't copy to ourselves
			//if ( h2->m_hostId == h->m_hostId ) continue;
			sprintf(tmp,
				"rcp %sgb.conf %shosts.conf %shosts2.conf "
				"%s:%s &",
				dir ,
				dir ,
				dir ,
				//h->m_hostId ,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
				//h2->m_hostId);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
                // installcat2
		else if ( installFlag == ifk_installcat2 ) {
			// . copy catdb files to all hosts
			// don't copy to ourselves
			if ( h2->m_hostId == 0 ) continue;
			sprintf(tmp,
				"rcp "
				"%scatdb/content.rdf.u8 "
				"%s:%scatdb/content.rdf.u8",
				dir,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"rcp "
				"%scatdb/structure.rdf.u8 "
				"%s:%scatdb/structure.rdf.u8",
				dir,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"rcp "
				"%scatdb/gbdmoz.structure.dat "
				"%s:%scatdb/gbdmoz.structure.dat",
				dir,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"rcp "
				"%scatdb/gbdmoz.content.dat "
				"%s:%scatdb/gbdmoz.content.dat",
				dir,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			//system ( tmp );
			//sprintf(tmp,
			//	"rcp "
			//	"%scatdb/gbdmoz.content.dat.diff "
			//	"%s:%scatdb/gbdmoz.content.dat.diff",
			//	dir,
			//	iptoa(h2->m_ip),
			//	h2->m_dir);
			//log(LOG_INIT,"admin: %s", tmp);
			//system ( tmp );
		}
                // installnewcat2
		else if ( installFlag == ifk_installnewcat2 ) {
			// . copy catdb files to all hosts
			// don't copy to ourselves
			if ( h2->m_hostId == 0 ) continue;
			sprintf(tmp,
				"rcp "
				"%scatdb/content.rdf.u8.new "
				"%s:%scatdb/content.rdf.u8.new",
				dir,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"rcp "
				"%scatdb/structure.rdf.u8.new "
				"%s:%scatdb/structure.rdf.u8.new",
				dir,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"rcp "
				"%scatdb/gbdmoz.structure.dat.new "
				"%s:%scatdb/gbdmoz.structure.dat.new",
				dir,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"rcp "
				"%scatdb/gbdmoz.content.dat.new "
				"%s:%scatdb/gbdmoz.content.dat.new",
				dir,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
			sprintf(tmp,
				"rcp "
				"%scatdb/gbdmoz.content.dat.new.diff "
				"%s:%scatdb/gbdmoz.content.dat.new.diff",
				dir,
				iptoa(h2->m_ipShotgun),
				h2->m_dir);
			log(LOG_INIT,"admin: %s", tmp);
			system ( tmp );
		}
	}
	// return 0 on success
	return 0;
}

// . only call this once at start up
// . this wrapper logic is now in Rdb.cpp, attemptMergeAll()
/*
void tryMergingWrapper ( int fd , void *state ) {
	g_tagdb.getRdb()->attemptMerge     ( 1 , false );
	g_catdb.getRdb()->attemptMerge     ( 1 , false );
	g_indexdb.getRdb()->attemptMerge    ( 1 , false );
	g_datedb.getRdb()->attemptMerge     ( 1 , false );
	g_titledb.getRdb()->attemptMerge    ( 1 , false );
	g_tfndb.getRdb()->attemptMerge      ( 1 , false );
	g_spiderdb.getRdb()->attemptMerge   ( 1 , false );
	g_checksumdb.getRdb()->attemptMerge ( 1 , false );
	g_clusterdb.getRdb()->attemptMerge  ( 1 , false );
	g_loop.unregisterSleepCallback ( NULL , tryMergingWrapper );
}
*/

// as a security measure so we know who is using gigablast get a page
void getPageWrapper ( int fd , void *state ) {
	//Url u;
	//u.set ( "http://www.gigablast.com/register.html"         ,
	//	gbstrlen("http://www.gigablast.com/register.html") );
	// dns servers might not be working, so do this one
	//u.set ( "http://207.114.174.29/register.html" ,
	//	gbstrlen("http://207.114.174.29/register.html") );
	//u.set ( "http://64.62.168.40/register.html" ,
	//	gbstrlen("http://64.62.168.40/register.html") );
	if ( ! g_conf.m_isLive ) return;

	char *s = "http://www.gigablast.com/register.html";
	//u.set ( s , gbstrlen(s) );
	g_httpServer.getDoc ( s,0, 0, -1 , 0 , NULL , NULL , 30*1000 , 0 , 0 ,
			      20*1024 , 20*1024 );
	// now do this every hour
	g_loop.unregisterSleepCallback( NULL, getPageWrapper);
	// do it every 10 hours now
	g_loop.registerSleepCallback(1000LL*60LL*60LL*10LL, NULL,
				     getPageWrapper);
}

// take snapshot of g_stats
//void takeSnapshotWrapper( int status, void *state) {g_statsdb.takeSnapshot();}

bool registerMsgHandlers ( ) {
	if (! registerMsgHandlers1()) return false;
	if (! registerMsgHandlers2()) return false;
	if (! registerMsgHandlers3()) return false;
	//if ( ! Msg9a::registerHandler() ) return false;
	if ( ! g_pingServer.registerHandler() ) return false;
	//if ( ! g_accessdb.registerHandler () ) return false;
	// in SpiderProxy.cpp...
	initSpiderProxyStuff();
	return true;
}

bool registerMsgHandlers1(){
	Msg20 msg20;	if ( ! msg20.registerHandler () ) return false;
	//Msg22 msg22;	if ( ! msg22.registerHandler () ) return false;
	//Msg23 msg23;	if ( ! msg23.registerHandler () ) return false;
	Msg2a msg2a;    if ( ! msg2a.registerHandler () ) return false;
	//Msg36 msg36;	if ( ! msg36.registerHandler () ) return false;
	//Msg30 msg30;    if ( ! msg30.registerHandler () ) return false;
	MsgC  msgC ;    if ( ! msgC.registerHandler  () ) return false;

	if ( ! Msg22::registerHandler() ) return false;
	//Msg2e msg2e;    if ( ! msg2e.registerHandler () ) return false;
	// msg hanlder for pageturk
	//Msg60 msg60;    if ( ! msg60.registerHandler () ) return false;
	return true;
}

bool registerMsgHandlers2(){
	Msg0  msg0 ;	if ( ! msg0.registerHandler  () ) return false;
	Msg1  msg1 ;	if ( ! msg1.registerHandler  () ) return false;
	//Msg6  msg6 ;    if ( ! msg6.registerHandler  () ) return false;
	//Msg7  msg7 ;	if ( ! msg7.registerHandler  () ) return false;
	//Msg8a  msg8a ;if ( ! msg8a.registerHandler  () ) return false;
	Msg8b  msg8b ;  if ( ! msg8b.registerHandler  () ) return false;
	//Msg10 msg10;	if ( ! msg10.registerHandler () ) return false;
	//Msg11 msg11;	if ( ! msg11.registerHandler () ) return false;
	//Msg12 msg12;	if ( ! msg12.registerHandler () ) return false;
	//Msg13 msg13;	if ( ! msg13.registerHandler () ) return false;
	//MsgE  msge ;  if ( ! msge.registerHandler  () ) return false;
	//Speller speller;if ( ! speller.registerHandler()) return false;

	//Syncdb::registerHandlers();

	if ( ! Msg13::registerHandler() ) return false;
	//if ( ! MsgF ::registerHandler() ) return false;

	//if(! g_udpServer.registerHandler(0x10,handleRequest10)) return false;
	if ( ! g_udpServer.registerHandler(0xc1,handleRequestc1)) return false;
	if ( ! g_udpServer.registerHandler(0x39,handleRequest39)) return false;
	if ( ! g_udpServer.registerHandler(0x2c,handleRequest2c)) return false;
	if ( ! g_udpServer.registerHandler(0x12,handleRequest12)) return false;

	if ( ! registerHandler4  () ) return false;

	// seo module handlers. this will just be stubs declared above
	// if no seo module. the seo module is not part of the open source.
	if(! g_udpServer.registerHandler(0x8e,handleRequest8e)) return false;
	if(! g_udpServer.registerHandler(0x4f,handleRequest4f)) return false;
	if(! g_udpServer.registerHandler(0x95,handleRequest95)) return false;

	if(! g_udpServer.registerHandler(0x3e,handleRequest3e)) return false;
	if(! g_udpServer.registerHandler(0x3f,handleRequest3f)) return false;

	if ( ! g_udpServer.registerHandler(0x25,handleRequest25)) return false;
	if ( ! g_udpServer.registerHandler(0x07,handleRequest7)) return false;

	return true;

	/*
	// VALGRIND does not like this huge stack waster, aka, Msg39
	Msg39 *msg39;
	// Ha HA!!!
	//msg39 = new Msg39();
	msg39 = new ( Msg39 );
	mnew (msg39 , sizeof(Msg39) , "mainmsg39" );
	bool ret = msg39->registerHandler ();
	mdelete (msg39 , sizeof(Msg39) , "mainmsg39" );
	delete msg39;
	return ret;
	*/
}

bool registerMsgHandlers3(){
	Msg17 msg17;    if ( ! msg17.registerHandler () ) return false;
	//Msg34 msg34;    if ( ! msg34.registerHandler () ) return false;
	//Msg35 msg35;    if ( ! msg35.registerHandler () ) return false;
	//Msg24 msg24;    if ( ! msg24.registerHandler () ) return false;
	//Msg40 msg40;    if ( ! msg40.registerHandler () ) return false;
	//MsgB  msgb;     if ( ! msgb.registerHandler  () ) return false;
       	//Msg3e msg3e;    if ( ! msg3e.registerHandler () ) return false;
	//Msg42 msg42;    if ( ! msg42.registerHandler () ) return false;
	//Msg33 msg33;    if ( ! msg33.registerHandler () ) return false;
	//if ( ! g_pingServer.registerHandler() ) return false;
	//if ( ! Msg1c::init() ) return false;
	if ( ! Msg40::registerHandler() ) return false;
	return true;
}

/*
void makeNewConf ( int32_t hostId , char *confFilename ) {
	// read in the conf file
	//	if ( ! g_conf.init ( confFilename ) ) {
	g_conf.init ( confFilename ) ;
	// minimal non-default description into conf
	char buf[1024];
	sprintf ( buf ,
		  "<hostId> %"INT32"</>"
		  "<dnsIp>209.157.102.11</>"  // ns2.best.com
		  , hostId );
	// add it -- the rest will be filled in as defaults
	g_conf.add ( buf );
	// save it
	g_conf.save ();
}
*/

bool mainShutdown ( bool urgent ) {
	return g_process.shutdown(urgent);
}

/*
static int32_t s_shutdownCount;

static void doneShutdownServerWrapper ( void *state ) ;
static bool doneShutdownServer (  ) ;
static void doneSavingWrapper ( void *state ) ;
static bool isAllClosed ( ) ;
bool closeAll ( void *state , void (* callback)(void *state) );
bool allExit ( ) ;

static bool s_urgent = false ;
static bool s_shutdownLock = false;

// call this from gdb if stuck in an infinite loop and we need to save all
bool mainShutdown2 ( ) {

	s_shutdownLock = false;

	g_indexdb.getRdb()->m_isClosed    = false;
	g_titledb.getRdb()->m_isClosed    = false;
	g_tfndb.getRdb()->m_isClosed      = false;
	g_clusterdb.getRdb()->m_isClosed  = false;
	g_linkdb.getRdb()->m_isClosed    = false;
	g_checksumdb.getRdb()->m_isClosed = false;
	g_spiderdb.getRdb()->m_isClosed   = false;
	g_datedb.getRdb()->m_isClosed     = false;
	g_tagdb.getRdb()->m_isClosed     = false;
	g_statsdb.getRdb()->m_isClosed    = false;

	g_indexdb.getRdb()->m_tree.m_needsSave    = false;
	g_titledb.getRdb()->m_tree.m_needsSave    = false;
	g_tfndb.getRdb()->m_tree.m_needsSave      = false;
	g_clusterdb.getRdb()->m_tree.m_needsSave  = false;
 	g_linkdb.getRdb()->m_needsSave           = false;
	g_checksumdb.getRdb()->m_tree.m_needsSave = false;
	g_spiderdb.getRdb()->m_tree.m_needsSave   = false;
	g_datedb.getRdb()->m_tree.m_needsSave     = false;
	g_tagdb.getRdb()->m_tree.m_needsSave     = false;
	g_statsdb.getRdb()->m_tree.m_needsSave     = false;

	return mainShutdown ( true );
}

// . save and exit this server
// . if easydown is true, we broadcast to all others and wait to complete
//   the necessary transactions in each udpServer
bool mainShutdown ( bool urgent ) {
	// no longer allow threads to do this
	if ( g_threads.amThread() ) return true;
	// hack for now
	//log("FIX THIS HACK");
	//if ( urgent ) return true; //exit(-1);
	// . turn off interrupts
	// . we don't want to be interrupted in here!
	// . this is really only useful if we're NOT in a thread cuz
	//   main process could still be interrupted
	// . if we call it from a thread it just results in us getting an
	//   interrupt and since the g_interruptsOn flag is false we'll end
	//   up saying ?wtf?
	if ( ! g_threads.amThread() ) g_loop.interruptsOff();
	// ensure this is not re-entered
	if ( s_shutdownLock ) return true;
	s_shutdownLock = true;
	// save current spidering process
	g_spiderLoop.saveCurrentSpidering();
	// save the Conf file now
	g_conf.save();
	// turn off spidering and addUrl (don't save these)
	g_conf.m_spideringEnabled = 0;
	// i keep forgetting to turn add url back on, so don't turn off now
	//g_conf.m_addUrlEnabled    = 0;
	// save state for top docs
	g_pageTopDocs.saveStateToDisk();
	g_autoBan.save();
	// save it
	s_urgent = urgent;
	// if we're going down hard don't bother waiting on transactions...
	if ( s_urgent ) {
		// disable threads from spawning
		g_threads.disableThreads();
		// . save the Conf file again since we turned off spider/addurl
		// . we don't want them to be on after we recover from crash
		g_conf.save();
		// . try to save all rdbs
		// . return false if blocked
		if ( ! closeAll(NULL,doneSavingWrapper) ) {
			fprintf(stderr,"why did this block? Please fix asap. "
				"Important data is not getting saved.\n");
			return false;
		}
		// we didn't block, so they must all be closed
		return allExit ( );
	}
	// . close our tcp server
	// . this will shut it down right away w/o worrying about completing
	//   transactions
	//g_httpServer.reset();

	// . send notes to all the hosts in the network telling them we're
	//   shutting down
	// . this uses g_udpServer2
	// . this returns false if it blocks
	// . we don't care if it blocks or not
	// . don't bother asking the hosts to send an email alert for us
	//   since we're going down gracefully by letting everyone know
	g_pingServer.broadcastShutdownNotes ( false , // sendEmailAlert?
					      NULL  ,
					      NULL  );
	// reset the shutdown count
	s_shutdownCount = 0;
	// log it
	log(LOG_INFO,"udp: Shutting down servers.");
	// start shutting down our high priority udp server
	//if ( g_udpServer2.shutdown ( NULL , doneShutdownServerWrapper ) )
	//	s_shutdownCount++;
	// and low priority
	if ( g_udpServer.shutdown ( NULL , doneShutdownServerWrapper  ) )
		s_shutdownCount++;
	if ( g_dnsUdpServer.shutdown ( NULL , doneShutdownServerWrapper  ) )
		s_shutdownCount++;
	// bail if we're waiting to complete transactions or something
	if ( s_shutdownCount < 2 ) return false;
	// otherwise, did not block
	return doneShutdownServer();
}

void doneShutdownServerWrapper ( void *state ) {
	doneShutdownServer ( );
}

bool doneShutdownServer (  ) {
	// inc count
	s_shutdownCount++;
	// return if one more to go
	if ( s_shutdownCount < 2 ) return false;
	// . otherwise, save contents of each rdb
	// . this returns false if blocked, true otherwise
	if ( ! closeAll(NULL,doneSavingWrapper) ) return false;
	// do not exit if not all closed
	if ( ! isAllClosed () ) {
		log(LOG_LOGIC,"db: Not all closed but was exiting.");
		return false;
	}
	// otherwise, nobody blocked
	return allExit( );
}

// return false if blocked, true otherwise
bool closeAll ( void *state , void (* callback)(void *state) ) {
	// TODO: why is this called like 100x per second when a merge is
	// going on? why don't we sleep longer in between?
	g_tagdb.getRdb()->close(state,callback,s_urgent,true);
	g_catdb.getRdb()->close(state,callback,s_urgent,true);
	g_indexdb.getRdb()->close(state,callback,s_urgent,true);
	g_datedb.getRdb()->close(state,callback,s_urgent,true);
	g_titledb.getRdb()->close(state,callback,s_urgent,true);
	g_tfndb.getRdb()->close(state,callback,s_urgent,true);
	g_spiderdb.getRdb()->close(state,callback,s_urgent,true);
	g_checksumdb.getRdb()->close(state,callback,s_urgent,true);
	g_clusterdb.getRdb()->close(state,callback,s_urgent,true);
	g_statsdb.getRdb()->close(state,callback,s_urgent,true);

	g_linkdb.getRdb()->close(state,callback,s_urgent,true);

	g_tagdb2.getRdb()->close(state,callback,s_urgent,true);
	//g_catdb2.getRdb()->close(state,callback,s_urgent,true);
	g_indexdb2.getRdb()->close(state,callback,s_urgent,true);
	g_datedb2.getRdb()->close(state,callback,s_urgent,true);
	g_titledb2.getRdb()->close(state,callback,s_urgent,true);
	g_tfndb2.getRdb()->close(state,callback,s_urgent,true);
	g_spiderdb2.getRdb()->close(state,callback,s_urgent,true);
	g_checksumdb2.getRdb()->close(state,callback,s_urgent,true);
	g_clusterdb2.getRdb()->close(state,callback,s_urgent,true);


	int32_t count = 0;
	int32_t need  = 0;
	count += g_tagdb.getRdb()->isClosed(); need++;
	count += g_catdb.getRdb()->isClosed(); need++;
	count += g_indexdb.getRdb()->isClosed(); need++;
	count += g_datedb.getRdb()->isClosed(); need++;
	count += g_titledb.getRdb()->isClosed(); need++;
	count += g_tfndb.getRdb()->isClosed(); need++;
	count += g_spiderdb.getRdb()->isClosed(); need++;
	count += g_checksumdb.getRdb()->isClosed(); need++;
	count += g_clusterdb.getRdb()->isClosed(); need++;
	count += g_statsdb.getRdb()->isClosed(); need++;
	count += g_linkdb.getRdb()->isClosed(); need++;

	count += g_tagdb2.getRdb()->isClosed(); need++;
	//count += g_catdb2.getRdb()->isClosed(); need++;
	count += g_indexdb2.getRdb()->isClosed(); need++;
	count += g_datedb2.getRdb()->isClosed(); need++;
	count += g_titledb2.getRdb()->isClosed(); need++;
	count += g_tfndb2.getRdb()->isClosed(); need++;
	count += g_spiderdb2.getRdb()->isClosed(); need++;
	count += g_checksumdb2.getRdb()->isClosed(); need++;
	count += g_clusterdb2.getRdb()->isClosed(); need++;

	// . don't try saving collectiondb until everyone else is done
	// . since we get called like 100x per second when a merge is
	//   going on, this is a good idea until we fix that problem!
	if ( count < need ) return false;
	// this one always blocks
	g_collectiondb.save();
	g_repair.save();
	//this one too
	g_classifier.save();
	// close the Chinese parser lexicon stuff
	//close_lexicon ();
	// save our caches
	for ( int32_t i = 0; i < MAX_GENERIC_CACHES; i++ ) {
		if ( g_genericCache[i].useDisk() )
			g_genericCache[i].save();
	}
	// save dns caches
	RdbCache *c ;
	c = g_dnsDistributed.getCache();
	if ( c->useDisk() ) c->save();
	// return true if all closed right away w/o blocking
	return true;
}


void doneSavingWrapper ( void *state ) {
	// are they all closed now?
	if ( ! isAllClosed () ) return;
	allExit ( );
	return;
}

void resetAll ( ) {
	g_log.reset();
	g_hostdb.reset()  ;
	g_hostdb2.reset()  ;
	g_spiderLoop.reset();

	g_indexdb.reset();
	g_datedb.reset();
	g_titledb.reset();
	g_spiderdb.reset();
	g_tfndb.reset();
	g_checksumdb.reset();
	g_clusterdb.reset();
	g_linkdb.reset();
	g_tagdb.reset();
	g_catdb.reset();
	g_statsdb.reset();

	g_indexdb2.reset();
	g_datedb2.reset();
	g_titledb2.reset();
	g_spiderdb2.reset();
	g_tfndb2.reset();
	g_checksumdb2.reset();
	g_clusterdb2.reset();
	g_tagdb2.reset();
	//g_catdb2.reset();

	g_collectiondb.reset();
	g_categories1.reset();
	g_categories2.reset();
	g_robotdb.reset();
	g_dnsDistributed.reset();
	g_dnsLocal.reset();
	g_udpServer.reset();
	g_dnsUdpServer.reset();
	//g_udpServer2.reset();
	g_httpServer.reset();
	g_loop.reset();
	for ( int32_t i = 0; i < MAX_GENERIC_CACHES; i++ )
		g_genericCache[i].reset();
	g_speller.reset();
	resetMsg6();
	g_spiderCache.reset();
	g_threads.reset();
	g_ucUpperMap.reset();
	g_ucLowerMap.reset();
	g_ucProps.reset();
	g_ucCombiningClass.reset();
	g_ucScripts.reset();
	g_profiler.reset();
	g_pageTopDocs.destruct();
	g_pageNetTest.destructor();
	resetDecompTables();
	resetCompositionTable();
	g_langList.reset();
	g_autoBan.reset();
	resetPageAddUrl();
	resetHttpMime();
	reset_iana_charset();
	resetAdultBit();
	resetDomains();
	resetEntities();
	resetQuery();
	resetStopWords();
	resetUnicode();
	resetMsg12();
}


void allExitWrapper ( int fd , void *state ) {
	allExit();
}

// returns false if blocked, otherwise just exits
bool allExit ( ) {
	// . wait for all renames and unlinks to complete
	// . BUT don't wait more than 100 seconds, we need that core
	//int32_t t = getTime();
	static char s_registered = 0;
	if ( g_unlinkRenameThreads > 0 ) { // && getTime()-t < 100 ) {
		//static char s_flag = 1;
		//if ( s_flag ) {
		log("db: Waiting for file unlink/rename threads to "
		    "complete. numThreads=%"INT32".",(int32_t)g_unlinkRenameThreads);
		//s_flag = 0;
		//}
		if ( ! s_registered &&
		     ! g_loop.registerSleepCallback(1000,NULL,
						    allExitWrapper) ) {
			log("db: Failed to register all exit wrapper. "
			    "Sleeping 30 seconds to make sure all unlink/"
			    "rename threads exit.");
			sleep(30);
		}
		else {
			s_registered = 1;
			return false;
		}
	}

	if ( s_registered )
		g_loop.unregisterSleepCallback(NULL, allExitWrapper);

	// . this one always blocks
	// . save the "sync" file last, after all other files have saved
	//   successfully, because if one has a problem it will need to be
	//   sync'ed.
	//g_sync.close();
	g_collectiondb.save();
	g_repair.save();
	// . don't bother resetting if we're urgent
	// . resetting makes it easier to see what memory has been leaked
	if ( ! s_urgent ) {
		resetAll();
		// print out memory here, not from the destructor cuz it
		// freezes in malloc for some reason sometimes
		g_mem.printMem();
		// . if we're not a panic/urgent dump, don't force dump core
		// . exit cleanly (0 means no errors)
		exit(0);
	}

	// . place breakpoint here for memory leak detection
	// . then say "print g_mem.printMem()" from gdb
	// . some TermTable's were not freed for stopWords, obsceneWords, ...

	// . if we the main process we must kill all threads since linux
	//   has a bug that won't dump our core if threads are about
	if ( ! g_threads.amThread () ) {
		// . otherwise, we're the main process
		// . linux has a bug where the core won't dump when threads
		//   are running
		//pthread_kill_other_threads_np();
		// print it
		if ( g_loop.m_shutdown != 1 )
			fprintf(stderr,"allExit: dumping core after saving\n");
	}
	// print out memory here, not from the destructor cuz it freezes
	// in malloc for some reason sometimes
	g_mem.printMem();
	// . this forces an abnormal termination which will cause a core dump
	// . do not dump core on SIGHUP signals any more though
	if ( g_loop.m_shutdown != 1 ) abort();
	else                          exit(0);
	// a dummy return to keep compiler happy
	return false;
}

// return false if one or more is still not closed yet
bool isAllClosed ( ) {
	int32_t count = 0;
	int32_t need  = 0;
	// this one always blocks
	count += g_collectiondb.save(); need++;
	count += g_tagdb.getRdb()->isClosed();	need++;
	count += g_catdb.getRdb()->isClosed();	need++;
	count += g_indexdb.getRdb()->isClosed(); need++;
	count += g_datedb.getRdb()->isClosed();	need++;
	count += g_titledb.getRdb()->isClosed(); need++;
	count += g_tfndb.getRdb()->isClosed();	need++;
	count += g_spiderdb.getRdb()->isClosed(); need++;
	count += g_checksumdb.getRdb()->isClosed(); need++;
	count += g_clusterdb.getRdb()->isClosed(); need++;
	count += g_statsdb.getRdb()->isClosed(); need++;
	count += g_linkdb.getRdb()->isClosed(); need++;

	count += g_tagdb2.getRdb()->isClosed();	need++;
	//count += g_catdb2.getRdb()->isClosed();	need++;
	count += g_indexdb2.getRdb()->isClosed(); need++;
	count += g_datedb2.getRdb()->isClosed();	need++;
	count += g_titledb2.getRdb()->isClosed(); need++;
	count += g_tfndb2.getRdb()->isClosed();	need++;
	count += g_spiderdb2.getRdb()->isClosed(); need++;
	count += g_checksumdb2.getRdb()->isClosed(); need++;
	count += g_clusterdb2.getRdb()->isClosed(); need++;

	// . the sync file is now saved in g_collectiondb.save()
	// . this one always blocks
	//g_sync.close();
	// return and wait if not
	return ( count >= need );
}
*/


//#include "./libmpm/mp_malloc.h"
/*
void zlibtest() {
	char *ptrs[1000];
	int32_t  lens[1000];
	for ( int32_t j = 0 ; j < 220000 ; j++ ) {
		log("pass=%"INT32"",j);
		Msg0 *m = new (Msg0);
		delete (m);
	}
	return;
	for ( int32_t j = 0 ; j < 120000 ; j++ ) {
		log("pass=%"INT32"",j);
		// malloc 1,000 bufs of size about 100-64k each
		for ( int32_t i = 0 ; i < 100 ; i++ ) {
			int32_t  bufSize = 1000 + (rand() % 65000);
			ptrs[i] = (char *)mmalloc ( bufSize , "ztest" );
			if ( ! ptrs[i] ) {
				log("no mem!"); exit(-1); }
			lens[i] = bufSize;
			// simple write
			for ( int32_t k = 0 ; k < bufSize ; k+=900 )
			ptrs[i][k] = 'a' + (rand() % 64);
		}
		// now free them
		for ( int32_t i = 0 ; i < 100 ; i++ )
			mfree (ptrs[i] , lens[i] , "ztest" );
	}
}
*/

#include "Rdb.h"
#include "Xml.h"
//#include "Tfndb.h"
//#include "Checksumdb.h"
#include "Threads.h"

//
// dump routines here now
//

void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTree,
		  int64_t docid , char justPrintDups ,
		  bool justPrintSentences,
		  bool justPrintWords ) {

	if (!ucInit(g_hostdb.m_dir, true)) {
		log("Unicode initialization failed!");
		return;
	}
	// init our table for doing zobrist hashing
	if ( ! hashinit() ) {
		log("db: Failed to init hashtable." ); return ; }
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
	//g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
	//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
	g_titledb.init ();
	//g_collectiondb.init(true);
	g_titledb.getRdb()->addRdbBase1(coll);
	key_t startKey ;
	key_t endKey   ;
	key_t lastKey  ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();
	startKey = g_titledb.makeFirstKey ( docid );
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	int64_t prevId = 0LL;
	int32_t count = 0;
	char ttt[2048+MAX_URL_LEN];
	HashTableX dedupTable;
	dedupTable.set(4,0,10000,NULL,0,false,0,"maintitledb");
	//g_synonyms.init();
	// load the appropriate dictionaries -- why???
	//g_speller.init();

	// make this
	XmlDoc *xd;
	try { xd = new (XmlDoc); }
	catch ( ... ) {
		fprintf(stdout,"could not alloc for xmldoc\n");
		exit(-1);
	}
	CollectionRec *cr = g_collectiondb.getRec(coll);

 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_TITLEDB   ,
			      cr->m_collnum          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          , // cache key ptr
			      0             , // retry num
			      -1            , // maxRetries
			      true          , // compensate for merge
			      -1LL          , // sync point
			      &msg5b        )){
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;

	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k       = list.getCurrentKey();
		char *rec     = list.getCurrentRec();
		int32_t  recSize = list.getCurrentRecSize();
		int64_t docId       = g_titledb.getDocIdFromKey ( k );
		//int32_t      hostHash    = g_titledb.getHostHash     ( k );
		//int32_t      contentHash = g_titledb.getContentHash  ( k );
		if ( k <= lastKey )
			log("key out of order. "
			    "lastKey.n1=%"XINT32" n0=%"XINT64" "
			    "currKey.n1=%"XINT32" n0=%"XINT64" ",
			    lastKey.n1,lastKey.n0,
			    k.n1,k.n0);
		lastKey = k;
		int32_t shard = g_hostdb.getShardNum ( RDB_TITLEDB , &k );
		// print deletes
		if ( (k.n0 & 0x01) == 0) {
			fprintf(stdout,"n1=%08"XINT32" n0=%016"XINT64" docId=%012"INT64" "
			       "shard=%"INT32" (del)\n",
				k.n1 , k.n0 , docId , shard );
			continue;
		}
		// free the mem
		xd->reset();
		// uncompress the title rec
		//TitleRec tr;
		if ( ! xd->set2 ( rec , recSize , coll ,NULL , 0 ) )
			continue;

		// get this
		//uint32_t siteHash32 = xd->m_siteHash32;

		// extract the url
		Url *u = xd->getFirstUrl();

		// MOD to only print root urls
		//if (!u->isRoot()) continue;
		// get ip
		char ipbuf [ 32 ];
		strcpy ( ipbuf , iptoa(u->getIp() ) );
		// pad with spaces
		int32_t blen = gbstrlen(ipbuf);
		while ( blen < 15 ) ipbuf[blen++]=' ';
		ipbuf[blen]='\0';
		//int32_t  ext = g_tfndb.makeExt ( u );
		//int32_t nc = xd->size_catIds / 4;//tr.getNumCatids();
		if ( justPrintDups ) {
			// print into buf
			if ( docId != prevId ) {
				time_t ts = xd->m_spideredTime;//tr.getSpiderDa
				struct tm *timeStruct = localtime ( &ts );
				//struct tm *timeStruct = gmtime ( &ts );
				char ppp[100];
				strftime(ppp,100,"%b-%d-%Y-%H:%M:%S",
					 timeStruct);
				LinkInfo *info = xd->ptr_linkInfo1;//tr.ge
				char foo[1024];
				foo[0] = '\0';
				//if ( tr.getVersion() >= 86 )
				sprintf(foo,
					//"tw=%"INT32" hw=%"INT32" upw=%"INT32" "
					"sni=%"INT32" ",
					//(int32_t)xd->m_titleWeight,
					//(int32_t)xd->m_headerWeight,
					//(int32_t)xd->m_urlPathWeight,
					(int32_t)xd->m_siteNumInlinks);
				char *ru = xd->ptr_redirUrl;
				if ( ! ru ) ru = "";
				sprintf(ttt,
					"n1=%08"XINT32" n0=%016"XINT64" docId=%012"INT64" "
					//hh=%07"XINT32" ch=%08"XINT32" "
					//"e=%02"XINT32" "
					"size=%07"INT32" "
					"ch32=%010"UINT32" "
					"clen=%07"INT32" "
					"cs=%04d "
					"lang=%02d "
					"sni=%03"INT32" "
					//"cats=%"INT32" "
					"lastspidered=%s "
					"ip=%s "
					"numLinkTexts=%04"INT32" "
					"%s"
					"version=%02"INT32" "
					//"maxLinkTextWeight=%06"UINT32"%% "
					"hc=%"INT32" "
					"redir=%s "
					"url=%s "
					"firstdup=1 "
					"shard=%"INT32" "
					"\n",
					k.n1 , k.n0 ,
					//rec[0] ,
					docId ,
					//hostHash ,
					//contentHash ,
					//(int32_t)ext ,
					recSize - 16 ,
					xd->m_contentHash32,
					xd->size_utf8Content,//tr.getContentLen
					xd->m_charset,//tr.getCharset(),
					xd->m_langId,//tr.getLanguage(),
					(int32_t)xd->m_siteNumInlinks,//tr.getDo
					//nc,
					ppp,
					iptoa(xd->m_ip),//ipbuf ,
					info->getNumGoodInlinks(),
					foo,
					(int32_t)xd->m_version,
					//ms,
					(int32_t)xd->m_hopCount,
					ru,
					u->getUrl() ,
					shard );
				prevId = docId;
				count = 0;
				continue;
			}
			// print previous docid that is same as our
			if ( count++ == 0 ) printf ( "\n%s" , ttt );
		}
		// nice, this is never 0 for a titlerec, so we can use 0 to signal
		// that the following bytes are not compressed, and we can store
		// out special checksum vector there for fuzzy deduping.
		//if ( rec[0] != 0 ) continue;
		// print it out
		//printf("n1=%08"XINT32" n0=%016"XINT64" b=0x%02hhx docId=%012"INT64" sh=%07"XINT32" ch=%08"XINT32" "
		// date indexed as local time, not GMT/UTC
		time_t ts = xd->m_spideredTime;//tr.getSpiderDate();
		struct tm *timeStruct = localtime ( &ts );
		//struct tm *timeStruct = gmtime ( &ts );
		char ppp[100];
		strftime(ppp,100,"%b-%d-%Y-%H:%M:%S",timeStruct);
		//ppp[strlen(ppp)-2]='\0';

		/*
		  BEGIN MOD FOR DUMPING STUFF TO RE-LINK ANALYZE

		LinkInfo *info = tr.getLinkInfo();
		int32_t nLinkTexts = info->getNumLinkTexts();
		if ( nLinkTexts > 10 ) continue;
		// continue if spidered after june 14
		if ( timeStruct->tm_mon == 5 && // june
		     timeStruct->tm_mday >= 14 ) continue;
		// get sum of them link texts
		int32_t sum = 0;
		char *pp = NULL;
		int32_t nexti = 0;
		uint32_t baseScore = 0;
		for ( int32_t i = 0 ; i < nLinkTexts ; i++ ) {
			info->getLinkText ( 0 ,
					    NULL , // len
					    NULL , // itemPtr
					    NULL ,  // itemLen
					    &baseScore ,
					    NULL , // quality
					    NULL , // numLinks
					    NULL , // docId
					    NULL , // ip
					    &nexti , // nexti
					    &pp  );// nextp
			sum += baseScore;
		}
		// skip if not very high scoring
		// *100/256 to get the percentages seen in PageTitledb.cpp
		if ( sum < 10000 ) continue;
		// print it
		log ( LOG_INFO, "%s %"INT32" links sum %"INT32"",
		      tr.getUrl()->getUrl(), nLinkTexts , sum );
		// continue
		continue;
		*/
		//uint32_t ms = 0;
		LinkInfo *info = xd->ptr_linkInfo1;//tr.getLinkInfo();
		//for ( Inlink*k=NULL;info&&(k=info->getNextInlink(k)); ){
		//	// returns NULL if none
		//	if ( k->m_baseScore > (int32_t)ms ) ms = k->m_baseScore;
		//}
		// normalize
		//ms = ((int64_t)ms * 100LL) / 256LL;

		char foo[1024];
		foo[0] = '\0';
		//if ( tr.getVersion() >= 86 )
		sprintf(foo,
			//"tw=%"INT32" hw=%"INT32" upw=%"INT32" "
			"sni=%"INT32" ",
			//(int32_t)xd->m_titleWeight,
			//(int32_t)xd->m_headerWeight,
			//(int32_t)xd->m_urlPathWeight,
			(int32_t)xd->m_siteNumInlinks);

		char *ru = xd->ptr_redirUrl;
		if ( ! ru ) ru = "";

		fprintf(stdout,
			"n1=%08"XINT32" n0=%016"XINT64" docId=%012"INT64" "
			//hh=%07"XINT32" ch=%08"XINT32" "
			//"e=%02"XINT32" "
			"size=%07"INT32" "
			"ch32=%010"UINT32" "
			"clen=%07"INT32" "
			"cs=%04d "
			"ctype=%s "
			"lang=%02d "
			"sni=%03"INT32" "
			//"cats=%"INT32" "
			"lastspidered=%s "
			"ip=%s "
			"numLinkTexts=%04"INT32" "
			"%s"
			"version=%02"INT32" "
			//"maxLinkTextWeight=%06"UINT32"%% "
			"hc=%"INT32" "
			"shard=%"INT32" "
			"metadatasize=%"INT32" "
			//"diffbot=%"INT32" "
			"redir=%s "
			"url=%s\n",
			k.n1 , k.n0 ,
			//rec[0] ,
			docId ,
			//hostHash ,
			//contentHash ,
			//(int32_t)ext ,
			recSize - 16 ,
			xd->m_contentHash32,
			xd->size_utf8Content,//tr.getContentLen() ,
			xd->m_charset,//tr.getCharset(),
			g_contentTypeStrings[xd->m_contentType],
			xd->m_langId,//tr.getLanguage(),
			(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
			//nc,
			ppp,
			iptoa(xd->m_ip),//ipbuf ,
			info->getNumGoodInlinks(),
			foo,
			(int32_t)xd->m_version,
			//ms,
			(int32_t)xd->m_hopCount,
			shard,
			xd->size_metadata,
			//(int32_t)xd->m_isDiffbotJSONObject,
			ru,
			u->getUrl() );
		//printf("%s\n",xd->ptr_utf8Content);
		// free the mem
		xd->reset();
		//g_mem.printMem();
	}
	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key_t *)list.getLastKey() ) return;
	goto loop;
}
/*
void dumpTfndb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTree ,
		bool verify) {
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
	//g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
	g_conf.m_tfndbMaxDiskPageCacheMem = 0;
	g_tfndb.init ();
	//g_collectiondb.init(true);
	g_tfndb.getRdb()->addRdbBase1(coll );
	key_t startKey ;
	key_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	RdbList list;
	key_t oldk; oldk.setMin();
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_TFNDB     ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k    = list.getCurrentKey();
		if ( verify ) {
			if ( oldk > k )
				fprintf(stdout,"got bad key order. "
					"%"XINT32"/%"XINT64" > %"XINT32"/%"XINT64"\n",
					oldk.n1,oldk.n0,k.n1,k.n0);
			oldk = k;
			continue;
		}
		int64_t docId = g_tfndb.getDocId        ( &k );
		//int32_t      e     = g_tfndb.getExt          ( k );
		int32_t      tfn   = g_tfndb.getTfn ( &k );
		//int32_t  clean = 0  ; if ( g_tfndb.isClean ( k ) ) clean= 1;
		int32_t  half  = 0  ; if ( k.n0 & 0x02           ) half = 1;
		char *dd    = "" ; if ( (k.n0 & 0x01) == 0    ) dd   =" (del)";
		fprintf(stdout,
			"%08"XINT32" %016"XINT64" docId=%012"INT64" "
			"tfn=%03"INT32" half=%"INT32" %s\n",
			k.n1,k.n0,docId,tfn,half,dd);
	}
	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key_t *)list.getLastKey() ) return;
	goto loop;
}
*/
void dumpWaitingTree (char *coll ) {
	RdbTree wt;
	if (!wt.set(0,-1,true,20000000,true,"waittree2",
		    false,"waitingtree",sizeof(key_t)))return;
	collnum_t collnum = g_collectiondb.getCollnum ( coll );
	// make dir
	char dir[500];
	sprintf(dir,"%scoll.%s.%"INT32"",g_hostdb.m_dir,coll,(int32_t)collnum);
	// load in the waiting tree, IPs waiting to get into doledb
	BigFile file;
	file.set ( dir , "waitingtree-saved.dat" , NULL );
	bool treeExists = file.doesExist() > 0;
	// load the table with file named "THISDIR/saved"
	RdbMem wm;
	if ( treeExists && ! wt.fastLoad(&file,&wm) ) return;
	// the the waiting tree
	int32_t node = wt.getFirstNode();
	for ( ; node >= 0 ; node = wt.getNextNode(node) ) {
		// breathe
		QUICKPOLL(MAX_NICENESS);
		// get key
		key_t *key = (key_t *)wt.getKey(node);
		// get ip from that
		int32_t firstIp = (key->n0) & 0xffffffff;
		// get the time
		uint64_t spiderTimeMS = key->n1;
		// shift upp
		spiderTimeMS <<= 32;
		// or in
		spiderTimeMS |= (key->n0 >> 32);
		// get the rest of the data
		fprintf(stdout,"time=%"UINT64" firstip=%s\n",
			spiderTimeMS,
			iptoa(firstIp));
	}
}


void dumpDoledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTree){
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
	//g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
	//g_conf.m_doledbMaxDiskPageCacheMem = 0;
	g_doledb.init ();
	//g_collectiondb.init(true);
	g_doledb.getRdb()->addRdbBase1(coll );
	key_t startKey ;
	key_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	RdbList list;
	key_t oldk; oldk.setMin();
	CollectionRec *cr = g_collectiondb.getRec(coll);
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_DOLEDB    ,
			      cr->m_collnum          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k    = list.getCurrentKey();
		if ( oldk > k )
			fprintf(stdout,"got bad key order. "
				"%"XINT32"/%"XINT64" > %"XINT32"/%"XINT64"\n",
				oldk.n1,oldk.n0,k.n1,k.n0);
		oldk = k;
		// get it
		char *drec = list.getCurrentRec();
		// sanity check
		if ( (drec[0] & 0x01) == 0x00 ) {char *xx=NULL;*xx=0; }
		// get spider rec in it
		char *srec = drec + 12 + 4;
		// print doledb info first then spider request
		fprintf(stdout,"dolekey=%s (n1=%"UINT32" n0=%"UINT64") "
			"pri=%"INT32" "
			"spidertime=%"UINT32" "
			"uh48=0x%"XINT64"\n",
			KEYSTR(&k,12),
			k.n1,
			k.n0,
			(int32_t)g_doledb.getPriority(&k),
			g_doledb.getSpiderTime(&k),
			g_doledb.getUrlHash48(&k));
		fprintf(stdout,"spiderkey=");
		// print it
		g_spiderdb.print ( srec );
		// the \n
		printf("\n");
		// must be a request -- for now, for stats
		if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) {
			char *xx=NULL;*xx=0; }
		// cast it
		SpiderRequest *sreq = (SpiderRequest *)srec;
		// skip negatives
		if ( (sreq->m_key.n0 & 0x01) == 0x00 ) {
			char *xx=NULL;*xx=0; }
	}
	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key_t *)list.getLastKey() ) return;
	goto loop;
}


// . dataSlot fo the hashtable for spider stats in dumpSpiderdb
// . key is firstip
class UStat {
public:
	// for spider requests:
	int32_t m_numRequests;
	int32_t m_numRequestsWithReplies;
	int32_t m_numWWWRoots;
	int32_t m_numNonWWWRoots;
	int32_t m_numHops1;
	int32_t m_numHops2;
	int32_t m_numHops3orMore;
	int32_t m_ageOfYoungestSpideredRequest;
	int32_t m_ageOfOldestUnspideredRequest;
	int32_t m_ageOfOldestUnspideredWWWRootRequest;
	// for spider replies:
	int32_t m_numGoodReplies;
	int32_t m_numErrorReplies;
};

static HashTableX g_ut;

void addUStat1 ( SpiderRequest *sreq, bool hadReply , int32_t now ) {
	int32_t firstIp = sreq->m_firstIp;
	// lookup
	int32_t n = g_ut.getSlot ( &firstIp );
	UStat *us = NULL;
	UStat tmp;
	if ( n < 0 ) {
		us = &tmp;
		memset(us,0,sizeof(UStat));
		g_ut.addKey(&firstIp,us);
		us = (UStat *)g_ut.getValue ( &firstIp );
	}
	else {
		us = (UStat *)g_ut.getValueFromSlot ( n );
	}
	int32_t age = now - sreq->m_addedTime;
	// inc the counts
	us->m_numRequests++;
	if ( hadReply) us->m_numRequestsWithReplies++;
	if ( sreq->m_hopCount == 0 ) {
		if  ( sreq->m_isWWWSubdomain ) us->m_numWWWRoots++;
		else                           us->m_numNonWWWRoots++;
	}
	else if ( sreq->m_hopCount == 1 ) us->m_numHops1++;
	else if ( sreq->m_hopCount == 2 ) us->m_numHops2++;
	else if ( sreq->m_hopCount >= 3 ) us->m_numHops3orMore++;
	if ( hadReply ) {
		if (age < us->m_ageOfYoungestSpideredRequest ||
		          us->m_ageOfYoungestSpideredRequest == 0 )
			us->m_ageOfYoungestSpideredRequest = age;
	}
	if ( ! hadReply ) {
		if (age > us->m_ageOfOldestUnspideredRequest ||
		          us->m_ageOfOldestUnspideredRequest == 0 )
			us->m_ageOfOldestUnspideredRequest = age;
	}
	if ( ! hadReply && sreq->m_hopCount == 0 && sreq->m_isWWWSubdomain ) {
		if (age > us->m_ageOfOldestUnspideredWWWRootRequest ||
		          us->m_ageOfOldestUnspideredWWWRootRequest == 0 )
			us->m_ageOfOldestUnspideredWWWRootRequest = age;
	}
}

void addUStat2 ( SpiderReply *srep , int32_t now ) {
	int32_t firstIp = srep->m_firstIp;
	// lookup
	int32_t n = g_ut.getSlot ( &firstIp );
	UStat *us = NULL;
	UStat tmp;
	if ( n < 0 ) {
		us = &tmp;
		memset(us,0,sizeof(UStat));
		g_ut.addKey(&firstIp,us);
		us = (UStat *)g_ut.getValue ( &firstIp );
	}
	else {
		us = (UStat *)g_ut.getValueFromSlot ( n );
	}
	//int32_t age = now - srep->m_spideredTime;
	// inc the counts
	if ( srep->m_errCode )
		us->m_numErrorReplies++;
	else
		us->m_numGoodReplies++;

}


int32_t dumpSpiderdb ( char *coll,
		    int32_t startFileNum , int32_t numFiles , bool includeTree ,
		    char printStats ,
		    int32_t firstIp ) {
	if ( startFileNum < 0 ) {
		log(LOG_LOGIC,"db: Start file number is < 0. Must be >= 0.");
		return -1;
	}

	if ( printStats == 1 ) {
		//g_conf.m_maxMem = 2000000000LL; // 2G
		//g_mem.m_maxMem  = 2000000000LL; // 2G
		if ( ! g_ut.set ( 4, sizeof(UStat), 10000000, NULL,
				  0,0,false,"utttt") )
			return -1;
	}

	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
	//g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
	//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
	g_spiderdb.init ();
	//g_collectiondb.init(true);
	g_spiderdb.getRdb()->addRdbBase1(coll );
	key128_t startKey ;
	key128_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// start based on firstip if non-zero
	if ( firstIp ) {
		startKey = g_spiderdb.makeFirstKey ( firstIp );
		endKey  = g_spiderdb.makeLastKey ( firstIp );
	}
	//int32_t t1 = 0;
	//int32_t t2 = 0x7fffffff;
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	RdbList list;
	// clear before calling Msg5
	g_errno = 0;

	// init stats vars
	int32_t negRecs   = 0;
	int32_t emptyRecs = 0;
	int32_t uniqDoms  = 0;
	// count urls per domain in "domTable"
	HashTable domTable;
	domTable.set ( 1024*1024 );
	// count every uniq domain per ip in ipDomTable (uses dup keys)
	HashTableX ipDomTable;
	// allow dups? true!
	ipDomTable.set ( 4,4,5000000 , NULL, 0, true ,0, "ipdomtbl");
	// count how many unique domains per ip
	HashTable ipDomCntTable;
	ipDomCntTable.set ( 1024*1024 );
	// buffer for holding the domains
	int32_t  bufSize = 1024*1024;
	char *buf     = (char *)mmalloc(bufSize,"spiderstats");
	int32_t  bufOff  = 0;
	int32_t  count   = 0;
	int32_t  countReplies = 0;
	int32_t  countRequests = 0;
	int64_t offset = 0LL;
	int32_t now;
	static int64_t s_lastRepUh48 = 0LL;
	static int32_t s_lastErrCode = 0;
	static int32_t s_lastErrCount = 0;
	CollectionRec *cr = g_collectiondb.getRec(coll);

 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_SPIDERDB  ,
			      cr->m_collnum       ,
			      &list         ,
			      (char *)&startKey      ,
			      (char *)&endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return -1;
	}
	// all done if empty
	if ( list.isEmpty() ) goto done;

	// this may not be in sync with host #0!!!
	now = getTimeLocal();

	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {

		// get it
		char *srec = list.getCurrentRec();

		// save it
		int64_t curOff = offset;
		// and advance
		offset += list.getCurrentRecSize();

		// must be a request -- for now, for stats
		if ( ! g_spiderdb.isSpiderRequest((key128_t *)srec) ) {
			// print it
			if ( ! printStats ) {
				printf( "offset=%"INT64" ",curOff);
				g_spiderdb.print ( srec );
				printf("\n");
			}
			// its a spider reply
			SpiderReply *srep = (SpiderReply *)srec;
			// store it
			s_lastRepUh48 = srep->getUrlHash48();
			s_lastErrCode = srep->m_errCode;
			s_lastErrCount = srep->m_errCount;
			countReplies++;
			// get firstip
			if ( printStats == 1 ) addUStat2 ( srep , now );
			continue;
		}

		// cast it
		SpiderRequest *sreq = (SpiderRequest *)srec;

		countRequests++;

		int64_t uh48 = sreq->getUrlHash48();
		// count how many requests had replies and how many did not
		bool hadReply = ( uh48 == s_lastRepUh48 );

		// get firstip
		if ( printStats == 1 ) addUStat1 ( sreq , hadReply , now );

		// print it
		if ( ! printStats ) {
			printf( "offset=%"INT64" ",curOff);
			g_spiderdb.print ( srec );

			printf(" requestage=%"INT32"s",now-sreq->m_addedTime);
			printf(" hadReply=%"INT32"",(int32_t)hadReply);

			printf(" errcount=%"INT32"",(int32_t)s_lastErrCount);

			if ( s_lastErrCode )
				printf(" errcode=%"INT32"(%s)",(int32_t)s_lastErrCode,
				       mstrerror(s_lastErrCode));
			else
				printf(" errcode=%"INT32"",(int32_t)s_lastErrCode);

			// we haven't loaded hosts.conf so g_hostdb.m_map
			// is not set right... so this is useless
			//printf(" shard=%"INT32"\n",
			//     (int32_t)g_hostdb.getShardNum(RDB_SPIDERDB,sreq));
			printf("\n");
		}

		// print a counter
		if ( ((count++) % 100000) == 0 )
			fprintf(stderr,"Processed %"INT32" records.\n",count-1);

		if ( printStats != 2 ) continue;

		// skip negatives
		if ( (sreq->m_key.n0 & 0x01) == 0x00 ) continue;

		// skip bogus shit
		if ( sreq->m_firstIp == 0 || sreq->m_firstIp==-1 ) continue;

		// int16_tcut
		int32_t domHash = sreq->m_domHash32;
		// . is it in the domain table?
		// . keeps count of how many urls per domain
		int32_t slot = domTable.getSlot ( domHash );
		if ( slot >= 0 ) {
			int32_t off = domTable.getValueFromSlot ( slot );
			// just inc the count for this domain
			*(int32_t *)(buf + off) = *(int32_t *)(buf + off) + 1;
			continue;
		}

		// get the domain
		int32_t  domLen = 0;
		char *dom = getDomFast ( sreq->m_url , &domLen );

		// always need enough room...
		if ( bufOff + 4 + domLen + 1 >= bufSize ) {
			int32_t  growth     = bufSize * 2 - bufSize;
			// limit growth to 10MB each time
			if ( growth > 10*1024*1024 ) growth = 10*1024*1024;
			int32_t  newBufSize = bufSize + growth;
			char *newBuf = (char *)mrealloc( buf , bufSize ,
							 newBufSize,
							 "spiderstats");
			if ( ! newBuf ) return -1;
			// re-assign
			buf     = newBuf;
			bufSize = newBufSize;
		}

		// otherwise add it, it is a new never-before-seen domain
		//char poo[999];
		//gbmemcpy ( poo , dom , domLen );
		//poo[domLen]=0;
		//fprintf(stderr,"new dom %s hash=%"INT32"\n",dom,domHash);
		// store the count of urls followed by the domain
		char *ptr = buf + bufOff;
		*(int32_t *)ptr = 1;
		ptr += 4;
		gbmemcpy ( ptr , dom , domLen );
		ptr += domLen;
		*ptr = '\0';
		// use an ip of 1 if it is 0 so it hashes right
		int32_t useip = sreq->m_firstIp; // ip;
		// can't use 1 because it all clumps up!!
		//if ( ip == 0 ) useip = domHash ;
		// this table counts how many urls per domain, as
		// well as stores the domain
		if ( ! domTable.addKey (domHash , bufOff) ) return -1;
		// . if this is the first time we've seen this domain,
		//   add it to the ipDomTable
		// . this hash table must support dups.
		// . we need to print out all the domains for each ip
		if ( ! ipDomTable.addKey ( &useip , &bufOff ) ) return -1;
		// . this table counts how many unique domains per ip
		// . it is kind of redundant since we have ipDomTable
		int32_t ipCnt = ipDomCntTable.getValue ( useip );
		if ( ipCnt < 0 ) ipCnt = 0;
		if ( ! ipDomCntTable.addKey ( useip, ipCnt+1) ) return -1;
		// advance to next empty spot
		bufOff += 4 + domLen + 1;
		// count unque domains
		uniqDoms++;
	}

	startKey = *(key128_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey >= *(key128_t *)list.getLastKey() ) goto loop;

 done:
	// print out the stats
	if ( ! printStats ) return 0;


	// print UStats now
	if ( printStats == 1 ) {
		for ( int32_t i = 0 ; i < g_ut.getNumSlots();i++ ) {
			if ( g_ut.m_flags[i] == 0 ) continue;
			UStat *us = (UStat *)g_ut.getValueFromSlot(i);
			int32_t firstIp = *(int32_t *)g_ut.getKeyFromSlot(i);
			fprintf(stdout,"%s ",
				iptoa(firstIp));
			fprintf(stdout,"requests=%"INT32" ",
				us->m_numRequests);
			fprintf(stdout,"wwwroots=%"INT32" ",
				us->m_numWWWRoots);
			fprintf(stdout,"nonwwwroots=%"INT32" ",
				us->m_numNonWWWRoots);
			fprintf(stdout,"1hop=%"INT32" ",
				us->m_numHops1);
			fprintf(stdout,"2hop=%"INT32" ",
				us->m_numHops2);
			fprintf(stdout,"3hop+=%"INT32" ",
				us->m_numHops3orMore);
			fprintf(stdout,"mostrecentspider=%"INT32"s ",
				us->m_ageOfYoungestSpideredRequest);
			fprintf(stdout,"oldestunspidered=%"INT32"s ",
				us->m_ageOfOldestUnspideredRequest);
			fprintf(stdout,"oldestunspideredwwwroot=%"INT32" ",
				us->m_ageOfOldestUnspideredWWWRootRequest);
			fprintf(stdout,"spidered=%"INT32" ",
				us->m_numRequestsWithReplies);
			fprintf(stdout,"goodspiders=%"INT32" ",
				us->m_numGoodReplies);
			fprintf(stdout,"errorspiders=%"INT32"",
				us->m_numErrorReplies);
			fprintf(stdout,"\n");
		}
		return 0;
	}


	int32_t uniqIps = ipDomCntTable.getNumSlotsUsed();

	// print out all ips, and # of domains they have and list of their
	// domains
	int32_t nn = ipDomTable.getNumSlots();
	// i is the bucket to start at, must be EMPTY!
	int32_t i = 0;
	// count how many buckets we visit
	int32_t visited = 0;
	// find the empty bucket
	for ( i = 0 ; i < nn ; i++ )
		if ( ipDomTable.m_flags[i] == 0 ) break;
		//if ( ipDomTable.getKey(i) == 0 ) break;
	// now we can do our scan of the ips. there can be dup ips in the
	// table so we must chain for each one we find
	for ( ; visited++ < nn ; i++ ) {
		// wrap it
		if ( i == nn ) i = 0;
		// skip empty buckets
		if ( ipDomTable.m_flags[i] == 0 ) continue;
		// get ip of the ith slot
		int32_t ip = *(int32_t *)ipDomTable.getKeyFromSlot(i);
		// get it in the ip table, if not there, skip it
		int32_t domCount = ipDomCntTable.getValue ( ip ) ;
		if ( domCount == 0 ) continue;
		// log the count
		int32_t useip = ip;
		if ( ip == 1 ) useip = 0;
		fprintf(stderr,"%s has %"INT32" domains.\n",iptoa(useip),domCount);
		// . how many domains on that ip, print em out
		// . use j for the inner loop
		int32_t j = i;
		// buf for printing ip
		char ipbuf[64];
		sprintf (ipbuf,"%s",iptoa(useip) );
	jloop:
		int32_t ip2 = *(int32_t *)ipDomTable.getKeyFromSlot ( j ) ;
		if ( ip2 == ip ) {
			// get count
			int32_t  off = *(int32_t *)ipDomTable.getValueFromSlot ( j );
			char *ptr = buf + off;
			int32_t  cnt = *(int32_t *)ptr;
			char *dom = buf + off + 4;
			// print: "IP Domain urlCountInDomain"
			fprintf(stderr,"%s %s %"INT32"\n",ipbuf,dom,cnt);
			// advance && wrap
			if ( ++j >= nn ) j = 0;
			// keep going
			goto jloop;
		}
		// not an empty bucket, so keep chaining
		if ( ip2 != 0 ) {
			// advance & wrap
			if ( ++j >= nn ) j = 0;
			// keep going
			goto jloop;
		}
		// ok, we are done, do not do this ip any more
		ipDomCntTable.removeKey(ip);
	}

	if ( negRecs )
		fprintf(stderr,"There are %"INT32" total negative records.\n",
			negRecs);
	if ( emptyRecs )
		fprintf(stderr,"There are %"INT32" total negative records.\n",
			emptyRecs);

	//fprintf(stderr,"There are %"INT32" total urls.\n",count);
	fprintf(stderr,"There are %"INT32" total records.\n",count);
	fprintf(stderr,"There are %"INT32" total request records.\n",countRequests);
	fprintf(stderr,"There are %"INT32" total replies records.\n",countReplies);
	// end with total uniq domains
	fprintf(stderr,"There are %"INT32" unique domains.\n",uniqDoms);
	// and with total uniq ips in this priority
	fprintf(stderr,"There are %"INT32" unique IPs.\n",uniqIps);
	return 0;
}

/*
static bool makeNewTitleRecKey ( char *rec , int32_t recSize , key_t *newk ,
				 TitleRec *tr , int64_t *h ) ;

// how big can a compressed title record be?
#define MAX_TR_SIZE (2*1024*1024)

// returns false and sets g_errno on error
bool makeNewTitleRecKey ( char *rec , int32_t recSize , key_t *newk ,
			  TitleRec *tr , int64_t *h ) {
	// if uncompress failed, just keep looping
	if ( ! xd->set
	if ( ! tr->set ( rec , MAX_TR_SIZE , false ) )
		return log("db: TitleRec uncompress failed. continuing.");
	// get hashes
	Xml xml;
	//CrashMe();
	xml.set ( tr->getCharset(),tr->getContent() , tr->getContentLen() ,
		  false, 0, false, tr->getVersion() );
	*h = g_checksumdb.getContentHash ( &xml,tr->getUrl(), tr->getLinkInfo(),
					   tr->getVersion(),
					   0);// niceness
	int32_t contentHash = (int32_t)*h;
	int32_t hostHash    = hash32 (tr->getUrl()->getHost() ,
				   tr->getUrl()->getHostLen() );
	// remake the key with these hashes in the low bits
	*newk = g_titledb.makeTitleRecKey ( tr->getDocId() ,
					    false ,  // del key?
					    hostHash , contentHash ,
					    false ,  // adult bit is false
					    false ); // adul category is false
	return true;
}
*/

/*
bool addToTfndb ( char *coll , TitleRec *tr , int32_t id2 ) {
	// add to tfndb if we should
	int32_t e = g_tfndb.makeExt ( tr->getUrl() );
	key_t k = g_tfndb.makeKey ( tr->getDocId(), e, id2 , // tfn
				    false , false ); // clean? del?
	// get the rdb of the tfndb
	Rdb *r = g_tfndb.getRdb();
	// do a non-blocking dump of tree if it's 90% full now
	if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){
if ( ! r->dumpTree ( 0 ) ) // niceness
			return log("db: addToTfndb: dumpTree failed" );
	}
	// returns false and sets g_errno on error
	if ( ! r->addRecord ( coll, k , NULL , 0 , 0) )
		return log("db: addToTfndb: addRecord: %s",mstrerror(g_errno));
	return true;
}

bool addToTfndb2 ( char *coll , SpiderRec *sr , int32_t id2 ) {
	// add to tfndb if we should
	int32_t e = g_tfndb.makeExtQuick ( sr->getUrl() );
	int64_t d = g_titledb.getProbableDocId ( sr->getUrl() );
key_t k = g_tfndb.makeKey ( d, e, 255 , // tfn
	0 , false ); // is clean?, del?
	// get the rdb of the tfndb
	Rdb *r = g_tfndb.getRdb();
	// do a non-blocking dump of tree if it's 90% full now
	if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){
if ( ! r->dumpTree ( 0 ) ) // niceness
			return log("db: addToTfndb2: dumpTree failed" );
	}
	// returns false and sets g_errno on error
	if ( ! r->addRecord ( coll, k , NULL , 0 , 0) )
	       return log("db: addToTfndb2: addRecord: %s",mstrerror(g_errno));
	return true;
}
*/

//Need these two if tr's in addtospiderdb are getting their quality from
// their root urls.
/*bool loadRootUrls    ( char *filename){
	File f;
	f.set ( filename );
	// open files
	if ( ! f.open ( O_RDONLY ) ) {
		log("init: Rooturls open: %s %s",filename,mstrerror(g_errno));
		return 0;
	}
	// get file size
	int32_t fileSize = f.getFileSize() ;
	//init hashtable to lets say 1 mil
	// store a \0 at the end
	int32_t bufSize = fileSize + 1;

	// make buffers to hold all
	char *buf = (char *) mmalloc ( bufSize , "RootUrls" );
	if ( ! buf) {
		log("init: Rooturls mmalloc: %s",mstrerror(errno));
		return 0;
	}

	//char *bufEnd = buf + bufSize;

	// set m_p1
	char *p    = buf;
	char *pend = buf + bufSize - 1;

	// read em all in
	if ( ! f.read ( buf , fileSize , 0 ) ) {
		log("init: Rooturls read: %s %s",filename,mstrerror(g_errno));
		return 0;
	}
	//Close the file, no need to waste mem
	f.close();
	// making all the \n's \0's
	for (int32_t i=0; i<bufSize;i++){
		if (buf[i]!='\n') continue;
		buf[i]='\0';
	}
	char q;
	int64_t h;
	while (p<pend){
		char *p1 = strstr(p,"q=");
		if(!p1) {
			p+=gbstrlen(p)+1;continue;}
		p1+=2;
		q=atoi(p1);
		char *p2 = strstr(p,"http://");
		if (!p2) {
			p+=gbstrlen(p)+1;continue;}
		// since these are all 'supposed' to be root urls, not
		// checking for that. Even if they aren't shouldn't be a
		// problem except for a bloated hashtable
		h=hash64Lower(p2,gbstrlen(p2));
		s_rootUrls.addKey(h,q);
		// move to the next string
		p+=gbstrlen(p)+1;
	}
	log ("init: %"INT32" Rooturls added to hashtable",
	     s_rootUrls.getNumSlotsUsed());
	//free the buf
	mfree(buf,bufSize,"RootUrls");
	return true;
	}*/

/*
bool addToSpiderdb ( char *coll , TitleRec *tr ) {
	// try spider db now
	//int32_t date = tr->getSpiderDate();
	// get length
	//int32_t collLen = gbstrlen(coll);
	// base priority on # of path components
	//unsigned char priority = tr->getSpiderPriority();
	//int32_t npc = tr->getUrl()->getPathDepth();
	// count ending file name as a path component
	//if ( tr->getUrl()->getFilenameLen() > 0 ) npc++;
	// count cgi crap as one path component
	//if ( tr->getUrl()->isCgi()              ) npc++;
	//if ( npc <= 5 ) priority = 19 - npc;
	//else            priority = 0;
	// spammers love to create millions of hostnames on the same domain
	//if ( ! tr->getUrl()->isSuperRoot() ) npc++;
	// if more than 10 linkers, make it 5
	// hey, doesn't his count internal linkers, too? skip it then
	// higher quality pages get higher priority, too

	// MOD for GK cluster
	// But don't get the quality from the titleRec. Since for gk
	// the titlerecs do not have the right quality, get the
	// root urls quality. For that loadRootUrls() must have already
	// been called by gendbs.

	//int32_t q = tr->getDocQuality();
	//if ( q > 50 && priority < 13 ) priority = 13;
	//if ( q > 70 && priority < 14 ) priority = 14;
	//if ( q > 85 && priority < 15 ) priority = 15;

	//
	// BEGIN SPECIAL CODE FOR FIXING SCORING BUG
	//

	// only get older versions before the fix
	//if ( tr->getVersion() >= 49 ) return true;
	// quick estimate of words, this works fast and well!!

	// see *exactly* how many words we have here

	// temp filter, only add big ones because they are the ones
	// that are messing us up the most
	//if ( tr->getContentLen() < 40000 ) return true;
	// temp hack
	//priority = 6;

	//
	// END SPECIAL CODE FOR FIXING SCORING BUG
	//


	key_t k = g_spiderdb.makeSpiderRecKey ( tr->getDocId() ,
						tr->getNextSpiderDate(), // date ,
						tr->getNextSpiderPriority() , // priority,
						0,
						false, false ,
						// this is now obsolete
						true );
	                                        //!tr->isSpiderLinksFalse());
	// sanity check
	if ( getGroupId(RDB_SPIDERDB,&k) != g_hostdb.m_groupId ) {
		log("spider key is wrong groupId");
		char *xx = NULL; *xx = 0; }
	// add to spiderdb now
	SpiderRec sr;
	sr.set ( tr->getUrl       () ,
		 coll                , // tr->getColl      () ,
		 gbstrlen(coll)        , // tr->getCollLen   () ,
		 tr->getNextSpiderDate() ,
		 tr->getNextSpiderPriority() , // priority            ,
		 0                   , // retryNum
		 false               , // forced?
		 false               , // is new?
		 -1                  , // url, not docId based
		 false               , // forceDelete?
		 -1                  , // ruleset
		 tr->getIp()         , // ip
		 tr->getIp()         , // sanityIp
		 tr->getDocQuality()    , // docQuality
		 tr->getHopCount()   );// hopCount

	if ( sr.getStoredSize () > 2048 )
		return log("db: makespiderdb: could not store %s",
			   tr->getUrl()->getUrl());
	// serialize into buf
	char buf [ 4096 ];
	int32_t recSize = sr.store ( buf , 2048 );
	// get the rdb of the spiderdb
	Rdb *r = g_spiderdb.getRdb();
	// do a non-blocking dump of tree if it's 90% full now
	if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){
		if ( ! r->dumpTree ( 0 ) ) // niceness
			return log("db: makespiderdb: dumpTree failed" );
	}
	// returns false and sets g_errno on error
	if ( ! r->addRecord ( coll , k , buf , recSize , 0) )
		return log("db: addToSpiderdb: addRecord: %s",
			   mstrerror(g_errno));
	return true;
}

BigFile     s_cf    [ MAX_HOSTS ];
int64_t   s_cfoff [ MAX_HOSTS ] ; // file offsets
static bool s_cdbInit = false;

bool addToChecksumdb ( char *coll , TitleRec *tr ) {
	// we have to make multiple checksumdbs since we won't store
	// all of them locally ourselves
	if ( ! s_cdbInit ) {
		// open up one checksumdb FLAT file for each group
		int32_t ng = g_hostdb.getNumShards();
		for ( int32_t i = 0 ; i < ng ; i++ ){
			char name[64];
			// . initialize our own internal rdb
			// . the %"XINT32" in "g%"INT32"" is the group NUM to which the
			//   keys in this file belong, the "h%"INT32"" is the host
			//   number that generated these keys
			sprintf(name,"checksumg%"INT32"h%"INT32"db",i,g_hostdb.m_hostId);
			// unlink this file just in case
			s_cf[i].set ( g_hostdb.m_dir , name );
			s_cf[i].unlink();
			if ( ! s_cf[i].open ( O_RDWR | O_CREAT ) )
			      return log("db: addToChecksumdb: cannot open %s",
					 name);
			s_cfoff[i] = 0LL;
		}
		s_cdbInit = true;
	}

	//key_t k ;
	int32_t cKeySize = g_conf.m_checksumdbKeySize;
	char k[16];

	// this fails on out of memory to set the Xml class.
	//if ( ! tr->getChecksumKey(&k) )
	//if ( ! tr->getChecksumKey(k) )
	//	return log("db: addToChecksumdb: getChecksumKey failed: %s.",
	//		   mstrerror(g_errno));


	TitleRec *otr = tr;

	//
	// get the checksumdb key just like we get it in Msg16.cpp!!
	// TODO: store in title rec
	//

	Xml xml;
	if ( ! xml.set ( tr->getCharset() ,
			 tr->getContent() ,
			 tr->getContentLen() ,
			 false ,
			 0,
			 false ,
			 tr->getVersion() ,
			 true , // setParentArgs
			 MAX_NICENESS) )
		return log("db: addToChecksumdb: getChecksumKey failed: %s.",
			   mstrerror(g_errno));


	// MDW: we should have the xml already parsed here!
	//Xml *xml = m_oldDoc.getXmlDoc()->getXml();
	int64_t h;
	// get link infos
	LinkInfo *linkInfo  = otr->getLinkInfo ();
	//LinkInfo *linkInfo2 = otr->getLinkInfo2();
	h = g_checksumdb.getContentHash ( &xml              ,
					  otr->getUrl()     ,
					  linkInfo          ,
					  otr->getVersion() ,
					  MAX_NICENESS      );
	// get our doc's link-adjusted quality
	char newQuality = otr->getDocQuality();
	// make the OLD dup key
	char oldk[16];
	g_checksumdb.makeDedupKey ( otr->getUrl()      ,
				    h                  ,
				    otr->getDocId()    ,
				    otr->getVersion () ,
				    false              , //del
				    newQuality         ,
				    oldk               );


	// from Msg1.cpp:55
	uint32_t groupId = getGroupId ( RDB_CHECKSUMDB , &k );
	int32_t dbnum = g_hostdb.makeHostId ( groupId );
	log(LOG_INFO,"mila groupId= %"UINT32" hostId=%"INT32"",
	    groupId,dbnum);
	// debug msg
	//log("db: %08"XINT32" %016"XINT64" %s",k.n1,k.n0,url->getUrl());
	// add to the appropriate checksumdb slice

	//if ( ! s_cf[dbnum].write ( &k , sizeof(key_t), s_cfoff[dbnum] ) )
	if ( ! s_cf[dbnum].write ( k , cKeySize, s_cfoff[dbnum] ) )
		return log("db: addToChecksumdb: write checksumdb failed");
	//s_cfoff[dbnum] += sizeof(key_t);
	s_cfoff[dbnum] += cKeySize;
	return true;
}
*/
/*
bool mergeChecksumFiles ( ) {

	// if main checksumdb file already exists, do not do merge
	BigFile f;
	f.set (g_hostdb.m_dir,"checksumdb-saved.dat");
	if ( f.doesExist() ) return true;
	f.set (g_hostdb.m_dir,"checksumdb0001.dat");
	if ( f.doesExist() ) return true;


	// disable threads so everything is blocking
	g_threads.disableThreads();

	// open up one checksumdb FLAT file for each group
	bool flag = false;
	int64_t count = 0;
	int32_t ng = g_hostdb.getNumShards();
	for ( int32_t i = 0 ; i < ng ; i++ ) {
		// . initialize our own internal rdb
		// . the %"XINT32" in "g%"XINT32"" is the group id to which the keys
		//   in this file belong, the "h%"INT32"" is the host number that
		//   generated these keys
		// . g_hostdb.m_hostId is also our group NUM
		char name[64];
		sprintf(name,"checksumg%"INT32"h%"INT32"db",g_hostdb.m_hostId,i);
		f.set (g_hostdb.m_dir,name);
		// if file does not exist then do not do any merging
		if ( ! f.doesExist() ) continue;
		// otherwise, we're doing a merge, so announce it
		if ( ! flag ) {
			flag = true;
			log("db: *-*-*-* mergeChecksumdbs: merging "
			    "%s/checksumg*h*db* files",g_hostdb.m_dir );
		}
		// open just for reading
		if ( ! f.open ( O_RDONLY ) ) {
			g_threads.enableThreads();
			return log("db: mergeChecksumFiles: cannot open %s",
				   name);
		}
		// mention it
		log("db: mergeChecksumdbs: merging %s",name);
		int64_t off = 0LL;
		// now add them one at a time to our g_checksumdb
		//key_t k;
		int32_t cKeySize = g_conf.m_checksumdbKeySize;
		char k[16];
		// how big is the file?
		int64_t fileSize = f.getFileSize();
	loop:
		//if ( ! f.read ( &k, sizeof(key_t) , off ) ) {
		if ( ! f.read ( k , cKeySize , off ) ) {
			g_threads.enableThreads();
			return log("db: mergeChecksumFiles: %s off=%"INT64" "
				   "read failed", name, off );
		}
		//off += sizeof(key_t);
		off += cKeySize;
		Rdb *r = g_checksumdb.getRdb();
		count++;
		// do a non-blocking dump of tree if it's 90% full now
		if (r->m_mem.is90PercentFull() || r->m_tree.is90PercentFull()){
			if ( ! r->dumpTree ( 0 ) ) {// niceness
				g_threads.enableThreads();
			    return log("db: mergeChecksums: dumpTree failed" );
			}
		}
		// returns false and sets g_errno on error. finalmerge=coll
		if ( ! r->addRecord ( "finalmerge", k , NULL , 0 , 0) ) {
			g_threads.enableThreads();
			return log("db: mergeChecksums: addRecord: %s",
				   mstrerror(g_errno));
		}
		// loop if more to go
		if ( off < fileSize ) goto loop;
		// otherwise, we're done with this file, do next one
		f.close();
	}
	// save g_checksumdb
	g_checksumdb.getRdb()->close ( NULL, NULL, true, false );
	// announce it
	log("db: *-*-*-* mergeChecksumdbs: merge complete. added %"INT64" keys to "
	    "checksumdb.",count);
	g_threads.enableThreads();
	return true;
}
*/

/*
// . returns false and sets g_errno on error, true on success
// . some temp code to convert our key format to the new key format
// . can also be used to regenerate tfndb and checksumdb
bool fixTitleRecs( char *coll ) {

	RdbBase *tbase = getRdbBase ( RDB_TITLEDB , coll );

	bool flag = true;

	bool doChecksumdb = true ;
	bool doTfndb      = true ;
	bool doSpiderdb   = true ;

	// disable threads so everything is blocking
	g_threads.disableThreads();

	// but if titledb has more than 1 file on disk, they need to be merged
	// so we can re-write the keys without fear of encountering deletes
	// for which we cannot compute the site or content hashes to make
	// the new titleRec key
	if ( tbase->getNumFiles() > 1 )
		return log("fixTitleRecs: more than one titledb file "
			   "found");

	collnum_t collnum = g_collectiondb.getCollnum ( coll );

	key_t  k;
	char  *rec;
	int32_t   recSize;
	TitleRec tr;
	key_t newk;
	int64_t h;
	bool isNegative = false;
	int32_t count = 0;
	// change the keys of TitleRecs in the RdbTree
	RdbTree *tt = &g_titledb.getRdb()->m_tree;
	// how many nodes in title rec tree?
	int32_t nn = tt->getNumNodes();
	// debug msg
	log("db: *-*-*-* Converting %"INT32" title rec keys in tree.",nn);
	if ( doChecksumdb ) log("db: *-*-*-* Generating checksumdb");
	if ( doTfndb      ) log("db: *-*-*-* Generating tfndb");
	if ( doSpiderdb   ) log("db: *-*-*-* Generating spiderdb");

	// make sure tree is good
	//if ( ! tt->checkTree ( true ) ) return false;
	// get id2 of titledb
	int32_t id2 = tbase->m_fileIds2[0];

	// loop through all the nodes, go by k
	for ( int32_t i = 0 ; i < nn ; i++ ) {
		// skip if empty
		if ( tt->m_parents[i] == -2 ) continue;
		// get his key
		k = *(key_t *)tt->getKey(i);
		// declare these up here since we have a "goto skip"
		RdbList tlist;
		Msg5 msg5;
		Msg5 msg5b;
		key_t startKey ;
		key_t endKey   ;
		// positives are easy
		if ( (k.n0 & 0x01) == 0x01 ) {
			if(!tt->getList(collnum,k,k,10,&tlist,NULL,NULL,false))
				return log("getlist failed");
			if ( tlist.isExhausted() ) {
				log("db: getlist failed 2 "
				    "i=%"INT32" n1=%"XINT32" n0=%"XINT64". continuing.",
				    i,k.n1,k.n0);
				continue;
			}
			tlist.resetListPtr();
			goto skip;
		}
		// get this rec and its positive, if any
		startKey = k;
		endKey   = k;
		endKey.n0 |= 0x01;
		// look it up, block
		if ( ! msg5.getList ( RDB_TITLEDB       ,
				      coll              ,
				      &tlist            ,
				      startKey          ,
				      endKey            ,
				      8000              , // minRecSizes
				      false             , // includeTree?
				      false             , // addToCache?
				      0                 , // max cache age
				      0                 , // startFileNum
				      -1                , // numFiles (-1 =all)
				      NULL              , // state
				      NULL              , // callback
				      2                 , // niceness
				      false             ,// error correction?
				      NULL          , // cache key ptr
				      0             , // retry num
				      -1            , // maxRetries
				      true          , // compensate for merge
				      -1LL          , // sync point
				      &msg5b        ))
			return log(LOG_LOGIC,"db: getList did not block.");
		// . if the negative has no positive, list will NOT be empty
		// . this also happens if negative key has been converted in
		//   the tree, but positive key on disk have not been...
		if ( ! tlist.isExhausted() ) {
			int64_t d = g_titledb.getDocIdFromKey ( k );
			log("db: docId %"INT64" has negative but no positive",d);
			continue;
		}
		isNegative = true;
	skip:
		// make tr
		rec = tlist.getCurrentRec();
		// get new key, skip if set failed
		if ( ! makeNewTitleRecKey ( rec,MAX_TR_SIZE, &newk,&tr,&h ) ) {
			log("db: tree node titleRec set failed. continuing.");
			continue;
		}
		// if positive, save checksumdb, tfndb and spiderdb
		if ( ! isNegative ) {
		if ( doTfndb      && ! addToTfndb (coll,&tr,id2)) return false;
		if ( doSpiderdb   && ! addToSpiderdb  (coll,&tr)) return false;
		if ( doChecksumdb && ! addToChecksumdb(coll,&tr)) return false;
		// log every 100 or so
		if ( count % 100 == 0 )
			log("db: #%"INT32" %s",count,xd.ptr_firstUrl);
		count++;
		}

		// if already processed, skip it!
		if ( newk == k ) continue;
		// make negative again
		if ( isNegative ) {
			newk.n0 &= 0xfffffffffffffffeLL;
			isNegative = false;
			((key_t *)(tt->m_keys))[i] = newk;
			continue;
		}
		// change the key, should not affect the ordering
		((key_t *)(tt->m_keys))[i] = newk;

	}

	// save the converted tree
	log("db: *-*-*-* Saving titledb-saved.dat");
	tt->fastSave ( g_hostdb.m_dir , "titledb" , false , NULL , NULL );

	// open the file of TitleRecs, should only be one of them
	BigFile f;
	f.set ( g_hostdb.m_dir , "titledb0001.dat" );
	if ( ! f.open ( O_RDWR | O_TRUNC ) )
		return log("fixTitleRecs: open: %s",
			   mstrerror(g_errno));
	f.setBlocking ( );
	int64_t off = 0;
	// get one rec at a time and store in this buffer
	char *buf = (char *)mmalloc ( MAX_TR_SIZE , "main");
	if ( ! buf ) return log("fixTitleRecs: malloc failed");
	int64_t fsize = f.getFileSize();
	if ( fsize <= 0 ) {
		mfree ( buf , MAX_TR_SIZE , "main" );
		return log("filesize of %s is %"INT64"",
			   f.getFilename(),fsize);
	}
	int32_t ng = g_hostdb.getNumShards();

	// save the old map, do not overwrite any old one
	log("db: *-*-*-* Moving old titledb0001.map to titledb0001.map.old");
	sprintf ( buf , "mv -i %s/titledb0001.map %s/titledb0001.map.old",
		  g_hostdb.m_dir,g_hostdb.m_dir);
	system ( buf );

	// get the old map in memory
	RdbMap *m = tbase->getMaps()[0];
	// make a new map for the converted titledb
	//sprintf ( buf , "%s/titledb0001.map",g_hostdb.m_dir);
	// this will reset it
	m->set ( g_hostdb.m_dir , "titledb0001.map" , -1, false,sizeof(key_t),
		 GB_INDEXDB_PAGE_SIZE);

 loop:
	// are we done?
	if ( off >= fsize ) {
		log("db: *-*-*-* Reached end of title file and tree. "
		    "Saving data to disk");
		// save titledb tree if we modified it
		//g_titledb.getRdb()->close ( NULL, NULL, true, false );
		// dump trees we did
		for ( int32_t i = 0 ; doChecksumdb && i < ng ; i++ )
			s_cf[i].close ( );
		if ( doTfndb )
			g_tfndb.getRdb()->close ( NULL, NULL, true, false );
		if ( doSpiderdb )
			g_spiderdb.getRdb()->close ( NULL, NULL, true, false );
		// re-enable threads
		g_threads.enableThreads();
		mfree ( buf , MAX_TR_SIZE , "main" );
		// return now if we did not update titledb0001.dat at all
		if ( flag ) return true;

		//f.set ( g_hostdb.m_dir , "titledb0001.map");
		//f.unlink();
		log("db: *-*-*-*- Saving new titledb0001.map");
		if ( ! m->writeMap() )
			return log("fixTitleRecs: could not write "
				   "map file.");
		return true;
	}

	// read in info about next titleRec
	if ( ! f.read ( buf , 16 , off ) ) {
		mfree ( buf , MAX_TR_SIZE , "main" );
		return log("reading blocked");
	}
	if ( g_errno ) {
		mfree ( buf , MAX_TR_SIZE , "main" );
		return log("reading size error, needed 16");
	}
	// get the key and recSize
	k = *(key_t *) buf;
	recSize = *(int32_t *)(buf+12) + 16 ;
	// bitch and fix if recSize is corrupt
	if ( recSize > 4*1024*1024 || recSize < 16 ) {
		log("db: fixTitleRecs: bad TitleRec size of %"INT32".",recSize);
		log("db: fixTitleRecs: attempting to determine correct size.");
		recSize = getRecSize ( &f , off );
		if ( recSize < 0 ) {
			mfree ( buf , MAX_TR_SIZE , "main" );
			return log("fixTitleRecs: attempt failed.");
		}
		log("db: fixTitleRecs: found size to be %"INT32"",recSize);
	}
	if ( recSize > MAX_TR_SIZE ) {
		log("db: fixTitleRecs: tr size is %"INT32". skipping.",recSize);
		off += recSize ;
		goto loop;
	}
	// read in the key_recSiez+titleRec
	if ( ! f.read ( buf , recSize, off )) {
		mfree ( buf , MAX_TR_SIZE , "main" );
		return log("reading blocked");
	}
	if ( g_errno ) {
		mfree ( buf , MAX_TR_SIZE , "main" );
		return log("reading size error, needed 16");
	}
	// set our rec ptr to what we just read
	rec = buf;
	// get new key, skip if set failed
	bool status = makeNewTitleRecKey ( rec, MAX_TR_SIZE, &newk,&tr,&h ) ;
	// add to the map
	if ( ! m->addRecord ( newk , buf , recSize ) ) {
		mfree ( buf , MAX_TR_SIZE , "main" );
		return log("add to new map error");
	}
	// deal with title rec decompress failure
	if ( ! status ) {
		log("db: fixTitleRecs: makeNewTitleRecKey failed. "
		    "off=%"INT64" recSize=%"INT32".",off,recSize);
		off += recSize ;
		goto loop;
	}
	// only write back the new key if different from the old key
	if ( newk != k ) {
		// if we haven't already logged this do it now
		if ( flag ) {
			log("db: *-*-*-* Converting keys in titledb0001.dat.");
			flag = false;
		}
		// ovewrite the old
		if ( ! f.write ( &newk , 12 , off ) ) {
			mfree ( buf , MAX_TR_SIZE , "main" );
			return log("overwrite failed. that sucks.");
		}
	}
	// if key has negative equivalent in tree, do not add it to the 3 dbs
	key_t negk = newk ; negk.n0 &= 0xfffffffffffffffeLL;
	if ( tt->getNode ( collnum , negk ) < 0 ) {
		// add recs to the three dbs
		if ( doTfndb      && ! addToTfndb      ( coll , &tr , id2 ) ) {
			mfree ( buf , MAX_TR_SIZE , "main" );
			return false;
		}
		if ( doSpiderdb   && ! addToSpiderdb   ( coll , &tr ) ) {
			mfree ( buf , MAX_TR_SIZE , "main" );
			return false;
		}
		if ( doChecksumdb && ! addToChecksumdb ( coll , &tr ) ) {
			mfree ( buf , MAX_TR_SIZE , "main" );
			return false;
		}
	}
	else
		log("db: fixTitleRecs: key is negative in tree");
	// advance to point to next titleRec now
	off += recSize ;
	// log every 100 or so
	if ( count % 100 == 0 )
		log("db: #%"INT32" %s",count,xd.ptr_firstUrl);
	count++;
	// loop for more
	goto loop;
}

// . when a titleRec has an impossible size, there was disk corruption
// . try all possible size combinations, up to 1 million
int32_t getRecSize ( BigFile *f , int64_t off ) {
	char *buf = (char *) mmalloc ( 50*1024*1024 , "main" );
	if ( ! buf ) return -1;
	f->read ( buf , 50*1024*1024 , off );
	TitleRec tr;
	// loop over possible sizes
	for ( int32_t i = 0 ; i < 48*1024*1024 - 32 ; i++ ) {
		char *next = buf + 12 + 4 + i;
		// log every 1000 or so
		if ( i % 1000 == 0 ) log("db: i=%"INT32"",i);
		// ensure sane size, if not try next i
		int32_t size = *(int32_t *)(next + 12);
		if ( size < 0 || size > 1024*1024 ) continue;
		// if uncompress failed, just keep looping
		if ( ! tr.set ( next , MAX_TR_SIZE , false ) )
			continue;
		// if it uncompressed successfully, make sure url is valid
		char *u = tr.getUrl()->getUrl();
		// log it
		log("db: getRecSize: recSize of %"INT32" has next url of %s",i,u);
		// is valid?
		if ( u[0] != 'h' ) {
			log("db: getRecSize: skipping since url does not start "
			    "with 'h'");
			continue;
		}
		// otherwise, return it
		mfree ( buf , 50*1024*1024 , "main" );
		return i + 16;
	}
	log("getRecSize: no good recSize found");
	mfree ( buf , 50*1024*1024 , "main" );
	return -1;
}
*/

// . also makes checksumdb
// . g_hostdb.m_hostId should be set correctly
/*
bool genDbs ( char *coll ) {
	if (!ucInit(g_hostdb.m_dir, true))
		return log("build: Unicode initialization failed!");
	RdbBase *base = getRdbBase ( RDB_TITLEDB , coll );
	BigFile f;
	// if no titledb, there is no generating
	//bool hasTitledb = false;
	//f.set ( g_hostdb.m_dir , "titledb-saved.dat");
	//if ( f.doesExist() ) hasTitledb = true ;
	//f.set ( g_hostdb.m_dir , "titledb0001.dat");
	//if ( f.doesExist() ) hasTitledb = true ;
	//if ( ! hasTitledb ) return true;
	bool doChecksumdb = true ;
	bool doTfndb      = true ;
	bool doSpiderdb   = true ;
	// build checksumdb if there not one
	char tmp[256];
	int32_t ng = g_hostdb.getNumShards();
	int32_t gnum = g_hostdb.m_hostId % ng;
	sprintf ( tmp , "checksumg%"INT32"h%"INT32"db",gnum,g_hostdb.m_hostId);
	f.set ( g_hostdb.m_dir , tmp );
	if ( f.doesExist() ) doChecksumdb = false;
	f.set ( g_hostdb.m_dir , "checksumdb-saved.dat");
	if ( f.doesExist() ) doChecksumdb = false;
	f.set ( g_hostdb.m_dir , "checksumdb0001.dat");
	if ( f.doesExist() ) doChecksumdb = false;
	// same for tfndb
	f.set ( g_hostdb.m_dir , "tfndb-saved.dat");
	if ( f.doesExist() ) doTfndb = false;
	f.set ( g_hostdb.m_dir , "tfndb0001.dat");
	if ( f.doesExist() ) doTfndb = false;
	// and spiderdb
	f.set ( g_hostdb.m_dir , "spiderdb-saved.dat");
	if ( f.doesExist() ) doSpiderdb = false;
	f.set ( g_hostdb.m_dir , "spiderdb0001.dat");
	if ( f.doesExist() ) doSpiderdb = false;

	// bail if all are three already
	if ( ! doChecksumdb && ! doTfndb && ! doSpiderdb ) return true;

	// disable threads so everything is blocking
	g_threads.disableThreads();

	if ( doChecksumdb ) log("db: *-*-*-* Generating checksumdb");
	if ( doTfndb      ) log("db: *-*-*-* Generating tfndb");
	if ( doSpiderdb   ){
		log("db: *-*-*-* Generating spiderdb");

		//Need this if tr's in addtospiderdb are getting their
		//quality from their root urls.
		// if dospiderdb, also load rooturls for MOD
	}

	// we only add tfn's of 0, so everybody should be in the root file,
	// should be ok if in tree though!
	if ( doTfndb && base->getNumFiles() > 1 ) {
		log("genDbs: More than one titledb file found. "
		    "Can not create tfndb. Do a tight merge on "
		    "titledb and then try again.");
		return true;
	}

	// get id2 of titledb
	int32_t id2 = base->m_fileIds2[0];

	// we have to make multiple checksumdbs since we won't store all of
	// them locally ourselves
	BigFile   cf    [ MAX_HOSTS ];
	int64_t cfoff [ MAX_HOSTS ] ; // file offsets
	// open up one checksumdb FLAT file for each group
	for ( int32_t i = 0 ; doChecksumdb && i < ng ; i++ ){
		char name[64];
		// . initialize our own internal rdb
		// . the %"XINT32" in "g%"INT32"" is the group NUM to which the keys
		//   in this file belong, the "h%"INT32"" is the host number that
		//   generated these keys
		sprintf(name,"checksumg%"INT32"h%"INT32"db",i,g_hostdb.m_hostId);
		// unlink this file just in case
		cf[i].set ( g_hostdb.m_dir , name );
		cf[i].unlink();
		if ( ! cf[i].open ( O_RDWR | O_CREAT ) )
			return log("genDbs: cannot open %s",name);
		cfoff[i] = 0LL;
	}

	// reset some stuff
	key_t nextKey; nextKey.setMin();
	RdbList tlist;
	tlist.reset();
	int32_t minRecSizes=3*1024*1024; // 3 megs

	// keep these declared before the loop so compiler stops complaining
	key_t endKey;
	Msg5 msg5;
	Msg5 msg5b;
	char *rec      ;
	int32_t  listSize ;
	TitleRec tr ;
	static uint32_t count = 0;
	endKey.setMax();

	// now pick titleRec from old titledb
 loop:
	tlist.reset();
	// always clear last bit of g_nextKey
	nextKey.n0 &= 0xfffffffffffffffeLL;
	// a niceness of 0 tells it to block until it gets results!!
	if ( ! msg5.getList ( RDB_TITLEDB    ,
			      coll           ,
			      &tlist         ,
			      nextKey        ,
			      endKey         , // should be maxed!
			      minRecSizes    , // min rec sizes
			      true           , // include tree?
			      false          , // includeCache
			      false          , // addToCache
			      0              , // startFileNum
			      -1             , // m_numFiles
			      NULL           , // state
			      NULL           , // callback
			      0              , // niceness
			      true           , // do error correction?
			      NULL           , // cache key ptr
			      0              , // retry num
			      -1             , // maxRetries
			      true           , // compensate for merge
			      -1LL           , // sync point
			      &msg5b         ))
		return log(LOG_LOGIC,"db: getList did not block.");
	// close up if no titleRec
	if ( tlist.isEmpty() ) {
		log("db: *-*-*-* All done generating. saving files.");
		// dump trees we did
		for ( int32_t i = 0 ; doChecksumdb && i < ng ; i++ )
			cf[i].close ( );
		if ( doTfndb ) {
			// force tree dump to disk
			g_tfndb.getRdb()->dumpTree(0);
			g_tfndb.getRdb()->close ( NULL, NULL, true, false );
		}
		if ( doSpiderdb ) {
			// force tree dump to disk
			g_spiderdb.getRdb()->dumpTree(0);
			g_spiderdb.getRdb()->close ( NULL, NULL, true, false );
		}
		g_threads.enableThreads();
		return true;
	}
	tlist.resetListPtr();

listLoop:
	if (tlist.isExhausted() ) {
		goto loop;
	}

	// advance g_nextKey to get next titleRec
	nextKey = tlist.getCurrentKey();
	nextKey += (uint32_t)1;
	// get raw rec from list
	rec      = tlist.getCurrentRec();
	listSize = tlist.getListSize ();
	// set the titleRec we got
if ( ! tr.set ( rec , listSize , false ) ) { // own data?
		log("db: gotList: error setting titleRec! skipping." );
		tlist.skipCurrentRecord();
		goto listLoop;
	}

	if ( doTfndb      && ! addToTfndb      (coll, &tr, id2)) return false;
	if ( doSpiderdb   && ! addToSpiderdb   (coll, &tr )) return false;
	if ( doChecksumdb && ! addToChecksumdb (coll, &tr )) return false;

	// log the url
	if ( (count % 300) == 0 )
		logf(LOG_INFO,"db: %"INT32") %s %"INT32"",
		    count,tr.getUrl()->getUrl(),tr.getContentLen());
	count++;

	tlist.skipCurrentRecord();
	// get another record from the list we've got
	goto listLoop;
	// make the compiler happy
	//	return true;
}

// . also makes checksumdb
// . g_hostdb.m_hostId should be set correctly
//   1. if a url is in spiderdb as old but is not really old (i.e. it does
//      not exist in titledb/tfndb) then it will not make it into tfndb
//      and we will get EDOCNOTOLD errors when we try to spider it, and
//      and it will be deleted from spiderdb.
//   2. if a url is in spiderdb as new but also in titledb, then we add it
//      to tfndb with the probable docid, but when adding to tfndb from titledb
//      it gets added with the actual docid. so tfndb kinda has a double
//      record. but when the spiderdb record is done as new it should remove
//      the old tfndb record if the probable docid did not match the actual
//      in Msg14.cpp....
// . Try seeing if there are recs with the same probable docid (convert actual
//   to probable) and the same extension hash. they should not both be in tfndb
bool genTfndb ( char *coll ) {
	RdbBase *base = getRdbBase ( RDB_TITLEDB , coll );
	BigFile f;
	// same for tfndb
	f.set ( g_hostdb.m_dir , "tfndb-saved.dat");
	if ( f.doesExist() ) {
		log("db: %stfndb-saved.dat exists. "
		    "Not generating tfndb. Please move it to a tmp dir.",
		    g_hostdb.m_dir);
		return false;
	}
	f.set ( g_hostdb.m_dir , "tfndb0001.dat");
	if ( f.doesExist() ) {
		log("db: %stfndb0001.dat exists. Not generating tfndb. "
		    "Please move all %stfndb* files to a tmp dir.",
		    g_hostdb.m_dir,g_hostdb.m_dir);
		return false;
	}
	g_conf.m_maxMem = 2000000000LL;
	g_mem.m_maxMem  = 2000000000LL;
	// we only add tfn's of 0, so everybody should be in the root file,
	// should be ok if in tree though!
	//if ( base->getNumFiles() > 1 ) {
	//	log("db: More than one titledb file found. "
	//	    "Can not create tfndb. Do a tight merge on "
	//	    "titledb and then try again.");
	//	exit(-1);
	//	return true;
	//}

	// disable threads so everything is blocking
	g_threads.disableThreads();

	log("db: Generating tfndb.");

	// reset some stuff
	key_t nextKey; nextKey.setMin();
	RdbList tlist;
	tlist.reset();
	key_t endKey; endKey.setMax();
	int32_t fn = 0;
	int32_t id2;
	int32_t local = 0;
	int64_t dd;

	SpiderRec sr;
	static uint32_t count = 0;
	Msg5 msg5;
	Msg5 msg5b;

	// debug stuff
	//nextKey = g_titledb.makeFirstTitleRecKey ( 4949692421LL );
	//goto loop2;

	// add url recs for spiderdb
 loop:
	// always clear last bit of g_nextKey
	nextKey.n0 &= 0xfffffffffffffffeLL;
	// a niceness of 0 tells it to block until it gets results!!
	if ( ! msg5.getList ( RDB_SPIDERDB    ,
			      coll           ,
			      &tlist         ,
			      nextKey        ,
			      endKey         , // should be maxed!
			      200048          , // min rec sizes
			      true           , // include tree?
			      false          , // includeCache
			      false          , // addToCache
			      0              , // startFileNum
			      -1             , // m_numFiles
			      NULL           , // state
			      NULL           , // callback
			      0              , // niceness
			      true           ))// do error correction?
		return log(LOG_LOGIC,"db: getList did not block.");
	// close up if no titleRec
	if ( tlist.isEmpty() ) {
		log(LOG_INFO,"db: Read %"INT32" spiderdb recs.",local);
		local = 0;
		log(LOG_INFO,"db: All done reading spiderdb.");
		//g_tfndb.getRdb()->close ( NULL, NULL, true, false );
		//g_threads.enableThreads();
		// is the list from the tree in memory?
		if ( fn == base->getNumFiles() ) id2 = 255;
		else                             id2 = base->m_fileIds2[fn];
		if ( id2 == 255 )
			log(LOG_INFO,"db: Reading titledb tree.");
		else
			log(LOG_INFO,"db: Reading "
				    "file #%"INT32" titledb*-%03"INT32".dat*.",fn,id2);
		// reset key
		nextKey.setMin();
		local = 0;
		goto loop2;
		//return true;
	}
 nextRec:
	// advance g_nextKey to get next titleRec
	nextKey = tlist.getCurrentKey();
	nextKey += (uint32_t)1;
	// set the titleRec we got
	if ( ! sr.set ( &tlist ) ) {
		log("db: gotList: error setting spiderRec! skipping." );
		goto skip;
	}
	// . skip docid based spider recs
	// . if its old, we'll take care of it below
	// . no, add here even if old, it will be overridden because if it is
	//   old then it is using its actual docid, not just probable docid
	// . if we find a spider rec is really not old and throw it into
	//   the new category, that is bad because it may be using its actual
	//   docid and not probable...
	// . this logic here assumes spiderdb is 100% correct, if it isn't
	//   we should have a fixspiderdb command

	// . if spiderdb rec in there is labelled as new but it is really old
	//   we will add it to tfndb here with its probable docid, but when
	//   finding it in the titledb we will add it again to tfndb with its
	//   actual docid. the two may not match and we end up with double
	//   tfndb entries.

	// . if spider rec is labelled as old, and we say 'doc not old' and
	//   move it to new, then there was not a titlerec for it!! ok we need
	//   to regen tfndb and stop moving spider recs like that.
	if ( sr.m_url.getUrlLen() > 0 &&
	     g_spiderdb.isSpiderRecNew ( tlist.getCurrentKey() ) )
		// add url based spider recs
		if ( ! addToTfndb2 (coll, &sr, 255)) return false; // id2=255
	// log the url
	if ( (count % 10000) == 0 ) {
		if ( sr.m_url.getUrlLen() > 0 )
		     logf(LOG_INFO,"db: *%"INT32") %s",count,sr.getUrl()->getUrl());
		else
		     logf(LOG_INFO,"db: *%"INT32") %"INT64"",count,sr.m_docId);
	}
 skip:
	count++;
	local++;
	// try going down list
	if ( tlist.skipCurrentRecord() ) goto nextRec;
	// start it all over for another TitleRec
	goto loop;

 loop2:
	// just the tree?
	int32_t nf          = 1;
	bool includeTree = false;
	if ( fn == base->getNumFiles() ) { nf = 0; includeTree = true; }
	// always clear last bit of g_nextKey
	nextKey.n0 &= 0xfffffffffffffffeLL;
	// a niceness of 0 tells it to block until it gets results!!
	if ( ! msg5.getList ( RDB_TITLEDB    ,
			      coll           ,
			      &tlist         ,
			      nextKey        ,
			      endKey         , // should be maxed!
			      1024           , // min rec sizes
			      includeTree    , // include tree?
			      false          , // includeCache
			      false          , // addToCache
			      fn             , // startFileNum
			      nf             , // m_numFiles
			      NULL           , // state
			      NULL           , // callback
			      0              , // niceness
			      true           , // do error correction?
			      NULL           , // cache key ptr
			      0              , // retry num
			      -1             , // maxRetries
			      true           , // compensate for merge
			      -1LL           , // sync point
			      &msg5b         ))
		return log(LOG_LOGIC,"db: getList did not block.");
	// close up if no titleRec
	if ( tlist.isEmpty() ) {
		fn++;
		if ( fn <= base->getNumFiles() ) {
			log(LOG_INFO,"db: Scanning titledb file #%"INT32".",
			    fn);
			nextKey.setMin();
			goto loop2;
		}
	done:
		// otherwise, wrap it up
		log(LOG_INFO,
		   "db: Scanned %"INT32" spiderdb and titledb recs.",count);
		log(LOG_INFO,
		   "db: All done generating tfndb. Saving files.");
		// force tree dump to disk, we use more mem for tfndb than
		// most gb process, so they won't be able to load the tree
		g_tfndb.getRdb()->dumpTree(0);
		// save our tree to disk, should be empty.
		g_tfndb.getRdb()->close ( NULL, NULL, true, false );
		g_threads.enableThreads();
		return true;
	}
 nextRec2:
	key_t tkey;
	tkey = tlist.getCurrentKey();
	dd = g_titledb.getDocIdFromKey ( tkey );
	// skip if bad... CORRUPTION
	if ( tkey < nextKey ) {
		int32_t p1 = msg5.m_msg3.m_startpg[0] + 1;
		log("db: Encountered corruption in titledb while making "
		    "tfndb. Page = %"INT32". "
		    "NextKey.n1=%"UINT32" %"UINT64". "
		    "Key.n1=%"UINT32" %"UINT64" "
		    "FirstDocId=%"UINT64".",
		    p1-1,nextKey.n1,nextKey.n0,tkey.n1,tkey.n0,
		    g_titledb.getDocIdFromKey(nextKey));
		RdbMap **maps = base->getMaps();
	here:
		// bail if done
		if ( p1 >= maps[0]->getNumPages() ) goto done;
		key_t kk = *(key_t *)maps[0]->getKeyPtr ( p1 );
		if ( kk <= nextKey ) { p1++; goto here; }
		// otherwise, use that next key
		nextKey = kk;
		goto loop2;
	}

	// advance g_nextKey to get next titleRec
	nextKey = tlist.getCurrentKey();
	nextKey += (uint32_t)1;
	// advance one if positive, must always start on a negative key
	if ( (nextKey.n0 & 0x01) == 0x01 ) nextKey += (uint32_t)1;
	// get raw rec from list
	char *rec      = tlist.getCurrentRec();
	int32_t  listSize = tlist.getListSize ();
	// is the list from the tree in memory?
	if ( fn == base->getNumFiles() ) id2 = 255;
	else                             id2 = base->m_fileIds2[fn];
	TitleRec tr ;
	// skip if its a delete
	// let's print these out
	if ( (tkey.n0 & 0x01) == 0x00 ) {
		static bool ff = true;
		if ( ff ) {
			log("GOT NEGATIVE KEY. tfndb generation will "
			    "contain positive tfndb keys for title recs "
			    "that were deleted!! bad... need to tight "
			    "merge titledb to fix this. better yet, "
			    "you should be using the Repair tool to repair "
			    "tfndb, that one actually works!");
			ff = false;
		}
		goto skip2;
	}
	// set the titleRec we got
if ( ! tr.set ( rec , listSize , false ) ) { // own data?
		int64_t d = g_titledb.getDocIdFromKey ( tkey );
		log("db: gotList: Error setting titleRec. docId=%"INT64". "
		    "Skipping." , d );
		goto loop2; // skip2;
	}
	if ( ! addToTfndb (coll, &tr, id2)) return false;
	// log the url
	if ( (count % 1000) == 0 )
	     logf(LOG_INFO,"db: %"INT32") %s %"INT32" %"INT64"",
		  count,tr.getUrl()->getUrl(),tr.getContentLen(),dd);
	count++;
	local++;
 skip2:
	// try going down list
	if ( tlist.skipCurrentRecord() ) goto nextRec2;
	// start it all over for another TitleRec
	goto loop2;
	// make the compiler happy
	return true;
}
*/


// . for cleaning up indexdb
// . print out docids in indexdb but not in our titledb, if they should be
void dumpMissing ( char *coll ) {
	// load tfndb, assume it is a perfect reflection of titledb
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
	//g_conf.m_indexdbMaxCacheMem = 0;
	//g_conf.m_clusterdbMaxDiskPageCacheMem = 0;

	//g_tfndb.init ();
	//g_collectiondb.init(true); // isDump?
	//g_tfndb.getRdb()->addRdbBase1 ( coll );
	g_titledb.init();
	g_titledb.getRdb()->addRdbBase1 ( coll );
	// if titledb has stuff in memory, do not do this, it needs to
	// be dumped out. this way we can assume a tfn of 255 means the docid
	// is probable and just in spiderdb. (see loop below)
	if ( g_titledb.getRdb()->m_tree.getNumUsedNodes() ) {
		logf(LOG_INFO,"db: Titledb needs to be dumped to disk before "
		     "we can scan tfndb. Please do ./gb ddump to do this or "
		     "click on \"dump to disk\" in the Master Controls.");
		return;
	}
	// . just get the docids from tfndb...
	// . this tfndb rec count is for ALL colls!! DOH!
	// MDW FIX THIS RIGHT!
	int64_t numRecs = 12345;//g_tfndb.getRdb()->getNumTotalRecs();
	int64_t oldNumSlots = (numRecs * 100) / 80;
	// make a power of 2
	// make it a power of 2
	//oldNumSlots *= 2;
	//oldNumSlots -= 1;
	//int32_t numSlots = getHighestLitBitValue((uint32_t)oldNumSlots);
	int32_t numSlots = oldNumSlots;
	//uint32_t mask = numSlots - 1;

	// make a hash table for docids
	logf(LOG_INFO,"db: Allocating %"INT32" bytes for docids.",numSlots*8);
	uint64_t *slots =
		(uint64_t *)mcalloc ( numSlots * 8 , "dumpMissing" );
	if ( ! slots ) {
		log("db: Could not alloc %"INT32" bytes to load in %"INT64" docids.",
		    numSlots*8,numRecs);
		return;
	}

	// load in all tfndb recs
	key_t startKey ;
	key_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 5*1024*1024;
	RdbList list;
	Msg5 msg5;

	logf(LOG_INFO,"db: Loading tfndb for hostId %"INT32", has %"INT64" recs.",
	     g_hostdb.m_hostId,numRecs);
	int64_t count = 0;
	int32_t next = 0;
	int32_t used = 0;
	CollectionRec *cr = g_collectiondb.getRec(coll);
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_TFNDB     ,
			      cr->m_collnum    ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		exit(-1);
	}
	// all done if empty
	if ( list.isEmpty() ) goto done;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		// get the tfn
		key_t k   = list.getCurrentKey();
		count++;
		// skip if negative
		if ( (k.n0 & 0x01LL) == 0x00 ) continue;
		// titledb tree is empty, so this must indicate it is in
		// spiderdb only
		int32_t  tfn = 0;//g_tfndb.getTfn(&k);
		if ( tfn == 255 ) continue;
		// get docid
		uint64_t d = 0LL;//g_tfndb.getDocId ( &k );
		// add to hash table
		//int32_t n = (uint32_t)d & mask;
		int32_t n = (uint32_t)d % numSlots;
		// chain if not in there
		while ( slots[n] )
			if ( ++n >= numSlots ) n = 0;
		// add it here
		slots[n] = d;
		// count it
		if ( used >= next ) {
			logf(LOG_INFO,"db: Loaded %"INT32" docids.",used);
			next = used + 1000000;
		}
		used++;
	}
	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey >= *(key_t *)list.getLastKey() ) goto loop;


	// ok now, scan indexdb and report docids in indexdb that are not
	// in our tfndb when they should be.

 done:
	logf(LOG_INFO,"db: Scanned %"INT64" tfndb recs.",count);
	logf(LOG_INFO,"db: Scanning indexdb.");
	logf(LOG_INFO,"db: Tight merge indexdb to make this faster.");

	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	g_indexdb.init ();
	//g_collectiondb.init(true);
	g_indexdb.getRdb()->addRdbBase1 ( coll );
	startKey.setMin();
	endKey.setMax();
	// get a meg at a time
	minRecSizes = 5*1024*1024;

	Msg5 msg5b;
	//uint32_t groupId = g_hostdb.m_groupId;
	uint32_t shardNum = g_hostdb.getMyShardNum();
	count = 0;
	int32_t scanned = 0;
	//HashTableT <int64_t,char> repeat;
	HashTableX repeat;
	if ( ! repeat.set ( 8,1,1000000,NULL,0,false,0,"rpttbl" ) ) {
		log("db: Failed to init repeat hash table.");
		return;
	}

 loop2:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_INDEXDB   ,
			      cr->m_collnum       ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// something to log
	scanned += list.getListSize();
	if ( scanned >= 100000000 ) {
		count += scanned;
	scanned = 0;
		logf(LOG_INFO,"db: Scanned %"INT64" bytes.",count);
	}
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {

		key_t k    = list.getCurrentKey();
		// skip deletes
		if ( (k.n0 & 0x01) == 0x00 ) continue;
		// do we hold his titleRec? continue if not
		if ( getShardNum ( RDB_TITLEDB , &k ) != shardNum ) continue;
		// get his docid
		uint64_t d = g_indexdb.getDocId(k);
		// otherwise, report him if not in tfndb
		//int32_t n = (uint32_t)d & mask;
		int32_t n = (uint32_t)d % numSlots;
		while ( slots[n] && slots[n] != d )
			if ( ++n >= numSlots ) n = 0;
		// if he was not in tfndb when he should have been,
		// print him on stdout
		if ( slots[n] == d ) continue;
		// is he in the repeat table?
		int32_t slot=repeat.getSlot(&d);
		if (slot!=-1)
			if ( *(char *)repeat.getValueFromSlot ( slot ) == 1 )
				continue;
		// print if this is the first time
		fprintf(stderr,"missingdocid %012"UINT64"\n",d);
		// put him in a table so we don't repeat him
		char one = 1;
		repeat.addKey ( &d , &one );
	}

	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey >= *(key_t *)list.getLastKey() ) goto loop2;
	logf(LOG_INFO,"db: Done generating missing docids.");
	return;
}

// . for cleaning up indexdb
// . print out docids in the same termlist multiple times
void dumpDups ( char *coll ) {
	// load tfndb, assume it is a perfect reflection of titledb
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	//g_conf.m_indexdbMaxCacheMem = 0;

	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	g_indexdb.init ();
	//g_collectiondb.init(true);
	g_indexdb.getRdb()->addRdbBase1 ( coll );

	key_t startKey ;
	key_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 6*1024*1024;
	int32_t numSlots = 2 * 1024 * 1024;
	int64_t * slots;
	char * scores;
	slots = (int64_t *) mmalloc ( numSlots * 8, "main-dumpDups");
	scores = (char *) mmalloc ( numSlots, "main-dumpDups");
	if(!slots || !scores) {
		if(!slots)
			log(LOG_INFO,"admin: Could not allocate %"INT64" "
				   "bytes for dumpDups" ,
				   (int64_t) numSlots * 8 );
		else mfree(slots, numSlots * 8, "main-dumpDups" );

		if(!scores)
			log(LOG_INFO,"admin: Could not allocate %"INT32" "
				   "bytes for dumpDups" ,
				    numSlots  );
		else mfree(scores, numSlots , "main-dumpDups" );
		return;
	}

	int32_t offset1 = 0;
	int32_t offset2 = 0;
	int64_t tempTid = -1;
	int64_t lastTid = -1;
	int64_t tid = -1;

	int64_t indexdbCount = 0;
	char * tempListPtr;
	char * tempListPtrHi;
	key_t k;
	int64_t d;
	int32_t hashMod;
	uint64_t n2;
	int64_t endTid;
	char filename[30];
	char buff[100];
	int32_t numParsed = 0;
	int32_t collNum = g_collectiondb.getCollnum ( coll );
	File f;
	File f2;
	Rdb *r = g_indexdb.getRdb();
	RdbTree *tree = &r->m_tree;

	sprintf(filename,"removedDupKeys.%"INT32"", collNum );
	f.set(g_hostdb.m_dir, filename);
	if(f.doesExist() ) {
		log(LOG_INFO,"admin: File %s%s already exists. "
		    "Aborting process" , g_hostdb.m_dir, filename );
		return;
	}
	if( !f.open(O_RDWR | O_CREAT) ) {
		log( LOG_INFO, "admin: Could not create %s/%s.",
		     g_hostdb.m_dir, filename);
		return ;
	}

	sprintf(filename,"removedDupDocIds.%"INT32"", collNum);
	//	g_collectiondb.getCollnum ( coll ));
	f2.set(g_hostdb.m_dir, filename);
	if(f2.doesExist() ) {
		f2.unlink();
	}
	if( !f2.open(O_RDWR | O_CREAT) ) {
		log( LOG_INFO, "admin: Could not create %s/%s.",
		     g_hostdb.m_dir, filename);
		return ;
	}

	RdbList list;
	Msg5 msg5;

	Msg5 msg5b;
	uint32_t count = 0;
	uint32_t count2 = 0;
	int64_t byteCount = 0;
	uint32_t highLitBit;
	char *p;
	//uint32_t groupId = g_hostdb.m_groupId;
	count = 0;
	int64_t scanned = 0;

	int64_t dups = 0LL;
	char lookup[256] = { 8, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
			     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
			     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
			     5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
			     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
			     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
			     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
			     6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
			     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
			     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
			     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
			     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
			     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
			     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
			     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
			     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
	//	HashTableT <int64_t,char> repeat;
	//	HashTableT <int64_t,char> local;


	logf(LOG_INFO,"db: Scanning indexdb for repeated docids.");
	logf(LOG_INFO,"db: Tight merge indexdb to make this faster.");
	logf(LOG_INFO,"db: Dumping docid termId pairs to file.");

	/*	if ( ! repeat.set ( 1000000 ) ) {
		log("db: Failed to init repeat hash table.");
		return;
	}

	if ( ! local.set ( 1000000 ) ) {
		log("db: Failed to init repeat hash table2.");
		return;
	}
	*/
	CollectionRec *cr = g_collectiondb.getRec(coll);

 loop:
	//int64_t startTime = gettimeofdayInMilliseconds();
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_INDEXDB   ,
			      cr->m_collnum          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	//int64_t endTime = gettimeofdayInMilliseconds();
	//log(LOG_INFO,"dumpdups Msg5 time = %"INT32"",(int32_t)endTime - startTime);
	// all done if empty
	if ( list.isEmpty() ) {
		mfree ( slots, numSlots * 8, "main-dumpDups");
		mfree ( scores, numSlots, "main-dumpDups");
		f.close();
		f2.close();
		return;
	}

	// something to log
	scanned += list.getListSize();
		if ( scanned >= 10000000 )
		{
		byteCount += scanned;
		scanned = 0;
		logf(LOG_INFO,"db: Scanned %"INT64" bytes. Parsed %"INT64" records"
		     "dups=%"INT64". ",byteCount,indexdbCount, dups);
	}
	tid = -1;

	k = *(key_t *) list.getStartKey();
	tempTid = g_indexdb.getTermId( k );
	k = *(key_t *) list.getEndKey();
	endTid = g_indexdb.getTermId( k );
	// loop over entries in list

	memset(slots , 0, numSlots * 8);
	memset(scores, 0, numSlots);
	offset1 = 0;
	offset2 = 0;
	//startTime = gettimeofdayInMilliseconds();
	// loop over entries in list
	//startTime = gettimeofdayInMilliseconds();
	//int32_t totalNumParsed = 0;
	bool sameTidList = 0;
	int32_t thisDup = 0;
	if(tempTid == endTid) sameTidList = 1;
	//log(LOG_INFO,"sameTidList = %d",sameTidList);
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {

		k = list.getCurrentKey();
		if ( (k.n0 & 0x01LL) == 0x00 ) continue;
		tempTid = g_indexdb.getTermId(k);
		d = g_indexdb.getDocId(k);
		//totalNumParsed++;
		numParsed++;
		//change in tid, get the count

		if(tempTid != tid ) {
			thisDup = 0;
			//is this tid the same as we process in the last run
			if(tid == -1 && tempTid == lastTid) {
				log(LOG_INFO,"admin: We broke termlist of "
				    "termid=%"INT64". Some "
			   "docids may be repeated in this termlist and "
			    "we will not know.",
			    tempTid);
			}

			//check if we hit the endTid - then reload the
			// list from that point
			if(tempTid == endTid && !sameTidList) {
				break;
			}
			if(sameTidList) {

				count = numSlots - 1;
				offset1 = 0;
				offset2 = numSlots;
				if(tid != -1) {
					memset(slots , 0, numSlots * 8);
					memset(scores, 0, numSlots);
				//log(LOG_INFO,"dumpDups Wish more numslots");

				}
				tid = tempTid;
				hashMod = numSlots;
				lastTid = tid;
				numParsed = 1;

			} else {
				tid = tempTid;


				tempListPtr = list.m_listPtr;
				tempListPtrHi = list.m_listPtrHi;
				count = 1;
				for( list.skipCurrentRecord();
				     !list.isExhausted();
				     list.skipCurrentRecord() ) {
					if( *(list.m_listPtr) & 0x02 ) {
						count++;
					}
					else {
						key_t kt = list.getCurrentKey();
						tempTid=g_indexdb.getTermId(kt);
						if(tempTid != tid)
							break;
						count++;

					}

				}
				list.m_listPtr = tempListPtr;
				list.m_listPtrHi = tempListPtrHi;

				if(count == 1) {
					continue;
				}

				p = (char *) &count;
				if( count*2  > (unsigned) numSlots )
					count = numSlots;
				else {
					if(count <= 255) {
						highLitBit=
						   lookup[(unsigned char) *p];
						highLitBit++;
						count2 = 1;
						count2 <<= highLitBit;
						if( count2/2 < count ) {
							count2 <<= 1;
						}
						count = count2;
					} else if (count <= 65535) {
						p++;
						highLitBit=
						   lookup[(unsigned char) *p];
						highLitBit += 9;
						count2 = 1;
						count2 <<= highLitBit;
						if( count2/2 < count ) {
							count2 <<= 1;
						}
						count = count2;
					} else if (count <= 16777216) {
						p += 2;
						highLitBit=
						   lookup[(unsigned char) *p];
						highLitBit += 17;
						count2 = 1;
						count2 <<= highLitBit;
						if( count2/2 < count ) {
							count2 <<= 1;
							if(count2 >
							   (unsigned) numSlots)
								count2=numSlots;
						}
						count = count2;
					} else {
						p += 3;
						highLitBit=
						   lookup[(unsigned char) *p];
						highLitBit += 25;
						count2 = 1;
						count2 <<= highLitBit;
						if( count2/2 < count ) {
							count2 <<= 1;
							if(count2 >
							   (unsigned) numSlots)
								count2=numSlots;
						}
						count = count2;
					}
				}

				if(offset2 + count + 1 < (unsigned) numSlots ) {
					offset1 =  offset2 + 1;
					offset2 += count;
				}
				else {
					memset(slots , 0, numSlots * 8);
					memset(scores, 0, numSlots);
					offset1 = 0;
					offset2 = count;
				}

				hashMod = count;
				lastTid = tid;

				numParsed = 1;
			}
		}
		indexdbCount ++;


		n2 = (uint64_t) d & (hashMod-1);
		n2 += offset1;
		while ( slots[n2] && slots[n2] != d ) {
			if ( ++n2 >= (uint64_t) offset2 )
				n2 = offset1;
		}

		if( slots[n2] != d ) {
			slots[n2] = d;
			scores[n2] = (unsigned char) g_indexdb.getScore(k);
		}
		else {
			dups++;

			//add negative keys
			sprintf(buff,"%08"XINT32" %016"XINT64"\n",
				k.n1, k.n0);
			f.write(buff,gbstrlen(buff), -1);
			sprintf(buff,"%"INT64"\n",d);
			f2.write(buff, gbstrlen(buff), -1);

			k.n0 &= 0xfffffffffffffffeLL;

			if ( ! r->addRecord ( coll, k , NULL , 0 , 0) ) {
				log("admin: could not add negative key: %s",
					   mstrerror(g_errno));
				return;
			}

			if ( tree->getNumAvailNodes() <= 0 ) {
				// this should block
				r->dumpTree(0);
			}

			key_t kt;

			kt = g_indexdb.makeKey(tid,scores[n2],slots[n2],false);
			sprintf(buff,"%08"XINT32" %016"XINT64"\n",
				kt.n1, kt.n0);

			kt.n0 &= 0xfffffffffffffffeLL;
			f.write(buff,gbstrlen(buff), -1);

			if ( ! r->addRecord ( coll, kt , NULL , 0 , 0) ) {
				log("admin: could not add negative key: %s",
				    mstrerror(g_errno));
				return;
			}

			if ( tree->getNumAvailNodes() <= 0 ) {
				// this should block
				r->dumpTree(0);
			}
		}

		if(numParsed*2 + 1 > numSlots ) {
		//log(LOG_INFO,"dumpDups wished more numSlots numParsed=%"INT32"",
		//    numParsed);
			tid = 0;
		}
	}
	// no, use the last termid!! well this is not perfect, oh well
	//endTime = gettimeofdayInMilliseconds();
	//log(LOG_INFO,"dumpdups Loop time = %"INT64" notParsed=%"INT32"",
	//    endTime - startTime,  (list.getNumRecs()-totalNumParsed));
	if( list.isExhausted() ) {
		startKey = *(key_t *)list.getLastKey();
		startKey += (uint32_t) 1;
		if ( startKey >= *(key_t *)list.getLastKey() ) goto loop;
	}
	else {
		startKey = k;
		goto loop;
	}
	// watch out for wrap around

	logf(LOG_INFO,"db: Done generating missing docids. Parsed %"INT64""
	     " indexdb records", indexdbCount);
	mfree ( slots, numSlots * 8, "main-dumpDups");
	mfree ( scores, numSlots, "main-dumpDups");
	f.close();
	f2.close();
	list.freeList();
	r->close ( NULL , NULL , false , false );

	return;
}


// . remove the docids in "filename" from indexdb.
// . make a hashtable of these docids
// . once each host has a list of the docids in /a/missing*, do this:
// dsh -ac 'cd /a/ ; echo -n "cat /a/missing* | grep missingdoc | awk '{print $2}' | sort > sorted." > /a/poo ; cd /a/ ; ls missing* >> /a/poo ; chmod +x /a/poo; /a/poo'
// . then each host will have a file called /a/sorted.missing* and you can
//   copy them to host #0 and merge sort them with 'sort -m -t /a/tmp sorted.*'
void removeDocIds  ( char *coll , char *filename ) {

	int fd;
	fd = open ( filename , O_RDONLY );
	if ( fd <= 0 ) {
		log("db: Count not open %s for reading: %s",
		    filename,strerror(errno));
		return ;
	}

	int64_t dcount = 0;
	int64_t offset ;
	char buf [ 1024*1024*2+1 ];
	int32_t readSize ;
	int32_t n ;
	char *p;
	char *pend;

	// note it
	logf(LOG_INFO,"db: Counting docids in file %s.",filename);

 loop1:
	// read in docids and hash them
	offset = 0;
	readSize = 1024*1024*2;
	n = read ( fd , buf , readSize );
	if ( n < 0 ) {
		log("db: Had error reading %s: %s",
		    filename,strerror(errno));
		return ;
	}
	offset += n;
	// 0 is EOF
	p    = buf;
	pend = buf + n;
	*pend = 0;
	while ( *p ) {
		// count it for now
		dcount++;
		// advance over \n
		while ( *p && *p !='\n' ) p++;
		// all done?
		while ( *p == '\n' ) p++;
	}
	if ( n > 0 ) goto loop1;

	// note it
	logf(LOG_INFO,"db: Counted %"INT64" docids in file %s.",dcount,filename);

	int64_t oldNumSlots = (dcount * 100LL) / 80LL;
	oldNumSlots *= 2;
	oldNumSlots -= 1;
	int32_t numSlots = getHighestLitBitValue ((uint32_t)oldNumSlots);
	if ( numSlots < 64 ) numSlots = 64;
	int32_t need = numSlots * 8;
	logf(LOG_INFO,"db: Allocating %"INT32" bytes for hash table.",need);
	uint32_t mask = numSlots - 1;
	uint64_t *slots =
		(uint64_t *)mcalloc(need,"loaddocids");
	if ( ! slots ) {
		log("db: Could not allocate %"INT32" bytes to read in docids. "
		    "Please split this file and do multiple runs.", need);
		return;
	}

	// now hash those docids
	offset = 0;
	close ( fd );
	fd = open ( filename , O_RDONLY );
	if ( fd <= 0 ) {
		log("db: Count not open %s for reading: %s",
		    filename,strerror(errno));
		return ;
	}

	// note it
	logf(LOG_INFO,"db: Loading and hashing docids from file %s.",filename);

 loop2:
	// read in docids and hash them
	n = read ( fd , buf , readSize );
	if ( n < 0 ) {
		log("db: Had error reading %s: %s",
		    filename,strerror(errno));
		return ;
	}
	offset += n;
	// 0 is EOF
	p    = buf;
	pend = buf + n;
	*pend = 0;
	while ( *p ) {
		// get docid
		uint64_t d = atoll(p);
		// hash it
		int32_t n = (uint32_t)d & mask;
		while ( slots[n] && slots[n] != d )
			if ( ++n >= numSlots ) n = 0;
		// add him
		slots[n] = d;
		// advance over \n
		while ( *p && *p !='\n' ) p++;
		// all done?
		while ( *p == '\n' ) p++;
	}
	if ( n > 0 ) goto loop2;

	// do not merge so much
	//if ( g_conf.m_indexdbMinFilesToMerge < 100 )
	//	g_conf.m_indexdbMinFilesToMerge = 100;
	//if ( g_conf.m_checksumdbMinFilesToMerge < 100 )
	//	g_conf.m_checksumdbMinFilesToMerge = 100;
	if ( g_conf.m_clusterdbMinFilesToMerge < 100 )
		g_conf.m_clusterdbMinFilesToMerge = 100;
	//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
	//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
	//g_conf.m_clusterdbMaxDiskPageCacheMem = 0;
	//g_conf.m_indexdbMaxCacheMem = 0;
	//g_conf.m_checksumdbMaxCacheMem = 0;
	//g_conf.m_clusterdbMaxCacheMem = 0;

	//g_tfndb.init();
	g_indexdb.init ();
	//g_checksumdb.init();
	g_clusterdb.init();
	//g_collectiondb.init(true);
	//g_tfndb.getRdb()->addRdbBase1 ( coll );
	g_indexdb.getRdb()->addRdbBase1 ( coll );
	//g_checksumdb.getRdb()->addRdbBase1 ( coll );
	g_clusterdb.getRdb()->addRdbBase1 ( coll );
	// this what set to 2 on me before, triggering a huge merge
	// every dump!!! very bad, i had to gdb to each process and set
	// this value to 50 myself.
	//CollectionRec *cr = g_collectiondb.getRec ( coll );
	//if ( cr->m_indexdbMinFilesToMerge < 50 )
	//	cr->m_indexdbMinFilesToMerge = 50;

	// note it
	logf(LOG_INFO,"db: Loaded %"INT64" docids from file \"%s\".",
	     dcount,filename);

	// now scan indexdb and remove recs with docids in this hash table
	logf(LOG_INFO,"db: Scanning indexdb and removing recs.");
	//logf(LOG_INFO,"db: Tight merge indexdb to make this faster.");

	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;

	key_t startKey;
	key_t endKey;
	startKey.setMin();
	endKey.setMax();

	// compatability with checksumdb's variable size keys
	/*
	int32_t cKeySize = g_conf.m_checksumdbKeySize;
	char startKey2[16];
	char endKey2[16];

	// initialize checksumdb specific keys
	if (cKeySize == 16) {
		((key128_t *)startKey2)->setMin();
		((key128_t *)endKey2)->setMax();
	}
	else {
		KEYSET( startKey2, (char *)&startKey, cKeySize );
		KEYSET( endKey2, (char *)&endKey, cKeySize );
	}
	*/

	g_threads.disableThreads();
	Rdb *r = g_indexdb.getRdb();
	collnum_t collnum = g_collectiondb.getCollnum ( coll );
	// do not start if any indexdb recs in tree or more than 1 disk file
	RdbBase *base = r->getBase(collnum);
	if ( base->getNumFiles() > 1 ) {
		log("db: More than 1 indexdb file. Please tight merge.");
		return;
	}
	if ( g_indexdb.getRdb()->m_tree.getNumUsedNodes() ) {
		log("db: Indexdb tree not empty. Please dump.");
		return;
	}

	// set niceness really high
	if ( setpriority ( PRIO_PROCESS, getpid() , 20 ) < 0 )
		log("db: Call to setpriority failed: %s.",
		    mstrerror(errno));


	// get a meg at a time
	int32_t minRecSizes = 5*1024*1024;

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;

	//
	//
	// SCAN INDEXDB and remove missing docids
	//
	//

	r = g_indexdb.getRdb();
	int64_t count = 0;
	int32_t scanned = 0;
	int64_t recs = 0;
	int64_t removed = 0;
	RdbTree *tree = &r->m_tree;
	CollectionRec *cr = g_collectiondb.getRec(coll);

 loop3:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_INDEXDB   ,
			      cr->m_collnum      ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      // HACK: use false for now
			      //false         , // includeTree   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      // HACK: use 1 for now
			      //1            , // numFiles      ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// something to log
	scanned += list.getListSize();
	if ( scanned >= 100000000 ) {
		count += scanned;
		scanned = 0;
		logf(LOG_INFO,"db: Scanned %"INT64" bytes. Scanned %"INT64" records. "
		     "Removed %"INT64" records.",count,recs,removed);
	}
	// yield every 256k records
	int32_t ymask = 0x40000;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		recs++;
		if ( (recs & ymask) == 0x00 ) sched_yield();
		key_t k    = list.getCurrentKey();
		// skip deletes
		if ( (k.n0 & 0x01) == 0x00 ) continue;
		uint64_t d = g_indexdb.getDocId(k);
		// see if docid is in delete list
		int32_t n = (uint32_t)d & mask;
		while ( slots[n] && slots[n] != d )
			if ( ++n >= numSlots ) n = 0;
		// skip him if we should not delete him
		if ( slots[n] != d ) continue;
		// otherwise, remove him
		// make him a delete, turn off his last bit (the del bit)
		k.n0 &= 0xfffffffffffffffeLL;
		if ( ! r->addRecord ( collnum , (char *)&k , NULL , 0 , 0) ) {
			log("db: Could not delete record.");
			return;
		}
		removed++;
		// dump tree?
		if ( tree->getNumAvailNodes() <= 0 ) {
			// this should block
			r->dumpTree(0);
		}
	}

	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey >= *(key_t *)list.getLastKey() ) goto loop3;


	logf(LOG_INFO,"db: Scanned %"INT64" bytes. Scanned %"INT64" records. "
	     "Removed %"INT64" records.",count+scanned,recs,removed);

	// this should block
	//r->dumpTree(0);

	// save the tree man!
	logf(LOG_INFO,"db: Finished removing docids from indexdb. Saving.");
	r->close ( NULL , NULL , false , false );


	//
	//
	// SCAN CHECKSUMDB and remove missing docids
	//
	//
	/*
	logf(LOG_INFO,"db: Scanning checksumdb and removing recs.");
	r = g_checksumdb.getRdb();
	count = 0;
	scanned = 0;
	recs = 0;
	removed = 0;
	tree = &r->m_tree;

 loop4:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_CHECKSUMDB,
			      coll          ,
			      &list         ,
			      //startKey      ,
			      //endKey        ,
			      startKey2     ,
			      endKey2       ,
			      minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// something to log
	scanned += list.getListSize();
	if ( scanned >= 100000000 ) {
		count += scanned;
		scanned = 0;
		logf(LOG_INFO,"db: Scanned %"INT64" bytes. Scanned %"INT64" records. "
		     "Removed %"INT64" records.",count,recs,removed);
	}
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		recs++;
		if ( (recs & ymask) == 0x00 ) sched_yield();

		//key_t k = list.getCurrentKey();
		char k[16];
		list.getCurrentKey( k );
		// skip deletes
		//if ( (k.n0 & 0x01) == 0x00 ) continue;
		if ( (((key_t *)k)->n0 & 0x01) == 0x00 ) continue;
		uint64_t d = g_checksumdb.getDocId( k );
		// see if docid is in delete list
		int32_t n = (uint32_t)d & mask;
		while ( slots[n] && slots[n] != d )
			if ( ++n >= numSlots ) n = 0;
		// skip him if we should not delete him
		if ( slots[n] != d ) continue;
		// otherwise, remove him
		// make him a delete, turn off his last bit (the del bit)
		//k.n0 &= 0xfffffffffffffffeLL;
		((key_t *)k)->n0 &= 0xfffffffffffffffeLL;
		if ( ! r->addRecord ( collnum , k , NULL , 0 , 0) ) {
			log("db: Could not delete record.");
			return;
		}
		removed++;
		// dump tree?
		if ( tree->getNumAvailNodes() <= 0 ) {
			// this should block
			r->dumpTree(0);
		}
	}

	//startKey = *(key_t *)list.getLastKey();
	//startKey += (uint32_t) 1;
	list.getLastKey( startKey2 );
	if ( cKeySize == 12 )
		*((key_t *)startKey2) += (uint32_t) 1;
	else if ( cKeySize == 16 )
		*((key128_t *)startKey2) += (uint32_t) 1;

	// watch out for wrap around
	//if ( startKey >= *(key_t *)list.getLastKey() ) goto loop4;
	if ( KEYCMP(startKey2, list.getLastKey(), cKeySize) >= 0 )
		goto loop4;

	logf(LOG_INFO,"db: Scanned %"INT64" bytes. Scanned %"INT64" records. "
	     "Removed %"INT64" records.",count+scanned,recs,removed);

	// this should block
	//r->dumpTree(0);

	logf(LOG_INFO,"db: Finished removing docids from checksumdb. Saving.");
	r->close ( NULL , NULL , false , false );
	*/


	//
	//
	// SCAN CLUSTERDB and remove missing docids
	//
	//

	logf(LOG_INFO,"db: Scanning clusterdb and removing recs.");
	r = g_clusterdb.getRdb();
	count = 0;
	scanned = 0;
	recs = 0;
	removed = 0;
	tree = &r->m_tree;
	//CollectionRec *cr = g_collectiondb.getRec(coll);

 loop5:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_CLUSTERDB ,
			      cr->m_collnum    ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// something to log
	scanned += list.getListSize();
	if ( scanned >= 100000000 ) {
		count += scanned;
		scanned = 0;
		logf(LOG_INFO,"db: Scanned %"INT64" bytes. Scanned %"INT64" records. "
		     "Removed %"INT64" records.",count,recs,removed);
	}
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		recs++;
		if ( (recs & ymask) == 0x00 ) sched_yield();
		key_t k    = list.getCurrentKey();
		// skip deletes
		if ( (k.n0 & 0x01) == 0x00 ) continue;
		uint64_t d = g_clusterdb.getDocId(&k);
		// see if docid is in delete list
		int32_t n = (uint32_t)d & mask;
		while ( slots[n] && slots[n] != d )
			if ( ++n >= numSlots ) n = 0;
		// skip him if we should not delete him
		if ( slots[n] != d ) continue;
		// otherwise, remove him
		// make him a delete, turn off his last bit (the del bit)
		k.n0 &= 0xfffffffffffffffeLL;
		if ( ! r->addRecord ( collnum , (char *)&k , NULL , 0 , 0) ) {
			log("db: Could not delete record.");
			return;
		}
		removed++;
		// dump tree?
		if ( tree->getNumAvailNodes() <= 0 ) {
			// this should block
			r->dumpTree(0);
		}
	}

	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey >= *(key_t *)list.getLastKey() ) goto loop5;

	logf(LOG_INFO,"db: Scanned %"INT64" bytes. Scanned %"INT64" records. "
	     "Removed %"INT64" records.",count+scanned,recs,removed);

	// this should block
	//r->dumpTree(0);

	logf(LOG_INFO,"db: Finished removing docids from clusterdb. Saving.");
	r->close ( NULL , NULL , false , false );


	//
	//
	// SCAN TFNDB and remove missing docids
	// one twin might have the docid, while the other doesn't,
	// so make sure to remove it from both.
	//
	//

	logf(LOG_INFO,"db: Scanning tfndb and removing recs.");
	r = 0;//g_tfndb.getRdb();
	count = 0;
	scanned = 0;
	recs = 0;
	removed = 0;
	tree = &r->m_tree;
	//CollectionRec *cr = g_collectiondb.getRec(coll);

 loop6:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_TFNDB     ,
			      cr->m_collnum      ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// something to log
	scanned += list.getListSize();
	if ( scanned >= 100000000 ) {
		count += scanned;
		scanned = 0;
		logf(LOG_INFO,"db: Scanned %"INT64" bytes. Scanned %"INT64" records. "
		     "Removed %"INT64" records.",count,recs,removed);
	}
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		recs++;
		if ( (recs & ymask) == 0x00 ) sched_yield();
		key_t k    = list.getCurrentKey();
		// skip deletes
		if ( (k.n0 & 0x01) == 0x00 ) continue;
		uint64_t d = 0;//g_tfndb.getDocId(&k);
		// see if docid is in delete list
		int32_t n = (uint32_t)d & mask;
		while ( slots[n] && slots[n] != d )
			if ( ++n >= numSlots ) n = 0;
		// skip him if we should not delete him
		if ( slots[n] != d ) continue;
		// otherwise, remove him
		// make him a delete, turn off his last bit (the del bit)
		k.n0 &= 0xfffffffffffffffeLL;
		if ( ! r->addRecord ( collnum , (char *)&k , NULL , 0 , 0) ) {
			log("db: Could not delete record.");
			return;
		}
		removed++;
		// dump tree?
		if ( tree->getNumAvailNodes() <= 0 ) {
			// this should block
			r->dumpTree(0);
		}
	}

	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey >= *(key_t *)list.getLastKey() ) goto loop6;

	logf(LOG_INFO,"db: Scanned %"INT64" bytes. Scanned %"INT64" records. "
	     "Removed %"INT64" records.",count+scanned,recs,removed);

	logf(LOG_INFO,"db: Finished removing docids from tfndb. Saving.");
	r->close ( NULL , NULL , false , false );


	return;
}


/*
// . also makes checksumdb
// . g_hostdb.m_hostId should be set correctly
bool fixTfndb ( char *coll ) {
	// get the list of tfns
	g_titledb.init();
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	g_tfndb.init ();
	g_collectiondb.init(true); // isDump?
	g_tfndb.getRdb()->addRdbBase1 ( coll );
	g_titledb.getRdb()->addRdbBase1 ( coll );
	RdbBase *base = getRdbBase ( RDB_TITLEDB , coll );
	int32_t nf = base->getNumFiles();

	key_t startKey ;
	key_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	RdbList list;

	BigFile *f = NULL;
	RdbMap  *m = NULL;
	int64_t offset = 0LL;
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_TFNDB     ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      false         , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		exit(-1);
	}
	// all done if empty
	if ( list.isEmpty() ) goto done;
	// create new tfndb*.dat file to hold the negative keys
	if ( ! f ) {
		RdbBase *base = getRdbBase ( RDB_TFNDB , coll );
	        int32_t fn = base->addNewFile ( -1 ); // id2
		if ( fn < 0 ) {
			log("fixtfndb: Failed to create new file for "
			    "tfndb.");
			exit(-1);
		}
		f = base->m_files [ fn ];
		m = base->m_maps  [ fn ];
		f->open ( O_RDWR | O_CREAT | O_EXCL , NULL );
		log(LOG_INFO,"fixtfndb: writing fixes to %s",f->getFilename());
	}

	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		// get the tfn
		key_t k   = list.getCurrentKey();
		int32_t  tfn = g_tfndb.getTitleFileNum ( k );
		if ( tfn == 255 ) continue;
		// skip if negative
		if ( (k.n0 & 0x01LL) == 0x00 ) continue;
		int32_t  i = 0;
		for ( ; i < nf ; i++ ) if ( base->m_fileIds2[i] == tfn ) break;
		if ( i < nf ) continue;
		// does not correspond to a tfn, remove it
		int64_t docId = g_tfndb.getDocId        ( k );
		int32_t      e     = g_tfndb.getExt          ( k );
		int32_t  clean = 0  ; if ( g_tfndb.isClean ( k ) ) clean= 1;
		int32_t  half  = 0  ; if ( k.n0 & 0x02           ) half = 1;
		char *dd    = "" ; if ( (k.n0 & 0x01) == 0    ) dd   =" (del)";
		fprintf(stdout,
			"%016"XINT64" docId=%012"INT64" "
			"e=0x%08"XINT32" tfn=%03"INT32" clean=%"INT32" half=%"INT32" %s\n",
			k.n0,docId,e,tfn,clean,half,dd);
		// make negative
		k.n0 &= 0xfffffffffffffffeLL;
		f->write ( &k , sizeof(key_t) , offset );
		offset += sizeof(key_t);
		//m->addRecord ( k , NULL , 0 );
	}
	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey >= *(key_t *)list.getLastKey() ) goto loop;
 done:
	if ( ! f ) return true;
	// write map
	//m->writeMap();
	f->close();
	exit(1);
	return true;
}
*/

// . diff with indexdb in sync/ dir
// . returns false if they differ, true otherwise
/*
bool syncIndexdb ( ) {
	// open indexdb in sync/ dir
	Indexdb idb;
	// temporarily set the working dir
	char newdir [ 256 ];
	sprintf ( newdir , "%s/sync", g_hostdb.m_dir );
	char olddir [ 256 ];
	strcpy ( olddir , g_hostdb.m_dir );
	strcpy ( g_hostdb.m_dir , newdir );
	// init the second indexdb with this new directory
	if ( ! idb.init() ) return false;
	//if ( ! idb.getRdb()->addRdbBase1 ( "main" ) ) return false;
	// restore working dir
	strcpy ( g_hostdb.m_dir , olddir );
	// count diffs
	int64_t count = 0;
	// always block
	g_threads.disableThreads();

	// reset some stuff
	key_t nextKey; nextKey.setMin();
	RdbList ilist1;
	RdbList ilist2;
	ilist1.reset();
	ilist2.reset();

	// now read list from sync dir, and make sure in old dir
 loop:
	key_t endKey; endKey.setMax();
	// always clear last bit of g_nextKey
	nextKey.n0 &= 0xfffffffffffffffeLL;
	// announce startKey
	log("db: next k.n1=%08"XINT32" n0=%016"XINT64"",nextKey.n1,nextKey.n0);
	// a niceness of 0 tells it to block until it gets results!!
	Msg5 msg5;
	if ( ! msg5.getList ( RDB_INDEXDB    ,
			      coll1          ,
			      &ilist1        ,
			      nextKey        ,
			      endKey         , // should be maxed!
			      1024*1024      , // min rec sizes
			      true           , // include tree?
			      false          , // includeCache
			      false          , // addToCache
			      0              , // startFileNum
			      -1             , // m_numFiles
			      NULL           , // state
			      NULL           , // callback
			      0              , // niceness
			      true           ))// do error correction?
		return log(LOG_LOGIC,"db: getList did not block.");

	if ( ! msg5.getList ( RDB_INDEXDB    ,
			      coll2          ,
			      &ilist2        ,
			      nextKey        ,
			      endKey         , // should be maxed!
			      1024*1024      , // min rec sizes
			      true           , // include tree?
			      false          , // includeCache
			      false          , // addToCache
			      0              , // startFileNum
			      -1             , // m_numFiles
			      NULL           , // state
			      NULL           , // callback
			      0              , // niceness
			      true           ))// do error correction?
		return log(LOG_LOGIC,"db: getList did not block.");

	// get last keys of both
	key_t last1 ;
	key_t last2 ;
	if ( ! ilist1.isEmpty() ) last1 = ilist1.getLastKey();
	else                      last1.setMax();
	if ( ! ilist2.isEmpty() ) last2 = ilist2.getLastKey();
	else                      last2.setMax();

	// get the min
	key_t min = last1; if ( min > last2 ) min = last2;

	// now compare the two lists
 iloop:
	key_t k1;
	key_t k2;
	// skip if both empty
	if ( ilist1.isExhausted() && ilist2.isExhausted() ) goto done;
	// if one list is exhausted before the other, dump his keys
	if ( ilist1.isExhausted() ) k1.setMax();
	else                        k1 = ilist1.getCurrentKey();
	if ( ilist2.isExhausted() ) k2.setMax();
	else                        k2 = ilist2.getCurrentKey();
	// if different report it
	if      ( k1 < k2 ) {
		log("db: sync dir has k.n1=%08"XINT32" n0=%016"XINT64"",k1.n1,k1.n0);
		ilist1.skipCurrentRecord();
		count++;
		goto iloop;
	}
	else if ( k2 < k1 ) {
		log("db: orig dir has k.n1=%08"XINT32" n0=%016"XINT64"",k2.n1,k2.n0);
		ilist2.skipCurrentRecord();
		count++;
		goto iloop;
	}
	if ( ! ilist1.isExhausted() ) ilist1.skipCurrentRecord();
	if ( ! ilist2.isExhausted() ) ilist2.skipCurrentRecord();
	goto iloop;

 done:
	// if both lists were completely empty, we're done
	if ( ilist1.isEmpty() && ilist2.isEmpty() ) {
		log("db: *-*-*-* found %"INT32" discrepancies",count);
		g_threads.enableThreads();
		return (count==0);
	}

	// advance nextKey to get next pair of lists
	nextKey = min;
	nextKey += (uint32_t)1;
	// start it all over again
	goto loop;
}
*/

/*
// generates clusterdb from titledb
bool makeClusterdb ( char *coll ) {
	key_t nextKey;
	key_t endKey;
	RdbList list;
	RdbList rlist;
	Msg5 msg5;
	Msg5 msg5b;
	int32_t minRecSizes = 1024*1024;
	//int32_t minRecSizes = 32*1024;
	uint32_t count = 0;
	// make sure the files are clean
	BigFile f;
	f.set ( g_hostdb.m_dir , "clusterdb-saved.dat");
	if ( f.doesExist() ) {
		log("db: %sclusterdb-saved.dat exists. "
		    "Not generating clusterdb.",
		    g_hostdb.m_dir);
		return false;
	}
	f.set ( g_hostdb.m_dir , "clusterdb0001.dat");
	if ( f.doesExist() ) {
		log("db: %sclusterdb0001.dat exists. Not generating clusterdb.",
		    g_hostdb.m_dir);
		return false;
	}
	// turn off threads
	g_threads.disableThreads();
	// log the start
	log("db: Generating clusterdb for Collection %s.", coll);
	// how many are we processing?
	log("db: makeclusterdb: processing %"INT32" urls",
	    g_titledb.getLocalNumDocs());

	// reset some stuff
	nextKey.n1 = 0;
	nextKey.n0 = 0;
	endKey.setMax();

	rlist.set ( NULL,
		    0,
		    NULL,
		    0,
		    0,
		    false,
		    true );

loop:
	list.reset();
	// always clear last bit of g_nextKey
	nextKey.n0 &= 0xfffffffffffffffeLL;
	//int64_t startTime = gettimeofdayInMilliseconds();
	// a niceness of 0 tells it to block until it gets results!!
	bool status = msg5.getList (
				 RDB_TITLEDB    ,
				 coll           ,
				 &list          ,
				 nextKey        ,
				 endKey         , // should be maxed!
				 minRecSizes    , // get this many bytes of rec
				 true           , // include tree?
				 false          , // includeCache
				 false          , // addToCache
				 0              , // startFileNum
				 -1             , // m_numFiles
				 NULL           , // state
				 NULL           , // callback
				 0              , // niceness
				 true           , // do error correction?
				 NULL           , // cache key
				 0              , // retry num
				 -1             , // maxRetries
				 true           , // compensate for merge
				 -1LL           , // sync point
				 &msg5b         );
	if ( ! status ) {
		log("db: critical error. msg5 did a non-blocking call");
		exit(-1);
	}

	// close up if no titleRec
	if ( list.isEmpty() ) {
		log ( LOG_INFO, "db: Added %"INT32" files to clusterdb.", count);
		log ( LOG_INFO,
		      "db: All done generating clusterdb. Saving files.");
		// force tree dump to disk
		g_clusterdb.getRdb()->dumpTree(0);
		// dump trees we did
		g_clusterdb.getRdb()->close ( NULL, NULL, true, false );
		g_threads.enableThreads();
		return true;
	}
	list.resetListPtr();
	rlist.reset();
listLoop:
	if ( list.isExhausted() ) {
		// . add our list to rdb
		if ( ! g_clusterdb.getRdb()->addList ( coll, &rlist ) ) {
			log ( "db: clusterdb addList had error: %s",
			      mstrerror(g_errno) );
			return false;
		}
		goto loop;
	}
	// advance g_nextKey to get next titleRec
	nextKey = list.getCurrentKey();
	nextKey += 1;
	// advance one if positive, must always start on a negative key
	if ( (nextKey.n0 & 0x01) == 0x01 ) nextKey += (uint32_t)1;
	// get raw rec from list
	char *rec     = list.getCurrentRec();
	int32_t  recSize = list.getCurrentRecSize();
	// set the titleRec we got
	TitleRec oldtr ;
	if ( ! oldtr.set ( rec , recSize , false ) ) {// own data?
		log("db: error setting titleRec! skipping." );
		list.skipCurrentRecord();
		goto listLoop;
	}
	Url       *url         = oldtr.getUrl();
	// log the url
	//if ( count % 1000 == 0 )
		//log(LOG_INFO, "%"INT32") %s %"INT32"",
		//		 count,url->getUrl(),oldtr.getContentLen());
	count++;
	// make a cluster rec
	char crec [ CLUSTER_REC_SIZE ];
	g_clusterdb.makeRecFromTitleRec ( crec ,
					 &oldtr,
					  false  );
	//g_clusterdb.makeRecFromTitleRecKey ( crec ,
	//				     rec,
	//				     false  );
	rlist.addRecord ( crec, 0, NULL );
	int32_t nLinkTexts = oldtr.getLinkInfo()->getNumInlinks();
	if ( nLinkTexts > 10 )
		log ( LOG_INFO, "db: %s (%"INT32" links)",
			        url->getUrl(), nLinkTexts );
	if ( count % 10000 == 0 )
		log(LOG_INFO, "db: %"INT32") %"XINT32" %"XINT64"", count,
			      ((key_t*)crec)->n1, ((key_t*)crec)->n0);
	// set startKey, endKey
	//key_t key1 = *(key_t *)crec;
	//key_t key2 = key1;
	// add to our g_clusterdb
	//rlist.set ( crec ,
	//	    CLUSTER_REC_SIZE ,
	//	    crec ,
	//	    CLUSTER_REC_SIZE ,
	//	    key1 ,
	//	    key2 ,
	//	    CLUSTER_REC_SIZE - 12 ,
	//	    false                 , // own data?
	//	    true                  );// use half keys?
	// . add our list to rdb
	//if ( ! g_clusterdb.getRdb()->addList ( coll, &rlist ) ) {
	//	log ( "db: clusterdb addList had error: %s",
	//	      mstrerror(g_errno) );
	//	return false;
	//}

	list.skipCurrentRecord();
	goto listLoop;
	//goto loop;
}
*/

// forces add the hash of the date meta tag into a range for every rec
/*
bool genDateRange ( char *coll ) {
	key_t nextKey;
	key_t endKey;
	RdbList list;
	RdbList rlist;
	Msg5 msg5;
	Msg5 msg5b;
	Msg1 msg1;
	int32_t minRecSizes = 1024*1024;
	//int32_t minRecSizes = 32*1024;
	uint32_t count = 0;
	uint64_t addSize = 0;
	// turn off threads
	g_threads.disableThreads();
	// log the start
	log("db: Generating date range index for Collection %s.", coll);
	// how many are we processing?
	log("db: genDateRange: processing %"INT32" urls",
	    g_titledb.getLocalNumDocs());

	// get site rec 16 for hashing date range ??
	SiteRec sr;
	sr.m_xml = g_tagdb.getSiteXml ( 16, coll, gbstrlen(coll) );

	// reset some stuff
	nextKey.n1 = 0;
	nextKey.n0 = 0;
	endKey.setMax();

	rlist.set ( NULL,
		    0,
		    NULL,
		    0,
		    0,
		    false,
		    true );

loop:
	list.reset();
	// always clear last bit of g_nextKey
	nextKey.n0 &= 0xfffffffffffffffeLL;
	//int64_t startTime = gettimeofdayInMilliseconds();
	// a niceness of 0 tells it to block until it gets results!!
	bool status = msg5.getList (
				 RDB_TITLEDB    ,
				 coll           ,
				 &list          ,
				 nextKey        ,
				 endKey         , // should be maxed!
				 minRecSizes    , // get this many bytes of rec
				 true           , // include tree?
				 false          , // includeCache
				 false          , // addToCache
				 0              , // startFileNum
				 -1             , // m_numFiles
				 NULL           , // state
				 NULL           , // callback
				 0              , // niceness
				 true           , // do error correction?
				 NULL           , // cache key
				 0              , // retry num
				 -1             , // maxRetries
				 true           , // compensate for merge
				 -1LL           , // sync point
				 &msg5b         );
	if ( ! status ) {
		log("db: critical error. msg5 did a non-blocking call");
		exit(-1);
	}

	// close up if no titleRec
	if ( list.isEmpty() ) {
	// FOR SMALL TEST ONLY!!
	//if ( list.isEmpty() || count > 500 ) {
		//log ( LOG_INFO, "db: THIS WAS ONLY A TEST OF 500 RECS!" );
		log ( LOG_INFO, "db: Generated date range for %"INT32" TitleRecs.",
				count);
		log ( LOG_INFO,
		      "db: All done generating date range. Saving files. "
		      "(%"UINT64")", addSize );
		// dump trees we did
		// force tree dump to disk
		g_indexdb.getRdb()->dumpTree(0);
		g_indexdb.getRdb()->close ( NULL, NULL, true, false );
		g_threads.enableThreads();
		return true;
	}
	list.resetListPtr();
	rlist.reset();
listLoop:
	if ( list.isExhausted() ) {
		goto loop;
	}
	// advance g_nextKey to get next titleRec
	nextKey = list.getCurrentKey();
	nextKey += 1;
	// advance one if positive, must always start on a negative key
	if ( (nextKey.n0 & 0x01) == 0x01 ) nextKey += (uint32_t)1;
	// get raw rec from list
	char *rec      = list.getCurrentRec();
	//int32_t  listSize = list.getListSize ();
	int32_t  recSize  = list.getCurrentRecSize();
	// set the titleRec we got
	TitleRec oldtr ;
	if ( ! oldtr.set ( rec , recSize , false ) ) { // own data?
		log("gotList: error setting titleRec! skipping." );
		goto loop;
	}
	// log the url
	Url *url = oldtr.getUrl();
	if ( count % 10000 == 0 )
		log(LOG_INFO, "%"INT32") %s %"INT32"",
		    count,url->getUrl(),oldtr.getContentLen());
	count++;
	// use XmlDoc and TermTable to hash the date range
	TermTable tt;
	XmlDoc    xmlDoc;
	xmlDoc.set(&oldtr, &sr, NULL, 0);
	xmlDoc.hashDate ( &tt, &oldtr, &sr );
	// dump the term table into an index list
	IndexList indexList;
	IndexList newDateList;
	uint64_t chksum1;
	indexList.set ( &tt,
			oldtr.getDocId(),
			NULL,
			&newDateList,
			0,
			NULL,
			&chksum1 ,
			0 ); // niceness
	addSize += indexList.getListSize();
	// . add our list to rdb
	if ( ! g_indexdb.getRdb()->addList ( coll, &indexList ) ) {
		log ( "db: indexdb addList had error: %s",
		      mstrerror(g_errno) );
		return false;
	}
	// go to the next titlerec
	list.skipCurrentRecord();
	goto listLoop;
}
*/

static int keycmp(const void *, const void *);
int keycmp ( const void *p1 , const void *p2 ) {
	// returns 0 if equal, -1 if p1 < p2, +1 if p1 > p2
	if ( *(key_t *)p1 < *(key_t *)p2 ) return -1;
	if ( *(key_t *)p1 > *(key_t *)p2 ) return  1;
	return 0;
}

/*
bool matchertest ( int argc, char* argv[] ) {
	const int iterCompile = 10000;
	int numTerms = -1;
	// find -- separator
	for (int i = 0; i < argc; i++) {
		if (strcmp(argv[i], "--") == 0) {
			numTerms = i;
			break;
		}
	}
	if (numTerms == -1)
		return false;
	MatchTerm terms[numTerms];
	for (int i = 0; i < numTerms; i++) {
		terms[i].m_term = (uint8_t*) argv[i];
		terms[i].m_termSize = gbstrlen(argv[i]);
	}

	// --------------------------------------------------------------------
	// do times compiles of various types
	struct timeval tv;

	// --------------------------------------------------------------------
	gettimeofday(&tv, NULL);
	uint64_t tBMMStart = tv.tv_sec * 1000000 + tv.tv_usec;
	for (int i = 0; i < iterCompile; i++) {
		BitMatrixMatcher matcher;
		matcher.Compile(terms, numTerms, false);
		if (!matcher.Ready()) {
			fprintf(stderr, "BitMatrixMatcher compile\n");
			return false;
		}
	}
	gettimeofday(&tv, NULL);
	uint64_t tBMMElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) - tBMMStart;
	fprintf(stderr, "STAT %24s %6llduS Compile/Free\n",
		"BitMatrixMatcher", tBMMElapsed / iterCompile);

	// --------------------------------------------------------------------
	gettimeofday(&tv, NULL);
	uint64_t tSATMStart = tv.tv_sec * 1000000 + tv.tv_usec;
	for (int i = 0; i < iterCompile; i++) {
		SmallAsciiTrieMatcher matcher;
		matcher.Compile(terms, numTerms, false);
		if (!matcher.Ready()) {
			fprintf(stderr, "SmallAsciiTrieMatcher compile\n");
			return false;
		}
	}
	gettimeofday(&tv, NULL);
	uint64_t tSATMElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) - tSATMStart;
	fprintf(stderr, "STAT %24s %6llduS Compile/Free\n",
		"SmallAsciiTrieMatcher", tSATMElapsed / iterCompile);

	// --------------------------------------------------------------------
	gettimeofday(&tv, NULL);
	uint64_t tMBTMStart = tv.tv_sec * 1000000 + tv.tv_usec;
	for (int i = 0; i < iterCompile; i++) {
		MediumBinaryTrieMatcher matcher;
		matcher.Compile(terms, numTerms, false);
		if (!matcher.Ready()) {
			fprintf(stderr, "MediumBinaryTrieMatcher compile\n");
			return false;
		}
	}
	gettimeofday(&tv, NULL);
	uint64_t tMBTMElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) - tMBTMStart;
	fprintf(stderr, "STAT %24s %6llduS Compile/Free\n",
		"MediumBinaryTrieMatcher", tMBTMElapsed / iterCompile);

	// --------------------------------------------------------------------
	gettimeofday(&tv, NULL);
	uint64_t tMMStart = tv.tv_sec * 1000000 + tv.tv_usec;
	for (int i = 0; i < iterCompile; i++) {
		MatrixMatcher matcher;
		matcher.Compile(terms, numTerms, false);
		if (!matcher.Ready()) {
			fprintf(stderr, "MatrixMatcher compile\n");
			return false;
		}
	}
	gettimeofday(&tv, NULL);
	uint64_t tMMElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) - tMMStart;
	fprintf(stderr, "STAT %24s %6llduS Compile/Free\n",
		"MatrixMatcher", tMMElapsed / iterCompile);

	// --------------------------------------------------------------------
	// get contents of each file into memory
	argv += (numTerms + 1);
	argc -= (numTerms + 1);
	int numFiles = argc;
	uint8_t* content[numFiles];
	uint32_t len[numFiles];
	for (int i = 0; i < numFiles; i++) {
		FILE *pf = fopen(argv[i], "rb");
		if (pf == NULL) {
			fprintf(stderr, "unable to open '%s'\n",
				argv[i]);
			return false;
		}
		struct stat sb;
		if (fstat(fileno(pf), &sb) != 0) {
			fprintf(stderr, "unable to stat '%s'\n", argv[i]);
			return false;
		}
		len[i] = sb.st_size;
		content[i] = (uint8_t*) mmalloc(len[i], "file");
		if (content == NULL) {
			fprintf(stderr, "unable to malloc '%s'\n", argv[i]);
			return false;
		}
		if (fread(content[i], len[i], 1, pf) != 1) {
			fprintf(stderr, "unable to fread '%s'\n", argv[i]);
			return false;
		}
		fclose(pf);
	}

	// --------------------------------------------------------------------
	// compile a matcher of each type
	BitMatrixMatcher matcherBMM;
	matcherBMM.Compile(terms, numTerms, false);
	//matcherBMM.Dump();
	SmallAsciiTrieMatcher matcherSATM;
	matcherSATM.Compile(terms, numTerms, false);
	//matcherSATM.Dump();
	MediumBinaryTrieMatcher matcherMBTM;
	matcherMBTM.Compile(terms, numTerms, false);
	//matcherMBTM.Dump();
	MatrixMatcher matcherMM;
	matcherMM.Compile(terms, numTerms, false);
	//matcherMM.Dump();

	const int numMatchers = 4;
	Matcher* matchers[numMatchers]  =	{	&matcherBMM,
							&matcherSATM,
							&matcherMBTM,
							&matcherMM,
						};
	char* matcherNames[numMatchers] = 	{
							"BitMatrixMatcher",
							"SmallAsciiTrieMatcher",
							"MediumBinaryTrieMatcher",
							"MatrixMatcher"
						};


	// --------------------------------------------------------------------
	// perform matching on each file using each type of matcher
	const int iterExec = 1000;
	for (int fileix = 0; fileix < numFiles; fileix++) {
		for (int matcherix = 0; matcherix < numMatchers; matcherix++) {
			int hits = 0;
			gettimeofday(&tv, NULL);
			uint64_t tStart = tv.tv_sec * 1000000 + tv.tv_usec;
			for (int iter = 0; iter < iterExec; iter++) {
				hits = 0;
				const uint8_t* icursor = content[fileix];
				const uint8_t* iend = icursor + len[fileix];
				Matcher* matcher = matchers[matcherix];
				uint16_t termNum;
				while (icursor < iend) {
					icursor = matcher->Exec(icursor,
								iend - icursor,
								&termNum);
					hits++;
					if (icursor == NULL)
						break;
					//fprintf(stderr, "hit: %s\n",
					//	terms[termNum].m_term);
					icursor += terms[termNum].m_termSize;
				}
			}
			gettimeofday(&tv, NULL);
			uint64_t tElapsed = (tv.tv_sec * 1000000 + tv.tv_usec) -
				tStart;
			fprintf(stderr,"STAT %24s %6llduS %4dKB %4d hits %s\n",
				matcherNames[matcherix],
				tElapsed / iterExec, len[fileix] / 1024,
				hits - 1, argv[fileix]);
		}
	}

	return true;
}

bool trietest ( ) {
	//TrieMatcher<uint8_t,8> matcher;
	MatrixMatcher matcher;
	MatchTerm terms[3];
	terms[0].m_term = (uint8_t*) "jackie";
	terms[0].m_termSize = 6;
	terms[1].m_term = (uint8_t*) "jack";
	terms[1].m_termSize = 4;
	terms[2].m_term = (uint8_t*) "sandi";
	terms[2].m_termSize = 5;
	matcher.Compile(terms, 3, false);
	matcher.Dump();
	uint16_t numTerm;
	const uint8_t* pos;
	#define STRING (uint8_t*) "this is jAck's test for Sandi's enjoyment"
	for (int i = 0; i < 10; i++) {
		pos = matcher.Exec(STRING, gbstrlen((char*) STRING), &numTerm);
		if (pos != NULL) {
			fprintf(stderr, "term[%d] '%s' -> %s\n",
				numTerm, terms[numTerm].m_term, pos);
			pos += gbstrlen((char*) terms[numTerm].m_term);
			pos = matcher.Exec(pos, gbstrlen((char*) pos), &numTerm);
			if (pos != NULL) {
				fprintf(stderr, "term[%d] '%s' -> %s\n",
					numTerm, terms[numTerm].m_term, pos);
				pos += gbstrlen((char*) terms[numTerm].m_term);
				pos = matcher.Exec(pos, gbstrlen((char*) pos),
						&numTerm);
				if (pos != NULL)
					exit(1);
			}
		}
	}
	return false;
}
*/

/*

bool gbgzip (char *filename) {
	File f;
	File w;
	char outfile[1024];
	*(outfile + snprintf(outfile,1023,"%s.gz",filename)) = '\0';
	f.set (".",filename);
	w.set (".",outfile);
	if ( f.doesExist() && f.open ( O_RDONLY ) &&
	     w.open (  O_RDWR | O_CREAT )) {}
	else return log("FATAL: could not open "
			"file for reading:%s",
			filename);
	g_conf.m_maxMem = 2000000000LL;
	g_mem.m_maxMem  = 2000000000LL;

	int64_t fileSize = f.getFileSize();
	if(g_conf.m_maxMem < fileSize)
		return log("FATAL: file too large:%s",
			   filename);
	char* srcbuf = (char*)mmalloc(fileSize,"gzip src");
	int64_t dstbufSize = (int64_t)(fileSize*1.001 + 32);
	char* dstbuf = (char*)mmalloc(dstbufSize,"gzip dst");
	if(srcbuf == NULL || dstbuf == NULL)
		return log("FATAL: file too large:%s, out of memory.",
			   filename);
	int32_t unsigned int written = dstbufSize;
	f.read ( srcbuf , fileSize , 0);
	int32_t err = gbcompress( (unsigned char*)dstbuf    ,
			       &written,
			       (unsigned char*)srcbuf    ,
			       (uint32_t)fileSize ,
			       ET_GZIP);
	if(written == 0 || err != Z_OK)
		if ( err == Z_BUF_ERROR )
		return log("FATAL: could not write file srclen=%"INT64", "
			   "dstlen=0, %s%s",
			   fileSize, mstrerror(g_errno),
			   err == Z_BUF_ERROR?", buffer too small":"");

	w.write ( dstbuf , written , 0);
	sync(); // f.flush ( );
	return true;
}


bool gbgunzip (char *filename) {
	//make the output filename:
	char outfile[1024];
	int32_t filenamelen = gbstrlen(filename);
	int32_t outfilelen = filenamelen - 3;
	if(strcmp(filename + outfilelen, ".gz") != 0)
		return log("FATAL: could not open "
			   "file, not a .gz:%s",
			   filename);

	gbmemcpy(outfile, filename, outfilelen);
	outfile[outfilelen] = '\0';

	//open our input and output files right away
	File f;
	File w;
	f.set (filename);
	w.set (outfile);
	if ( f.doesExist() && f.open ( O_RDONLY ) &&
	     w.open (  O_RDWR | O_CREAT )) {}
	else return log("FATAL: could not open "
			"file for reading:%s",
			filename);
	g_conf.m_maxMem = 2000000000LL;
	g_mem.m_maxMem  = 2000000000LL;

	int64_t fileSize = f.getFileSize();
	if(g_conf.m_maxMem < fileSize)
		return log("FATAL: file too large:%s",
			   filename);
	char* srcbuf = (char*)mmalloc(fileSize,"gzip src");
	if(srcbuf == NULL)
		return log("FATAL: file too large:%s, out of memory.",
			   filename);

	f.read ( srcbuf , fileSize , 0);
	int32_t dstbufSize = getGunzippedSize(srcbuf,fileSize);
	char* dstbuf = (char*)mmalloc(dstbufSize,"gzip dst");
	if(dstbuf == NULL)
		return log("FATAL: file too large:%s, out of memory.",
			   filename);
	int32_t unsigned int written = dstbufSize;
 	int32_t err = gbuncompress( (unsigned char*)dstbuf    ,
				 &written                  ,
				 (unsigned char*)srcbuf    ,
				 (uint32_t)fileSize);
	if(written == 0 || err != Z_OK)
 		if ( err == Z_BUF_ERROR )
 		return log("FATAL: could not write file srclen=%"INT64", "
			   "dstlen=0, %s%s",
 			   fileSize, mstrerror(g_errno),
 			   err == Z_BUF_ERROR?", buffer too small":"");
 	w.write ( dstbuf , written , 0);
 	sync(); // f.flush ( );
 	return true;
}
*/

// time speed of inserts into RdbTree for indexdb
bool bucketstest ( char* dbname ) {
	g_conf.m_maxMem = 2000000000LL; // 2G
	//g_mem.m_maxMem  = 2000000000LL; // 2G


	if ( dbname ) {
		char keySize = 16;
		if(strcmp(dbname, "indexdb") == 0) keySize = 12;
		RdbBuckets rdbb;
		rdbb.set (0,
			  0x7fffffff, // LONG_MAX ,
			  false ,//own data
			  "buckets-test",
			  RDB_INDEXDB,
			  false , //data in ptrs
			  "TestBuckets" ,
			  keySize ,
			  false );
		rdbb.loadBuckets ( dbname );
		if(!rdbb.selfTest(true/*testall*/, false/*core*/))
			if(!rdbb.repair())
				log("db: unrepairable buckets.");
		return 0;
	}

	char oppKey[MAX_KEY_BYTES];
	RdbBuckets rdbb;
	char keySize = 12;
	rdbb.set (0,
		  0x7fffffff,//LONG_MAX ,
		  false ,//own data
		  "buckets-test",
		  RDB_INDEXDB,
		  false , //data in ptrs
		  "TestBuckets" , keySize , false );


	int32_t numKeys = 1000000;
	log("db: speedtest: generating %"INT32" random keys.",numKeys);
	// seed randomizer
	srand ( (int32_t)gettimeofdayInMilliseconds() );
	// make list of one million random keys
	char *k = (char*)mmalloc ( keySize * numKeys , "main" );
	if ( ! k ) return log("speedtest: malloc failed");
	int32_t *r = (int32_t *)k;
	int32_t ksInLongs = keySize / 4;
	for ( int32_t i = 0 ; i < numKeys * ksInLongs ; i++ ) {
		r[i] = rand();// % 2000;
	}

	for ( int32_t i = 0 ; i < 1000 ; i++ ) {
		int32_t j = (rand() % numKeys) * keySize;
		int32_t m = (rand() % numKeys) * keySize;
		gbmemcpy((char*)&k[j], (char*)&k[m], keySize);
		KEYXOR((char*)&k[j],0x01);
	}

	// init the tree
	RdbTree rt;
	if ( ! rt.set ( 0              , // fixedDataSize  ,
			numKeys + 1000 , // maxTreeNodes   ,
			false          , // isTreeBalanced ,
			numKeys * 32   , // maxTreeMem     ,
			false          ,
			"tree-test"    ,
			false          ,
			"TestTree"     ,
			keySize) ) // own data?
		return log("speedTest: tree init failed.");
	// add to regular tree
	int64_t t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < numKeys * keySize; i += keySize ) {
		char* key = k+i;
		KEYSET(oppKey,key,keySize);
		KEYXOR(oppKey,0x01);
		int32_t n;
		n = rt.getNode ( 0, oppKey );
		if ( n >= 0 ) {
			rt.deleteNode3 ( n , true );
		}
		if ( rt.addNode ( 0, key , NULL , 0 ) < 0 )
			return log("speedTest: rdb tree addNode "
				   "failed");
	}
	// print time it took
	int64_t e = gettimeofdayInMilliseconds_force();
	log("db: added %"INT32" keys to rdb tree in %"INT64" ms, "
	    "now have %"INT32" keys",numKeys,e - t, rt.getNumUsedNodes());

	for ( int32_t i = 0 ; i < numKeys * keySize; i+=keySize ) {
		char* key = k+i;
		//if ( k[i].n1 == 1234567 )
		//	fprintf(stderr,"i=%"INT32"\n",i);
		if ( rdbb.addNode ( 0,key , NULL , 0 ) < 0 )
			return log("speedTest: rdb buckets addNode "
				   "failed");
	}

	rdbb.testAndRepair();
	t = gettimeofdayInMilliseconds_force();
	log("db: added %"INT32" keys to rdb buckets in %"INT64" ms, "
	    "now have %"INT32" keys, mem used: %"INT32"",
	    numKeys,t - e, rdbb.getNumKeys(),rdbb.getMemOccupied());
	rdbb.selfTest(true, true);


	log("db: saving and loading buckets.");

	e = gettimeofdayInMilliseconds_force();

 	rdbb.fastSave ( ".",
 			false,
 			NULL, NULL);

	t = gettimeofdayInMilliseconds_force();
	log("db: saved rdbbuckets in %"INT64" ms",t - e);

	//rdbb.setNeedsSave(false);
	rdbb.clear();
	e = gettimeofdayInMilliseconds_force();

 	rdbb.loadBuckets ( "TestBuckets" );

	t = gettimeofdayInMilliseconds_force();
	log("db: loaded rdbbuckets in %"INT64" ms", t - e);

	rdbb.selfTest(true, true);

	//now test loading a tree, the keys will be sorted, so this
	// is the worst case performance.
 	RdbBuckets rdbb2;
 	rdbb2.set (0,
 		   10000000 ,
 		   false ,//own data
		   "buckets-test",
		   RDB_INDEXDB,
 		   false , //data in ptrs
 		   "TestBuckets" , keySize , false );

  	rdbb2.addTree (&rt);
  	rdbb2.selfTest(true, true);
	rdbb2.setNeedsSave(false);


	//now test finding of individual keys
	int32_t tests = numKeys * 2;
	log("db: Testing retrival of %"INT32" individual keys",tests );
	int64_t ttook = 0;
	int64_t btook = 0;
	int32_t tgot = 0;
	int32_t bgot = 0;
	int32_t found = 0;
	for ( int32_t i = 0 ; i < 0; i++ ) {
		int32_t j = (rand() % numKeys) * keySize;
		e = gettimeofdayInMilliseconds_force();
		int32_t nodeNum = rt.getNode ( 0 , (char*)&k[j]);
		t = gettimeofdayInMilliseconds_force();
		ttook += t - e;

		e = gettimeofdayInMilliseconds_force();
		char* foundKey = rdbb.getKeyVal ( 0 , (char*)&k[j], NULL, NULL);
		t = gettimeofdayInMilliseconds_force();
		btook += t - e;


		if(nodeNum == -1) {
			if(foundKey == NULL) {
				continue;
			}
			log("speedTest: node not found in tree, but found in buckets! "
			    "looked up %016"XINT64"%08"XINT32", got %016"XINT64"%08"XINT32"",
			    *(int64_t*)((char*)&k[j]+(sizeof(int32_t))),
			    *(int32_t*)(char*)&k[j],
			    *(int64_t*)(foundKey+(sizeof(int32_t))),
			    *(int32_t*)foundKey);
			rdbb.printBuckets();
			char* xx = NULL; *xx = 0;
		}
		if(foundKey == NULL) {
			if(nodeNum == -1) {
				continue;
			}
			log("speedTest: node not found in buckets, but found in tree! "
			    "%016"XINT64"%08"XINT32"",
			    *(int64_t*)((char*)&k[j]+(sizeof(int32_t))),
			    *(int32_t*)(char*)&k[j]);

			rdbb.printBuckets();
			char* xx = NULL; *xx = 0;
		}
		found++;
	}
	log("db: found %"INT32" keys from rdbtree in %"INT64" ms",found, ttook);
	log("db: found %"INT32" keys from rdbbuckets in %"INT64" ms",found, btook);


	// sort the list of keys
	t = gettimeofdayInMilliseconds_force();
	gbsort ( k , numKeys , sizeof(key_t) , keycmp );
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("db: sorted %"INT32" in %"INT64" ms",numKeys,e - t);


	tests = 100;
	log("db: Testing retrival of a list of keys, %"INT32" random ranges", tests);
	RdbList treelist;
	RdbList bucketlist;
	RdbList list;
	int32_t numPosRecs;
	int32_t numNegRecs;
	char *tmpkey1;
	char *tmpkey2;
	char key1 [ MAX_KEY_BYTES ];
	char key2 [ MAX_KEY_BYTES ];


	int32_t minRecSizes = 10000000;
	//int32_t minRecSizes = -1;
	for ( int32_t i = 0 ; i < tests; i++ ) {
// 		int32_t startKey = rand() % numKeys;
// 		int32_t endKey = (rand() % (numKeys - startKey)) + startKey;
		for ( int32_t x = 0 ; x < MAX_KEY_BYTES; x++ ) {
			key1[x] = rand();
			key2[x] = rand();
		}
 		char* skey;
 		char* ekey;

		if ( KEYCMP(key1,key2,keySize) < 0 ) {
			skey = key1;
			ekey = key2;
		} else {
			skey = key2;
			ekey = key1;
		}

		e = gettimeofdayInMilliseconds_force();
		rt.getList ( (collnum_t)0 ,
			     skey,
			     ekey,
			     minRecSizes, //min rec sizes
			     &treelist,
			     &numPosRecs,
			     &numNegRecs,
			     true ); //use half keys
		t = gettimeofdayInMilliseconds_force();
		ttook += t - e;
		tgot += treelist.getNumRecs();

		tmpkey1 = treelist.getStartKey();
		tmpkey2 = treelist.getEndKey();

		/*
		log(LOG_WARN, "db rdbtree found %"INT32" recs (%"INT32" pos, %"INT32" neg) "
		"between "
		"%016"XINT64"%08"XINT32" and "
		"%016"XINT64"%08"XINT32".  "
		    "took %"INT64" ms",
		    treelist.getNumRecs(),
		    numPosRecs,
		    numNegRecs,
		    *(int64_t*)(tmpkey1+(sizeof(int32_t))), *(int32_t*)tmpkey1,
		    *(int64_t*)(tmpkey2+(sizeof(int32_t))), *(int32_t*)tmpkey2,
		    t - e  );
		*/

		e = gettimeofdayInMilliseconds_force();
		rdbb.getList ( (collnum_t)0 ,
			       skey,
			       ekey,
			       minRecSizes, //min rec sizes
			       &bucketlist,
			       &numPosRecs,
			       &numNegRecs,
			       true ); //use half keys
		t = gettimeofdayInMilliseconds_force();
		btook += t - e;
		bgot += bucketlist.getNumRecs();


		tmpkey1 = bucketlist.getStartKey();
		tmpkey2 = bucketlist.getEndKey();

		/*
		log(LOG_WARN, "db buckets found %"INT32" recs (%"INT32" pos, %"INT32" neg) "
		"between "
		    "%016"XINT64"%08"XINT32" and "
		    "%016"XINT64"%08"XINT32".  "
		    "took %"INT64" ms",
		    bucketlist.getNumRecs(),
		    numPosRecs,
		    numNegRecs,
		    *(int64_t*)(tmpkey1+(sizeof(int32_t))), *(int32_t*)tmpkey1,
		    *(int64_t*)(tmpkey2+(sizeof(int32_t))), *(int32_t*)tmpkey2,
		    t - e  );
		*/
		//check for consistency
		char tkey [ MAX_KEY_BYTES ];
		char bkey [ MAX_KEY_BYTES ];
		while(1) {
			if(treelist.isExhausted() ) {
				if(bucketlist.isExhausted() ) break;
				bucketlist.getCurrentKey(bkey);
				log(LOG_WARN, "db tree and buckets "
				    "inconsistency"
				    " remaining key in buckets is "
				    "%016"XINT64"%08"XINT32".  ",
				    *(int64_t*)(bkey+(sizeof(int32_t))),
				    *(int32_t*)bkey);

				char* xx = NULL; *xx = 0;
			}
			else if (bucketlist.isExhausted() ) {
				treelist.getCurrentKey(tkey);
				log(LOG_WARN, "db tree and buckets "
				    "inconsistency"
				    " remaining key in tree is "
				    "%016"XINT64"%08"XINT32".  ",
				    *(int64_t*)(tkey+(sizeof(int32_t))),
				    *(int32_t*)tkey);
				char* xx = NULL; *xx = 0;
			}
			treelist.getCurrentKey(tkey);
			bucketlist.getCurrentKey(bkey);

			if ( KEYCMP(tkey,bkey,keySize) != 0 ) {
				log(LOG_WARN, "db tree and buckets "
				    "inconsistency "
				    "%016"XINT64"%08"XINT32" and "
				    "%016"XINT64"%08"XINT32".  ",
				    *(int64_t*)(tkey+(sizeof(int32_t))),
				    *(int32_t*)tkey,
				    *(int64_t*)(bkey+(sizeof(int32_t))),
				    *(int32_t*)bkey);
				char* xx = NULL; *xx = 0;
			}

			treelist.skipCurrentRecord();
			bucketlist.skipCurrentRecord();
		}
	}
	log("db: List retrieval successful. ");
	log("db: rdbtree took %"INT64" ms for %"INT32" recs ", ttook, tgot);
	log("db: rdbbuckets took %"INT64" ms for %"INT32" recs",   btook, bgot);
	int64_t tAddTook = 0;
	int64_t bAddTook = 0;
	int64_t tgetListTook = 0;
	int64_t bgetListTook = 0;
	int64_t tdelListTook = 0;
	int64_t bdelListTook = 0;

	ttook = 0;
	btook = 0;
	tgot = 0;
	bgot = 0;

	minRecSizes = 200000;


	KEYSET(key1,KEYMIN(), keySize);
	KEYSET(key2,KEYMAX(), keySize);
	bool status = true;
	log("db: simulating dump, deleting entire list of keys");
	while(rdbb.getNumKeys() > 0 && status) {
		status = rdbb.getList ( (collnum_t)0,
					key1       ,
					KEYMAX()   ,
					minRecSizes,
					&list      ,
					&numPosRecs ,
					&numNegRecs ,
					false );
		if(!status) {char* xx = NULL; *xx = 0;}
		if ( status && list.isEmpty() ) break;
		int32_t numBefore = rdbb.getNumKeys();
		rdbb.deleteList((collnum_t)0, &list);
		//		if (KEYCMP(key2,key1,keySize) < 0) break;


		log("db: buckets now has %"INT32" keys.  "
		    "difference of %"INT32", list size was %"INT32".  "
		    "%016"XINT64"%08"XINT32".  ",
		    rdbb.getNumKeys(), numBefore - rdbb.getNumKeys(),
		    list.getNumRecs(),
		    *(int64_t*)(key1+(sizeof(int32_t))),
		    *(int32_t*)key1);;

		if(numBefore - rdbb.getNumKeys() != list.getNumRecs())
			{char* xx = NULL; *xx = 0;}
		KEYSET(key2,key1,keySize);
		KEYSET(key1,list.getLastKey(),keySize);
		KEYADD(key1,1,keySize);
	}
	if(rdbb.getNumKeys() > 0) {char* xx = NULL; *xx = 0;}


	rdbb.setNeedsSave(false);
	rdbb.clear();

	log("db: Testing retrival of a list of keys, %"INT32" random ranges "
	    "interspersed with adds and deletes", numKeys);
	rt.clear();
	rt.m_needsSave = false;

	for ( int32_t i = 0 ; i < numKeys ; i++ ) {

		e = gettimeofdayInMilliseconds_force();

		char* key = &k[i*keySize];
		KEYSET(oppKey,key,keySize);
		KEYXOR(oppKey,0x01);
		int32_t n;
		collnum_t collnum = rand() % 10;

		n = rt.getNode ( collnum , oppKey );
		if ( n >= 0 ) rt.deleteNode3 ( n , true );

		if ( rt.addNode (collnum, key, NULL , 0 ) < 0 )
			return log("speedTest: rdb tree addNode "
				   "failed");
		t = gettimeofdayInMilliseconds_force();
		tAddTook += t - e;

		e = gettimeofdayInMilliseconds_force();
		if ( rdbb.addNode(collnum, key, NULL, 0 ) < 0 )
			return log("speedTest: rdb buckets addNode "
				   "failed");
		t = gettimeofdayInMilliseconds_force();
		bAddTook += t - e;

		if(i % 100 != 0) continue;

 		char* skey;
 		char* ekey;

 		if(rand() % 2) { //check keys that exist
 			int32_t beg = (rand() % numKeys) * keySize;
 			int32_t end = (rand() % numKeys) * keySize;
 			skey = (char*)&k[beg];
 			ekey = (char*)&k[end];
 			if(KEYCMP(skey,ekey,keySize) > 0) {
 				skey = (char*)&k[end];
 				ekey = (char*)&k[beg];
 			}
 		}
 		else {//otherwise check keys that don't exist
			for ( int32_t x = 0 ; x < MAX_KEY_BYTES; x++ ) {
				key1[x] = rand();
				key2[x] = rand();
			}
			if ( KEYCMP(key1,key2,keySize) < 0 ) {
				skey = key1;
				ekey = key2;
			} else {
				skey = key2;
				ekey = key1;
			}
		}


		e = gettimeofdayInMilliseconds_force();
		rt.getList ( collnum,
			     skey,
			     ekey,
			     minRecSizes, //min rec sizes
			     &treelist,
			     &numPosRecs,
			     &numNegRecs,
			     true ); //use half keys
		t = gettimeofdayInMilliseconds_force();
		tgetListTook += t - e;
		tgot += treelist.getNumRecs();

		if(!treelist.checkList_r(false, false, RDB_INDEXDB))
			log("tree's list was bad");


		tmpkey1 = treelist.getStartKey();
		tmpkey2 = treelist.getEndKey();


		if(treelist.getNumRecs() > 0) {
			log(LOG_WARN, "db inserted %"INT32" keys", i+1);
			log(LOG_WARN, "db rdbtree found %"INT32" recs (%"INT32" pos, "
			    "%"INT32" neg) between "
			    "%016"XINT64"%08"XINT32" and "
			    "%016"XINT64"%08"XINT32".  "
			    "took %"INT64" ms, %"INT64" ms so far",
			    treelist.getNumRecs(),
			    numPosRecs,
			    numNegRecs,
			    *(int64_t*)(tmpkey1+(sizeof(int32_t))), *(int32_t*)tmpkey1,
			    *(int64_t*)(tmpkey2+(sizeof(int32_t))), *(int32_t*)tmpkey2,
			    t - e ,tgetListTook );
		}

		e = gettimeofdayInMilliseconds_force();
		rdbb.getList ( collnum,
			       skey,
			       ekey,
			       minRecSizes, //min rec sizes
			       &bucketlist,
			       &numPosRecs,
			       &numNegRecs,
			       true ); //use half keys
		t = gettimeofdayInMilliseconds_force();
		bgetListTook += t - e;
		bgot += bucketlist.getNumRecs();

		if(!bucketlist.checkList_r(false, false, RDB_INDEXDB))
			log("bucket's list was bad");


		tmpkey1 = bucketlist.getStartKey();
		tmpkey2 = bucketlist.getEndKey();


		if(treelist.getNumRecs() > 0) {
			log(LOG_WARN, "db buckets found %"INT32" recs (%"INT32" pos, "
			    "%"INT32" neg) between "
			    "%016"XINT64"%08"XINT32" and "
			    "%016"XINT64"%08"XINT32".  "
			    "took %"INT64" ms, %"INT64" ms so far.",
			    bucketlist.getNumRecs(),
			    numPosRecs,
			    numNegRecs,
			    *(int64_t*)(tmpkey1+(sizeof(int32_t))), *(int32_t*)tmpkey1,
			    *(int64_t*)(tmpkey2+(sizeof(int32_t))), *(int32_t*)tmpkey2,
			    t - e , bgetListTook);
		}

		//check for consistency
		treelist.resetListPtr();
		bucketlist.resetListPtr();
		char tkey [ MAX_KEY_BYTES ];
		char bkey [ MAX_KEY_BYTES ];
		while(1) {
			if(treelist.isExhausted() ) {
				if(bucketlist.isExhausted() ) break;
				bucketlist.getCurrentKey(bkey);
				log(LOG_WARN, "db tree and buckets "
				    "inconsistency"
				    " remaining key in buckets is "
				    "%016"XINT64"%08"XINT32".  ",
				    *(int64_t*)(bkey+(sizeof(int32_t))),
				    *(int32_t*)bkey);

				char* xx = NULL; *xx = 0;
			}
			else if (bucketlist.isExhausted() ) {
				treelist.getCurrentKey(tkey);
				log(LOG_WARN, "db tree and buckets "
				    "inconsistency"
				    " remaining key in tree is "
				    "%016"XINT64"%08"XINT32".  ",
				    *(int64_t*)(tkey+(sizeof(int32_t))),
				    *(int32_t*)tkey);
				char* xx = NULL; *xx = 0;
			}
			treelist.getCurrentKey(tkey);
			bucketlist.getCurrentKey(bkey);

			if ( KEYCMP(tkey,bkey,keySize) != 0 ) {
				log(LOG_WARN, "db tree and buckets "
				    "inconsistency "
				    "%016"XINT64"%08"XINT32" and "
				    "%016"XINT64"%08"XINT32".  ",
				    *(int64_t*)(tkey+(sizeof(int32_t))),
				    *(int32_t*)tkey,
				    *(int64_t*)(bkey+(sizeof(int32_t))),
				    *(int32_t*)bkey);
				char* xx = NULL; *xx = 0;
			}

			treelist.skipCurrentRecord();
			bucketlist.skipCurrentRecord();
		}
		//continue;
		if(rand() % 100 != 0) continue;


		log("db: removing %"INT32" nodes from tree.  "
		"tree currently has %"INT32" keys",
		treelist.getNumRecs(), rt.getNumUsedNodes  ( ));
		e = gettimeofdayInMilliseconds_force();

		rt.deleteList(collnum, &treelist, true);
		t = gettimeofdayInMilliseconds_force();
		tdelListTook += t - e;

		log("db: Now tree has %"INT32" keys", rt.getNumUsedNodes());

		log("db: removing %"INT32" nodes from buckets. "
		"buckets currently has %"INT32" keys",
		bucketlist.getNumRecs(), rdbb.getNumKeys(0));
		e = gettimeofdayInMilliseconds_force();
		rdbb.deleteList(collnum, &bucketlist);
		t = gettimeofdayInMilliseconds_force();
		bdelListTook += t - e;

		log("db: Now buckets has %"INT32" keys",  rdbb.getNumKeys(0));
	}


	log("db: List retrieval successful. ");
	log("db: rdbtree Add %"INT64" ms, GetList %"INT64" ms, Delete %"INT64" "
	    "for %"INT32" recs ",
	    tAddTook, tgetListTook, tdelListTook, tgot);

	log("db: rdbBuckets Add %"INT64" ms, GetList %"INT64" ms, Delete %"INT64" "
	    "for %"INT32" recs ",
	    bAddTook, bgetListTook, bdelListTook, bgot);


	#if 0
	// get the list
	key_t kk;
	kk.n0 = 0LL;
	kk.n1 = 0;
	//kk.n1 = 1234567;
	//int32_t n = rt.getNextNode ( (collnum_t)0, (char *)&kk );
	int32_t n = rt.getFirstNode();
	// loop it
	t = gettimeofdayInMilliseconds_force();
	int32_t count = 0;
	while ( n >= 0 ) {
		n = rt.getNextNode ( n );
		count++;
	}
	e = gettimeofdayInMilliseconds_force();
	log("db: getList for %"INT32" nodes in %"INT64" ms",count,e - t);
#endif
	rt.m_needsSave = false;
	rdbb.setNeedsSave(false);
	return true;
}

// time speed of inserts into RdbTree for indexdb
bool treetest ( ) {
	int32_t numKeys = 500000;
	log("db: speedtest: generating %"INT32" random keys.",numKeys);
	// seed randomizer
	srand ( (int32_t)gettimeofdayInMilliseconds_force() );
	// make list of one million random keys
	key_t *k = (key_t *)mmalloc ( sizeof(key_t) * numKeys , "main" );
	if ( ! k ) return log("speedtest: malloc failed");
	int32_t *r = (int32_t *)k;
	int32_t size = 0;
	int32_t first = 0;
	for ( int32_t i = 0 ; i < numKeys * 3 ; i++ ) {
		if ( (i % 3) == 2 && first++ < 50000 ) {
			r[i] = 1234567;
			size++;
		}
		else
			r[i] = rand();
	}
	// init the tree
	RdbTree rt;
	if ( ! rt.set ( 0              , // fixedDataSize  ,
			numKeys + 1000 , // maxTreeNodes   ,
			false          , // isTreeBalanced ,
			numKeys * 28   , // maxTreeMem     ,
			false          , // own data?
			"tree-test"    ) )
		return log("speedTest: tree init failed.");
	// add to regular tree
	int64_t t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < numKeys ; i++ ) {
		//if ( k[i].n1 == 1234567 )
		//	fprintf(stderr,"i=%"INT32"\n",i);
		if ( rt.addNode ( (collnum_t)0 , k[i] , NULL , 0 ) < 0 )
			return log("speedTest: rdb tree addNode "
				   "failed");
	}
	// print time it took
	int64_t e = gettimeofdayInMilliseconds_force();
	log("db: added %"INT32" keys to rdb tree in %"INT64" ms",numKeys,e - t);

	// sort the list of keys
	t = gettimeofdayInMilliseconds_force();
	gbsort ( k , numKeys , sizeof(key_t) , keycmp );
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("db: sorted %"INT32" in %"INT64" ms",numKeys,e - t);

	// get the list
	key_t kk;
	kk.n0 = 0LL;
	kk.n1 = 0;
	kk.n1 = 1234567;
	int32_t n = rt.getNextNode ( (collnum_t)0, (char *)&kk );
	// loop it
	t = gettimeofdayInMilliseconds_force();
	int32_t count = 0;
	while ( n >= 0 && --first >= 0 ) {
		n = rt.getNextNode ( n );
		count++;
	}
	e = gettimeofdayInMilliseconds_force();
	log("db: getList for %"INT32" nodes in %"INT64" ms",count,e - t);
	return true;
}


// time speed of inserts into RdbTree for indexdb
bool hashtest ( ) {
	// load em up
	int32_t numKeys = 1000000;
	log("db: speedtest: generating %"INT32" random keys.",numKeys);
	// seed randomizer
	srand ( (int32_t)gettimeofdayInMilliseconds_force() );
	// make list of one million random keys
	key_t *k = (key_t *)mmalloc ( sizeof(key_t) * numKeys , "main" );
	if ( ! k ) return log("speedtest: malloc failed");
	int32_t *r = (int32_t *)k;
	for ( int32_t i = 0 ; i < numKeys * 3 ; i++ ) r[i] = rand();
	// init the tree
	//HashTableT<int32_t,int32_t> ht;
	HashTable ht;
	ht.set ( (int32_t)(1.1 * numKeys) );
	// add to regular tree
	int64_t t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < numKeys ; i++ )
		if ( ! ht.addKey ( r[i] , 1 ) )
			return log("hashtest: add key failed.");
	// print time it took
	int64_t e = gettimeofdayInMilliseconds_force();
	// add times
	log("db: added %"INT32" keys in %"INT64" ms",numKeys,e - t);

	// do the delete test
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < numKeys ; i++ )
		if ( ! ht.removeKey ( r[i] ) )
			return log("hashtest: add key failed.");
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	// add times
	log("db: deleted %"INT32" keys in %"INT64" ms",numKeys,e - t);

	return true;
}


// time speed of big write, read and the seeks
bool thrutest ( char *testdir , int64_t fileSize ) {

	// always block
	g_threads.disableThreads();

	// a read/write buffer of 30M
	int32_t bufSize = 30000000;  // 30M
	//int64_t fileSize = 4000000000LL; // 4G
#undef malloc
	char *buf = (char *) malloc ( bufSize );
#define malloc coreme
	if ( ! buf ) return log("speedtestdisk: %s",strerror(errno));
	// store stuff in there
	for ( int32_t i = 0 ; i < bufSize ; i++ ) buf[i] = (char)i;

	BigFile f;
	// try a read test from speedtest*.dat*
	f.set (testdir,"speedtest");
	if ( f.doesExist() ) {
		if ( ! f.open ( O_RDONLY ) )
			return log("speedtestdisk: cannot open %s/%s",
				   testdir,"speedtest");
		// ensure big enough
		if ( f.getFileSize() < fileSize )
			return log("speedtestdisk: File %s/%s is too small "
				   "for requested read size.",
				   testdir,"speedtest");
		log("db: reading from speedtest0001.dat");
		f.setBlocking();
		goto doreadtest;
	}
	// try a read test from indexdb*.dat*
	f.set (testdir,"indexdb0001.dat");
	if ( f.doesExist() ) {
		if ( ! f.open ( O_RDONLY ) )
			return log("speedtestdisk: cannot open %s/%s",
				   testdir,"indexdb0001.dat");
		log("db: reading from indexdb0001.dat");
		f.setBlocking();
		goto doreadtest;
	}
	// try a write test to speedtest*.dat*
	f.set (testdir,"speedtest");
	if ( ! f.doesExist() ) {
		if ( ! f.open ( O_RDWR | O_CREAT | O_SYNC ) )
			return log("speedtestdisk: cannot open %s/%s",
				   testdir,"speedtest");
		log("db: writing to speedtest0001.dat");
		f.setBlocking();
	}

	// write  2 gigs to the file, 1M at a time
	{
	int64_t t1 = gettimeofdayInMilliseconds_force();
	int32_t numLoops = fileSize / bufSize;
	int64_t off = 0LL;
	int32_t next = 0;
	for ( int32_t i = 0 ; i < numLoops ; i++ ) {
		f.write ( buf , bufSize , off );
		sync(); // f.flush ( );
		off  += bufSize ;
		next += bufSize;
		//if ( i >= numLoops || next < 100000000 ) continue;
		if ( i + 1 < numLoops && next < 100000000 ) continue;
		next = 0;
		// print speed every X seconds
		int64_t t2 = gettimeofdayInMilliseconds_force();
		float mBps = (float)off / (float)(t2-t1) / 1000.0 ;
		fprintf(stderr,"wrote %"INT64" bytes in %"INT64" ms (%.1f MB/s)\n",
			off,t2-t1,mBps);
	}
	}

 doreadtest:

	{
	int64_t t1 = gettimeofdayInMilliseconds_force();
	int32_t numLoops = fileSize / bufSize;
	int64_t off = 0LL;
	int32_t next = 0;
	for ( int32_t i = 0 ; i < numLoops ; i++ ) {
		f.read ( buf , bufSize , off );
		//sync(); // f.flush ( );
		off  += bufSize ;
		next += bufSize;
		//if ( i >= numLoops || next < 100000000 ) continue;
		if ( i + 1 < numLoops && next < 100000000 ) continue;
		next = 0;
		// print speed every X seconds
		int64_t t2 = gettimeofdayInMilliseconds_force();
		float mBps = (float)off / (float)(t2-t1) / 1000.0 ;
		fprintf(stderr,"read %"INT64" bytes in %"INT64" ms (%.1f MB/s)\n",
			off,t2-t1,mBps);
	}
	}

	return true;
}

//
// SEEK TEST
//

#include <sys/time.h>  // gettimeofday()
#include <sys/time.h>
#include <sys/resource.h>
//#include <pthread.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

//static pthread_attr_t s_attr;
//static int startUp ( void *state ) ;
static void *startUp ( void *state , ThreadEntry *t ) ;
static int32_t s_count = 0;
static int64_t s_filesize = 0;
//static int32_t s_lock = 1;
static int32_t s_launched = 0;
//static int s_fd1 ; // , s_fd2;
static BigFile s_f;
static int32_t s_numThreads = 0;
static int64_t s_maxReadSize = 1;
static int64_t s_startTime = 0;
//#define MAX_READ_SIZE (2000000)
#include <sys/types.h>
#include <sys/wait.h>

void seektest ( char *testdir, int32_t numThreads, int32_t maxReadSize ,
		char *filename ) {

	g_loop.init();
	g_threads.init();
	s_numThreads = numThreads;
	s_maxReadSize = maxReadSize;
	if ( s_maxReadSize <= 0 ) s_maxReadSize = 1;
	//if ( s_maxReadSize > MAX_READ_SIZE ) s_maxReadSize = MAX_READ_SIZE;

	log(LOG_INIT,"admin: dir=%s threads=%"INT32" maxReadSize=%"INT32" file=%s\n",
	    testdir,(int32_t)s_numThreads, (int32_t)s_maxReadSize , filename );

	// maybe its a filename in the cwd
	if ( filename ) {
		s_f.set(testdir,filename);
		if ( s_f.doesExist() ) {
			log(LOG_INIT,"admin: reading from %s.",
			    s_f.getFilename());
			goto skip;
		}
		log("admin: %s does not exists. Use ./gb thrutest ... "
		    "to create speedtest* files.",
		    s_f.getFilename());
		return;
	}
	// check other defaults
	s_f.set ( testdir , "speedtest" );
	if ( s_f.doesExist() ) {
		log(LOG_INIT,"admin: reading from speedtest*.dat.");
		goto skip;
	}
	// try a read test from indexdb*.dat*
	s_f.set (testdir,"indexdb0001.dat");
	if ( s_f.doesExist() ) {
		log(LOG_INIT,"admin: reading from indexdb0001.dat.");
		goto skip;
	}

	log("admin: Neither speedtest* or indexdb0001.dat* "
	    "exist. Use ./gb thrutest ... to create speedtest* files.");
	return;
skip:
	s_f.open ( O_RDONLY );
	s_filesize = s_f.getFileSize();
	log ( LOG_INIT, "admin: file size = %"INT64".",s_filesize);
	// always block
	//g_threads.disableThreads();
	// seed rand
	srand(time(NULL));

	//fprintf(stderr,"disabled until Threads class is used\n");
	//return;
	//}

	// open 2 file descriptors
	//s_fd1 = open ( "/tmp/glibc-2.2.2.tar" , O_RDONLY );
	//s_fd1 = open ( filename , O_RDONLY );
	//s_fd2 = open ( "/tmp/glibc-2.2.5.tar" , O_RDONLY );
	// . set up the thread attribute we use for all threads
	// . fill up with the default values first
	//if ( pthread_attr_init( &s_attr ) )
	//	fprintf (stderr,"Threads::init: pthread_attr_init: error\n");
	// then customize
	//if ( pthread_attr_setdetachstate(&s_attr,PTHREAD_CREATE_DETACHED) )
	//	fprintf ( stderr,"Threads::init: pthread_attr_setdeatchstate:\n");
	//if ( setpriority ( PRIO_PROCESS, getpid() , 0 ) < 0 ) {
	//	fprintf(stderr,"Threads:: setpriority: failed\n");
	//	exit(-1);
	//}
	//s_lock = 1;
	//pthread_t tid1 ; //, tid2;

	// set time
	s_startTime = gettimeofdayInMilliseconds_force();

	int32_t stksize = 1000000 ;
	int32_t bufsize = stksize * s_numThreads ;
#undef malloc
	char *buf = (char *)malloc ( bufsize );
#define malloc coreme
	if ( ! buf ) { log("test: malloc of %"INT32" failed.",bufsize); return; }
	g_conf.m_useThreads = true;
	//int pid;
	for ( int32_t i = 0 ; i < s_numThreads ; i++ ) {
		//int err = pthread_create ( &tid1,&s_attr,startUp,(void *)i) ;
		if (!g_threads.call(GENERIC_THREAD,0,
				    (void *)(PTRTYPE)i,NULL,startUp)){
			log("test: Thread launch failed."); return; }
		//pid = clone ( startUp , buf + stksize * i ,
		//      CLONE_FS | CLONE_FILES | CLONE_VM | //CLONE_SIGHAND |
		//		  SIGCHLD ,
		//		  (void *)NULL );
		//if ( pid == (pid_t)-1) {log("test: error cloning"); return;}
		//log(LOG_INIT,"test:launched i=%"INT32" pid=%i",i,(int)pid);
		//log(LOG_INIT,"test:launched i=%"INT32"",i,(int)pid);
		log(LOG_INIT,"test: Launched thread #%"INT32".",i);
		//if ( err != 0     ) return ;// -1;
	}
	// unset lock
	//s_lock = 0;
	// sleep til done
#undef sleep
	while ( 1 == 1 ) sleep(1000);
#define sleep(a) { char *xx=NULL;*xx=0; }
	//int status;
	//for ( int32_t i = 0 ; i < s_numThreads ; i++ ) waitpid(pid,&status,0);
}

//int startUp ( void *state ) {
void *startUp ( void *state , ThreadEntry *t ) {
	int32_t id = (int32_t) (PTRTYPE)state;
	// . what this lwp's priority be?
	// . can range from -20 to +20
	// . the lower p, the more cpu time it gets
	// . this is really the niceness, not the priority
	//int p = 0;
	//if ( id == 1 ) p = 0;
	//else           p = 30;
	// . set this process's priority
	// . setpriority() is only used for SCHED_OTHER threads
	//if ( setpriority ( PRIO_PROCESS, getpid() , p ) < 0 ) {
	//	fprintf(stderr,"Threads::startUp: setpriority: failed\n");
	//	exit(-1);
	//}
	// read buf
	//char buf [ MAX_READ_SIZE ];
#undef malloc
	char *buf = (char *) malloc ( s_maxReadSize );
#define malloc coreme
	if ( ! buf ) {
		fprintf(stderr,"MALLOC FAILED in thread\n");
		return 0; // NULL;
	}
	// we got ourselves
	s_launched++;
	// msg
	fprintf(stderr,"id=%"INT32" launched. Performing 100000 reads.\n",id);
	// wait for lock to be unleashed
	//while ( s_launched != s_numThreads ) usleep(10);
	// now do a stupid loop
	//int32_t j, off , size;
	int64_t off , size;
	for ( int32_t i = 0 ; i < 100000 ; i++ ) {
		uint64_t r = rand();
		r <<= 32 ;
		r |= rand();
		off = r % (s_filesize - s_maxReadSize );
		// rand size
		//size = rand() % s_maxReadSize;
		size = s_maxReadSize;
		//if ( size < 32*1024 ) size = 32*1024;
		// time it
		int64_t start = gettimeofdayInMilliseconds_force();
		//fprintf(stderr,"%"INT32") i=%"INT32" start\n",id,i );
		//pread ( s_fd1 , buf , size , off );
		s_f.read ( buf , size , off );
		//fprintf(stderr,"%"INT32") i=%"INT32" done\n",id,i );
		int64_t now = gettimeofdayInMilliseconds_force();
#undef usleep
		usleep(0);
#define usleep(a) { char *xx=NULL;*xx=0; }
		s_count++;
		float sps = (float)((float)s_count * 1000.0) /
			(float)(now - s_startTime);
		fprintf(stderr,"count=%"INT32" off=%012"INT64" size=%"INT32" time=%"INT32"ms "
			"(%.2f seeks/sec)\n",
			(int32_t)s_count,
			(int64_t)off,
			(int32_t)size,
			(int32_t)(now - start) ,
			sps );
	}


	// dummy return
	return 0; //NULL;
}

void dumpSectiondb(char *coll,int32_t startFileNum,int32_t numFiles,
		   bool includeTree) {
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	g_sectiondb.init ();
	//g_collectiondb.init(true);
	g_sectiondb.getRdb()->addRdbBase1(coll );
	key128_t startKey ;
	key128_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	RdbList list;
	char tmpBuf[1024];
	SafeBuf sb(tmpBuf, 1024);
	bool firstKey = true;
	CollectionRec *cr = g_collectiondb.getRec(coll);
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_SECTIONDB ,
			      cr->m_collnum      ,
			      &list         ,
			      (char *)&startKey      ,
			      (char *)&endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;

	key128_t lastk;

	// loop over entries in list
	for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
		char *rec  = list.getCurrentRec();
		key128_t *k = (key128_t *)rec;
		char *data = list.getCurrentData();
		int32_t  size = list.getCurrentDataSize();
		// is it a delete?
		if ( (k->n0 & 0x01) == 0 ) {
			printf("k.n1=%016"XINT64" k.n0=%016"XINT64" (delete)\n",
			       k->n1  , k->n0   | 0x01  );  // fix it!
			continue;
		}
		if ( size != sizeof(SectionVote) ) { char *xx=NULL;*xx=0; }
		// sanity check
		if ( ! firstKey ) {
			if ( k->n1 < lastk.n1 ) { char *xx=NULL;*xx=0; }
			if ( k->n1 == lastk.n1 && k->n0 < lastk.n0 ) {
				char *xx=NULL;*xx=0; }
		}
		// no longer a first key
		firstKey = false;
		// copy it
		gbmemcpy ( &lastk , k , sizeof(key128_t) );
		int32_t shardNum =  getShardNum (RDB_SECTIONDB,k);
		//int32_t groupNum = g_hostdb.getGroupNum ( gid );
		// point to the data
		char  *p       = data;
		char  *pend    = data + size;
		// breach check
		if ( p >= pend ) {
			printf("corrupt sectiondb rec k.n0=%"UINT64"",k->n0);
			continue;
		}
		// parse it up
		SectionVote *sv = (SectionVote *)data;
		int64_t termId = g_datedb.getTermId ( k );
		// score is the section type
		unsigned char score2 = g_datedb.getScore(k);
		char *stype = "unknown";
		if ( score2 == SV_CLOCK          ) stype = "clock         ";
		if ( score2 == SV_EURDATEFMT     ) stype = "eurdatefmt    ";
		if ( score2 == SV_EVENT          ) stype = "event         ";
		if ( score2 == SV_ADDRESS        ) stype = "address       ";
		if ( score2 == SV_TAGPAIRHASH    ) stype = "tagpairhash   ";
		if ( score2 == SV_TAGCONTENTHASH ) stype = "tagcontenthash";
		if ( score2 == SV_FUTURE_DATE    ) stype = "futuredate    ";
		if ( score2 == SV_PAST_DATE      ) stype = "pastdate      ";
		if ( score2 == SV_CURRENT_DATE   ) stype = "currentdate   ";
		if ( score2 == SV_SITE_VOTER     ) stype = "sitevoter     ";
		if ( score2 == SV_TURKTAGHASH    ) stype = "turktaghash   ";
		int64_t d = g_datedb.getDocId(k);
		int32_t date = g_datedb.getDate(k);
		// dump it
		printf("k=%s "
		       "sh48=%"XINT64" " // sitehash is the termid
		       "date=%010"UINT32" "
		       "%s (%"UINT32") "
		       "d=%012"UINT64" "
		       "score=%f samples=%f "
		       "shardnum=%"INT32""
		       "\n",
		       //k->n1,
		       //k->n0,
		       KEYSTR(k,sizeof(key128_t)),
		       termId,
		       date,
		       stype,(uint32_t)score2,
		       d,
		       sv->m_score,
		       sv->m_numSampled,
		       shardNum);
	}

	startKey = *(key128_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key128_t *)list.getLastKey() ){ printf("\n"); return;}
	goto loop;
}

void dumpRevdb(char *coll,int32_t startFileNum,int32_t numFiles, bool includeTree) {
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	g_revdb.init ();
	//g_collectiondb.init(true);
	g_revdb.getRdb()->addRdbBase1(coll );
	key_t startKey ;
	key_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	RdbList list;
	char tmpBuf[1024];
	SafeBuf sb(tmpBuf, 1024);
	bool firstKey = true;
	CollectionRec *cr = g_collectiondb.getRec(coll);
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_REVDB     ,
			      cr->m_collnum ,
			      &list         ,
			      (char *)&startKey      ,
			      (char *)&endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;

	key_t lastk;

	// loop over entries in list
	for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
		char *rec  = list.getCurrentRec();
		key_t *k = (key_t *)rec;
		char *data = list.getCurrentData();
		int32_t  size = list.getCurrentDataSize();
		// get docid from key
		int64_t d = g_revdb.getDocId(k);
		// is it a delete?
		if ( (k->n0 & 0x01) == 0 ) {
			printf("k.n1=%08"XINT32" k.n0=%016"XINT64" d=%"UINT64" (delete)\n",
			       k->n1  , k->n0   | 0x01  , d );  // fix it!
			continue;
		}
		//if ( size != sizeof(SectionVote) ) { char *xx=NULL;*xx=0; }
		// sanity check
		if ( ! firstKey ) {
			if ( k->n1 < lastk.n1 ) { char *xx=NULL;*xx=0; }
			if ( k->n1 == lastk.n1 && k->n0 < lastk.n0 ) {
				char *xx=NULL;*xx=0; }
		}
		// no longer a first key
		firstKey = false;
		// copy it
		gbmemcpy ( &lastk , k , sizeof(key_t) );
		// point to the data
		char  *p       = data;
		char  *pend    = data + size;
		// breach check
		if ( p > pend ) {
			printf("corrupt revdb rec k.n1=0x%08"XINT32" d=%"UINT64"\n",
			       k->n1,d);
			continue;
		}
		// parse it up
		//SectionVote *sv = (SectionVote *)data;
		// dump it
		printf("k.n1=%08"XINT32" k.n0=%016"XINT64" ds=%06"INT32" d=%"UINT64"\n",
		       k->n1,k->n0,size,d);
	}

	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key_t *)list.getLastKey() ){ printf("\n"); return;}
	goto loop;
}


void dumpTagdb (char *coll,int32_t startFileNum,int32_t numFiles,
		bool includeTree,
		int32_t c , char req, int32_t rdbId ,
		char *siteArg ) {
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	g_tagdb.init ();
	//g_collectiondb.init(true);
	if ( rdbId == RDB_TAGDB ) g_tagdb.getRdb()->addRdbBase1(coll );
	if ( rdbId == RDB_CATDB ) g_catdb.init();
	key128_t startKey ;
	key128_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	if ( siteArg ) {
		startKey = g_tagdb.makeStartKey ( siteArg );
		endKey = g_tagdb.makeEndKey ( siteArg );
		log("gb: using site %s for start key",siteArg );
	}
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	RdbList list;
	//char tmpBuf[1024];
	//SafeBuf sb(tmpBuf, 1024);

	// get my hostname and port
	char httpAddr[30];
	int32_t port = g_hostdb.getMyPort() - 1000;
	char action[50]="";
	sprintf(httpAddr,"127.0.0.1:%"INT32"", port );
	if ( req == 'D') strcpy(action,"&deleterec=1&useNew=1");
	CollectionRec *cr = g_collectiondb.getRec(coll);

	int64_t hostHash = -1;
	int64_t lastHostHash = -2;
	char *site = NULL;
	char sbuf[1024*2];
	int32_t siteNumInlinks = -1;
	int32_t typeSite = hash64Lower_a("site",4);
	int32_t typeInlinks = hash64Lower_a("sitenuminlinks",14);

 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( rdbId, //RDB_TAGDB     ,
			      cr->m_collnum      ,
			      &list         ,
			      (char *)&startKey      ,
			      (char *)&endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// loop over entries in list
	for(list.resetListPtr();!list.isExhausted(); list.skipCurrentRecord()){
		char *rec  = list.getCurrentRec();
		//key_t k    = list.getCurrentKey();
		key128_t k;
		list.getCurrentKey ( &k );
		char *data = list.getCurrentData();
		int32_t  size = list.getCurrentDataSize();
		// is it a delete?
		if ( (k.n0 & 0x01) == 0 ) {
			if ( req == 'z' ) continue;
			printf("k.n1=%016"XINT64" "
			       "k.n0=%016"XINT64" (delete)\n",
			       k.n1  , k.n0   | 0x01  );  // fix it!
			continue;
		}
		// point to the data
		char  *p       = data;
		char  *pend    = data + size;
		// breach check
		if ( p >= pend ) {
			printf("corrupt tagdb rec k.n0=%"UINT64"",k.n0);
			continue;
		}
		// catdb?
		if ( rdbId == RDB_CATDB ) {
			// for debug!
			CatRec crec;
			crec.set ( NULL,
				   data ,
				   size ,
				   false);
			fprintf(stdout,
				"key=%s caturl=%s #catids=%"INT32" "
				"version=%"INT32"\n"
			       ,KEYSTR(&k,12)
			    ,crec.m_url
			    ,(int32_t)crec.m_numCatids
			    ,(int32_t)crec.m_version
			    );
			continue;
		}
		// parse it up
		//TagRec *tagRec = (TagRec *)rec;
		Tag *tag = (Tag *)rec;
		// print the version and site
		char tmpBuf[1024];
		SafeBuf sb(tmpBuf, 1024);

		bool match = false;

		hostHash = tag->m_key.n1;

		if ( hostHash == lastHostHash ) {
			match = true;
		}
		else {
			site = NULL;
			siteNumInlinks = -1;
		}

		lastHostHash = hostHash;

		// if ( hostHash == 3079740012919792457LL )
		// 	log("hey");

		// making sitelist.txt?
		if ( tag->m_type == typeSite && req == 'z' ) {
			site = tag->getTagData();
			// make it null if too many .'s
			if ( site ) {
				char *p = site;
				int count = 0;
				int alpha = 0;
				int colons = 0;
				// foo.bar.baz.com is ok
				for ( ; *p ; p++ ) {
					if ( *p == '.' ) count++;
					if ( *p == ':' ) colons++;
					if ( is_alpha_a(*p) || *p=='-' )
						alpha++;
				}
				if ( count >= 4 )
					site = NULL;
				if ( colons > 1 )
					site = NULL;
				// no ip addresses allowed, need an alpha char
				if ( alpha == 0 )
					site = NULL;
			}
			// ends in :?
			int slen = 0;
			if ( site ) slen = gbstrlen(site);
			if ( site && site[slen-1] == ':' )
				site = NULL;
			// port bug
			if ( site && site[slen-2] == ':' && site[slen-1]=='/')
				site = NULL;
			// remove heavy spammers to save space
			if ( site && strstr(site,"daily-camshow-report") )
				site = NULL;
			if ( site && strstr(site,".livejasminhd.") )
				site = NULL;
			if ( site && strstr(site,".pornlivenews.") )
				site = NULL;
			if ( site && strstr(site,".isapornblog.") )
				site = NULL;
			if ( site && strstr(site,".teen-model-24.") )
				site = NULL;
			if ( site && ! is_ascii2_a ( site, gbstrlen(site) ) ) {
				site = NULL;
				continue;
			}
			if ( match && siteNumInlinks>=0) {
				// if we ask for 1 or 2 we end up with 100M
				// entries, but with 3+ we get 27M
				if ( siteNumInlinks > 2 && site )
					printf("%i %s\n",siteNumInlinks,site);
				siteNumInlinks = -1;
				site = NULL;
			}
			// save it
			if ( site ) strcpy ( sbuf , site );
			continue;
		}

		if ( tag->m_type == typeInlinks && req == 'z' ) {
			siteNumInlinks = atoi(tag->getTagData());
			if ( match && site ) {
				// if we ask for 1 or 2 we end up with 100M
				// entries, but with 3+ we get 27M
				if ( siteNumInlinks > 2 )
					printf("%i %s\n",siteNumInlinks,sbuf);
				siteNumInlinks = -1;
				site = NULL;
			}
			continue;
		}

		if ( req == 'z' )
			continue;

		// print as an add request or just normal
		if ( req == 'A' ) tag->printToBufAsAddRequest ( &sb );
		else              tag->printToBuf             ( &sb );

		// dump it
		printf("%s\n",sb.getBufStart());

	}

	startKey = *(key128_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key128_t *)list.getLastKey() ){
		printf("\n"); return;}
	goto loop;
}

bool parseTest ( char *coll , int64_t docId , char *query ) {
	g_conf.m_maxMem = 2000000000LL; // 2G
	//g_mem.m_maxMem  = 2000000000LL; // 2G
	//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
	//g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
	//g_conf.m_tfndbMaxDiskPageCacheMem = 0;
	//g_conf.m_titledbMaxTreeMem = 1024*1024*10;
	g_titledb.init ();
	//g_collectiondb.init(true);
	g_titledb.getRdb()->addRdbBase1 ( coll );
	log(LOG_INIT, "build: Testing parse speed of html docId %"INT64".",docId);
	// get a title rec
	g_threads.disableThreads();
	RdbList tlist;
	key_t startKey = g_titledb.makeFirstKey ( docId );
	key_t endKey   = g_titledb.makeLastKey  ( docId );
	// a niceness of 0 tells it to block until it gets results!!
	Msg5 msg5;
	Msg5 msg5b;
	CollectionRec *cr = g_collectiondb.getRec(coll);
	if ( ! msg5.getList ( RDB_TITLEDB    ,
			      cr->m_collnum        ,
			      &tlist         ,
			      startKey       ,
			      endKey         , // should be maxed!
			      9999999        , // min rec sizes
			      true           , // include tree?
			      false          , // includeCache
			      false          , // addToCache
			      0              , // startFileNum
			      -1             , // m_numFiles
			      NULL           , // state
			      NULL           , // callback
			      0              , // niceness
			      false          , // do error correction?
			      NULL           , // cache key ptr
			      0              , // retry num
			      -1             , // maxRetries
			      true           , // compensate for merge
			      -1LL           , // sync point
			      &msg5b         ))
		return log(LOG_LOGIC,"build: getList did not block.");
	// get the title rec
	if ( tlist.isEmpty() )
		return log("build: speedtestxml: "
			   "docId %"INT64" not found.",
			   docId );
	if (!ucInit(g_hostdb.m_dir, true))
		return log("Unicode initialization failed!");

	// get raw rec from list
	char *rec      = tlist.getCurrentRec();
	int32_t  listSize = tlist.getListSize ();
	// set the titleRec we got
	//TitleRec tr ;
	//if ( ! tr.set ( rec , listSize , false /*own data?*/ ) )
	//	return log("build: speedtestxml: Error setting "
	//		   "titleRec." );
	XmlDoc xd;
	if ( ! xd.set2 ( rec , listSize , coll , NULL , 0 ) )
		return log("build: speedtestxml: Error setting "
			   "xml doc." );
	log("build: Doc url is %s",xd.ptr_firstUrl);//tr.getUrl()->getUrl());
	log("build: Doc is %"INT32" bytes long.",xd.size_utf8Content-1);
	log("build: Doc charset is %s",get_charset_str(xd.m_charset));


	// time the summary/title generation code
	log("build: Using query %s",query);
	summaryTest1   ( rec , listSize , coll , docId , query );
	//summaryTest2   ( rec , listSize , coll , docId , query );
	//summaryTest3   ( rec , listSize , coll , docId , query );

	// for a 128k latin1 doc: (access time is probably 15-20ms)
	// 1.18 ms to set title rec (6ms total)
	// 1.58 ms to set Xml
	// 1.71 ms to set Words (~50% from Words::countWords())
	// 0.42 ms to set Pos
	// 0.66 ms to set Bits
	// 0.51 ms to set Scores
	// 0.35 ms to getText()

	// speed test
	int64_t t = gettimeofdayInMilliseconds_force();
	for ( int32_t k = 0 ; k < 100 ; k++ )
		xd.set2 (rec, listSize, coll , NULL , 0 );
	int64_t e = gettimeofdayInMilliseconds_force();
	logf(LOG_DEBUG,"build: Took %.3f ms to set title rec.",
	     (float)(e-t)/100.0);

	// speed test
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t k = 0 ; k < 100 ; k++ ) {
		char *mm = (char *)mmalloc ( 300*1024 , "ztest");
		mfree ( mm , 300*1024 ,"ztest");
	}
	e = gettimeofdayInMilliseconds_force();
	logf(LOG_DEBUG,"build: Took %.3f ms to do mallocs.",
	     (float)(e-t)/100.0);

	// get content
	char *content    = xd.ptr_utf8Content;//tr.getContent();
	int32_t  contentLen = xd.size_utf8Content-1;//tr.getContentLen();

	// loop parse
	Xml xml;
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! xml.set ( content , contentLen ,
				 false, 0, false, xd.m_version ,
				 true , // setparents
				 0 , // niceness
				 CT_HTML ) )
			return log("build: speedtestxml: xml set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Xml::set() took %.3f ms to parse docId %"INT64".",
	    (double)(e - t)/100.0,docId);
	double bpms = contentLen/((double)(e-t)/100.0);
	log("build: %.3f bytes/msec", bpms);
	// get per char and per byte speeds
	xml.reset();

	// loop parse
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! xml.set ( content , contentLen ,
				 false, 0, false, xd.m_version , false ,
				 0 , CT_HTML ) )
			return log("build: xml(setparents=false): %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Xml::set(setparents=false) took %.3f ms to "
	    "parse docId %"INT64".", (double)(e - t)/100.0,docId);


	if (!ucInit(g_hostdb.m_dir, true)) {
		log("Unicode initialization failed!");
		return 1;
	}
	Words words;

	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! words.set ( &xml , true , true ) )
			return log("build: speedtestxml: words set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Words::set(xml,computeIds=true) took %.3f ms for %"INT32" words"
	    " (precount=%"INT32") for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,words.m_preCount,docId);

	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! words.set2 ( &xml , true , true ) )
			return log("build: speedtestxml: words set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Words::set2(xml,computeIds=true) took %.3f ms for %"INT32" "\
	    "words (precount=%"INT32") for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,words.m_preCount,docId);


	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! words.set ( &xml , true , false ) )
			return log("build: speedtestxml: words set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Words::set(xml,computeIds=false) "
	    "took %.3f ms for %"INT32" words"
	    " (precount=%"INT32") for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,words.m_preCount,docId);


	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		//if ( ! words.set ( &xml , true , true ) )
		if ( ! words.set ( content , TITLEREC_CURRENT_VERSION,
				   true, 0 ) )
			return log("build: speedtestxml: words set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Words::set(content,computeIds=true) "
	    "took %.3f ms for %"INT32" words "
	    "for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,docId);


	Pos pos;
	// computeWordIds from xml
	words.set ( &xml , true , true ) ;
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		//if ( ! words.set ( &xml , true , true ) )
		if ( ! pos.set ( &words , NULL ) )
			return log("build: speedtestxml: pos set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Pos::set() "
	    "took %.3f ms for %"INT32" words "
	    "for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,docId);


	Bits bits;
	// computeWordIds from xml
	words.set ( &xml , true , true ) ;
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		//if ( ! words.set ( &xml , true , true ) )
		if ( ! bits.setForSummary ( &words ) )
			return log("build: speedtestxml: Bits set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Bits::setForSummary() "
	    "took %.3f ms for %"INT32" words "
	    "for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,docId);


	Dates dates;
	if (!dates.parseDates(&words,DF_FROM_BODY,NULL,NULL,0,NULL,CT_HTML) )
		return log("build: speedtestxml: parsedates: %s",
			   mstrerror(g_errno));

	Sections sections;
	// computeWordIds from xml
	words.set ( &xml , true , true ) ;
	bits.set ( &words ,TITLEREC_CURRENT_VERSION, 0);
	Phrases phrases;
	phrases.set ( &words,&bits,true,true,TITLEREC_CURRENT_VERSION,0);
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		//if ( ! words.set ( &xml , true , true ) )
		// do not supply xd so it will be set from scratch
		if ( ! sections.set (&words,&phrases,&bits,NULL,0,0,
				     NULL,0,NULL,NULL,
				     0, // contenttype
				     &dates ,
				     NULL, // sectionsdata
				     false, // sectionsdatavalid
				     NULL, // sectionsdata2
				     //0, // tagpairhash
				     NULL, // buf
				     0)) // bufSize
			return log("build: speedtestxml: sections set: %s",
				   mstrerror(g_errno));

	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Scores::set() "
	    "took %.3f ms for %"INT32" words "
	    "for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,docId);


	//Phrases phrases;
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! phrases.set ( &words ,
				     &bits  ,
				     true     , // use stop words
				     false    , // use stems
				     TITLEREC_CURRENT_VERSION ,
				     0 ) ) // niceness
			return log("build: speedtestxml: Phrases set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Phrases::set() "
	    "took %.3f ms for %"INT32" words "
	    "for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,docId);


	bool isPreformattedText ;
	int32_t contentType = xd.m_contentType;//tr.getContentType();
	if ( contentType == CT_TEXT ) isPreformattedText = true;
	else                          isPreformattedText = false;

	/*
	Weights weights;
	//LinkInfo info1;
	//LinkInfo info2;
	// computeWordIds from xml
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		//if ( ! words.set ( &xml , true , true ) )
		if ( ! weights.set (&words                   ,
				    &phrases                 ,
				    &bits                    ,
				    NULL                     , // sections
				    NULL                     , // debug?
				    true , // elim menus?
				    isPreformattedText       ,
				    TITLEREC_CURRENT_VERSION ,
				    600                      , // titleWeight
				    300                      , // headerWeight
				    NULL                     ,
				    false                    , // isLinkText?
				    false                    , // isCntTable?
				    0                        , // sitenuminlnkx
				    0                       )) // niceness
			return log("build: speedtestxml: Weights set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Weights::set() "
	    "took %.3f ms for %"INT32" words "
	    "for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,docId);
	*/

	/*
	if (!ucInit(g_hostdb.m_dir)) {
		log("Unicode initialization failed!");
		return 1;
	}
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! words.set ( &xml , true , true ) )
			return log("build: speedtestxml: words set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Words::set(computeIds=true) took %.3f ms for %"INT32" words "
	    "for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,docId);

	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! words.set ( &xml , false , true ) )
			return log("build: speedtestxml: words set: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Words::set(computeIds=false) took %.3f ms for docId %"INT64".",
	    (double)(e - t)/100.0,docId);
	*/


	char *buf = (char *)mmalloc(contentLen*2+1,"main");
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ )
		if ( ! xml.getText ( buf , contentLen*2+1 ,
				     0         ,  // startNode
				     9999999   ,  // endNode (the last one!)
				     false     ,  // includeTags?
				     true      ,  // visible text only?
				     true      ,  // convert html entities?
				     true      ,  // filter spaces?
				     false     )) // use <stop index> tag?
			return log("build: speedtestxml: getText: %s",
				   mstrerror(g_errno));
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Xml::getText(computeIds=false) took %.3f ms for docId "
	    "%"INT64".",(double)(e - t)/100.0,docId);


	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ ) {
		int32_t bufLen = xml.getText ( buf , contentLen*2+1 ,
				     0         ,  // startNode
				     9999999   ,  // endNode (the last one!)
				     false     ,  // includeTags?
				     true      ,  // visible text only?
				     true      ,  // convert html entities?
				     true      ,  // filter spaces?
				     false     ); // use <stop index> tag?
		if ( ! bufLen ) return log("build: speedtestxml: getText: %s",
					   mstrerror(g_errno));
		if ( ! words.set ( buf,TITLEREC_CURRENT_VERSION,true,0) )
			return log("build: speedtestxml: words set: %s",
				   mstrerror(g_errno));
	}

	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Xml::getText(computeIds=false) w/ word::set() "
	    "took %.3f ms for docId "
	    "%"INT64".",(double)(e - t)/100.0,docId);


	Matches matches;
	Query q;
	//int32_t collLen = gbstrlen(coll);
	q.set2 ( query , langUnknown , false );
	matches.setQuery ( &q );
	words.set ( &xml , true , 0 ) ;
	t = gettimeofdayInMilliseconds_force();
	for ( int32_t i = 0 ; i < 100 ; i++ ) {
		matches.reset();
		if ( ! matches.addMatches ( &words ) )
			return log("build: speedtestxml: matches set: %s",
				   mstrerror(g_errno));
	}
	// print time it took
	e = gettimeofdayInMilliseconds_force();
	log("build: Matches::set() took %.3f ms for %"INT32" words"
	    " (precount=%"INT32") for docId %"INT64".",
	    (double)(e - t)/100.0,words.m_numWords,words.m_preCount,docId);


	return true;
}

/*
bool carveTest ( uint32_t radius, char *fname, char* query ) {
	Query q;
	q.set(query, 0); // boolflag
	FILE* f = fopen(fname, "rb");
	if (f == NULL) {
		fprintf(stderr, "unable to open: '%s' %d\n",
		fname, errno);
		return false;
	}
	char buf[128*1024];
	int bytes = fread(buf, 1, sizeof(buf), f);
	if (bytes < 1) {
		fprintf(stderr, "unable to read: '%s' %d\n",
		fname, errno);
		fclose(f);
		return false;
	}
	buf[bytes] = '\0';
	log(LOG_INFO, "carve[%d]: %s", bytes, buf);
	HtmlCarver carver(csISOLatin1, radius);
	char out[128*1024];
	int carvedbytes;
	carvedbytes = carver.AsciiAndCarveNoTags(
			(uint8_t*) buf, (uint32_t) bytes,
			(uint8_t*) out, sizeof(out) - 1, q);
	out[carvedbytes] = '\0';
	fprintf(stderr, "carved[%d]: '%s'\n", carvedbytes, out);
	return true;
}
*/
bool summaryTest1   ( char *rec , int32_t listSize, char *coll , int64_t docId ,
		      char *query ) {

	//int32_t collLen = gbstrlen(coll);
	// CollectionRec *cr = g_collectiondb.getRec ( coll );

	// start the timer
	int64_t t = gettimeofdayInMilliseconds_force();

	//int32_t titleMaxLen               = cr->m_titleMaxLen;
	//bool considerTitlesFromBody    = false;
	// int32_t summaryMaxLen             = cr->m_summaryMaxLen;
	// int32_t numSummaryLines           = cr->m_summaryMaxNumLines;
	// int32_t summaryMaxNumCharsPerLine = cr->m_summaryMaxNumCharsPerLine;
	// these are arbitrary (taken from Msg24.cpp)
	// int32_t bigSampleRadius           = 100;
	// int32_t bigSampleMaxLen           = 4000;
	// bool ratInSummary              = false;

	Query q;
	q.set2 ( query , langUnknown , false );

	char *content ;
	int32_t  contentLen ;

	// loop parse
	for ( int32_t i = 0 ; i < 100 ; i++ ) {

		//TitleRec tr;
		XmlDoc xd;
		xd.set2 (rec, listSize, coll,NULL,0);
		// get content
		content    = xd.ptr_utf8Content;//tr.getContent();
		contentLen = xd.size_utf8Content-1;//tr.getContentLen();

		// now parse into xhtml (takes 15ms on lenny)
		Xml xml;
		xml.set ( content, contentLen ,
			  false/*ownData?*/, 0, false, xd.m_version ,
			  true , // setparents
			  0 , // niceness
			  CT_HTML );

		xd.getSummary();

		//Summary s;
		// bool status;
		/*
		status = s.set  ( &xml                      ,
				  &q                        ,
				  NULL                      , // termFreqs
				  false                     , // doStemming?
				  summaryMaxLen             ,
				  numSummaryLines           ,
				  summaryMaxNumCharsPerLine ,
				  bigSampleRadius           ,
				  bigSampleMaxLen           ,
				  ratInSummary              ,
				  &tr                       );
		*/
	}

	// print time it took
	int64_t e = gettimeofdayInMilliseconds_force();
	log("build: V1  Summary/Title/Gigabits generation took %.3f ms for docId "
	    "%"INT64".",
	    (double)(e - t)/100.0,docId);
	double bpms = contentLen/((double)(e-t)/100.0);
	log("build: %.3f bytes/msec", bpms);
	return true;
}

// mostly taken from Msg20.cpp
/*
bool summaryTest2   ( char *rec , int32_t listSize, char *coll , int64_t docId ,
		      char *query ) {

	//int32_t collLen = gbstrlen(coll);
	CollectionRec *cr = g_collectiondb.getRec ( coll );

	// start the timer
	int64_t t = gettimeofdayInMilliseconds_force();

	int32_t titleMaxLen               = cr->m_titleMaxLen;
	int32_t summaryMaxLen             = cr->m_summaryMaxLen;
	int32_t numSummaryLines           = cr->m_summaryMaxNumLines;
	int32_t summaryMaxNumCharsPerLine = cr->m_summaryMaxNumCharsPerLine;
	// these are arbitrary (taken from Msg24.cpp)
	int32_t bigSampleRadius           = 100;
	int32_t bigSampleMaxLen           = 4000;
	bool ratInSummary              = false;

	Query q;
	q.set ( query , 0 ); // boolFlag

	char *content ;
	int32_t  contentLen ;

	// loop parse
	for ( int32_t i = 0 ; i < 100 ; i++ ) {

		// 4ms
		TitleRec tr;
		tr.set (rec, listSize, false);
		// get content
		content    = tr.getContent();
		contentLen = tr.getContentLen();

		// time it
		//logf(LOG_TIMING,"query: summary step 1");
		// now parse into xhtml (takes 15ms on lenny)
		// 1ms
		Xml xml;
		xml.set ( tr.getCharset() , content, contentLen ,
			  false, 0, false, tr.getVersion() );
		// time it
		//logf(LOG_TIMING,"query: summary step 2");
		// 7ms
		Words ww;
		ww.set ( &xml ,
			 true , // compute word ids?
			 true );// has html entities?

		// time it
		// 0ms
		//logf(LOG_TIMING,"query: summary step 3");
		//int32_t  sfn = tr.getSiteFilenum();
		//Xml  *sx  = g_tagdb.getSiteXml ( sfn , coll , collLen );
		// time it
		//logf(LOG_TIMING,"query: summary step 4");
		// 5ms
		Sections ss;
		ss.set ( &ww ,NULL,0,NULL,NULL,&tr);

		// time it
		//logf(LOG_TIMING,"query: summary step 5");

		// 3.5ms
		Pos pos;
		pos.set ( &ww , &ss );

		// time it
		//logf(LOG_TIMING,"query: summary step 6");

		// .5ms
		Title tt;
		// use hard title? false!
		tt.setTitle(&tr,&xml,&ww,&ss,&pos,titleMaxLen,0xffff, NULL);
		char *tbuf    = tt.getTitle();
		int32_t  tbufLen = tt.m_titleBytes;
		// sanity check
		if ( ! tbuf && tbufLen ) { char *xx = NULL; *xx = 0; }

		// time it
		//logf(LOG_TIMING,"query: summary step 7");
		// 1ms
		Bits bb;
		if ( ! bb.setForSummary ( &ww ) ) return false;
		// time it
		//logf(LOG_TIMING,"query: summary step 8");

		// 8-9ms
		Summary s;
		bool status;
		status = s.set2 ( &xml                      ,
				  &ww                       ,
				  &bb                       ,
				  &ss                       ,
				  &pos                      ,
				  &q                        ,
				  NULL                      , // termFreqs
				  NULL                      , // affWeights
				  coll                      ,
				  collLen                   ,
				  false                     , // doStemming?
				  summaryMaxLen             ,
				  numSummaryLines           ,
				  summaryMaxNumCharsPerLine ,
				  bigSampleRadius           ,
				  bigSampleMaxLen           ,
				  ratInSummary              ,
				  &tr                       );
		// time it
		//logf(LOG_TIMING,"query: summary step 9");
	}

	// print time it took
	int64_t e = gettimeofdayInMilliseconds_force();
	log("build: V2  Summary/Title/Gigabits generation took %.3f ms for "
	    "docId %"INT64".",
	    (double)(e - t)/100.0,docId);
	double bpms = contentLen/((double)(e-t)/100.0);
	log("build: %.3f bytes/msec", bpms);
	return true;
}

bool summaryTest3   ( char *rec , int32_t listSize, char *coll , int64_t docId ,
		      char *query ) {

	//log(LOG_DEBUG, "HTML mem %d %d %d",
	//	g_mem.m_used, g_mem.m_numAllocated, g_mem.m_numTotalAllocated);

	//int32_t collLen = gbstrlen(coll);
	CollectionRec *cr = g_collectiondb.getRec ( coll );

	// start the timer
	int64_t t = gettimeofdayInMilliseconds_force();

	int32_t titleMaxLen               = cr->m_titleMaxLen;
	int32_t summaryMaxLen             = cr->m_summaryMaxLen;
	int32_t numSummaryLines           = cr->m_summaryMaxNumLines;
	int32_t summaryMaxNumCharsPerLine = cr->m_summaryMaxNumCharsPerLine;
	// these are arbitrary (taken from Msg24.cpp)
	int32_t bigSampleRadius           = 100;
	int32_t bigSampleMaxLen           = 4000;
	bool ratInSummary              = false;

	Query q;
	q.set ( query , 0 ); // boolFlag

	unsigned char *content ;
	int32_t  contentLen ;

	// loop parse
	for ( int32_t i = 0 ; i < 100 ; i++ ) {

		// 4ms
		TitleRec tr;
		tr.set (rec, listSize, false);
		// get content
		char *html    = tr.getContent();
		int32_t htmlLen = tr.getContentLen();

		HtmlCarver parser(tr.getCharset(), 256);
		unsigned char carved[128 * 1024];
		int32_t carvedMax = sizeof(carved);
		// choose this one to convert to utf8 prior to carving
		//int32_t carvedLen = parser.Utf8AndCarve((unsigned char*) content,
		// choose this one to emulate documents that are stored in utf8

		// set this to whatever makes sense for your test...
		switch (2)
		{
		case 1:
			//log(LOG_DEBUG, "HTML utf8 summary");
			contentLen = parser.Utf8AndCarve(
					(unsigned char*) html, htmlLen,
					carved, carvedMax, q);
			content = carved;
			break;
		case 2:
			//log(LOG_DEBUG, "HTML fast ascii summary");
			contentLen = parser.AsciiAndCarveNoTags(
					(unsigned char*) html, htmlLen,
					carved, carvedMax, q);
			content = carved;
			break;
		case 0:
		default:
			//log(LOG_DEBUG, "HTML compatible summary");
			content = (unsigned char*) html;
			contentLen = htmlLen;
			break;
		}

		// time it
		//logf(LOG_TIMING,"query: summary step 1");
		// now parse into xhtml (takes 15ms on lenny)
		// 1ms
		Xml xml;
		xml.set ( tr.getCharset() , (char*) content, contentLen ,
			  false, 0, false, tr.getVersion() );
		// time it
		//logf(LOG_TIMING,"query: summary step 2");
		// 7ms
		Words ww;
		ww.set ( &xml ,
			 true , // compute word ids?
			 true );// has html entities?

		// time it
		// 0ms
		//logf(LOG_TIMING,"query: summary step 3");
		//int32_t  sfn = tr.getSiteFilenum();
		//Xml  *sx  = g_tagdb.getSiteXml ( sfn , coll , collLen );
		// time it
		//logf(LOG_TIMING,"query: summary step 4");
		// 5ms
		Sections ss;
		ss.set ( &ww ,NULL,0,NULL,NULL,&tr);

		// time it
		//logf(LOG_TIMING,"query: summary step 5");

		// 3.5ms
		Pos pos;
		pos.set ( &ww , &ss );

		// time it
		//logf(LOG_TIMING,"query: summary step 6");

		// .5ms
		Title tt;
		// use hard title? false!
		tt.setTitle(&tr,&xml,&ww,&ss,&pos,titleMaxLen,0xffff,NULL);
		char *tbuf    = tt.getTitle();
		int32_t  tbufLen = tt.m_titleBytes;
		// sanity check
		if ( ! tbuf && tbufLen ) { char *xx = NULL; *xx = 0; }

		// time it
		//logf(LOG_TIMING,"query: summary step 7");
		// 1ms
		Bits bb;
		if ( ! bb.setForSummary ( &ww ) ) return false;
		// time it
		//logf(LOG_TIMING,"query: summary step 8");

		// 8-9ms
		Summary s;
		bool status;
		status = s.set2 ( &xml                      ,
				  &ww                       ,
				  &bb                       ,
				  &ss                       ,
				  &pos                      ,
				  &q                        ,
				  NULL                      , // termFreqs
				  NULL                      , // affWeights
				  coll                      ,
				  collLen                   ,
				  false                     , // doStemming?
				  summaryMaxLen             ,
				  numSummaryLines           ,
				  summaryMaxNumCharsPerLine ,
				  bigSampleRadius           ,
				  bigSampleMaxLen           ,
				  ratInSummary              ,
				  &tr                       );
		// time it
		//logf(LOG_TIMING,"query: summary step 9");
	}

	// print time it took
	int64_t e = gettimeofdayInMilliseconds_force();
	log("build: V3  Summary/Title/Gigabits generation took %.3f ms for "
	    "docId %"INT64".",
	    (double)(e - t)/100.0,docId);
	double bpms = contentLen/((double)(e-t)/100.0);
	log("build: %.3f bytes/msec", bpms);
	//log(LOG_DEBUG, "HTML mem %d %d %d",
	//	g_mem.m_used, g_mem.m_numAllocated, g_mem.m_numTotalAllocated);
	return true;
}
*/

void dumpIndexdbFile ( int32_t fn , int64_t off , char *ff , int32_t ks ,
		       char *NAME ) {
	// this is confidential data format
#ifdef _CLIENT_
	return;
#endif
#ifdef _METALINCS_
	return;
#endif
	char buf [ 1000000 ];
	int32_t bufSize = 1000000;
	char fname[64];
	sprintf ( fname , "%s%04"INT32".dat" , ff,fn );
	if ( NAME ) sprintf ( fname , "%s", NAME );
	BigFile f;
	fprintf(stderr,"opening ./%s\n",fname);
	f.set ( "./" , fname );
	if ( ! f.open ( O_RDONLY ) ) return;
	// init our vars
	bool haveTop = false;
	char top[6];
	memset ( top , 0 , 6 );
	bool warned = false;
	// how big is this guy?
	int64_t filesize = f.getFileSize();
	fprintf(stderr,"filesize=%"INT64"\n",filesize);
	fprintf(stderr,"off=%"INT64"\n",off);
	// reset error number
	g_errno = 0;
	// the big read loop
 loop:
	int64_t readSize = bufSize;
	if ( off + readSize > filesize ) readSize = filesize - off;
	// return if we're done reading the whole file
	if ( readSize <= 0 ) return;
	// read in as much as we can
	f.read ( buf , readSize , off );
	// bail on read error
	if ( g_errno ) {
		fprintf(stderr,"read of %s failed",f.getFilename());
		return;
	}
	char *p    = buf;
	char *pend = buf + readSize;
 inner:
	// parse out the keys
	int32_t size;
	if ( ((*p) & 0x02) == 0x00 ) size = ks;
	else                         size = ks-6;
	if ( p + size > pend ) {
		// skip what we read
		off  += readSize ;
		// back up so we don't split a key we should not
		off -= ( pend - p );
		// read more
		goto loop;
	}
	// new top?
	if ( size == ks ) { gbmemcpy ( top , p + (ks-6) , 6 ); haveTop = true; }
	// warning msg
	if ( ! haveTop && ! warned ) {
		warned = true;
		log("db: Warning: first key is a half key.");
	}
	// make the key
	char tmp [ MAX_KEY_BYTES ];
	gbmemcpy ( tmp , p , ks-6 );
	gbmemcpy ( tmp + ks-6 , top , 6 );
	// print the key
	if ( ks == 12 )
		fprintf(stdout,"%08"INT64") %08"XINT32" %016"XINT64"\n",
			off + (p - buf) ,
			*(int32_t *)(tmp+8),*(int64_t *)tmp );
	else
		fprintf(stdout,"%08"INT64") %016"XINT64" %016"XINT64"\n",
			off + (p - buf) ,
			*(int64_t *)(tmp+8),*(int64_t *)tmp );
	// go to next key
	p += size;
	// loop up
	goto inner;
}

void dumpIndexdb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTree,
		   int64_t termId ) {
	// this is confidential data format
#ifdef _CLIENT_
#ifndef _GLOBALSPEC_
	return;
#endif
#endif
#ifdef _METALINCS_
	return;
#endif
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	g_indexdb.init ();
	//g_collectiondb.init(true);
	g_indexdb.getRdb()->addRdbBase1(coll );
	key_t startKey ;
	key_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	if ( termId >= 0 ) {
		startKey = g_indexdb.makeFirstKey ( termId );
		endKey   = g_indexdb.makeLastKey  ( termId );
	}
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;

	// bail if not
	if ( g_indexdb.m_rdb.getNumFiles() <= startFileNum && numFiles > 0 ) {
		printf("Request file #%"INT32" but there are only %"INT32" "
		       "indexdb files\n",startFileNum,
		       g_indexdb.m_rdb.getNumFiles());
		return;
	}

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	CollectionRec *cr = g_collectiondb.getRec(coll);

 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_INDEXDB   ,
			      cr->m_collnum      ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {

		key_t k    = list.getCurrentKey();
		// is it a delete?
		char *dd = "";
		if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
		int64_t d = g_indexdb.getDocId(k);
		uint8_t dh = g_titledb.getDomHash8FromDocId(d);
		if ( termId < 0 )
			printf("k.n1=%08"XINT32" k.n0=%016"XINT64" "
			       "tid=%015"UINT64" score=%03"INT32" "
			       "docId=%012"INT64" dh=0x%02"XINT32"%s\n" ,
			       k.n1, k.n0, (int64_t)g_indexdb.getTermId(k),
			       (int32_t)g_indexdb.getScore(k) ,
			       d , (int32_t)dh, dd );
		else
			printf("k.n1=%08"XINT32" k.n0=%016"XINT64" "
			       "score=%03"INT32" docId=%012"INT64" dh=0x%02"XINT32"%s\n" ,
			       k.n1, k.n0,
			       (int32_t)g_indexdb.getScore(k) ,
			       d , (int32_t)dh, dd );

		continue;
	}

	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key_t *)list.getLastKey() ) return;
	goto loop;
}

void dumpPosdb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTree,
		   int64_t termId , bool justVerify ) {
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;

	if ( ! justVerify ) {
		g_posdb.init ();
		//g_collectiondb.init(true);
		g_posdb.getRdb()->addRdbBase1(coll );
	}

	key144_t startKey ;
	key144_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	if ( termId >= 0 ) {
		g_posdb.makeStartKey ( &startKey, termId );
		g_posdb.makeEndKey  ( &endKey, termId );
		printf("termid=%"UINT64"\n",termId);
		printf("startkey=%s\n",KEYSTR(&startKey,sizeof(POSDBKEY)));
		printf("endkey=%s\n",KEYSTR(&endKey,sizeof(POSDBKEY)));
	}
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;

	// bail if not
	if ( g_posdb.m_rdb.getNumFiles() <= startFileNum && numFiles > 0 ) {
		printf("Request file #%"INT32" but there are only %"INT32" "
		       "posdb files\n",startFileNum,
		       g_posdb.m_rdb.getNumFiles());
		return;
	}

	key144_t lastKey;
	lastKey.setMin();

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;

	// set this flag so Msg5.cpp if it does error correction does not
	// try to get the list from a twin...
	g_isDumpingRdbFromMain = 1;
	CollectionRec *cr = g_collectiondb.getRec(coll);

 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_POSDB   ,
			      cr->m_collnum      ,
			      &list         ,
			      &startKey      ,
			      &endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      true )) { // to debug RdbList::removeBadData_r()
		            //false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;

	// get last key in list
	char *ek2 = list.m_endKey;
	// print it
	printf("ek=%s\n",KEYSTR(ek2,list.m_ks) );

	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() && ! justVerify ;
	      list.skipCurrentRecord() ) {
		key144_t k; list.getCurrentKey(&k);
		// compare to last
		char *err = "";
		if ( KEYCMP((char *)&k,(char *)&lastKey,sizeof(key144_t))<0 )
			err = " (out of order)";
		lastKey = k;
		// is it a delete?
		char *dd = "";
		if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
		int64_t d = g_posdb.getDocId(&k);
		uint8_t dh = g_titledb.getDomHash8FromDocId(d);
		char *rec = list.m_listPtr;
		int32_t recSize = 18;
		if ( rec[0] & 0x04 ) recSize = 6;
		else if ( rec[0] & 0x02 ) recSize = 12;
		// alignment bits check
		if ( recSize == 6  && !(rec[1] & 0x02) ) {
			int64_t nd1 = g_posdb.getDocId(rec+6);
			// seems like nd2 is it, so it really is 12 bytes but
			// does not have the alignment bit set...
			//int64_t nd2 = g_posdb.getDocId(rec+12);
			//int64_t nd3 = g_posdb.getDocId(rec+18);
			// what size is it really?
			// seems like 12 bytes
			//log("debug1: d=%"INT64" nd1=%"INT64" nd2=%"INT64" nd3=%"INT64"",
			//d,nd1,nd2,nd3);
			err = " (alignerror1)";
			if ( nd1 < d ) err = " (alignordererror1)";
			//char *xx=NULL;*xx=0;
		}
		if ( recSize == 12 && !(rec[1] & 0x02) )  {
			//int64_t nd1 = g_posdb.getDocId(rec+6);
			// seems like nd2 is it, so it really is 12 bytes but
			// does not have the alignment bit set...
			int64_t nd2 = g_posdb.getDocId(rec+12);
			//int64_t nd3 = g_posdb.getDocId(rec+18);
			// what size is it really?
			// seems like 12 bytes
			//log("debug1: d=%"INT64" nd1=%"INT64" nd2=%"INT64" nd3=%"INT64"",
			//d,nd1,nd2,nd3);
			//if ( nd2 < d ) { char *xx=NULL;*xx=0; }
			//char *xx=NULL;*xx=0;
			err = " (alignerror2)";
			if ( nd2 < d ) err = " (alignorderrror2)";
		}
		// if it
		if ( recSize == 12 &&  (rec[7] & 0x02)) {
			//int64_t nd1 = g_posdb.getDocId(rec+6);
			// seems like nd2 is it, so it really is 12 bytes but
			// does not have the alignment bit set...
			int64_t nd2 = g_posdb.getDocId(rec+12);
			//int64_t nd3 = g_posdb.getDocId(rec+18);
			// what size is it really?
			// seems like 12 bytes really as well!
			//log("debug2: d=%"INT64" nd1=%"INT64" nd2=%"INT64" nd3=%"INT64"",
			//d,nd1,nd2,nd3);
			//char *xx=NULL;*xx=0;
			err = " (alignerror3)";
			if ( nd2 < d ) err = " (alignordererror3)";
		}
		if ( KEYCMP((char *)&k,(char *)&startKey,list.m_ks)<0 ||
		     KEYCMP((char *)&k,ek2,list.m_ks)>0){
			err = " (out of range)";
		}
		//if ( err )
		//	printf("%s",err );
		//continue;
		//if ( ! magicBit && recSize == 6 ) { char *xx=NULL;*xx=0; }
		int32_t facetVal32 = g_posdb.getFacetVal32 ( &k );

		if ( termId < 0 )
			printf(
			       "k=%s "
			       "tid=%015"UINT64" "
			       "docId=%012"INT64" "

			       "siterank=%02"INT32" "
			       "langid=%02"INT32" "
			       "pos=%06"INT32" "
			       "hgrp=%02"INT32" "
			       "spamrank=%02"INT32" "
			       "divrank=%02"INT32" "
			       "syn=%01"INT32" "
			       "densrank=%02"INT32" "
			       //"outlnktxt=%01"INT32" "
			       "mult=%02"INT32" "

			       "dh=0x%02"XINT32" "
			       "rs=%"INT32"" //recSize
			       "%s" // dd
			       "%s" // err
			       "\n" ,
			       KEYSTR(&k,sizeof(key144_t)),
			       (int64_t)g_posdb.getTermId(&k),
			       d ,
			       (int32_t)g_posdb.getSiteRank(&k),
			       (int32_t)g_posdb.getLangId(&k),
			       (int32_t)g_posdb.getWordPos(&k),
			       (int32_t)g_posdb.getHashGroup(&k),
			       (int32_t)g_posdb.getWordSpamRank(&k),
			       (int32_t)g_posdb.getDiversityRank(&k),
			       (int32_t)g_posdb.getIsSynonym(&k),
			       (int32_t)g_posdb.getDensityRank(&k),
			       //(int32_t)g_posdb.getIsOutlinkText(&k),
			       (int32_t)g_posdb.getMultiplier(&k),

			       (int32_t)dh,
			       recSize,
			       dd ,
			       err );
		else
			printf(
			       "k=%s "
			       "tid=%015"UINT64" "
			       "docId=%012"INT64" "

			       "siterank=%02"INT32" "
			       "langid=%02"INT32" "
			       "pos=%06"INT32" "
			       "hgrp=%02"INT32" "
			       "spamrank=%02"INT32" "
			       "divrank=%02"INT32" "
			       "syn=%01"INT32" "
			       "densrank=%02"INT32" "
			       //"outlnktxt=%01"INT32" "
			       "mult=%02"INT32" "
			       //"senth32=0x%08"XINT32" "
			       "[facetval=%"INT32"] "
			       "recSize=%"INT32" "
			       "dh=0x%02"XINT32"%s%s\n" ,
			       KEYSTR(&k,sizeof(key144_t)),
			       (int64_t)g_posdb.getTermId(&k),
			       d ,
			       (int32_t)g_posdb.getSiteRank(&k),
			       (int32_t)g_posdb.getLangId(&k),
			       (int32_t)g_posdb.getWordPos(&k),
			       (int32_t)g_posdb.getHashGroup(&k),
			       (int32_t)g_posdb.getWordSpamRank(&k),
			       (int32_t)g_posdb.getDiversityRank(&k),
			       (int32_t)g_posdb.getIsSynonym(&k),
			       (int32_t)g_posdb.getDensityRank(&k),
			       //(int32_t)g_posdb.getIsOutlinkText(&k),
			       (int32_t)g_posdb.getMultiplier(&k),
			       //(int32_t)g_posdb.getSectionSentHash32(&k),
			       facetVal32,
			       recSize,

			       (int32_t)dh,
			       dd ,
			       err );
		continue;
	}

	startKey = *(key144_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key144_t *)list.getLastKey() ) return;
	goto loop;
}

void dumpDatedb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTree,
		 int64_t termId , bool justVerify ) {
	// this is confidential data format
#ifdef _CLIENT_
	return;
#endif
#ifdef _METALINCS_
	return;
#endif
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	if ( ! justVerify ) {
		g_datedb.init ();
		//g_collectiondb.init(true);
		g_datedb.getRdb()->addRdbBase1(coll );
	}
	char startKey[16];
	char endKey  [16];
	int64_t termId1 = 0x0000000000000000LL;
	int64_t termId2 = 0xffffffffffffffffLL;
	if ( termId >= 0 ) {
		termId1 = termId;
		termId2 = termId;
	}
	key128_t kk;
	kk = g_datedb.makeStartKey ( termId1 , 0xffffffff );

	// tmp hack
	//kk.n1 = 0x51064d5bdd71bd51LL;
	//kk.n0 = 0x649ffe3f20f617c6LL;

	KEYSET(startKey,(char *)&kk,16);
	kk = g_datedb.makeEndKey   ( termId2 , 0x00000000 );
	KEYSET(endKey,(char *)&kk,16);
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;

	// bail if not
	if ( g_datedb.m_rdb.getNumFiles() <= startFileNum ) {
		printf("Request file #%"INT32" but there are only %"INT32" "
		       "datedb files\n",startFileNum,
		       g_datedb.m_rdb.getNumFiles());
		//return;
	}
	// turn off threads
	g_threads.disableThreads();

	Msg5 msg5;
	Msg5 msg5b;
	IndexList list;
	CollectionRec *cr = g_collectiondb.getRec(coll);
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_DATEDB ,
			      cr->m_collnum          ,
			      &list         ,
			      (char *)&startKey      ,
			      (char *)&endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		g_threads.enableThreads();
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) {
		g_threads.enableThreads();
		return;
	}
	uint8_t a,b;
	int64_t lattid  = hash64n("gbxlatitude") & TERMID_MASK;
	int64_t lontid  = hash64n("gbxlongitude")& TERMID_MASK;
	//int64_t lattid2 = hash64n("gbxlatitudecity") & TERMID_MASK;
	int64_t lattid2 = hash64n("gbxlatitude2") & TERMID_MASK;
	//int64_t lontid2 = hash64n("gbxlongitudecity")& TERMID_MASK;
	int64_t lontid2 = hash64n("gbxlongitude2")& TERMID_MASK;
	int64_t starttid= hash64n("gbxstart")& TERMID_MASK;
	int64_t endtid  = hash64n("gbxend")& TERMID_MASK;
	// sanity check
	if ( list.m_ks != 16 ) { char *xx = NULL; *xx = 0; }
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() && ! justVerify ;
	      list.skipCurrentRecord() ) {
		//key_t k    = list.getCurrentKey();
		uint8_t k[MAX_KEY_BYTES];
		list.getCurrentKey(k);
		// is it a delete?
		char *dd = "";
		//if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
		if ( KEYNEG((char *)k) ) dd = " (delete)";

		// get event id range
		a = 255 - k[7];
		b = 255 - k[6];

		// hack flag for indexing tag terms (complemented)
		bool isTagTerm = (k[9] == 0x7f);

		int64_t tid =(int64_t)list.getTermId16((char *)k);

		// print out for events
		if ( tid &&
		     tid != lattid  &&
		     tid != lontid  &&
		     tid != lattid2 &&
		     tid != lontid2 &&
		     tid != starttid &&
		     tid != endtid ) {
			char *ss = "";
			if ( isTagTerm ) ss = " tagterm";
			printf("k.n1=%016"XINT64" k.n0=%016"XINT64" "
			       "tid=%015"UINT64" "
			       //"date=%010"UINT32" "
			       "eidrng=%"INT32"-%"INT32" "
			       "score=%03"INT32" docId=%012"INT64"%s%s\n" ,
			       KEY1((char *)k,16),KEY0((char *)k),
			       tid,
			       //list.getCurrentDate(),
			       (int32_t)a,(int32_t)b,
			       (int32_t)list.getScore((char *)k),
			       list.getCurrentDocId() , ss, dd );
		}
		else if ( tid == starttid || tid == endtid ) {
			// this will uncomplement it
			uint32_t cd = (uint32_t)list.getCurrentDate();
			char *desc;
			if      ( tid == starttid ) desc = "startTime";
			else if ( tid == endtid   ) desc = "endTime";
			// convert to date str
			time_t ts = (time_t)cd;
			struct tm *timeStruct = localtime ( &ts );
			char ppp[100];
			strftime(ppp,100,"%b-%d-%Y-%H:%M:%S",timeStruct);
			// but use time if its not
			// otherwise a lat/lon/time key
			printf("k.n1=%016"XINT64" "
			       "k.n0=%016"XINT64" "
			       "tid=%015"UINT64"=%s "
			       "time=%s(%"UINT32") "
			       "eventId=%03"INT32" docId=%012"INT64"%s\n" ,
			       KEY1((char *)k,16),
			       KEY0((char *)k),
			       tid,
			       desc,
			       ppp,
			       cd,
			       (int32_t)list.getScore((char *)k),
			       list.getCurrentDocId() ,
			       dd );
		}
		else if ( tid ) {
			// this will uncomplement it
			uint32_t cd = list.getCurrentDate();
			// convert to float
			float latlon = (float)cd;
			// denormalize (we scaled by 10M)
			latlon /= 10000000.0;
			char *desc;
			if ( tid == lattid ) desc = "latitude";
			else if ( tid == lontid ) desc = "longitude";
			else if ( tid == lattid2 ) desc = "latitude2";
			else if ( tid == lontid2 ) desc = "longitude2";
			else desc = "unknownitude";
			// but use time if its not
			// otherwise a lat/lon/time key
			printf("k.n1=%016"XINT64" "
			       "k.n0=%016"XINT64" "
			       "tid=%015"UINT64" "
			       "%s=%.06f "
			       "eventId=%03"INT32" docId=%012"INT64"%s\n" ,
			       KEY1((char *)k,16),
			       KEY0((char *)k),
			       tid,
			       desc,
			       latlon,
			       (int32_t)list.getScore((char *)k),
			       list.getCurrentDocId() ,
			       dd );
		}

		/*
		if ( termId < 0 )
			printf("k.n1=%016"XINT64" k.n0=%016"XINT64" "
			       "tid=%015llu date=%010"UINT32" "
			       "score=%03"INT32" docId=%012"INT64"%s\n" ,
			       KEY1(k,16),KEY0(k),
			       (int64_t)list.getTermId16(k),
			       list.getCurrentDate(),
			       (int32_t)list.getScore(k),
			       list.getCurrentDocId() , dd );
		else
			printf("k.n1=%016"XINT64" k.n0=%016"XINT64" "
			       "date=%010"UINT32" score=%03"INT32" docId=%012"INT64"%s\n" ,
			       KEY1(k,16),KEY0(k),
			       list.getCurrentDate(),
			       (int32_t)list.getScore(k),
			       list.getCurrentDocId() , dd );
		*/
		continue;
	}

	KEYSET(startKey,list.getLastKey(),16);
	KEYADD(startKey,1,16);
	// watch out for wrap around
	//if ( startKey < *(key_t *)list.getLastKey() ) return;
	if ( KEYCMP(startKey,list.getLastKey(),16)<0 ) {
		g_threads.enableThreads();
		return;
	}
	goto loop;
}

void dumpClusterdb ( char *coll,
		     int32_t startFileNum,
		     int32_t numFiles,
		     bool includeTree ) {
	// this is confidential data format
#ifdef _CLIENT_
	return;
#endif
#ifdef _METALINCS_
	return;
#endif
	g_clusterdb.init ();
	//g_collectiondb.init(true);
	g_clusterdb.getRdb()->addRdbBase1(coll );
	key_t startKey ;
	key_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;

	// bail if not
	if ( g_clusterdb.getRdb()->getNumFiles() <= startFileNum ) {
		printf("Request file #%"INT32" but there are only %"INT32" "
		       "clusterdb files\n",startFileNum,
		       g_clusterdb.getRdb()->getNumFiles());
		return;
	}

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	CollectionRec *cr = g_collectiondb.getRec(coll);
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_CLUSTERDB ,
			      cr->m_collnum          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() )
		return;
	// loop over entries in list
	char strLanguage[256];
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k    = list.getCurrentKey();
		// is it a delete?
		char *dd = "";
		if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
		// get the language string
		languageToString ( g_clusterdb.getLanguage((char*)&k),
				   strLanguage );
		//uint32_t gid = getGroupId ( RDB_CLUSTERDB , &k );
		uint32_t shardNum = getShardNum( RDB_CLUSTERDB , &k );
		Host *grp = g_hostdb.getShard ( shardNum );
		Host *hh = &grp[0];
		// print it
		printf("k.n1=%08"XINT32" k.n0=%016"XINT64" "
		       "docId=%012"INT64" family=%"UINT32" "
		       "language=%"INT32" (%s) siteHash26=%"UINT32"%s "
		       "groupNum=%"UINT32" "
		       "shardNum=%"UINT32"\n",
		       k.n1, k.n0,
		       g_clusterdb.getDocId((char*)&k) ,
		       g_clusterdb.hasAdultContent((char*)&k) ,
		       (int32_t)g_clusterdb.getLanguage((char*)&k),
		       strLanguage,
		       g_clusterdb.getSiteHash26((char*)&k)    ,
		       dd ,
		       hh->m_hostId ,
		       shardNum);
		continue;
	}

	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key_t *)list.getLastKey() )
		return;
	goto loop;
}

/*
void dumpStatsdb( int32_t startFileNum, int32_t numFiles, bool includeTree,
		  int test ) {

	// this is confidential data format
#ifdef _CLIENT_
	return;
#endif
#ifdef _METALINCS_
	return;
#endif
	static char *coll = "stats";
	// We don't want to close the previous session so we
	// must not do a real init.
	g_statsdb.init( );//false - Is full init?
	g_collectiondb.init( true ); // Is dump?
	g_statsdb.getRdb()->addRdbBase1 ( coll );

	uint64_t ss_keys = 0;
	uint64_t dd_keys = 0;
	key96_t startKey;
	key96_t endKey;
	startKey.setMin();
	endKey.setMax();

	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;

	// bail if not
	if ( g_statsdb.getRdb()->getNumFiles() <= startFileNum ) {
		printf("Request file #%"INT32" but there are only %"INT32" "
		       "statsdb files\n",startFileNum,
		       g_statsdb.getRdb()->getNumFiles());
		return;
	}

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;

 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_STATSDB   ,
			      coll          ,
			      &list         ,
			      (char *)&startKey      ,
			      (char *)&endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() )
		return;
	// loop over entries in list
	key96_t k;
	time_t dateStamp;
	char txtDate[32];
	char *txt;

	uint64_t uCRC	 = 0LL;
	uint8_t  version = 0;
	int32_t dataSize = 0;

	SafeBuf cBuf( 1024 );
	bool dataSummaryGen = false;
	bool first = true;
	if ( g_mem.checkStackSize() > (int)(6*1024*1024) ) {
		fprintf( stderr, "Running low on stack space, %"INT32" bytes "
				 "used. %s:%d\n", g_mem.checkStackSize(),
			 	 __PRETTY_FUNCTION__, __LINE__ );
		return;
	}
	StatsV1 stats;

	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {

		list.getCurrentKey( (char *)&k );
		version = g_statsdb.getVersionFromKey( k );
		// is it a delete?
		char *dd = "";
		if (!( k.n0 & 0x01LL)) dd = " (delete)";
		dateStamp = g_statsdb.getTimestampFromKey( k );
		snprintf( txtDate, 32, "%s", ctime( &dateStamp ) );
		txt = txtDate;
                // get rid of the newline character
		while ( *txt ) {
			if ( *txt == '\n' ) { *txt = 0; break; } txt++;
		}

		// . We extract and verify the size of the data.
		// . If uCRC is zero, we failed to decompress the data.
		if ( k.n1 & SUMMARY_MASK ) {
			dataSummaryGen = true;
			cBuf.setBuf( list.getCurrentData(),
				     list.getCurrentDataSize(),
				     list.getCurrentDataSize(),
		false, //ownData
		csOther);//encoding
			if ( version == 1 ) {
				if ( ! stats.fromCompressed( cBuf ) ) {
                                        printf("Decompression Failed!!\n");
                                }
				dataSize = sizeof( StatsDataV1 );
				uCRC = g_statsdb.quickCRC(
						(uint8_t *)stats.getData(),
						dataSize );
			}
		}
		// print it
		if ( test != 3 && ! g_statsdb.getResolutionFromKey( k ) ) {
			printf("[Session Header Key] "
			       "k.n1=%08"XINT32" k.n0=%016"XINT64" resolution=%03"UINT32" "
			       "session=%05d timestamp=%010"INT32" [%s] "
			       "hostId=%05"UINT32" version=%03"UINT32" %s\n",
			       k.n1 , k.n0 ,
			       (uint32_t)g_statsdb.getResolutionFromKey(k),
			       (int16_t)g_statsdb.getSessionFromKey(k) ,
			       (int32_t)dateStamp ,
			       txtDate ,
			       (uint32_t)g_statsdb.getHostIdFromKey(k) ,
			       (uint32_t)version ,
			       dd );
			ss_keys++;
		} else if ( test == 2 ){
			printf("k.n1=0x%08"XINT32" k.n0=0x%016"XINT64" resolution=%03"UINT32" "
			       "session=%05d timestamp=%010"INT32" [%s] "
			       "hostId=%05"UINT32" version=%03"UINT32" "
			       "uLen=%010"UINT32" cLen=%010"UINT32" uCRC=%016"XINT64" %s \n",
			       k.n1 , k.n0 ,
			       (uint32_t)g_statsdb.getResolutionFromKey(k),
			       (int16_t)g_statsdb.getSessionFromKey(k) ,
			       (int32_t)dateStamp ,
			       txtDate ,
			       (uint32_t)g_statsdb.getHostIdFromKey( k ) ,
			       (uint32_t)version ,
			       (uint32_t)dataSize ,
			       list.getCurrentDataSize() ,
			       uCRC,
			       dd );
			dd_keys++;
		}
		else if ( test > 2 && first ) {
			StatsDataV1 &sData = *(StatsDataV1 *)stats.getData();
			printf("k.n1=0x%08"XINT32" k.n0=0x%016"XINT64" resolution=%03"UINT32" "
			       "session=%05d timestamp=%010"INT32" [%s] "
			       "hostId=%05"UINT32" version=%03"UINT32" "
			       "uLen=%010"UINT32" cLen=%010"UINT32" uCRC=%016"XINT64" %s \n"
			       "allQueries %"INT64"\n"
			       "msg3aRecallCnt %i\n"
			       "cpuUsage %f\n"
			       "",
			       k.n1 , k.n0 ,
			       (uint32_t)g_statsdb.getResolutionFromKey(k),
			       (int16_t)g_statsdb.getSessionFromKey(k) ,
			       (int32_t)dateStamp ,
			       txtDate ,
			       (uint32_t)g_statsdb.getHostIdFromKey( k ) ,
			       (uint32_t)version ,
			       (uint32_t)dataSize ,
			       list.getCurrentDataSize() ,
			       uCRC, dd,
			       sData.m_allQueries,
			       sData.m_msg3aRecallCnt,
			       sData.m_cpuUsage
			       );
			dd_keys++;
			if ( test == 3 ) {
				first = false;
                                printf( "\nPlease wait...\n\n" );
                        }
		}
	}

	startKey  = *(key96_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key96_t *)list.getLastKey() ) {
		printf( "Session Summary Keys: %"UINT64"\n"
			"Data Keys:            %"UINT64"\n",
			ss_keys, dd_keys );
		return;
	}
	goto loop;
}
*/

/*
void dumpChecksumdb( char *coll,
		     int32_t startFileNum,
		     int32_t numFiles,
		     bool includeTree ) {
	// this is confidential data format
#ifdef _CLIENT_
	return;
#endif
#ifdef _METALINCS_
	return;
#endif
	g_checksumdb.init ();
	g_collectiondb.init(true);
	g_checksumdb.getRdb()->addRdbBase1 ( coll );

	//key_t startKey ;
	//key_t endKey   ;
	//startKey.setMin();
	//endKey.setMax();
	int32_t cKeySize = g_conf.m_checksumdbKeySize;

	char startKey[16];
	char endKey[16];

	if ( cKeySize == 12 ) {
		((key_t *)startKey)->setMin();
		((key_t *)endKey)->setMax();
	}
	else if ( cKeySize == 16 ) {
		((key128_t *)startKey)->setMin();
		((key128_t *)endKey)->setMax();
	}

	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;

	//// bail if not
	//if ( g_checksumdb.getRdb()->getNumFiles() <= startFileNum ) {
	//	printf("Request file #%"INT32" but there are only %"INT32" "
	//	       "checksumdb files\n",startFileNum,
	//	       g_checksumdb.getRdb()->getNumFiles());
	//	return;
	//}

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;

 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_CHECKSUMDB ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() )
		return;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		uint32_t hosthash;

		//key_t k = list.getCurrentKey();
		char k[16];
		list.getCurrentKey( k );
		// is it a delete?
		char *dd = "";
		//if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
		if ( (((key_t *)k)->n0 & 0x01) == 0x00 ) dd = " (delete)";

		char kBuf[20];
		//uint32_t hosthash = (k.n1 >> 8) & 0xffff;
		// . check keys size before doing assignments
		if ( cKeySize == 12 ) {
			// get the language string
			hosthash = (((key_t *)k)->n1 >> 8) & 0xffff;
			sprintf( kBuf, "%08"XINT32"", ((key_t *)k)->n1);
		}
		else if ( cKeySize == 16 ) {
			// get the language string
			// . some extra manipulation needed to retrieve the
			// . host hash from the 16-byte key
			hosthash = ((((key128_t *)k)->n1 >> 38 ) & 0x3ff   ) |
				   ((((key128_t *)k)->n1 << 2) & 0x3fffc00 );
			sprintf( kBuf, "%016"XINT64"", ((key128_t *)k)->n1);
		}
		// print it
		printf("k.n1=%s k.n0=%016"XINT64" "
		       "docId=%012"INT64" quality=%d hosthash=0x%04"XINT32"%s\n",
		       kBuf, ((key_t *)k)->n0,
		       g_checksumdb.getDocId( k ) ,
		       (int)g_checksumdb.getDocQuality( k ),
		       hosthash ,
		       dd );
		continue;
	}

	//startKey = *(key_t *)list.getLastKey();
	KEYSET( startKey, list.getLastKey(), cKeySize );
	//startKey += (uint32_t) 1;
	// must check key size before assignments
	if ( cKeySize == 12 )
		*((key_t *)startKey) += (uint32_t) 1;
	else
		*((key128_t *)startKey) += (uint32_t) 1;
	// watch out for wrap around
	//if ( startKey < *(key_t *)list.getLastKey() ) return;
	if ( KEYCMP( startKey, list.getLastKey(), cKeySize ) < 0 )
		return;

	goto loop;
}
*/

void dumpLinkdb ( char *coll,
		  int32_t startFileNum,
		  int32_t numFiles,
		  bool includeTree ,
		  char *url ) {
	// this is confidential data format
#ifdef _CLIENT_
	return;
#endif
#ifdef _METALINCS_
	return;
#endif
	g_linkdb.init ();
	//g_collectiondb.init(true);
	g_linkdb.getRdb()->addRdbBase1(coll );
	key224_t startKey ;
	key224_t endKey   ;
	startKey.setMin();
	endKey.setMax();
	// set to docid
	if ( url ) {
		Url u;
		u.set ( url , gbstrlen(url) , true ); // addWWW?
		uint32_t h32 = u.getHostHash32();//g_linkdb.getUrlHash(&u)
		int64_t uh64 = hash64n(url,0);
		startKey = g_linkdb.makeStartKey_uk ( h32 , uh64 );
		endKey   = g_linkdb.makeEndKey_uk   ( h32 , uh64 );
	}
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;

	// bail if not
	if ( g_linkdb.getRdb()->getNumFiles() <= startFileNum  && !includeTree) {
		printf("Request file #%"INT32" but there are only %"INT32" "
		       "linkdb files\n",startFileNum,
		       g_linkdb.getRdb()->getNumFiles());
		return;
	}

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	CollectionRec *cr = g_collectiondb.getRec(coll);

 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_LINKDB ,
			      cr->m_collnum      ,
			      &list         ,
			      (char *)&startKey      ,
			      (char *)&endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         )){// err correction?
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key224_t k;
		list.getCurrentKey((char *) &k);
		// is it a delete?
		char *dd = "";
		if ( (k.n0 & 0x01) == 0x00 ) dd = " (delete)";
		int64_t docId = (int64_t)g_linkdb.getLinkerDocId_uk(&k);
		//if ( docId != 74785425291LL && docId != 88145066810LL )
		//	log("hey");
		//if ( list.m_listPtr-list.m_list >= 11784-24 )
		//	log("boo");
		//unsigned char hc = g_linkdb.getLinkerHopCount_uk(&k);
		//uint32_t gid = g_hostdb.getGroupId (RDB_LINKDB,&k,true);
		//int32_t groupNum = g_hostdb.getGroupNum ( gid );
		int32_t shardNum = getShardNum(RDB_LINKDB,&k);
		//if ( hc != 0 ) { char *xx=NULL;*xx=0; }
		// is it an ip or url record?
		//bool isHost = g_linkdb.isHostRecord ( &k );
		// is it a url or site key?
		//bool isUrlKey = g_linkdb.isUrlKey ( &k );
		// print this record type different
		//if ( isUrlKey ) {
		//int32_t ip = g_linkdb.getIp2(&k);
		//char *ipString = iptoa(ip);
		printf("k=%s "
		       "linkeesitehash32=0x%08"XINT32" "
		       "linkeeurlhash=0x%012"XINT64" "
		       "linkspam=%"INT32" "
		       "siterank=%02"INT32" "
		       //"hopcount=%03hhu "
		       "ip32=%s "
		       "docId=%012"UINT64" "
		       "discovered=%"UINT32" "
		       "lost=%"UINT32" "
		       "sitehash32=0x%08"XINT32" "
		       "shardNum=%"UINT32" "
		       "%s\n",
		       KEYSTR(&k,sizeof(key224_t)),
		       (int32_t)g_linkdb.getLinkeeSiteHash32_uk(&k),
		       (int64_t)g_linkdb.getLinkeeUrlHash64_uk(&k),
		       (int32_t)g_linkdb.isLinkSpam_uk(&k),
		       (int32_t)g_linkdb.getLinkerSiteRank_uk(&k),
		       //hc,//g_linkdb.getLinkerHopCount_uk(&k),
		       iptoa((int32_t)g_linkdb.getLinkerIp_uk(&k)),
		       docId,
		       (int32_t)g_linkdb.getDiscoveryDate_uk(&k),
		       (int32_t)g_linkdb.getLostDate_uk(&k),
		       (int32_t)g_linkdb.getLinkerSiteHash32_uk(&k),
		       shardNum,
		       dd );
	}

	startKey = *(key224_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key224_t *)list.getLastKey() ) return;
	goto loop;
}


bool pingTest ( int32_t hid , uint16_t clientPort ) {
	Host *h = g_hostdb.getHost ( hid );
	if ( ! h ) return log("net: pingtest: hostId %"INT32" is "
			      "invalid.",hid);
        // set up our socket
        int sock  = socket ( AF_INET, SOCK_DGRAM , 0 );
        if ( sock < 0 ) return log("net: pingtest: socket: %s.",
				   strerror(errno));

        // sockaddr_in provides interface to sockaddr
        struct sockaddr_in name;
        // reset it all just to be safe
        memset((char *)&name, 0,sizeof(name));
        name.sin_family      = AF_INET;
        name.sin_addr.s_addr = 0; /*INADDR_ANY;*/
        name.sin_port        = htons(clientPort);
        // we want to re-use port it if we need to restart
        int options = 1;
        if ( setsockopt(sock, SOL_SOCKET, SO_REUSEADDR ,
			&options,sizeof(options)) < 0 )
		return log("net: pingtest: setsockopt: %s.",
			   strerror(errno));
        // bind this name to the socket
        if ( bind ( sock, (struct sockaddr *)&name, sizeof(name)) < 0) {
               close ( sock );
               return log("net: pingtest: Bind on port %hu: %s.",
			  clientPort,strerror(errno));
	}

	//g_loop.setNonBlocking ( sock , 0 );
	//g_loop.interruptsOff();
	int fd = sock;
	int flags = fcntl ( fd , F_GETFL ) ;
	if ( flags < 0 )
		return log("net: pingtest: fcntl(F_GETFL): %s.",
			   strerror(errno));
	//if ( fcntl ( fd, F_SETFL, flags|O_NONBLOCK|O_ASYNC) < 0 )
	//return log("db: Loop::addSlot:fcntl(NONBLOCK):%s",strerror(errno));

	char dgram[1450];
	int n;
	struct sockaddr_in to;
	sockaddr_in from;
	socklen_t fromLen;
	int64_t startTime;

	// make the dgram
	UdpProtocol *up = &g_dp; // udpServer2.getProtocol();
	int32_t transId = 500000000 - 1 ;
	int32_t dnum    = 0; // dgramNum

	int32_t sends     = 0;
	int32_t lost      = 0;
	int32_t recovered = 0;
	int32_t acks      = 0;
	int32_t replies   = 0;

	int32_t ip = h->m_ip;
	ip = atoip("127.0.0.1",9);

	startTime = gettimeofdayInMilliseconds_force();
	to.sin_family      = AF_INET;
	to.sin_addr.s_addr = h->m_ip;
	to.sin_port        = ntohs(h->m_port);
	memset ( &(to.sin_zero) , 0,8 );
	log("net: pingtest: Testing hostId #%"INT32" at %s:%hu from client "
	    "port %hu", hid,iptoa(h->m_ip),h->m_port,clientPort);
	// if this is higher than number of avail slots UdpServer.cpp
	// will not be able to free the slots and this will end up sticking,
	// because the slots can only be freed in destroySlot() which
	// is not async safe!
	//int32_t count = 40000; // number of loops
	int32_t count = 1000; // number of loops
	int32_t avg = 0;
 sendLoop:
	if ( count-- <= 0 ) {
		log("net: pingtest: Got %"INT32" replies out of %"INT32" sent (%"INT32" lost)"
		    "(%"INT32" recovered)", replies,sends,lost,recovered);
		log("net: pingtest: Average reply time of %.03f ms.",
		    (double)avg/(double)replies);
		return true;
	}
	transId++;
	int32_t msgSize = 3; // indicates a debug ping packet to PingServer.cpp
	up->setHeader ( dgram, msgSize, 0x11, dnum, transId, true, false , 0 );
	int32_t size = up->getHeaderSize(0) + msgSize;
	int64_t start = gettimeofdayInMilliseconds_force();
	// debug
	//log("db: sending %"INT32" bytes",size);
	n = sendto(sock,dgram,size,0,(struct sockaddr *)&to,sizeof(to));
	if ( n != size ) return log("net: pingtest: sendto returned "
				    "%i "
				    "(should have returned %"INT32")",n,size);
	sends++;
 readLoop2:
	// loop until we read something
	n = recvfrom (sock,dgram,DGRAM_SIZE,0,(sockaddr *)&from, &fromLen);
	if (gettimeofdayInMilliseconds_force() - start>2000) {lost++; goto sendLoop;}
	if ( n <= 0 ) goto readLoop2; // { sched_yield(); goto readLoop2; }
	// for what transId?
	int32_t tid = up->getTransId ( dgram , n );
	// -1 is error
	if ( tid < 0 ) return log("net: pingtest: Bad transId.");
	// if no match, it was recovered, keep reading
	if ( tid != transId ) {
		log("net: pingTest: Recovered tid=%"INT32", current tid=%"INT32". "
		    "Resend?",tid,transId);
		recovered++;
		goto readLoop2;
	}
	// an ack?
	if ( up->isAck ( dgram , n ) ) {
		acks++;
		// debug
		//log("db: read ack of %"INT32" bytes",n);
		goto readLoop2;
	}
	// debug
	//log("db: read %"INT32" bytes",n);
	// mark the time
	int64_t took = gettimeofdayInMilliseconds_force()-start;
	if ( took > 1 ) log("net: pingtest: got reply #%"INT32" (tid=%"INT32") "
			    "in %"INT64" ms",replies,transId,took);
	// make average
	avg += took;
	// the reply?
	replies++;
	// send back an ack
	size = up->makeAck ( dgram, dnum, transId , true/*weinit?*/ , false );
	n = sendto(sock,dgram,size,0,(struct sockaddr *)&to,sizeof(to));
	// debug
	//log("db: send %"INT32" bytes",n);
	// mark our first read
	goto sendLoop;
}

int injectFileTest ( int32_t reqLen , int32_t hid ) {

	// make a mime
	char *req = (char *)mmalloc ( reqLen , "injecttest");
	if ( ! req ) return log("build: injecttest: malloc(%"INT32") "
				"failed", reqLen)-1;
	char *p    = req;
	char *pend = req + reqLen;
	sprintf ( p ,
		  "POST /inject HTTP/1.0\r\n"
		  "Content-Length: 000000000\r\n" // placeholder
		  "Content-Type: text/html\r\n"
		  "Connection: Close\r\n"
		  "\r\n" );
	p += gbstrlen(p);
	char *content = p;
	sprintf ( p ,
		  "u=%"UINT32".injecttest.com&c=&"
		  "deleteurl=0&ip=4.5.6.7&iplookups=0&"
		  "dedup=1&rs=7&"
		  "quick=1&hasmime=1&ucontent="
		  "HTTP 200\r\n"
		  "Last-Modified: Sun, 06 Nov 1994 08:49:37 GMT\r\n"
		  "Connection: Close\r\n"
		  "Content-Type: text/html\r\n"
		  "\r\n" ,
		  (uint32_t)time(NULL) );
	p += gbstrlen(p);
	// now store random words (just numbers of 8 digits each)
	while ( p + 12 < pend ) {
		int32_t r ; r = rand();
		sprintf ( p , "%010"UINT32" " , r );
		p += gbstrlen ( p );
	}
	// set content length
	int32_t clen = p - content;
	char *ptr = req ;
	// find start of the 9 zeroes
	while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
	// store length there
	sprintf ( ptr , "%09"UINT32"" , clen );
	// remove the \0
	ptr += gbstrlen(ptr); *ptr = '\r';

	// what is total request length?
	int32_t rlen = p - req;

	// generate the filename
	char *filename = "/tmp/inject-test";
	File f;
	f.set ( filename );
	f.unlink();
	if ( ! f.open ( O_RDWR | O_CREAT ) )
		return log("build: injecttest: Failed to create file "
			   "%s for testing", filename) - 1;

	if ( rlen != f.write ( req , rlen , 0 ) )
		return log("build: injecttest: Failed to write %"INT32" "
			   "bytes to %s", rlen,filename) - 1;
	f.close();

	mfree ( req , reqLen , "injecttest" );

	Host *h = g_hostdb.getHost(hid);

	char *ips = iptoa(h->m_ip);

	// now inject the file
	//return injectFile ( filename , ips , 0 , MAX_DOCID , false );
	return injectFile ( filename , ips , "main");
}

#define MAX_INJECT_SOCKETS 300
//#define MAX_INJECT_SOCKETS 1
static void doInject ( int fd , void *state ) ;
static void doInjectWarc ( int64_t fsize );
static void doInjectArc ( int64_t fsize );
static void injectedWrapper ( void *state , TcpSocket *s ) ;
static TcpServer s_tcp;
static File      s_file;
static int64_t s_off = 0; // offset into file
static int32_t      s_ip;
static int16_t     s_port;
static Hostdb s_hosts2;
static int32_t s_rrn = 0;
static int32_t      s_registered = 1;
static int32_t      s_maxSockets = MAX_INJECT_SOCKETS;
static int32_t      s_outstanding = 0;
static bool s_isDelete;
static int32_t s_injectTitledb;
static int32_t s_injectWarc;
static int32_t s_injectArc;
static char *s_coll = NULL;
static key_t s_titledbKey;
static char *s_req  [MAX_INJECT_SOCKETS];
static int64_t s_docId[MAX_INJECT_SOCKETS];
static char s_init5 = false;
static int64_t s_endDocId;

int injectFile ( char *filename , char *ips ,
		 //int64_t startDocId ,
		 //int64_t endDocId ,
		 //bool isDelete ) {
		 char *coll ) {


	// or part of an itemlist.txt-N
	int flen2 = gbstrlen(filename);
	if ( flen2>=14 && strncmp(filename,"itemlist.txt",12)==0 ) {
	        // must have -N
		int split = atoi(filename+13);
		log("inject: using part file of itemlist.txt of %i",split);
		// open it
		SafeBuf sb;
		sb.load("./itemlist.txt");
		// scan the lines
		char *p = sb.getBufStart();
		char *pend = p + sb.length();
		int count = 0;
		char *nextLine = NULL;
		for (  ; p && p < pend ; p = nextLine ) {
			nextLine = strstr(p,"\n");
			if ( nextLine ) nextLine++;
			// this is how many hosts we are using!!
			// TODO: make this get from hosts.conf!!!
			if ( count >= 40 ) count = 0;
			if ( count++ != split ) continue;
			// get line
			char *archiveDirName = p;
			if ( nextLine ) nextLine[-1] = '\0';
			// download the archive
			SafeBuf cmd;
			cmd.safePrintf("./ia download "
				       //"--format=\"Web ARChive GZ\" "
				       "--glob='*arc.gz' "
				       "%s"
				       ,archiveDirName);
			gbsystem(cmd.getBufStart());
			// now inject the warc gz files in there
			Dir dir;
			dir.set ( p );
			dir.open();
			log("setting dir to %s",p);
		subloop:
			char *xarcFilename = dir.getNextFilename("*arc.gz");
			// get next archive
			if ( ! xarcFilename ) {
				cmd.reset();
				// remove the archive dir when done if
				// no more warc.gz files in it
				cmd.safePrintf("rm -rf %s",archiveDirName);
				gbsystem(cmd.getBufStart());
				// download the next archive using 'ia'
				continue;
			}
			int32_t flen = gbstrlen(xarcFilename);
			char *ext = xarcFilename + flen -7;
			// gunzip to foo.warc or foo.arc depending!
			char *es = "";
			if ( ext[0] == 'w' ) es = "w";
			// inject the warc.gz files
			cmd.reset();
			cmd.safePrintf("gunzip -c %s/%s > ./foo%i.%sarc"
				       ,archiveDirName,xarcFilename,split,es);
			gbsystem(cmd.getBufStart());
			// now inject it
			cmd.reset();
			cmd.safePrintf("./gbi inject ./foo%i.%sarc hosts.conf"
				       ,split,es);
			gbsystem(cmd.getBufStart());
			goto subloop;
		}
		exit(0);
		log("cmd: done injecting archives for split %i",split);
	}


	bool isDelete = false;
	int64_t startDocId = 0LL;
	int64_t endDocId = MAX_DOCID;

	g_conf.m_maxMem = 4000000000LL;
	g_mem.init ( );//4000000000LL );

	// set up the loop
	if ( ! g_loop.init() ) return log("build: inject: Loop init "
					  "failed.")-1;
	// init the tcp server, client side only
	if ( ! s_tcp.init( NULL , // requestHandlerWrapper       ,
			   getMsgSize,
			   NULL , // getMsgPiece                 ,
			   0    , // port, only needed for server ,
			   &s_maxSockets    ) ) return false;

	s_tcp.m_doReadRateTimeouts = false;

	s_isDelete = isDelete;

	if ( ! s_init5 ) {
		s_init5 = true;
		for ( int32_t i = 0; i < MAX_INJECT_SOCKETS ; i++ )
			s_req[i] = NULL;
	}

	// get host
	//Host *h = g_hostdb.getHost ( hid );
	//if ( ! h ) return log("build: inject: Hostid %"INT32" is invalid.",
	//		      hid)-1;
	char *colon = strstr(ips,":");
	int32_t port = 8000;
	if ( colon ) {
		*colon = '\0';
		port = atoi(colon+1);
	}
	int32_t ip = 0;
	// is ip field a hosts.conf instead? that means to round robin.
	if ( strstr(ips,".conf") ) {
		if ( ! s_hosts2.init ( -1 ) ) { // ips , 0 ) ) {
			fprintf(stderr,"failed to load %s",ips);
			exit(0);
		}
		s_ip = 0;
		s_port = 0;
	}
	else {
		ip = atoip(ips,strlen(ips));
		if ( ip == 0 || ip == -1 ) {
			log("provided ip \"%s\" is a bad ip. "
				"exiting\n",ips);
			exit(0);
		}
		if ( port == 0 || port == -1 ) {
			log("bad port. exiting\n");
			exit(0);
		}
		s_ip   = ip;//h->m_ip;
		s_port = port;//h->m_httpPort;
	}

	s_injectTitledb = false;

	//char *coll = "main";
	if ( strncmp(filename,"titledb",7) == 0 ) {
		//int32_t hostId = 0;
		//Host *h = g_hostdb.getHost ( hostId );
		//if ( ! h ) { log("db: No host has id %"INT32".",hostId); exit(0);}
		//if ( ! g_conf.init ( h->m_dir ) ) { // , h->m_hostId ) ) {
		//	log("db: Conf init failed." ); exit(0); }
		// a new thing, titledb-gk144 or titledb-coll.main.0
		// init the loop, needs g_conf
		if ( ! g_loop.init() ) {
			log("db: Loop init failed." ); exit(0); }
		// set up the threads, might need g_conf
		if ( ! g_threads.init() ) {
			log("db: Threads init failed." ); exit(0); }
		s_injectTitledb = true;
		s_titledbKey.setMin();

		// read where we left off from file if possible
		char fname[256];
		//sprintf(fname,"%s/lastinjectdocid.dat",g_hostdb.m_dir);
		sprintf(fname,"./lastinjectdocid.dat");
		SafeBuf ff;
		ff.fillFromFile(fname);
		if ( ff.length() > 1 ) {
			int64_t ffdocId = atoll(ff.getBufStart() );
			// if process got killed in the middle of write
			// i guess the stored docid could be corrupted!
			// so make sure its in startDocId,endDocId range
			if ( ffdocId > 0 &&
			     ffdocId >= startDocId &&
			     ffdocId < endDocId )
				startDocId = ffdocId;
			else
				log("build: saved docid %"INT64" not "
				    "in [%"INT64",%"INT64"]",
				    ffdocId,
				    startDocId,
				    endDocId );
		}

		if ( startDocId != 0LL )
			s_titledbKey = g_titledb.makeFirstKey(startDocId);

		s_endDocId = endDocId;

		// so we do not try to merge files, or write any data:
		g_dumpMode = true;
		//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
		//g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
		//g_conf.m_urldbMaxDiskPageCacheMem = 0;

		// . add a fake coll just for it.
		// . make the subdir just gk144 not coll.gk144.0 so rick
		//   can inject the titledb bigfile
		//g_collectiondb.init(true);
		/*
		g_collectiondb.addRec ( "poo" , // char *coll ,
					NULL , // char *cpc ,
					0 , // int32_t cpclen ,
					false , // bool isNew ,
					-1 , // collnum_t collnum ,
					false , // bool isDump ,
					false ); // bool saveIt
		*/
		CollectionRec *cr = new (CollectionRec);
		SafeBuf *rb = &g_collectiondb.m_recPtrBuf;
		rb->reserve(4);
		g_collectiondb.m_recs = (CollectionRec **)rb->getBufStart();
		g_collectiondb.m_recs[0] = cr;

		// right now this is just for the main collection
		char *coll = "main";
		addCollToTable ( coll , (collnum_t) 0 );

		// force RdbTree.cpp not to bitch about corruption
		// assume we are only getting out collnum 0 recs i guess
		g_collectiondb.m_numRecs = 1;
		g_titledb.init ();
		//g_titledb.getRdb()->addRdbBase1(coll );
		// msg5::readList() requires the RdbBase for collnum 0
		// which holds the array of files and the tree
		Rdb *rdb = g_titledb.getRdb();
		static RdbBase *s_base = new ( RdbBase );
		// so getRdbBase always returns
		rdb->m_collectionlessBase = s_base;
		rdb->m_isCollectionLess = true;
		//CollectionRec *pcr = g_collectiondb.getRec((collnum_t)0);
		//pcr->m_bases[RDB_TITLEDB] = s_base;
		// dir for tree loading
		sprintf(g_hostdb.m_dir , "./" );
		rdb->loadTree();
		// titledb-
		if ( gbstrlen(filename)<=8 )
			return log("build: need titledb-coll.main.0 or "
			    "titledb-gk144 not just 'titledb'");
		char *coll2 = filename + 8;

		char tmp[1024];
		sprintf(tmp,"./%s",coll2);
		s_base->m_dir.set(tmp);
		strcpy(s_base->m_dbname,rdb->m_dbname);
		s_base->m_dbnameLen = gbstrlen(rdb->m_dbname);
		s_base->m_coll = "main";
		s_base->m_collnum = (collnum_t)0;
		s_base->m_rdb = rdb;
		s_base->m_fixedDataSize = rdb->m_fixedDataSize;
		s_base->m_useHalfKeys = rdb->m_useHalfKeys;
		s_base->m_ks = rdb->m_ks;
		s_base->m_pageSize = rdb->m_pageSize;
		s_base->m_isTitledb = rdb->m_isTitledb;
		s_base->m_minToMerge = 99999;
		// try to set the file info now!
		s_base->setFiles();
	}
	else {
		// open file
		s_file.set ( filename );
		if ( ! s_file.open ( O_RDONLY ) )
			return log("build: inject: Failed to open file %s "
				   "for reading.", filename) - 1;
		s_off = 0;
	}

	// this might be a compressed warc like .warc.gz
	s_injectWarc = false;
	s_injectArc  = false;
	int flen = gbstrlen(filename);
	if ( flen>5 && strcasecmp(filename+flen-5,".warc")==0 ) {
		s_injectWarc = true;
	}
	if ( flen>5 && strcasecmp(filename+flen-4,".arc")==0 ) {
		s_injectArc = true;
	}


	s_coll = coll;

	if ( ! s_coll ) s_coll = "main";

	// register sleep callback to get started
	if ( ! g_loop.registerSleepCallback(1, NULL, doInject) )
		return log("build: inject: Loop init failed.")-1;
	// run the loop
	if ( ! g_loop.runLoop() ) return log("build: inject: Loop "
					     "run failed.")-1;
	// dummy return
	return 0;
}

void doInject ( int fd , void *state ) {

	if ( s_registered ) {
		s_registered = 0;
		g_loop.unregisterSleepCallback ( NULL, doInject );
	}

	// turn off threads so this happens right away
	g_conf.m_useThreads = false;

	int64_t fsize ;
	if ( ! s_injectTitledb ) fsize = s_file.getFileSize();

	// just repeat the function separately. i guess we'd repeat
	// some code but for simplicity i think it is worth it. and we
	// should probably phase out the ++++URL: format thing.
	if ( s_injectWarc ) {
		doInjectWarc ( fsize );
		return;
	}

	if ( s_injectArc ) {
		doInjectArc ( fsize );
		return;
	}


 loop:

	int32_t reqLen;
	int32_t reqAlloc;
	char *req;

	// if reading from our titledb and injecting into another cluster
	if ( s_injectTitledb ) {
		// turn off threads so this happens right away
		g_conf.m_useThreads = false;
		key_t endKey; //endKey.setMax();
		endKey = g_titledb.makeFirstKey(s_endDocId);
		RdbList list;
		Msg5 msg5;
		Msg5 msg5b;
		char *coll = "main";
		CollectionRec *cr = g_collectiondb.getRec(coll);
		msg5.getList ( RDB_TITLEDB ,
			       cr->m_collnum,
			       &list         ,
			       (char *)&s_titledbKey ,
			       (char *)&endKey        ,
			       100 , // minRecSizes   ,
			       true , // includeTree   ,
			       false         , // add to cache?
			       0             , // max cache age
			       0 , // startFileNum  ,
			       -1, // numFiles      ,
			       NULL          , // state
			       NULL          , // callback
			       0             , // niceness
			       false         , // err correction?
			       NULL           , // cache key ptr
			       0              , // retry num
			       -1             , // maxRetries
			       true           , // compensate for merge
			       -1LL           , // sync point
			       &msg5b         );
		// all done if empty
		if ( list.isEmpty() ) { g_loop.reset();  exit(0); }
		// loop over entries in list
		list.getCurrentKey((char *) &s_titledbKey);
		// advance for next
		s_titledbKey += 1;
		// is it a delete?
		char *rec     = list.getCurrentRec    ();
		int32_t  recSize = list.getCurrentRecSize();
		// skip negative keys!
		if ( (rec[0] & 0x01) == 0x00 ) goto loop;
		// re-enable threads i guess
		g_conf.m_useThreads = true;
		// set and uncompress
		//TitleRec tr;
		XmlDoc xd;
		if ( ! xd.set2 ( rec ,
				 recSize ,
				 coll ,
				 NULL , // safebuf
				 0 , // niceness
				 NULL ) ) { // spiderrequest
			log("build: inject skipping corrupt title rec" );
			goto loop;
		}
		// sanity!
		if ( xd.size_utf8Content > 5000000 ) {
			log("build: inject skipping huge title rec" );
			goto loop;
		}
		// get the content length. uenc can be 2140 bytes! seen it!
		reqAlloc = xd.size_utf8Content + 6000;
		// make space for content
		req = (char *)mmalloc ( reqAlloc , "maininject" );
		if ( ! req ) {
			log("build: inject: Could not allocate %"INT32" bytes for "
			    "request at offset %"INT64"",reqAlloc,s_off);
			exit(0);
		}
		char *ipStr = iptoa(xd.m_ip);
		// encode the url
		char *url = xd.getFirstUrl()->getUrl();
		char uenc[5000];
		urlEncode ( uenc , 4000 , url , strlen(url) , true );
		char *content = xd.ptr_utf8Content;
		int32_t  contentLen = xd.size_utf8Content;
		if ( contentLen > 0 ) contentLen--;
		char c = content[contentLen];
		content[contentLen] = '\0';
		//log("inject: %s",xd.m_firstUrl.m_url);
		// form what we would read from disk
		reqLen = sprintf(req,
				 // print as unencoded content for speed
				 "POST /inject HTTP/1.0\r\n"
				 "Content-Length: 000000000\r\n"//placeholder
				 "Content-Type: text/html\r\n"
				 "Connection: Close\r\n"
				 "\r\n"
				 // now the post cgi parms
				 "c=%s&"
				 // quick docid only reply
				 "quick=1&"
				 // url of injecting page
				 "u=%s&"
				 "ip=%s&"
				 //"firstip=%s&"
				 "firstindexed=%"UINT32"&"
				 "lastspidered=%"UINT32"&"
				 // prevent looking up firstips
				 // on all outlinks for speed:
				 "spiderlinks=0&"
				 "hopcount=%"INT32"&"
				 "newonly=2&"  // only inject if new
				 "dontlog=1&"
				 "charset=%"INT32"&"
				 "ucontent="
				 // first the mime
				 //"HTTP 200\r\n"
				 //"Connection: Close\r\n"
				 //"Content-Type: text/html\r\n"
				 //"Content-Length: %"INT32"\r\n"
				 //"\r\n"
				 // then the content of the injecting page
				 "%s"
				 , coll
				 , uenc
				 , ipStr
				 //, ipStr
				 , xd.m_firstIndexedDate
				 , xd.m_spideredTime
				 , (int32_t)*xd.getHopCount()
				 , (int32_t)xd.m_charset
				 //, contentLen
				 , content
				 );
		content[contentLen] = c;
		if ( reqLen >= reqAlloc ) {
			log("inject: bad engineer here");
			char *xx=NULL;*xx=0;
		}
		// set content length
		char *start = strstr(req,"c=");
		int32_t realContentLen = strlen(start);
		char *ptr = req ;
		// find start of the 9 zeroes
		while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
		// store length there
		sprintf ( ptr , "%09"UINT32"" , realContentLen );
		// remove the \0
		ptr += strlen(ptr); *ptr = '\r';
		// map it
		int32_t i; for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) {
			// skip if occupied
			if ( s_req[i] ) continue;
			s_req  [i] = req;
			s_docId[i] = xd.m_docId;
			break;
		}
		if ( i >= MAX_INJECT_SOCKETS )
			log("build: could not add req to map");
	}
	else {
		// are we done?
		if ( s_off >= fsize ) {
			log("inject: done parsing file");
			g_loop.reset();
			exit(0);
		}
		// read the mime
		char buf [ 1000*1024 ];
		int32_t maxToRead = 1000*1024;
		int32_t toRead = maxToRead;
		if ( s_off + toRead > fsize ) toRead = fsize - s_off;
		int32_t bytesRead = s_file.read ( buf , toRead , s_off ) ;
		if ( bytesRead != toRead ) {
			log("build: inject: Read of %s failed at offset "
			    "%"INT64"", s_file.getFilename(), s_off);
			exit(0);
		}

		char *fend = buf + toRead;

		char *pbuf = buf;
		// partap padding?
		if ( pbuf[0] == '\n' ) pbuf++;
		if ( pbuf[0] == '\n' ) pbuf++;
		// need "++URL: "
		for ( ; *pbuf && strncmp(pbuf,"+++URL: ",8) ; pbuf++ );
		// none?
		if ( ! *pbuf ) {
			log("inject: done!");
			exit(0);
		}
		// sometimes line starts with "URL: http://www.xxx.com/\n"
		char *url = pbuf + 8; // NULL;
		// skip over url
		pbuf = strchr(pbuf,'\n');
		// null term url
		*pbuf = '\0';
		// log it
		log("inject: injecting url %s",url);
		// debug
		//if ( strstr(url,"worldexecutive.com") )
		//	log("poo");
		// skip to next line
		pbuf++;
		// get offset into "buf"
		int32_t len = pbuf - buf;
		// subtract that from toRead so it is the available bytes left
		toRead -= len;
		// advance this for next read
		s_off += len;

		//if ( ! strncmp(pbuf,"URL: ", 5 ) ) {
		// if it's not a mime header assume just a url
		//if ( strncmp(pbuf,"GET /",5) &&
		//     strncmp(pbuf,"POST /",6) ) {
		// skip "URL: "
		/*
		if ( strncmp(pbuf,"+++URL: ",8) == 0 )
				url = pbuf + 8;
			else
				url = pbuf;
			// find \n
			pbuf = strchr(pbuf,'\n');
			*pbuf = '\0';
			pbuf++;
			int32_t len = pbuf - buf;
			toRead -= len;
			s_off += len;
		}
		*/
		// should be a mime that starts with GET or POST
		//char *mimePtr = pbuf;
		HttpMime m;
		if ( ! m.set ( pbuf , toRead , NULL ) ) {
			if ( toRead > 128 ) toRead = 128;
			pbuf [ toRead ] = '\0';
			log("build: inject: Failed to set mime at offset "
			    "%"INT64" where request=%s",s_off,buf);
			exit(0);
		}
		// find the end of it, the next "URL: " line or
		// end of file
		char *p = pbuf;
		char *contentPtrEnd = fend;
		for ( ; p < fend ; p++ ) {
			if ( p[0] == '+' &&
			     p[1] == '+' &&
			     p[2] == '+' &&
			     p[3] == 'U' &&
			     p[4] == 'R' &&
			     p[5] == 'L' &&
			     p[6] == ':' &&
			     p[7] == ' ' ) {
				contentPtrEnd = p;
				break;
			}
		}
		// point to the content (NOW INCLUDE MIME!)
		char *contentPtr = pbuf;//  + m.getMimeLen();
		int32_t  contentPtrLen = contentPtrEnd - contentPtr;
		if ( contentPtrEnd == fend && bytesRead == maxToRead ) {
			log("inject: not reading enough content to inject "
			    "url %s . increase maxToRead from %"INT32"",url,
			    maxToRead);
			exit(0);
		}
		// get the length of content (includes the submime for
		// injection)
		int32_t contentLen = m.getContentLen();
		if ( ! url && contentLen == -1 ) {
			log("build: inject: Mime at offset %"INT64" does not "
			    "specify required Content-Length: XXX field.",
			    s_off);
			exit(0);
		}
		// alloc space for mime and content
		//reqAlloc = 5000;
		//if ( ! url ) reqAlloc += m.getMimeLen() + contentLen ;
		reqAlloc = contentPtrLen + 2 + 6000;
		// make space for content
		req = (char *)mmalloc ( reqAlloc , "maininject" );
		if ( ! req ) {
			log("build: inject: Could not allocate %"INT32" bytes for "
			    "request at offset %"INT64"",reqAlloc,s_off);
			exit(0);
		}
		char *rp = req;
		// a different format?
		//if ( url ) {
		char *ipStr = "1.2.3.4";
		//int32_t recycle = 0;
		//if ( s_isDelete ) recycle = 1;
		rp += sprintf(rp,
			      "POST /inject HTTP/1.0\r\n"
			      "Content-Length: 000000000\r\n"//bookmrk
			      "Content-Type: text/html\r\n"
			      "Connection: Close\r\n"
			      "\r\n"
			      "c=main&"
			      // do parsing consistency testing (slower!)
			      //"dct=1&"
			      // mime is in the "&ucontent=" parm
			      "hasmime=1&"
			      // prevent looking up firstips
			      // on all outlinks for speed:
			      "spiderlinks=0&"
			      "quick=1&" // quick reply
			      "dontlog=1&"
			      "ip=%s&"
			      //"recycle=%"INT32"&"
			      "deleteurl=%"INT32"&"
			      "u=",
			      ipStr,
			      //recycle,
			      (int32_t)s_isDelete);
		// url encode the url
		rp += urlEncode ( rp , 4000 , url , gbstrlen(url) );
		// finish it up
		rp += sprintf(rp,"&ucontent=");
		//}

		if ( ! url ) {
			// what is this?
			char *xx=NULL;*xx=0;
			/*
			// stick mime in there
			gbmemcpy ( rp , mimePtr , m.getMimeLen() );
			// skip that
			rp += m.getMimeLen();
			// turn \n\n into \r\n\r\n
			if ( rp[-2] == '\n' && rp[-1] == '\n' ) {
				rp[-2] = '\r';
				rp[ 0] = '\r';
				rp[ 1] = '\n';
				rp += 2;
			}
			// advance
			s_off += m.getMimeLen();
			// read from file into content
			int32_t contRead = contentLen;
			if ( s_off + contRead > fsize ) {
				log("build: inject: Content-Length of %"INT32" "
				    "specified "
				    "for content at offset %"INT64" would breech "
				    "EOF",
				    contentLen,s_off);
				exit(0);
			}
			if ( contRead != s_file.read ( rp ,contRead , s_off)) {
				log("build: inject: Read of %s failed at "
				    "offset %"INT64"",
				    s_file.getFilename(), s_off);
				exit(0);
			}
			// skip that
			rp += contRead;
			// success
			s_off += contRead;
			*/
		}

		// store the content after the &ucontent
		gbmemcpy ( rp , contentPtr , contentPtrLen );
		rp += contentPtrLen;

		s_off += contentPtrLen;

		// just for ease of display
		*rp = '\0';


		// set content length
		char *start = strstr(req,"c=");
		int32_t realContentLen = gbstrlen(start);
		char *ptr = req ;
		// find start of the 9 zeroes
		while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
		// store length there
		sprintf ( ptr , "%09"UINT32"" , realContentLen );
		// remove the \0
		ptr += strlen(ptr); *ptr = '\r';

		// set this
		reqLen = rp - req;
		// sanity
		if ( reqLen > reqAlloc ) { char *xx=NULL;*xx=0; }
	}

	int32_t ip = s_ip;
	int32_t port = s_port;

	// try hosts.conf
	if ( ip == 0 ) {
		// round robin over hosts in s_hosts2
		if ( s_rrn >= s_hosts2.getNumHosts() ) s_rrn = 0;
		Host *h = s_hosts2.getHost ( s_rrn );
		ip = h->m_ip;
		port = h->m_httpPort;
		s_rrn++;
	}

	// now inject it
	bool status = s_tcp.sendMsg ( ip   ,
				      port ,
				      req    ,
				      reqAlloc ,//Len ,
				      reqLen ,
				      reqLen ,
				      NULL   ,
				      injectedWrapper ,
				      9999*60*1000      , // timeout, 60days
				      -1              , // maxTextDocLen
				      -1              );// maxOtherDocLen
	// launch another if blocked
	//if ( ! status ) return;
	if ( ! status ) {
		//int32_t nh = g_hostdb.getNumHosts();
		//nh = (nh * 15) / 10;
		//if ( nh > MAX_INJECT_SOCKETS - 10 )
		//	nh = MAX_INJECT_SOCKETS - 10;
		//if ( nh < 5 ) nh = 5;
		// limit to one socket right now
		//if ( ++s_outstanding < 1 ) goto loop;
		if ( ++s_outstanding < MAX_INJECT_SOCKETS ) goto loop;
		return;
	}

	if ( g_errno )
		log("build: inject had error: %s.",mstrerror(g_errno));
	// free if did not block, tcpserver frees on immediate error
	else
		mfree ( req , reqAlloc , "maininject" );
	// loop if not
	goto loop;
}


// 100MB per warc rec max
#define MAXWARCRECSIZE 100*1024*1024

void doInjectWarc ( int64_t fsize ) {

	static char *s_buf = NULL;

	static bool s_hasMoreToRead;

	static char *s_pbuf = NULL;
	static char *s_pbufEnd = NULL;

	bool needReadMore = false;
	if ( ! s_pbuf ) needReadMore = true;


 readMore:

	if ( needReadMore ) {

		log("inject: reading %"INT64" bytes more of warc file"
		    ,(int64_t)MAXWARCRECSIZE);

		// are we done?
		if ( s_off >= fsize ) {
			log("inject: done parsing warc file");
			if ( s_outstanding ) {
				log("inject: waiting for socks");return;}
			g_loop.reset();
			exit(0);
		}

		// read 1MB of data into this buf to get the first WARC record
		// it must be < 1MB or we faulter.
		if ( ! s_buf ) {
			int64_t need = MAXWARCRECSIZE + 1;
			s_buf = (char *)mmalloc ( need ,"sibuf");
		}
		if ( ! s_buf ) {
			log("inject: failed to alloc buf");
			exit(0);
		}

		int32_t maxToRead = MAXWARCRECSIZE;
		int32_t toRead = maxToRead;
		s_hasMoreToRead = true;
		if ( s_off + toRead > fsize ) {
			toRead = fsize - s_off;
			s_hasMoreToRead = false;
		}
		int32_t bytesRead = s_file.read ( s_buf , toRead , s_off ) ;
		if ( bytesRead != toRead ) {
			log("inject: read of %s failed at offset "
			    "%"INT64"", s_file.getFilename(), s_off);
			exit(0);
		}
		// null term what we read
		s_buf[bytesRead] = '\0';

		// if not enough to constitute a WARC record probably just new lines
		if( toRead < 20 ) {
			log("inject: done processing file");
			if ( s_outstanding ) {
				log("inject: waiting for socks");return;}
			exit(0);
		}

		// mark the end of what we read
		//char *fend = buf + toRead;

		// point to what we read
		s_pbuf = s_buf;
		s_pbufEnd = s_buf + bytesRead;
	}

 loop:

	char *realStart = s_pbuf;

	// need at least say 100k for warc header
	if ( s_pbuf + 100000 > s_pbufEnd && s_hasMoreToRead )  {
		needReadMore = true;
		goto readMore;
	}

	// find "WARC/1.0" or whatever
	char *whp = s_pbuf;
	for ( ; *whp && strncmp(whp,"WARC/",5) ; whp++ );
	// none?
	if ( ! *whp ) {
		log("inject: could not find WARC/1 header start for file=%s",
		    s_file.getFilename());
		if ( s_outstanding ) {
			log("inject: waiting for socks");return;}
		exit(0);
	}

	char *warcHeader = whp;

	// find end of warc mime HEADER not the content
	char *warcHeaderEnd = strstr(warcHeader,"\r\n\r\n");
	if ( ! warcHeaderEnd ) {
		log("inject: could not find end of WARC header for file=%s.",
		    s_file.getFilename());
		if ( s_outstanding ) {
			log("inject: waiting for socks");return;}
		exit(0);
	}
	// \0 term for strstrs below
	*warcHeaderEnd = '\0';
	//warcHeaderEnd += 4;

	char *warcContent = warcHeaderEnd + 4;

	// get WARC-Type:
	// revisit  (if url was already done before)
	// request (making a GET or DNS request)
	// response (reponse to a GET or dns request)
	// warcinfo (crawling parameters, robots: obey, etc)
	// metadata (fetchTimeMs: 263, hopsFromSeed:P,outlink:)
	char *warcType = strstr(warcHeader,"WARC-Type:");
	if ( ! warcType ) {
		log("inject: could not find WARC-Type:");
		if ( s_outstanding ) {
			log("inject: waiting for socks");return;}
		exit(0);
	}
	warcType += 10;
	for ( ; is_wspace_a(*warcType); warcType++ );

	// get Content-Type:
	// application/warc-fields (fetch time, hops from seed)
	// application/http; msgtype=request  (the GET request)
	// application/http; msgtype=response (the GET reply)
	char *warcConType = strstr(warcHeader,"Content-Type:");
	if ( ! warcConType ) {
		log("inject: could not find Content-Type:");
		if ( s_outstanding ) {
			log("inject: waiting for socks");return;}
		exit(0);
	}
	warcConType += 13;
	for ( ; is_wspace_a(*warcConType); warcConType++ );


	// get Content-Length: of WARC header for its content
	char *warcContentLenStr = strstr(warcHeader,"Content-Length:");
	if ( ! warcContentLenStr ) {
		log("inject: could not find WARC "
		    "Content-Length:");
		if ( s_outstanding ) {
			log("inject: waiting for socks");return;}
		exit(0);
	}
	warcContentLenStr += 15;
	for(;is_wspace_a(*warcContentLenStr);warcContentLenStr++);

	// get warc content len
	int64_t warcContentLen = atoll(warcContentLenStr);

	char *warcContentEnd = warcContent + warcContentLen;

	uint64_t oldOff = s_off;

	uint64_t recSize = (warcContentEnd - realStart);

	// point to end of this warc record
	s_pbuf += recSize;

	// if we fall outside of the current read buf then re-read
	if ( s_pbuf > s_pbufEnd ) {
		if ( ! s_hasMoreToRead ) {
			log("inject: warc file exceeded file length.");
			if ( s_outstanding ) {
				log("inject: waiting for socks");return;}
			exit(0);
		}
		if ( recSize > MAXWARCRECSIZE ) {
			log("inject: skipping warc file of %"INT64" "
			    "bytes which is too big",recSize);
			s_off += recSize;
		}
		needReadMore = true;
		goto readMore;
	}

	// advance this for next read from the file
	s_off += recSize; // (warcContentEnd - realStart);//s_buf);


	// if WARC-Type: is not response, skip it. so if it
	// is a revisit then skip it i guess.
	if ( strncmp ( warcType,"response", 8 ) ) {
		// read another warc record
		goto loop;
	}

	// warcConType needs to be
	// application/http; msgtype=response
	if ( strncmp(warcConType,"application/http; msgtype=response", 34) ) {
		// read another warc record
		goto loop;
	}

	char *warcDateStr = strstr(warcHeader,"WARC-Date:");
	if ( warcDateStr ) warcDateStr += 10;
	for(;is_wspace_a(*warcDateStr);warcDateStr++);
	// convert to timestamp
	int64_t warcTime = 0;
	if ( warcDateStr ) warcTime = atotime ( warcDateStr );

	// set the url now
	char *url = strstr(warcHeader,"WARC-Target-URI:");
	if ( url ) url += 16;
	// skip spaces
	for ( ; url && is_wspace_a(*url) ; url++ );
	if ( ! url ) {
		log("inject: could not find WARC-Target-URI:");
		if ( s_outstanding ) {
			log("inject: waiting for socks");return;}
		exit(0);
	}
	// find end of it
	char *urlEnd = url;
	for (;urlEnd&&*urlEnd&&is_urlchar(*urlEnd);urlEnd++);

	// null term url
	//char c = *urlEnd;
	*urlEnd = '\0';


	char *httpReply = warcContent;
	int64_t httpReplySize = warcContentLen;

	// sanity check
	//char *bufEnd = s_buf + MAXWARCRECSIZE;
	if ( httpReply + httpReplySize >= s_pbufEnd ) {
		int needMore = httpReply + httpReplySize - s_pbufEnd;
		log("inject: not reading enough content to inject "
		    "url %s . increase MAXWARCRECSIZE by %"INT32" more",url,
		    needMore);
		exit(0);
	}

	// put it back
	//*urlEnd = c;


	// should be a mime that starts with GET or POST
	HttpMime m;
	if ( ! m.set ( httpReply , httpReplySize , NULL ) ) {
	  // 	if ( httpReplySize > 128 ) httpReplySize = 128;
	// 	httpReply [ httpReplySize ] = '\0';
	// 	log("build: inject: Failed to set mime at offset "
	// 	    "%"INT64" where request=%s",s_off,httpReply);
		log("inject: failed to set http mime at %"INT64" in file"
		    ,oldOff);
		goto loop;
	// 	exit(0);
	}

	// check content type
	int ct = m.getContentType();
	if ( ct != CT_HTML &&
	     ct != CT_TEXT &&
	     ct != CT_XML &&
	     ct != CT_JSON ) {
		goto loop;
	}


	SafeBuf req;

	// a different format?
	char *ipStr = "1.2.3.4";
	req.safePrintf(
		       "POST /admin/inject HTTP/1.0\r\n"
		       "Content-Length: 000000000\r\n"//bookmrk
		       "Content-Type: text/html\r\n"
		       "Connection: Close\r\n"
		       "\r\n"
		       // we need this ?
		       "?"
		       "c=%s&"
		       // do parsing consistency testing (slower!)
		       //"dct=1&"
		       "hasmime=1&"
		       // prevent looking up firstips
		       // on all outlinks for speed:
		       "spiderlinks=0&"
		       "quick=1&" // quick reply
		       "dontlog=0&"

		       // do not do re-injects. should save a TON of time
		       "newonly=1&"

		       "lastspidered=%"INT64"&"
		       "firstindexed=%"INT64"&"

		       "deleteurl=0&"
		       "ip=%s&"
		       //"recycle=%"INT32"&"
		       //"delete=%"INT32"&"
		       "u="
		       ,s_coll

		       ,warcTime
		       ,warcTime

		       ,ipStr
		       //recycle,
		       );

	// url encode the url
	req.urlEncode ( url );
	// finish it up
	req.safePrintf("&content=");
	// store the content after the &ucontent
	req.urlEncode ( httpReply , httpReplySize );
	req.nullTerm();


	// replace 00000 with the REAL content length
	char *start = strstr(req.getBufStart(),"c=");
	int32_t realContentLen = gbstrlen(start);
	char *ptr = req.getBufStart() ;
	// find start of the 9 zeroes
	while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
	// store length there
	sprintf ( ptr , "%09"UINT32"" , realContentLen );
	// remove the \0
	ptr += strlen(ptr); *ptr = '\r';


	int32_t ip = s_ip;
	int32_t port = s_port;

	// try hosts.conf
	if ( ip == 0 ) {
		// round robin over hosts in s_hosts2
		if ( s_rrn >= s_hosts2.getNumHosts() ) s_rrn = 0;
		Host *h = s_hosts2.getHost ( s_rrn );
		ip = h->m_ip;
		port = h->m_httpPort;
		s_rrn++;
	}

	// log it
	log("inject: injecting to %s:%i WARC url %s",iptoa(ip),(int)port,url);

	// now inject it
	bool status = s_tcp.sendMsg ( ip   ,
				      port ,
				      req.getBufStart()    ,
				      req.getCapacity(),
				      req.length(),
				      req.length(),
				      NULL   ,
				      injectedWrapper ,
				      // because it seems some sockets get stuck and
				      // they have no reply but the host they are
				      // connected to no longer has the connection
				      // open. and the readbuf is empty, but the send
				      // buf has been sent and it appears the inject
				      // when through. just the reply was never
				      // sent back for some reason.
				      5*60*1000     , // timeout, 5 mins
				      -1              , // maxTextDocLen
				      -1              );// maxOtherDocLen

	int realMax = 10;
	if ( s_hosts2.getNumHosts() > 1 )
		realMax = s_hosts2.getNumHosts() * 2;

	// launch another if blocked
	if ( ! status ) {
		// let injectedWrapper() below free it
		req.detachBuf();
		//int32_t nh = g_hostdb.getNumHosts();
		//nh = (nh * 15) / 10;
		//if ( nh > MAX_INJECT_SOCKETS - 10 )
		//	nh = MAX_INJECT_SOCKETS - 10;
		//if ( nh < 5 ) nh = 5;
		// limit to one socket right now
		//if ( ++s_outstanding < 1 ) goto loop;
		s_outstanding++;
		if ( s_outstanding < MAX_INJECT_SOCKETS &&
		     s_outstanding < realMax )
		  goto loop;
		return;
	}

	if ( g_errno ) {
		// let tcpserver.cpp free it
		req.detachBuf();
		log("build: inject had error: %s.",mstrerror(g_errno));
	}
	// loop if not
	goto loop;
}


void doInjectArc ( int64_t fsize ) {

	static char *s_buf = NULL;

	static bool s_hasMoreToRead;

	static char *s_pbuf = NULL;
	static char *s_pbufEnd = NULL;

	bool needReadMore = false;
	if ( ! s_pbuf ) needReadMore = true;


 readMore:

	if ( needReadMore ) {

		log("inject: reading %"INT64" bytes more of arc file"
		    ,(int64_t)MAXWARCRECSIZE);

		// are we done?
		if ( s_off >= fsize ) {
			log("inject: done parsing arc file");
			if ( s_outstanding ) {
				log("inject: waiting for socks");return;}
			g_loop.reset();
			exit(0);
		}

		// read 1MB of data into this buf to get the first WARC record
		// it must be < 1MB or we faulter.
		if ( ! s_buf ) {
			int64_t need = MAXWARCRECSIZE + 1;
			s_buf = (char *)mmalloc ( need ,"sibuf");
		}
		if ( ! s_buf ) {
			log("inject: failed to alloc buf");
			exit(0);
		}

		int32_t maxToRead = MAXWARCRECSIZE;
		int32_t toRead = maxToRead;
		s_hasMoreToRead = true;
		if ( s_off + toRead > fsize ) {
			toRead = fsize - s_off;
			s_hasMoreToRead = false;
		}
		int32_t bytesRead = s_file.read ( s_buf , toRead , s_off ) ;
		if ( bytesRead != toRead ) {
			log("inject: read of %s failed at offset "
			    "%"INT64"", s_file.getFilename(), s_off);
			exit(0);
		}
		// null term what we read
		s_buf[bytesRead] = '\0';

		// if not enough to constitute a ARC record probably just new
		// lines
		if( toRead < 20 ) {
			log("inject: done processing file");
			if ( s_outstanding ) {
				log("inject: waiting for socks");return;}
			exit(0);
		}

		// mark the end of what we read
		//char *fend = buf + toRead;

		// point to what we read
		s_pbuf = s_buf;
		s_pbufEnd = s_buf + bytesRead;
	}

 loop:

	char *realStart = s_pbuf;

	// need at least say 100k for arc header
	if ( s_pbuf + 100000 > s_pbufEnd && s_hasMoreToRead )  {
		needReadMore = true;
		goto readMore;
	}

	// find \n\nhttp://
	char *whp = s_pbuf;
	for ( ; *whp ; whp++ ) {
		if ( whp[0] != '\n' ) continue;
		if ( strncmp(whp+1,"http://",7) ) continue;
		break;
	}
	// none?
	if ( ! *whp ) {
		log("inject: could not find next \\nhttp:// in arc file");
		if ( s_outstanding ) {log("inject: waiting for socks");return;}
		exit(0);
	}

	char *arcHeader = whp;

	// find end of arc header not the content
	char *arcHeaderEnd = strstr(arcHeader+1,"\n");
	if ( ! arcHeaderEnd ) {
		log("inject: could not find end of ARC header.");
		exit(0);
	}
	// \0 term for strstrs below
	*arcHeaderEnd = '\0';

	char *arcContent = arcHeaderEnd + 1;

	// parse arc header line
	char *url = arcHeader + 1;
	char *hp = url;

	for ( ; *hp && *hp != ' ' ; hp++ );
	if ( ! *hp ) {log("inject: bad arc header 1.");exit(0);}
	*hp++ = '\0';
	char *ipStr = hp;


	for ( ; *hp && *hp != ' ' ; hp++ );
	if ( ! *hp ) {log("inject: bad arc header 2.");exit(0);}
	*hp++ = '\0';
	char *timeStr = hp;


	for ( ; *hp && *hp != ' ' ; hp++ );
	if ( ! *hp ) {log("inject: bad arc header 3.");exit(0);}
	*hp++ = '\0'; // null term timeStr
	char *arcConType = hp;

	for ( ; *hp && *hp != ' ' ; hp++ );
	if ( ! *hp ) {log("inject: bad arc header 4.");exit(0);}
	*hp++ = '\0'; // null term arcContentType

	char *arcContentLenStr = hp;
	// this is already \0 terminated from above!
	//for ( ; *hp && *hp != '\n' ; hp++ );
	//if ( ! *hp ) {log("inject: bad arc header 5.");exit(0);}
	//*hp++ = '\0'; // null term lenStr


	// get arc content len
	int64_t arcContentLen = atoll(arcContentLenStr);

	char *arcContentEnd = arcContent + arcContentLen;

	//uint64_t oldOff = s_off;

	uint64_t recSize = (arcContentEnd - realStart);

	// point to end of this arc record
	s_pbuf += recSize;

	// if we fall outside of the current read buf then re-read
	if ( s_pbuf > s_pbufEnd ) {
		if ( ! s_hasMoreToRead ) {
			log("inject: arc file exceeded file length.");
			if ( s_outstanding ) {
				log("inject: waiting for socks");return;}
			exit(0);
		}
		if ( recSize > MAXWARCRECSIZE ) {
			log("inject: skipping arc file of %"INT64" "
			    "bytes which is too big",recSize);
			s_off += recSize;
		}
		needReadMore = true;
		goto readMore;
	}

	// advance this for next read from the file
	s_off += recSize;


	// arcConType needs to indexable
	int32_t ct = getContentTypeFromStr ( arcConType );
	if ( ct != CT_HTML &&
	     ct != CT_TEXT &&
	     ct != CT_XML &&
	     ct != CT_JSON ) {
		// read another arc record
		goto loop;
	}

	// convert to timestamp
	int64_t arcTime = 0;
	// this time structure, once filled, will help yield a time_t
	struct tm t;
	// DAY OF MONTH
	t.tm_mday = atol2 ( timeStr + 6 , 2 );
	// MONTH
	t.tm_mon = atol2 ( timeStr + 4  , 2 );
	// YEAR
	t.tm_year = atol2 ( timeStr     , 4 ) - 1900 ; // # of years since 1900
	// TIME
	t.tm_hour = atol2 ( timeStr +  8 , 2 );
	t.tm_min  = atol2 ( timeStr + 10 , 2 );
	t.tm_sec  = atol2 ( timeStr + 12 , 2 );
	// unknown if we're in  daylight savings time
	t.tm_isdst = -1;
	// translate using mktime
	arcTime = timegm ( &t );


	char *httpReply = arcContent;
	int64_t httpReplySize = arcContentLen;

	// sanity check
	if ( httpReply + httpReplySize >= s_pbufEnd ) {
		int needMore = httpReply + httpReplySize - s_pbufEnd;
		log("inject: not reading enough content to inject "
		    "url %s . increase MAXWARCRECSIZE by %"INT32" more",url,
		    needMore);
		exit(0);
	}


	SafeBuf req;

	// a different format?
	//char *ipStr = "1.2.3.4";
	req.safePrintf(
		       "POST /admin/inject HTTP/1.0\r\n"
		       "Content-Length: 000000000\r\n"//bookmrk
		       "Content-Type: text/html\r\n"
		       "Connection: Close\r\n"
		       "\r\n"
		       // we need this ?
		       "?"
		       "c=%s&"
		       // do parsing consistency testing (slower!)
		       //"dct=1&"
		       "hasmime=1&"
		       // prevent looking up firstips
		       // on all outlinks for speed:
		       "spiderlinks=0&"
		       "quick=1&" // quick reply
		       "dontlog=0&"

		       // do not do re-injects. should save a TON of time
		       "newonly=1&"

		       "lastspidered=%"INT64"&"
		       "firstindexed=%"INT64"&"

		       "deleteurl=0&"
		       "ip=%s&"
		       //"recycle=%"INT32"&"
		       //"delete=%"INT32"&"
		       "u="
		       ,s_coll

		       ,arcTime
		       ,arcTime

		       ,ipStr
		       //recycle,
		       );

	// url encode the url
	req.urlEncode ( url );
	// finish it up
	req.safePrintf("&content=");
	// store the content after the &ucontent
	req.urlEncode ( httpReply , httpReplySize );
	req.nullTerm();


	// replace 00000 with the REAL content length
	char *start = strstr(req.getBufStart(),"c=");
	int32_t realContentLen = gbstrlen(start);
	char *ptr = req.getBufStart() ;
	// find start of the 9 zeroes
	while ( *ptr != '0' || ptr[1] !='0' ) ptr++;
	// store length there
	sprintf ( ptr , "%09"UINT32"" , realContentLen );
	// remove the \0
	ptr += strlen(ptr); *ptr = '\r';


	int32_t ip = s_ip;
	int32_t port = s_port;

	// try hosts.conf
	if ( ip == 0 ) {
		// round robin over hosts in s_hosts2
		if ( s_rrn >= s_hosts2.getNumHosts() ) s_rrn = 0;
		Host *h = s_hosts2.getHost ( s_rrn );
		ip = h->m_ip;
		port = h->m_httpPort;
		s_rrn++;
	}

	// log it
	log("inject: injecting ARC %s to %s:%i contentLen=%"INT64""
	    ,url
	    ,iptoa(ip)
	    ,(int)port
	    ,arcContentLen);

	// now inject it
	bool status = s_tcp.sendMsg ( ip   ,
				      port ,
				      req.getBufStart()    ,
				      req.getCapacity(),
				      req.length(),
				      req.length(),
				      NULL   ,
				      injectedWrapper ,
				      // because it seems some sockets get stuck and
				      // they have no reply but the host they are
				      // connected to no longer has the connection
				      // open. and the readbuf is empty, but the send
				      // buf has been sent and it appears the inject
				      // when through. just the reply was never
				      // sent back for some reason.
				      5*60*1000     , // timeout, 5 mins
				      -1              , // maxTextDocLen
				      -1              );// maxOtherDocLen

	int realMax = 10;
	if ( s_hosts2.getNumHosts() > 1 )
		realMax = s_hosts2.getNumHosts() * 3;

	// launch another if blocked
	if ( ! status ) {
		// let injectedWrapper() below free it
		req.detachBuf();
		//int32_t nh = g_hostdb.getNumHosts();
		//nh = (nh * 15) / 10;
		//if ( nh > MAX_INJECT_SOCKETS - 10 )
		//	nh = MAX_INJECT_SOCKETS - 10;
		//if ( nh < 5 ) nh = 5;
		// limit to one socket right now
		//if ( ++s_outstanding < 1 ) goto loop;
		s_outstanding++;
		if ( s_outstanding < MAX_INJECT_SOCKETS &&
		     s_outstanding < realMax )
		  goto loop;
		return;
	}

	if ( g_errno ) {
		// let tcpserver.cpp free it
		req.detachBuf();
		log("build: inject had error: %s.",mstrerror(g_errno));
	}
	// loop if not
	goto loop;
}


void injectedWrapper ( void *state , TcpSocket *s ) {
	s_outstanding--;

	// wtf is this? s_tcp is counting THIS socket so say "== 1"
	if ( s_tcp.m_numUsed == 1 && s_outstanding > 0 ) {
		log("inject: resetting s_outstanding to 0");
		s_outstanding = 0;
	}

	// debug note
	logf(LOG_DEBUG,"inject: out=%i used=%i",(int)s_outstanding,(int)s_tcp.m_numUsed);

	// errno?
	if ( g_errno ) {
		log("inject: Got server error: %s.",
		    mstrerror(g_errno));
		doInject(0,NULL);
		return;
	}
	// free send buf
	char *req    = s->m_sendBuf;
	int32_t  reqAlloc = s->m_sendBufSize;
	mfree ( req , reqAlloc , "maininject");
	s->m_sendBuf = NULL;

	int32_t i;
	static int32_t s_last = 0;
	int32_t now = getTimeLocal();

	// save docid every 10 seconds
	if ( now - s_last > 10 ) {
		int64_t minDocId = 0x0000ffffffffffffLL;
		// get min outstanding docid inject request
		for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) {
			// skip if occupied
			if ( ! s_req[i] ) continue;
			if ( s_docId[i] < minDocId ) minDocId = s_docId[i];
		}
		// map it
		bool saveIt = false;
		// are we the min?
		int32_t i; for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ ) {
			// skip if occupied
			if ( s_req[i] != req ) continue;
			// we got our request
			if ( s_docId[i] == minDocId ) saveIt = true;
			break;
		}
		if ( saveIt ) {
			s_last = now;
			SafeBuf sb;
			sb.safePrintf("%"INT64"\n",minDocId);
			char fname[256];
			//sprintf(fname,"%s/lastinjectdocid.dat",g_hostdb.m_dir
			sprintf(fname,"./lastinjectdocid.dat");
			sb.dumpToFile(fname);
		}
	}

	// remove ourselves from map
	for ( i = 0 ; i < MAX_INJECT_SOCKETS ; i++ )
		if ( s_req[i] == req ) s_req[i] = NULL;

	// get return code
	char *reply = s->m_readBuf;
	logf(LOG_INFO,"inject: reply=\"%s\"",reply);
	doInject(0,NULL);
}

void saveRdbs ( int fd , void *state ) {
	int64_t now = gettimeofdayInMilliseconds_force();
	int64_t last;
	Rdb *rdb ;
	// . try saving every 10 minutes from time of last write to disk
	// . if nothing more added to tree since then, Rdb::close() return true
	//int64_t delta = 10LL*60LL*1000LL;
	// . this is in MINUTES
	int64_t delta = (int64_t)g_conf.m_autoSaveFrequency *60000LL;
	if ( delta <= 0 ) return;
	// jitter it up a bit so not all hostIds save at same time, 15 secs
	delta += (int64_t)(g_hostdb.m_hostId % 10) * 15000LL + (rand()%7500);
	rdb = g_tagdb.getRdb();
	last = rdb->getLastWriteTime();
	if ( now - last > delta )
		if ( ! rdb->close(NULL,NULL,false,false)) return;
	rdb = g_catdb.getRdb();
	last = rdb->getLastWriteTime();
	if ( now - last > delta )
		if ( ! rdb->close(NULL,NULL,false,false)) return;
	//rdb = g_indexdb.getRdb();
	//last = rdb->getLastWriteTime();
	//if ( now - last > delta )
	//	if ( ! rdb->close(NULL,NULL,false,false)) return;
	rdb = g_posdb.getRdb();
	last = rdb->getLastWriteTime();
	if ( now - last > delta )
		if ( ! rdb->close(NULL,NULL,false,false)) return;
	//rdb = g_datedb.getRdb();
	//last = rdb->getLastWriteTime();
	//if ( now - last > delta )
	//	if ( ! rdb->close(NULL,NULL,false,false)) return;
	rdb = g_titledb.getRdb();
	last = rdb->getLastWriteTime();
	if ( now - last > delta )
		if ( ! rdb->close(NULL,NULL,false,false)) return;
	//rdb = g_tfndb.getRdb();
	//last = rdb->getLastWriteTime();
	//if ( now - last > delta )
	//	if ( ! rdb->close(NULL,NULL,false,false)) return;
	rdb = g_spiderdb.getRdb();
	last = rdb->getLastWriteTime();
	if ( now - last > delta )
		if ( ! rdb->close(NULL,NULL,false,false)) return;
	//rdb = g_checksumdb.getRdb();
	//last = rdb->getLastWriteTime();
	//if ( now - last > delta )
	//	if ( ! rdb->close(NULL,NULL,false,false)) return;
	rdb = g_clusterdb.getRdb();
	last = rdb->getLastWriteTime();
	if ( now - last > delta )
		if ( ! rdb->close(NULL,NULL,false,false)) return;
	rdb = g_statsdb.getRdb();
	last = rdb->getLastWriteTime();
	if ( now - last > delta )
		if ( ! rdb->close(NULL,NULL,false,false)) return;
}

// JAB: warning abatement
#if 0
bool checkDataParity ( ) {
	//return true;
	g_threads.disableThreads();

	// test the first collection
	char *coll = g_collectiondb.getCollName ( 0 );

	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	key_t startKey;
	key_t endKey;
	startKey.setMin();
	endKey.setMax();
	//int32_t minRecSizes = 64000;

	// CHECK INDEXDB
	log ( LOG_INFO, "db: Verifying Indexdb..." );
	if ( ! msg5.getList ( RDB_INDEXDB   ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      64000         , // minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         ))// err correction?
		return log("db: HEY! it did not block");

	int32_t count = 0;
	int32_t got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k = list.getCurrentKey();
		// skip negative keys
		if ( (k.n0 & 0x01) == 0x00 ) continue;
		count++;
		//uint32_t groupId = k.n1 & g_hostdb.m_groupMask;
		uint32_t shardNum = getShardNum ( RDB_INDEXDB, &k );
		if ( groupId == g_hostdb.m_groupId ) got++;
	}
	if ( got != count ) {
		log ("db: Out of first %"INT32" records in indexdb, only %"INT32" belong "
		     "to our group.",count,got);
		// exit if NONE, we probably got the wrong data
		if ( got == 0 ) return log("db: Are you sure you have the "
					   "right "
					   "data in the right directory? "
					   "Exiting.");
		return log ( "db: Exiting due to Indexdb inconsistency." );
	}
	log ( LOG_INFO, "db: Indexdb passed verification successfully. (%"INT32")",
			count );
	// CHECK TITLEDB
	log ( LOG_INFO, "db: Verifying Titledb..." );
	if ( ! msg5.getList ( RDB_TITLEDB   ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      1024*1024     , // minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          , // cache key ptr
			      0             , // retry num
			      -1            , // maxRetries
			      true          , // compensate for merge
			      -1LL          , // sync point
			      &msg5b        ))
		return log("db: HEY! it did not block");

	count = 0;
	got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k = list.getCurrentKey();
		// skip negative keys
		if ( (k.n0 & 0x01) == 0x00 ) continue;
		count++;
		uint32_t shardNum = getShardNum ( RDB_TITLEDB , &k );
		//int32_t groupId = k.n1 & g_hostdb.m_groupMask;
		if ( groupId == g_hostdb.m_groupId ) got++;
	}
	if ( got != count ) {
		log ("db: Out of first %"INT32" records in titledb, only %"INT32" belong "
		     "to our group.",count,got);
		// exit if NONE, we probably got the wrong data
		if ( count > 10 && got == 0 )
			return log("db: Are you sure you have the right "
				   "data in the right directory? "
				   "Exiting.");
		return log ( "db: Exiting due to Titledb inconsistency." );
	}

	log ( LOG_INFO, "db: Titledb passed verification successfully. (%"INT32")",
			count );
	// CHECK TFNDB
	log ( LOG_INFO, "db: Verifying Tfndb..." );
	if ( ! msg5.getList ( RDB_TFNDB     ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      64000         , // minRecSizes   ,
			      true          , // includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      0             , // startFileNum  ,
			      -1            , // numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         ))// err correction?
		return log("db: HEY! it did not block");

	count = 0;
	got   = 0;
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k = list.getCurrentKey();
		// skip negative keys
		if ( (k.n0 & 0x01) == 0x00 ) continue;
		count++;
		// verify the group
		uint32_t shardNum = getShardNum ( RDB_TFNDB , &k );
		if ( groupId == g_hostdb.m_groupId ) got++;
	}
	if ( got != count ) {
		log ("db: Out of first %"INT32" records in tfndb, only %"INT32" passed "
		     "verification.",count,got);
		// exit if NONE, we probably got the wrong data
		if ( got == 0 ) return log("db: Are you sure you have the "
					   "right "
					   "data in the right directory? "
					   "Exiting.");
		return log ( "db: Exiting due to Tfndb inconsistency." );
	}
	log ( LOG_INFO, "db: Tfndb passed verification successfully. (%"INT32")",
			count );

	// DONE
	g_threads.enableThreads();
	return true;
}
#endif

bool shutdownOldGB ( int16_t port ) {
	log("db: Saving and shutting down the other gb process." );
	// now make a new socket descriptor
	int sd = socket ( AF_INET , SOCK_STREAM , 0 ) ;
	// return NULL and set g_errno on failure
	if ( sd <  0 ) {
		// copy errno to g_errno
		g_errno = errno;
		log("tcp: Failed to create new socket: %s.",
		    mstrerror(g_errno));
		return false;
	}
	struct sockaddr_in to;
	to.sin_family = AF_INET;
	// our ip's are always in network order, but ports are in host order
	to.sin_addr.s_addr =  atoip("127.0.0.1",9);
	to.sin_port        =  htons((uint16_t)port);
	bzero ( &(to.sin_zero) , 8 ); // TODO: bzero too slow?
	// note it
	log("db: Connecting to port %hu.",port);
	// connect to the socket. This should block until it does
 again:
	if ( ::connect ( sd, (sockaddr *)&to, sizeof(to) ) != 0 ) {
		if ( errno == EINTR ) goto again;
		return log("admin: Got connect error: %s.",mstrerror(errno));
	}
	// note it
	log("db: Connected. Issuing shutdown command.");
	// send the message
	char *msg = "GET /master?usave=1 HTTP/1.0\r\n\r\n";
	write ( sd , msg , gbstrlen(msg) );
	// wait for him to shut down the socket
	char rbuf [5000];
	int32_t n;
 readmore:
	errno = 0;
	n = read ( sd , rbuf, 5000 );
	if ( n == -1 && errno == EINTR ) goto readmore;
	if ( n == -1 )
		return log("db: Got error reading reply: %s.",
			   mstrerror(errno));
	// success...
	close(sd);
	log("db: Received reply from old gb process.");
	return true;
}

bool memTest() {
	// let's ensure our core file can dump
	struct rlimit lim;
	lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
	if ( setrlimit(RLIMIT_CORE,&lim) )
		log("db: setrlimit: %s.", mstrerror(errno) );

	void *ptrs[4096];
	int numPtrs=0;
	//int totalMem=0;
	int i;
	// if ( ! g_log.init( "./memlog" ) ) {//g_hostdb.m_logFilename )        ) {
	// 	fprintf (stderr,"db: Log file init failed.\n" ); return 1; }
	//g_mem.init(0xffffffff);
	g_conf.m_maxMem = 0xffffffffLL;
	g_mem.init( );//g_mem.m_maxMem );


	fprintf(stderr, "memtest: Testing memory bus bandwidth.\n");
	// . read in 20MB 100 times (~2GB total)
	// . tests main memory throughput
	fprintf(stderr, "memtest: Testing main memory.\n");
	membustest ( 20*1024*1024 , 100 , true );
	// . read in 1MB 2,000 times (~2GB)
	// . tests the L2 cache
	fprintf(stderr, "memtest: Testing 1MB L2 cache.\n");
	membustest ( 1024*1024 , 2000 , true );
	// . read in 8000 200,000 times (~1.6GB)
	// . tests the L1 cache
	fprintf(stderr, "memtest: Testing 8KB L1 cache.\n");
	membustest ( 8000 , 100000 , true );

	fprintf(stderr, "memtest: Allocating up to %"INT64" bytes\n",
		g_conf.m_maxMem);
	for (i=0;i<4096;i++) {
		ptrs[numPtrs] = mmalloc(1024*1024, "memtest");
		if (!ptrs[numPtrs]) break;
		numPtrs++;
	}

	fprintf(stderr, "memtest: Was able to allocate %"INT64" bytes of a "
		"total of "
	    "%"INT64" bytes of memory attempted.\n",
	    g_mem.m_used,g_conf.m_maxMem);

	return true;


	fprintf(stderr, "memtest: Dumping core to test max core file size.\n");
	char *xx = NULL;
	*xx = 0;
	for (i=0;i<numPtrs;i++){
		mfree(ptrs[i], 1024*1024, "memtest");
	}
	return true;

}


// . read in "nb" bytes, loops times,
// . if readf is false, do write test, not read test
void membustest ( int32_t nb , int32_t loops , bool readf ) {

	int32_t count = loops;

	// don't exceed 50NB
	if ( nb > 50*1024*1024 ) {
		fprintf(stderr,"memtest: truncating to 50 Megabytes.\n");
		nb = 50*1024*1024;
	}

	int32_t n = nb ; //* 1024 * 1024 ;

	// make n divisble by 64
	//int32_t rem = n % 64;
	//if ( rem > 0 ) n += 64 - rem;

	// get some memory, 4 megs
	//#undef malloc
	//register char *buf = (char *)malloc(n + 64);
	//#define malloc coreme
	int32_t bufSize = 50*1024*1024;
	register char *buf = (char *) mmalloc ( bufSize , "main" );
	if ( ! buf ) return;
	char *bufStart = buf;
	register char *bufEnd = buf + n;

	//fprintf(stderr,"pre-reading %"INT32" NB \n",nb);
	// pre-read it so sbrk() can do its thing
	for ( int32_t i = 0 ; i < n ; i++ ) buf[i] = 1;

	g_clockNeedsUpdate = true;

	// time stamp
	int64_t t = gettimeofdayInMilliseconds_force();

	fprintf(stderr,"memtest: start = %"INT64"\n",t);

	// . time the read loop
	// . each read should only be 2 assenbly movl instructions:
	//   movl	-52(%ebp), %eax
	//   movl	(%eax), %eax
	//   movl	-52(%ebp), %eax
	//   movl	4(%eax), %eax
	//   ...
 loop:
	register int32_t c;

	if ( readf ) {
		while ( buf < bufEnd ) {
			// repeat 16x for efficiency.limit comparison to bufEnd
			c = *(int32_t *)(buf+ 0);
			c = *(int32_t *)(buf+ 4);
			c = *(int32_t *)(buf+ 8);
			c = *(int32_t *)(buf+12);
			c = *(int32_t *)(buf+16);
			c = *(int32_t *)(buf+20);
			c = *(int32_t *)(buf+24);
			c = *(int32_t *)(buf+28);
			c = *(int32_t *)(buf+32);
			c = *(int32_t *)(buf+36);
			c = *(int32_t *)(buf+40);
			c = *(int32_t *)(buf+44);
			c = *(int32_t *)(buf+48);
			c = *(int32_t *)(buf+52);
			c = *(int32_t *)(buf+56);
			c = *(int32_t *)(buf+60);
			buf += 64;
		}
	}
	else {
		while ( buf < bufEnd ) {
			// repeat 8x for efficiency. limit comparison to bufEnd
			*(int32_t *)(buf+ 0) = 0;
			*(int32_t *)(buf+ 4) = 1;
			*(int32_t *)(buf+ 8) = 2;
			*(int32_t *)(buf+12) = 3;
			*(int32_t *)(buf+16) = 4;
			*(int32_t *)(buf+20) = 5;
			*(int32_t *)(buf+24) = 6;
			*(int32_t *)(buf+28) = 7;
			buf += 32;
		}
	}
	if ( --count > 0 ) {
		buf = bufStart;
		goto loop;
	}

	g_clockNeedsUpdate = true;
	// completed
	int64_t now = gettimeofdayInMilliseconds_force();
	fprintf(stderr,"memtest: now = %"INT64"\n",t);
	// multiply by 4 since these are int32_ts
	char *op = "read";
	if ( ! readf ) op = "wrote";
	fprintf(stderr,"memtest: %s %"INT32" bytes (x%"INT32") in"
		"%"UINT64" ms.\n",
		 op , n , loops , now - t );
	// stats
	if ( now - t == 0 ) now++;
	double d = (1000.0*(double)loops*(double)(n)) / ((double)(now - t));
	fprintf(stderr,"memtest: we did %.2f MB/sec.\n" , d/(1024.0*1024.0));

	mfree ( bufStart , bufSize , "main" );

	return ;
}


bool cacheTest() {

	g_conf.m_maxMem = 2000000000LL; // 2G
	//g_mem.m_maxMem  = 2000000000LL; // 2G

	hashinit();

	// use an rdb cache
	RdbCache c;
	// init, 50MB
	int32_t maxMem = 50000000;
	// . how many nodes in cache tree can we fit?
	// . each rec is key (12) and ip(4)
	// . overhead in cache is 56
	// . that makes 56 + 4 = 60
	// . not correct? stats suggest it's less than 25 bytes each
	int32_t maxCacheNodes = maxMem / 25;
	// set the cache
	if ( ! c.init ( maxMem        ,
			4             ,  // fixed data size of rec
			false         ,  // support lists of recs?
			maxCacheNodes ,
			false         ,  // use half keys?
			"cachetest"        ,  // dbname
			false         )) // save cache to disk?
		return log("test: Cache init failed.");

	int32_t numRecs = 0 * maxCacheNodes;
	logf(LOG_DEBUG,"test: Adding %"INT32" recs to cache.",numRecs);

	// timestamp
	int32_t timestamp = 42;
	// keep ring buffer of last 10 keys
	key_t oldk[10];
	int32_t  oldip[10];
	int32_t  b = 0;
	// fill with random recs
	for ( int32_t i = 0 ; i < numRecs ; i++ ) {
		if ( (i % 100000) == 0 )
			logf(LOG_DEBUG,"test: Added %"INT32" recs to cache.",i);
		// random key
		key_t k ;
		k.n1 = rand();
		k.n0 = rand();
		k.n0 <<= 32;
		k.n0 |= rand();
		int32_t ip = rand();
		// keep ring buffer
		oldk [b] = k;
		oldip[b] = ip;
		if ( ++b >= 10 ) b = 0;
		// make rec,size, like dns, will be 4 byte hash and 4 byte key?
		c.addRecord((collnum_t)0,k,(char *)&ip,4,timestamp);
		// reset g_errno in case it had an error (we don't care)
		g_errno = 0;
		// get a rec too!
		if ( i < 10 ) continue;
		int32_t next = b + 1;
		if ( next >= 10 ) next = 0;
		key_t back = oldk[next];
		char *rec;
		int32_t  recSize;
		if ( ! c.getRecord ( (collnum_t)0 ,
				     back         ,
				     &rec     ,
				     &recSize ,
				     false    ,  // do copy?
				     -1       ,  // maxAge   ,
				     true     , // inc count?
				     NULL     , // *cachedTime = NULL,
				     true     )){ // promoteRecord?
			char *xx= NULL; *xx = 0; }
		if ( ! rec || recSize != 4 || *(int32_t *)rec != oldip[next] ) {
			char *xx= NULL; *xx = 0; }
	}

	// now try variable sized recs
	c.reset();

	logf(LOG_DEBUG,"test: Testing variably-sized recs.");

	// init, 300MB
	maxMem = 300000000;
	// . how many nodes in cache tree can we fit?
	// . each rec is key (12) and ip(4)
	// . overhead in cache is 56
	// . that makes 56 + 4 = 60
	// . not correct? stats suggest it's less than 25 bytes each
	maxCacheNodes = maxMem / 5000;
	//maxCacheNodes = 1200;
	// set the cache
	if ( ! c.init ( maxMem        ,
			-1            ,  // fixed data size of rec
			false         ,  // support lists of recs?
			maxCacheNodes ,
			false         ,  // use half keys?
			"cachetest"        ,  // dbname
			false         )) // save cache to disk?
		return log("test: Cache init failed.");

	numRecs = 30 * maxCacheNodes;
	//numRecs = 2 * maxCacheNodes;
	logf(LOG_DEBUG,"test: Adding %"INT32" recs to cache.",numRecs);

	// timestamp
	timestamp = 42;
	// keep ring buffer of last 10 keys
	int32_t oldrs[10];
	b = 0;
	//char lastp;
	// rec to add
	char *rec;
	int32_t  recSize;
	int32_t  maxRecSize = 40000000; // 40MB for termlists
	int32_t  numMisses = 0;
	char *buf = (char *)mmalloc ( maxRecSize + 64 ,"cachetest" );
	if ( ! buf ) return false;
	//sleep(2);
	// fill with random recs
	for ( int32_t i = 0 ; i < numRecs ; i++ ) {
		if ( (i % 100) == 0 )
			logf(LOG_DEBUG,"test: Added %"INT32" recs to cache. "
			     "Misses=%"INT32".",i,numMisses);
		// random key
		key_t k ;
		k.n1 = rand();
		k.n0 = rand();
		k.n0 <<= 32;
		k.n0 |= rand();
		// random size
		recSize = rand()%maxRecSize;//100000;
		// keep ring buffer
		oldk [b] = k;
		oldrs[b] = recSize;
		//oldip[b] = ip;
		if ( ++b >= 10 ) b = 0;
		// make the rec
		rec = buf;
		memset ( rec , (char)k.n1, recSize );
		//log("test: v0");
		// make rec,size, like dns, will be 4 byte hash and 4 byte key?
		if ( ! c.addRecord((collnum_t)0,k,rec,recSize,timestamp) ) {
			char *xx=NULL; *xx=0; }
		// do a dup add 1% of the time
		if ( (i % 100) == 0 )
			if(!c.addRecord((collnum_t)0,k,rec,recSize,timestamp)){
				char *xx=NULL; *xx=0; }
		//log("test: v1");
		//c.verify();
		// reset g_errno in case it had an error (we don't care)
		g_errno = 0;
		// get a rec too!
		if ( i < 10 ) continue;
		int32_t next = b + 1;
		if ( next >= 10 ) next = 0;
		key_t back = oldk[next];
		//log("cache: get rec");
		if ( ! c.getRecord ( (collnum_t)0 ,
				     back         ,
				     &rec     ,
				     &recSize ,
				     false    ,  // do copy?
				     -1       ,  // maxAge   ,
				     true     , // inc count?
				     NULL     , // *cachedTime = NULL,
				     true) ) {//true     )){ // promoteRecord?
			numMisses++;
			//logf(LOG_DEBUG,"test: missed");
			continue;
			char *xx= NULL;
			*xx = 0;
		}
		//log("cache: got rec");
		//char *p = c.m_bufs[0] + 9210679 + 51329;
		//if ( *p != lastp )
		//	logf(LOG_DEBUG,"test: p changed");
		//lastp = *p;
		if ( recSize != oldrs[next] ) {
			logf(LOG_DEBUG,"test: bad rec size.");
			char *xx=NULL; *xx = 0;
			continue;
		}
		char r = (char)back.n1;
		for ( int32_t j = 0 ; j < recSize ; j++ ) {
			if ( rec[j] == r ) continue;
			logf(LOG_DEBUG,"test: bad char in rec.");
			char *xx=NULL; *xx = 0;
		}
		//if ( ! rec || recSize != 4 || *(int32_t *)rec != oldip[next] ) {
		//	char *xx= NULL; *xx = 0; }
	}

	c.verify();

	c.reset();

	return true;
}

bool ramdiskTest() {

	//g_conf.m_maxMem = 2000000000LL; // 2G
	//g_mem.m_maxMem  = 2000000000LL; // 2G

	//hashinit();

	int fd = open ("/dev/ram2",O_RDWR);

	if ( fd < 0 ) {
		fprintf(stderr,"ramdisk: failed to open /dev/ram2\n");
		return false;
	}

	char *buf[1000];
	gbpwrite ( fd , buf , 1000, 0 );

	close ( fd);
	return true;
}

void  dosOpenCB( void *state, TcpSocket *s);

bool dosOpen(int32_t targetIp, uint16_t port, int numSocks) {
	TcpServer tcpClient;
	if ( ! g_loop.init() ) return log("loop: Loop init "
					  "failed.");
	// init the tcp server, client side only
	if ( ! tcpClient.init( NULL , // requestHandlerWrapper       ,
			       getMsgSize,
			       NULL , // getMsgPiece                 ,
			       0    // port, only needed for server
			       ) ) {

		return log("tcp: Tcp init failed.");
	}

	int32_t launched = 0;

	char* ebuf = "";
	for( int32_t i = 0; i < numSocks; i++) {
		if(!tcpClient.sendMsg( targetIp      ,
				      port    ,
				      ebuf,
				      0,
				      0,
				      0,
				      NULL,
				      dosOpenCB,
				      600 * 60 * 24,
				      -1,
				      -1)) {
			launched++;
		}
	}

	//printf("DOS version 5.2\n RAM: 000640K\n HIMEM: 1012\n\n");
	log("init: dos launched %"INT32" simultaneous requests.", launched);


	if ( ! g_loop.runLoop() ) return log("tcp: inject: Loop "
					     "run failed.");

	return true;
}

void  dosOpenCB( void *state, TcpSocket *s) {
	log("init: dos timeout");
}


// to get some of the hosts that were added to sitesearch.gigablast.com
// but not added in May or Apr: (this adds www. to domains that need it)
// ./gb dump t main 0 -1 0 >& foo
// grep ch= foo | grep -v " May-" | grep -v " Apr-" | awk '{print $13}' | urlinfo | grep "host: " | awk '{print $2}' | sort | uniq > added

// then the sites that have been searched:
// grep "search site" log0* | awk '{print $7}' | sort | uniq | urlinfo | grep "host: " | awk '{print $2}' | sort | uniq > searched

// then to print out the hosts that have not been searched in a while and
// should be removed from the sitesearch index
// diff added searched | grep "< " | awk '{print $2}' > toban

/*
void dumpCachedRecs (char *coll,int32_t startFileNum,int32_t numFiles,bool includeTree,
		     int64_t docid) {
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
	g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
	g_conf.m_tfndbMaxDiskPageCacheMem = 0;
	g_titledb.init ();
	g_collectiondb.init(true);
	g_titledb.getRdb()->addRdbBase1 ( coll );
	key_t startKey ;
	key_t endKey   ;
	key_t lastKey  ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();
	startKey = g_titledb.makeFirstTitleRecKey ( docid );
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	Msg5 msg5b;
	Msg5 msg5c;
	RdbList list;
	RdbList ulist;

	g_tfndb.init ();
	g_collectiondb.init(true);
	g_tfndb.getRdb()->addRdbBase1 ( coll );

	int64_t lastDocId = 0;
	int32_t compressBufSize = 0;
	char* compressBuf = NULL;
	fprintf(stderr, "Dumping Records:\n");
	int32_t filenum = 0;
	char filename[64];
	sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
	int FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
	int32_t numDumped = 0;
	uint32_t bytesDumped = 0;
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_TITLEDB   ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          , // cache key ptr
			      0             , // retry num
			      -1            , // maxRetries
			      true          , // compensate for merge
			      -1LL          , // sync point
			      &msg5b        )){
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k       = list.getCurrentKey();
		char *rec     = list.getCurrentRec();
		int32_t  recSize = list.getCurrentRecSize();
		int64_t docId       = g_titledb.getDocIdFromKey ( k );
		if ( k <= lastKey )
			log("key out of order. "
			    "lastKey.n1=%"XINT32" n0=%"XINT64" "
			    "currKey.n1=%"XINT32" n0=%"XINT64" ",
			    lastKey.n1,lastKey.n0,
			    k.n1,k.n0);
		lastKey = k;
		// print deletes
// 		if ( (k.n0 & 0x01) == 0) {
// 			fprintf(stderr,"n1=%08"XINT32" n0=%016"XINT64" docId=%012"INT64" "
// 			       "hh=%07"XINT32" ch=%08"XINT32" (del)\n",
// 			       k.n1 , k.n0 , docId , hostHash , contentHash );
// 			continue;
// 		}
		// uncompress the title rec
		TitleRec tr;
		if ( ! tr.set ( rec , recSize , false ) )
			continue;

		lastDocId = tr.getDocId();
		// extract the url
		Url *u = tr.getUrl();
		int32_t  ext = g_tfndb.makeExt ( u );

		key_t uk1 ;
		key_t uk2 ;
		uk1 = g_tfndb.makeMinKey ( docId );
		uk2 = g_tfndb.makeMaxKey ( docId );

		if(! msg5c.getList ( RDB_TFNDB         ,
				     coll              ,
				     &ulist            ,
				     uk1               , // startKey
				     uk2               , // endKey
				     0x7fffffff        , // minRecSizes
				     true              , // includeTree?
				     false             , // addToCache?
				     0                 , // max cache age
				     0                 , // startFileNum
				     -1                , // numFiles (-1 =all)
				     NULL              ,
				     NULL              ,
				     0                 , //nice
				     false             )) { //error correct
			log(LOG_LOGIC,"db: getList did not block.");
			return;
		}
		if(g_errno) {
			log(LOG_LOGIC,"db: tfndb getList had error: %s",
				mstrerror(g_errno));
		}
		bool found = false;
		for ( ulist.resetListPtr();
		      ! ulist.isExhausted() ;
		      ulist.skipCurrentRecord() ) {

			key_t k = ulist.getCurrentKey();

			if ( g_tfndb.getExt ( k ) == ext ) {
				found = true;
				break;
			}
		}

		if(!found) {
			//fprintf(stderr, "skipping %s %"INT64"\n", u->getUrl(), docId);
			continue;
		}

		int32_t needSize = (int32_t)(tr.getContentLen() * 1.01 + 12);
		if(needSize > compressBufSize) {
			char* newBuf = (char*)mrealloc(compressBuf, compressBufSize, needSize, "recDump");
			if(!newBuf) {
				log(LOG_WARN,"dump:couldn't dump this record:%s, no memory", u->getUrl());
				continue;
			}
			compressBufSize = needSize;
			compressBuf = newBuf;
		}

		uint32_t destLen = compressBufSize;

		int status = compress((unsigned char*)compressBuf,
				      &destLen,
				      (unsigned char*)tr.getContent(),
				      (uint32_t)tr.getContentLen());

		if(status != Z_OK) {
			log(LOG_WARN,"dump:couldn't dump this record:"
			    "%s, compress failed", u->getUrl());
			continue;
		}


		int32_t totSize = 2*sizeof(int32_t) + destLen + u->getUrlLen()+1;
		int32_t conLen  = tr.getContentLen();

		//fprintf(stderr, "%"INT32" %s %"INT32" %"INT32"\ng", totSize, u->getUrl(), conLen, destLen);

		write(FD, (char*)&totSize, sizeof(int32_t));
		write(FD, u->getUrl(), u->getUrlLen() + 1);
		write(FD, (char*)&conLen, sizeof(int32_t));
		write(FD, (char*)&(destLen), sizeof(int32_t));
		write(FD, compressBuf, destLen);
		numDumped++;
		bytesDumped += totSize;
// 		if(numDumped == 1000) {
// 			//change this later!!!!!!!!!!
// 			int32_t zero = 0;
// 			write(FD, &zero, sizeof(int32_t));
// 			return;
// 		}
	}
	fprintf(stderr, "dumped %"INT32" records (%"INT32" bytes).\n",numDumped, bytesDumped);
	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around

	if ( startKey < *(key_t *)list.getLastKey() ) {
		int32_t zero = 0;
		write(FD, &zero, sizeof(int32_t));
		return;
	}

	//start a new file if this one gets too big
	if(bytesDumped > 1000000000) {
		filenum++;
		sprintf(filename, "%s-%"INT32".ddmp", coll, filenum);
		close(FD);
		FD = open(filename, O_CREAT|O_WRONLY, S_IROTH);
		bytesDumped = 0;
		fprintf(stderr, "Started new file: %s. starts at docId: %"INT64".\n",filename, lastDocId);
	}
	goto loop;
}
*/

// CountDomains Structures and function definitions
struct lnk_info {
	char          *dom;
	int32_t           domLen;
	int32_t           pages;
};

struct dom_info {
	char          *dom;
	int32_t           domLen;
	int32_t           dHash;
	int32_t           pages;
	//int64_t      quality;
	struct ip_info 	      **ip_list;
	int32_t           numIp;
	//HashTable     *dht;
	int32_t 	      *lnk_table;
	int32_t           tableSize;
	int32_t           lnkCnt;
	int32_t	       lnkPages;
};

struct ip_info {
	uint32_t  ip;
	int32_t           pages;
	//int64_t      quality;
	struct dom_info **dom_list;
	int32_t           numDom;
};

// JAB: warning abatement
//static int ip_hcmp  (const void *p1, const void *p2);
static int ip_fcmp  (const void *p1, const void *p2);
static int ip_dcmp  (const void *p1, const void *p2);
// JAB: warning abatement
//static int dom_hcmp (const void *p1, const void *p2);
static int dom_fcmp (const void *p1, const void *p2);
static int dom_lcmp (const void *p1, const void *p2);
// JAB: warning abatement
//static int lnk_hcmp (const void *p1, const void *p2);
// JAB: warning abatement
//static int lnk_fcmp (const void *p1, const void *p2);

void countdomains( char* coll, int32_t numRecs, int32_t verbosity, int32_t output ) {
	struct ip_info **ip_table;
	struct dom_info **dom_table;
	//HashTable ipHT;
	//HashTable domHT;
	//ipHT.set ( numRecs+1 );
	//domHT.set( numRecs+1 );

	CollectionRec *cr = g_collectiondb.getRec(coll);

	key_t startKey;
	key_t endKey  ;
	key_t lastKey ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();

	g_titledb.init ();
	//g_collectiondb.init(true);
	g_titledb.getRdb()->addRdbBase1(coll );

	log( LOG_INFO, "cntDm: parms: %s, %"INT32"", coll, numRecs );
	int64_t time_start = gettimeofdayInMilliseconds_force();

	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	Msg5 msg5b;
	RdbList list;
	int32_t countDocs = 0;
	int32_t countIp = 0;
	int32_t countDom = 0;
	int32_t attempts = 0;

	ip_table  = (struct ip_info **)mmalloc(sizeof(struct ip_info *) * numRecs,
					     "main-dcit" );
	dom_table = (struct dom_info **)mmalloc(sizeof(struct dom_info *) * numRecs,
					     "main-dcdt" );

	for( int32_t i = 0; i < numRecs; i++ ) {
		ip_table[i] = NULL;
		dom_table[i] = NULL;
	}
 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_TITLEDB   ,
			      cr->m_collnum       ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      true         , // Do we need to include tree?
			      false         , // add to cache?
			      0             , // max cache age
			      0             ,
			      -1            ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          , // cache key ptr
			      0             , // retry num
			      -1            , // maxRetries
			      true          , // compensate for merge
			      -1LL          , // sync point
			      &msg5b        )){
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) goto freeInfo;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k       = list.getCurrentKey();
		char *rec     = list.getCurrentRec();
		int32_t  recSize = list.getCurrentRecSize();
		int64_t docId       = g_titledb.getDocId        ( &k );
		//int32_t      hostHash    = g_titledb.getHostHash     ( k );
		//int32_t      contentHash = g_titledb.getContentHash  ( k );
		attempts++;

		if ( k <= lastKey )
			log("key out of order. "
			    "lastKey.n1=%"XINT32" n0=%"XINT64" "
			    "currKey.n1=%"XINT32" n0=%"XINT64" ",
			    lastKey.n1,lastKey.n0,
			    k.n1,k.n0);
		lastKey = k;
		// print deletes
		if ( (k.n0 & 0x01) == 0) {
			fprintf(stderr,"n1=%08"XINT32" n0=%016"XINT64" docId=%012"INT64" "
				//"hh=%07"XINT32" ch=%08"XINT32" (del)\n",
				"(del)\n",
			       k.n1 , k.n0 , docId );
			continue;
		}

		if( (countIp >= numRecs) || (countDom >= numRecs) ) {
			log( LOG_INFO, "cntDm: countIp | countDom, greater than"
			     "numRecs requested, should never happen!!!!" );
			goto freeInfo;
		}

		// uncompress the title rec
		//TitleRec tr;
		//if ( ! tr.set ( rec , recSize , false ) )
		//	continue;
		XmlDoc xd;
		if ( ! xd.set2 (rec, recSize, coll,NULL,0) )
			continue;

		// extract the url
		//Url *u = tr.getUrl();

		struct ip_info  *sipi ;
		struct dom_info *sdomi;

		//uint32_t hkey_ip  = u->getIp();
		//uint32_t hkey_dom = hash32( u->getHost(), u->getHostLen() );
		//if( !(sipi = (struct ip_info *)ipHT.getValue( hkey_ip ))) {
		int32_t i;
		for( i = 0; i < countIp; i++ ) {
			if( !ip_table[i] ) continue;
			sipi = (struct ip_info *)ip_table[i];
			if( sipi->ip == (uint32_t)xd.m_ip ) break;
		}

		if( i == countIp ) {
			sipi = (struct ip_info *)mmalloc(sizeof(struct ip_info),
							 "main-dcip" );
			if( !sipi ) { char *XX=NULL; *XX=0; }
			//ipHT.addKey( hkey_ip, (int32_t)sipi, 0 );
			ip_table[countIp++]  = sipi;
			sipi->ip = xd.m_ip;//u->getIp();
			sipi->pages = 1;
			sipi->numDom = 0;
			//sipi->quality = tr.getDocQuality();
		}
		else {
			sipi->pages++;
			//sipi->quality += tr.getDocQuality();
		}

		//if( !(sdomi = (struct dom_info *)domHT.getValue( hkey_dom ))) {
		char *fu = xd.ptr_firstUrl;
		int32_t dlen; char *dom = getDomFast ( fu , &dlen );
		int32_t dkey = hash32( dom , dlen );

		for( i = 0; i < countDom; i++ ) {
			if( !dom_table[i] ) continue;
			sdomi = (struct dom_info *)dom_table[i];
			/*
			int32_t len = u->getHostLen();
			if( sdomi->domLen < u->getHostLen() ) len=sdomi->domLen;
			if(strncasecmp(sdomi->dom, u->getHost(), len)==0) break;
			*/
			if( sdomi->dHash == dkey ) break;
		}

		if( i == countDom ) {
			sdomi =(struct dom_info*)mmalloc(sizeof(struct dom_info),
							 "main-dcdm" );
			if( !sdomi ) { char *XX=NULL; *XX=0; }
			//domHT.addKey( hkey_dom, (int32_t)sdomi, 0 );
			dom_table[countDom++] = sdomi;
			sdomi->dom = (char *)mmalloc( dlen,"main-dcsdm" );

			strncpy( sdomi->dom, dom , dlen );
			sdomi->domLen = dlen;
			sdomi->dHash = dkey;
			sdomi->pages = 1;
			//sdomi->quality = tr.getDocQuality();
			sdomi->numIp = 0;

			//sdomi->dht = new( HashTable );
			//mnew( sdomi->dht, sizeof(HashTable), "main-dcndht" );
			//sdomi->dht->set( 1000 );
			sdomi->tableSize = 0;
			sdomi->lnkCnt = 0;
		}
		else {
			sdomi->pages++;
			//sdomi->quality += tr.getDocQuality();
		}

		Links *dlinks = xd.getLinks();

		/*
		// Parse outgoing links and count frequency
		Links dLinks;
		//Xml *sx;
		//sx  = g_tagdb.getSiteXml ( tr.getSiteFilenum(), coll ,
		//			    gbstrlen( coll ) );
		Xml xml;
		if (!xml.set( tr.getCharset(), tr.getContent(), tr.getContentLen(),
			      false, 0, false, tr.getVersion() )){
			log(LOG_WARN, "countdomains: error setting Xml: %s",
			    mstrerror(g_errno));
			return;
		}

		if (!dLinks.set( true, &xml, tr.getUrl(), false, false,
				 xd.m_version,0 )){
			log(LOG_WARN, "countdomains: error setting Links: %s",
			    mstrerror(g_errno));
			return;
		}
		*/
		int32_t size = dlinks->getNumLinks();
		if( !sdomi->tableSize ) {
			sdomi->lnk_table = (int32_t *)mmalloc(size * sizeof(int32_t),
							   "main-dclt" );
			sdomi->tableSize = size;
		}
		else {
			if( size > (sdomi->tableSize - sdomi->lnkCnt) ) {
				size += sdomi->lnkCnt;
				sdomi->lnk_table = (int32_t *)
					mrealloc(sdomi->lnk_table,
						 sdomi->tableSize*sizeof(int32_t),
						 size*sizeof(int32_t),
						 "main-dcrlt" );
				sdomi->tableSize = size;
			}
		}

		for( int32_t i = 0; i < dlinks->getNumLinks(); i++ ) {
			//struct lnk_info *slink;
			//Url url;
			//url.set(dLinks.getLink(i), dLinks.getLinkLen(i));
			char *link = dlinks->getLink(i);
			int32_t dlen; char *dom = getDomFast ( link , &dlen );
			uint32_t lkey = hash32( dom , dlen );
			//if( (slink = (struct lnk_info *)
			//                sdomi->dht->getValue( lkey ))) {
			int32_t j;
			for( j = 0; j < sdomi->lnkCnt; j++ ) {
				//slink=(struct lnk_info *)sdomi->lnk_table[j];
				if( sdomi->lnk_table[j] == (int32_t)lkey ) break;
				//if(slink->domLen != url.getHostLen()) continue;
				//if( !strcasecmp( slink->dom, url.getHost() ) )
				//break;
			}

			sdomi->lnkPages++;
			if( j != sdomi->lnkCnt ) continue;
			sdomi->lnk_table[sdomi->lnkCnt++] = lkey;
			sdomi->lnkPages++;
			//slink=(struct lnk_info *)mmalloc(sizeof(struct lnk_info),
			//				 "main-dcli" );
			//Sanity check, mallocing link_info struct
			//if( !slink ) { char *XX=NULL; *XX=0; }
			//sdomi->dht->addKey( lkey, (int32_t)slink, 0 );
			//sdomi->lnk_table[sdomi->lnkCnt++] = (int32_t)slink;
			//slink->dom = (char *)mmalloc( url.getHostLen(),
			//			      "main-dcsld" );
			//strncpy( slink->dom, url.getHost(),
			//	 url.getHostLen() );
			//slink->domLen = url.getHostLen();
			//slink->pages = 1;
		}

		// Handle lists
		if( !sipi->numDom || !sdomi->numIp ){
			sdomi->numIp++; sipi->numDom++;
			//Add to IP list for Domain
			sdomi->ip_list = (struct ip_info **)
				mrealloc( sdomi->ip_list,
					  (sdomi->numIp-1)*sizeof(char *),
					  sdomi->numIp*sizeof(char *),
					  "main-dcldm" );
			sdomi->ip_list[sdomi->numIp-1] = sipi;

			//Add to domain list for IP
			sipi->dom_list = (struct dom_info **)
				mrealloc( sipi->dom_list,
					  (sipi->numDom-1)*sizeof(char *),
					  sipi->numDom*sizeof(char *),
					  "main-dclip" );
			sipi->dom_list[sipi->numDom-1] = sdomi;
		}
		else {
			int32_t i;
			for( i = 0;
			     (i < sdomi->numIp)
				     && (sdomi->ip_list[i] != sipi);
			     i++ );
			if( sdomi->numIp != i ) goto updateIp;

			sdomi->numIp++;
			sdomi->ip_list = (struct ip_info **)
				mrealloc( sdomi->ip_list,
					  (sdomi->numIp-1)*sizeof(int32_t),
					  sdomi->numIp*sizeof(int32_t),
					  "main-dcldm" );
			sdomi->ip_list[sdomi->numIp-1] = sipi;

		updateIp:
			for( i = 0;
			     (i < sipi->numDom)
				     && (sipi->dom_list[i] != sdomi);
			     i++ );
			if( sipi->numDom != i ) goto endListUpdate;

			sipi->numDom++;
			sipi->dom_list = (struct dom_info **)
				mrealloc( sipi->dom_list,
					  (sipi->numDom-1)*sizeof(int32_t),
					  sipi->numDom*sizeof(int32_t),
					  "main-dclip" );
			sipi->dom_list[sipi->numDom-1] = sdomi;

		endListUpdate:
			i=0;
		}
		if( !((++countDocs) % 1000) )
			log(LOG_INFO, "cntDm: %"INT32" records searched.",countDocs);
		if( countDocs == numRecs ) goto freeInfo;
		//else countDocs++;
	}
	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around
	if ( startKey < *(key_t *)list.getLastKey() ) {
		log( LOG_INFO, "cntDm: Keys wrapped around! Exiting." );
		goto freeInfo;
	}

	if ( countDocs >= numRecs ) {
	freeInfo:
		char             buf[128];
		//int32_t             value   ;
		int32_t             len     ;
		char             loop    ;
		int32_t             recsDisp;
		struct ip_info  *tmpipi  ;
		struct dom_info *tmpdomi ;
		//struct lnk_info *tmplnk  ;
		loop = 0;

		FILE *fhndl;
		char out[128];
		if( output != 9 ) goto printHtml;
		// Dump raw data to a file to parse later
		sprintf( out, "%scntdom.xml", g_hostdb.m_dir );
		if( (fhndl = fopen( out, "wb" )) < 0 ) {
			log( LOG_INFO, "cntDm: File Open Failed." );
			return;
		}

		gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_fcmp );
		for( int32_t i = 0; i < countDom; i++ ) {
			if( !dom_table[i] ) continue;
			tmpdomi = (struct dom_info *)dom_table[i];
			len = tmpdomi->domLen;
			if( tmpdomi->domLen > 127 ) len = 126;
			strncpy( buf, tmpdomi->dom, len );
			buf[len] = '\0';
			fprintf(fhndl,
				"<rec1>\n\t<domain>%s</domain>\n"
				"\t<pages>%"INT32"</pages>\n"
				//"\t<quality>%"INT64"</quality>\n"
				"\t<block>\n",
				buf, tmpdomi->pages
				//,(tmpdomi->quality/tmpdomi->pages)
				);
			gbsort( tmpdomi->ip_list,tmpdomi->numIp, sizeof(int32_t),
			       ip_fcmp );
			for( int32_t j = 0; j < tmpdomi->numIp; j++ ) {
				if( !tmpdomi->ip_list[j] ) continue;
				tmpipi = (struct ip_info *)tmpdomi->ip_list[j];
				strcpy ( buf , iptoa( tmpipi->ip ) );
				fprintf(fhndl,"\t\t<ip>%s</ip>\n",buf);
			}
			fprintf(fhndl,
				"\t</block>\n"
				"\t<links>\n");
			/*
			gbsort(tmpdomi->lnk_table,tmpdomi->lnkCnt,sizeof(int32_t),
			       lnk_fcmp );
			for( int32_t j = 0; j < tmpdomi->lnkCnt; j++ ) {
				tmplnk = (struct lnk_info *)tmpdomi->lnk_table[j];
				len = tmplnk->domLen;
				if( len > 127 ) len = 126;
				strncpy( buf, tmplnk->dom, len );
				buf[len] = '\0';
				fprintf(fhndl,
					"\t\t<link>\n"
					"\t\t\t<domain>%s</domain>\n"
					"\t\t\t<pages>%"INT32"</pages>\n"
					"\t\t</link>\n",
					buf, tmplnk->pages);
			}
			fprintf(fhndl,
				"\t</links>\n"
				"</rec1>\n");
			*/
		}
		gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_fcmp );
		for( int32_t i = 0; i < countIp; i++ ) {
			if( !ip_table[i] ) continue;
			tmpipi = (struct ip_info *)ip_table[i];
			strcpy ( buf , iptoa( tmpipi->ip ) );
			fprintf(fhndl,
				"<rec2>\n\t<ip>%s</ip>\n"
				"\t<pages>%"INT32"</pages>\n"
				//"\t<quality>%"INT64"</quality>\n"
				"\t<block>\n",
				buf, tmpipi->pages);
			//(tmpipi->quality/tmpipi->pages));
			for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
				tmpdomi = (struct dom_info *)tmpipi->dom_list[j];
				len = tmpdomi->domLen;
				if( tmpdomi->domLen > 127 ) len = 126;
				strncpy( buf, tmpdomi->dom, len );
				buf[len] = '\0';
				fprintf(fhndl,
					"\t\t<domain>%s</domain>\n",
					buf);
			}
			fprintf(fhndl,
				"\t</block>\n"
				"</rec2>\n");
		}

		if( fclose( fhndl ) < 0 ) {
			log( LOG_INFO, "cntDm: File Close Failed." );
			return;
		}
		fhndl = 0;

		/*
		// Terminal Output format
		for( int32_t i = 0; i < countIp; i++ ) {
			if( !ip_table[i] ) continue;
			tmpipi = (struct ip_info *)ip_table[i];
			strcpy ( buf , iptoa( tmpipi->ip ) );
			fprintf( stderr, "\t\tIP: %s   \t\t\t\t\t%"INT32"\n", buf,
				 tmpipi->pages );
			for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
				int32_t len;
				tmpdomi = (struct dom_info *)tmpipi->dom_list[j];
				len = tmpdomi->domLen;
				if( tmpdomi->domLen > 127 ) len = 126;
				strncpy( buf, tmpdomi->dom, len );
				buf[len] = '\0';
				fprintf( stderr, "\t\t\tDM: %s",
					 buf );
				if( tmpdomi->domLen > 27 )
					fprintf( stderr, "\t\t" );
				else if( tmpdomi->domLen <= 11 )
					fprintf( stderr, "\t\t\t\t\t" );
				else if( tmpdomi->domLen >= 20 )
					fprintf( stderr, "\t\t\t" );
				else
					fprintf( stderr, "\t\t\t\t" );
				fprintf( stderr, "%"INT32"\n", tmpdomi->pages );

				if( verbosity != 10 ) continue;
				gbsort( tmpdomi->lnk_table, tmpdomi->lnkCnt,
				       sizeof(int32_t), lnk_fcmp );
				for( int32_t k = 0; k < tmpdomi->lnkCnt; k++ ) {
					tmplnk = (struct lnk_info *)
						tmpdomi->lnk_table[k];
					len = tmplnk->domLen;
					if( len > 127 ) len = 126;
					strncpy( buf, tmplnk->dom, len );
					buf[len] = '\0';
					fprintf( stderr, "\t\t\t\tLD: %s",
						 buf );
					if( len > 27 )
						fprintf( stderr, "\t" );
					else if( len <= 11 )
						fprintf( stderr, "\t\t\t\t" );
					else if( len >= 20 )
						fprintf( stderr, "\t\t" );
					else
						fprintf( stderr, "\t\t\t" );
					fprintf(stderr, "%"INT32"\n",
						tmplnk->pages);
				}

			}
			fprintf( stderr, "\n" );
		}
		*/
	printHtml:
		// HTML file Output
		sprintf( out, "%scntdom.html", g_hostdb.m_dir );
		if( (fhndl = fopen( out, "wb" )) < 0 ) {
			log( LOG_INFO, "cntDm: File Open Failed." );
			return;
		}
		int64_t total = g_titledb.getGlobalNumDocs();
		char link_ip[]  = "http://www.gigablast.com/search?"
			          "code=gbmonitor&q=ip%3A";
		char link_dom[] = "http://www.gigablast.com/search?"
			          "code=gbmonitor&q=site%3A";
		char menu[] = "<table cellpadding=\"2\" cellspacing=\"2\">\n<tr>"
			"<th bgcolor=\"#CCCC66\"><a href=\"#pid\">"
			"Domains Sorted By Pages</a></th>"
			"<th bgcolor=\"#CCCC66\"><a href=\"#lid\">"
			"Domains Sorted By Links</a></th>"
			"<th bgcolor=\"#CCCC66\"><a href=\"#pii\">"
			"IPs Sorted By Pages</a></th>"
			"<th bgcolor=\"#CCCC66\"><a href=\"#dii\">"
			"IPs Sorted By Domains</a></th>"
			"<th bgcolor=\"#CCCC66\"><a href=\"#stats\">"
			"Stats</a></th>"
			"</tr>\n</table>\n<br>\n";

		char hdr[] = "<table cellpadding=\"5\" cellspacing=\"2\">"
			"<tr bgcolor=\"AAAAAA\">"
			"<th>Domain</th>"
			"<th>Domains Linked</th>"
			//"<th>Avg Quality</th>"
			"<th># Pages</th>"
			"<th>Extrap # Pages</th>"
			"<th>IP</th>"
			"</tr>\n";

		char hdr2[] = "<table cellpadding=\"5\" cellspacing=\"2\">"
			"<tr bgcolor=\"AAAAAA\">"
			"<th>IP</th>"
			"<th>Domain</th>"
			"<th>Domains Linked</th>"
			//"<th>Avg Quality</th>"
			"<th># Pages</th>"
			"<th>Extrap # Pages</th>"
			"</tr>\n";

		char clr1[] = "#FFFF00";//"yellow";
		char clr2[] = "#FFFF66";//"orange";
		//char clr3[] = "#0099FF";//"#66FF33";
		//char clr4[] = "#33FFCC";//"#33CC33";
		char *color;

		fprintf( fhndl,
			 "<html><head><title>Domain/IP Counter</title></head>\n"
			 "<body>"
			 "<h1>Domain/IP Counter</h1><br><br>"
			 "<a name=\"stats\">"
			 "<h2>Stats</h2>\n%s", menu );

		// Stats
		fprintf( fhndl, "<br>\n\n<table>\n"
			 "<tr><th align=\"left\">Total Number of Domains</th>"
			 "<td>%"INT32"</td></tr>\n"
			 "<tr><th align=\"left\">Total Number of Ips</th>"
			 "<td>%"INT32"</td></tr>\n"
			 "<tr><th align=\"left\">Number of Documents Searched"
			 "</th><td>%"INT32"</td></tr>\n"
			 "<tr><th align=\"left\">Number of Failed Attempts</th>"
			 "<td>%"INT32"</td></tr><tr></tr><tr>\n"
			 "<tr><th align=\"left\">Number of Documents in Index"
			 "</th><td>%"INT64"</td></tr>\n"
			 "<tr><th align=\"left\">Estimated Domains in index</th>"
			 "<td>%"INT64"</td></tr>"
			 "</table><br><br><br>\n"
			 ,countDom,countIp,
			 countDocs, attempts-countDocs,total,
			 ((countDom*total)/countDocs) );


		fprintf( fhndl, "<a name=\"pid\">\n"
			 "<h2>Domains Sorted By Pages</h2>\n"
			 "%s", menu );
		gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_fcmp );
	printDomLp:

		fprintf( fhndl,"%s", hdr );
		recsDisp = countDom;
		if( countDom > 1000 ) recsDisp = 1000;
		for( int32_t i = 0; i < recsDisp; i++ ) {
			char buf[128];
			int32_t len;
			if( !dom_table[i] ) continue;
			if( i%2 ) color = clr2;
			else color = clr1;
			tmpdomi = (struct dom_info *)dom_table[i];
			len = tmpdomi->domLen;
			if( tmpdomi->domLen > 127 ) len = 126;
			strncpy( buf, tmpdomi->dom, len );
			buf[len] = '\0';
			fprintf( fhndl, "<tr bgcolor=\"%s\"><td>"
				 "<a href=\"%s%s\" target=\"_blank\">%s</a>"
				 "</td><td>%"INT32"</td>"
				 //"<td>%"INT64"</td>"
				 "<td>%"INT32"</td>"
				 "<td>%"INT64"</td><td>",
				 color, link_dom,
				 buf, buf, tmpdomi->lnkCnt,
				 //(tmpdomi->quality/tmpdomi->pages),
				 tmpdomi->pages,
				 ((tmpdomi->pages*total)/countDocs) );
			for( int32_t j = 0; j < tmpdomi->numIp; j++ ) {
				tmpipi = (struct ip_info *)tmpdomi->ip_list[j];
				strcpy ( buf , iptoa(tmpipi->ip) );
				fprintf( fhndl, "<a href=\"%s%s\""
					 "target=\"_blank\">%s</a>\n",
					 link_ip, buf, buf );
			}
			fprintf( fhndl, "</td></tr>\n" );
			/*
			if( verbosity != 10 ) goto printDone;
			gbsort(tmpdomi->lnk_table,tmpdomi->lnkCnt,sizeof(int32_t),
			       lnk_fcmp );
			for( int32_t k = 0; k < tmpdomi->lnkCnt; k++ ) {
				tmplnk = (struct lnk_info *)tmpdomi->lnk_table[k];
				len = tmplnk->domLen;
				if( len > 127 ) len = 126;
				strncpy( buf, tmplnk->dom, len );
				buf[len] = '\0';
				fprintf( fhndl, "\t\t<tr bgcolor=\"green\"><td>"
					 "</td><td></td><td>%s</td><td></td><td>"
					 "%"INT32"</td><td>%"INT64"</td></tr>\n", buf,
					 tmplnk->pages,
					 ((tmplnk->pages*total)/countDocs) );
			}
		printDone:
			*/
			fprintf( fhndl, "\n" );
		}

		fprintf( fhndl, "</table>\n<br><br><br>" );
		if( loop == 0 ) {
			loop = 1;
			gbsort( dom_table, countDom, sizeof(struct dom_info *), dom_lcmp );
			fprintf( fhndl, "<a name=\"lid\">"
				 "<h2>Domains Sorted By Links</h2>\n%s", menu );

			goto printDomLp;
		}
		loop = 0;

		fprintf( fhndl, "<a name=\"pii\">"
			 "<h2>IPs Sorted By Pages</h2>\n%s", menu );


		gbsort( ip_table, countIp, sizeof(struct ip_info *), ip_fcmp );
	printIpLp:
		fprintf( fhndl,"%s", hdr2 );
		recsDisp = countIp;
		if( countIp > 1000 ) recsDisp = 1000;
		for( int32_t i = 0; i < recsDisp; i++ ) {
			char buf[128];
			if( !ip_table[i] ) continue;
			tmpipi = (struct ip_info *)ip_table[i];
			strcpy ( buf , iptoa(tmpipi->ip) );
			if( i%2 ) color = clr2;
			else color = clr1;
			int32_t linked = 0;
			for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
				tmpdomi=(struct dom_info *)tmpipi->dom_list[j];
				linked += tmpdomi->lnkCnt;
			}
			fprintf( fhndl, "\t<tr bgcolor=\"%s\"><td>"
				 "<a href=\"%s%s\" target=\"_blank\">%s</a>"
				 "</td>"
				 "<td>%"INT32"</td>"
				 "<td>%"INT32"</td>"
				 //"<td>%"INT64"</td>"
				 "<td>%"INT32"</td>"
				 "<td>%"INT64"</td></tr>\n",
				 color,
				 link_ip, buf, buf, tmpipi->numDom, linked,
				 //(tmpipi->quality/tmpipi->pages),
				 tmpipi->pages,
				 ((tmpipi->pages*total)/countDocs) );
			/*
			for( int32_t j = 0; j < tmpipi->numDom; j++ ) {
			        int32_t len;
				tmpdomi=(struct dom_info *)tmpipi->dom_list[j];
				len = tmpdomi->domLen;
				if( tmpdomi->domLen > 127 ) len = 126;
				strncpy( buf, tmpdomi->dom, len );
				buf[len] = '\0';
				if( j%2 ) color = clr4;
				else color = clr3;
				fprintf( fhndl, "<tr bgcolor=\"%s\"><td>"
					 "</td><td><a href=\"%s%s\">%s</a></td>"
					 "<td>%"INT32"</td><td>%"INT64""
					 "</td><td>%"INT32"</td><td> %"INT64"</td></tr>"
					 "\n", color, link_dom, buf,
					 buf, tmpdomi->lnkCnt,
					 (tmpdomi->quality/tmpdomi->pages),
					 tmpdomi->pages,
					 ((tmpdomi->pages*total)/countDocs) );
			}
			*/
			fprintf( fhndl, "\n" );
		}

		fprintf( fhndl, "</table>\n<br><br><br>" );
		if( loop == 0 ) {
			loop = 1;
			gbsort( ip_table, countIp, sizeof(struct ip_table *), ip_dcmp );
			fprintf( fhndl, "<a name=\"dii\">"
				 "<h2>IPs Sorted By Domains</h2>\n%s", menu );
			goto printIpLp;
		}

		if( fclose( fhndl ) < 0 ) {
			log( LOG_INFO, "cntDm: File Close Failed." );
			return;
		}
		fhndl = 0;


		int32_t ima = 0;
		int32_t dma = 0;

		log( LOG_INFO, "cntDm: Freeing ip info struct..." );
		for( int32_t i = 0; i < countIp; i++ ) {
			if( !ip_table[i] ) continue;
			//value = ipHT.getValue( ip_table[i] );
			//if(value == 0) continue;
			tmpipi = (struct ip_info *)ip_table[i];
			mfree( tmpipi->dom_list, tmpipi->numDom*sizeof(int32_t),
			       "main-dcflip" );
			ima += tmpipi->numDom * sizeof(int32_t);
			mfree( tmpipi, sizeof(struct ip_info), "main-dcfip" );
			ima += sizeof(struct ip_info);
			tmpipi = NULL;
		}
		mfree( ip_table, numRecs * sizeof(struct ip_table *), "main-dcfit" );

		log( LOG_INFO, "cntDm: Freeing domain info struct..." );
		for( int32_t i = 0; i < countDom; i++ ) {
			if( !dom_table[i] ) continue;
			tmpdomi = (struct dom_info *)dom_table[i];
			/*
			for( int32_t j = 0; j < tmpdomi->lnkCnt; j++ ) {
				if( !tmpdomi->lnk_table[j] ) continue;
				tmplnk=(struct lnk_info *)tmpdomi->lnk_table[j];
				mfree( tmplnk->dom, tmplnk->domLen,
				       "main-dsfsld" );
				mfree( tmplnk, sizeof(struct lnk_info),
				       "main-dsfsli" );
			}
			*/
			mfree( tmpdomi->lnk_table,
			       tmpdomi->tableSize*sizeof(int32_t),
			       "main-dcfsdlt" );
			dma += tmpdomi->tableSize * sizeof(int32_t);
			mfree( tmpdomi->ip_list, tmpdomi->numIp*sizeof(int32_t),
			       "main-dcfldom" );
			dma += tmpdomi->numIp * sizeof(int32_t);
			mfree( tmpdomi->dom, tmpdomi->domLen, "main-dcfsdom" );
			dma += tmpdomi->domLen;
			//tmpdomi->dht.reset();
			//mdelete( tmpdomi->dht, sizeof(HashTable), "main-dcmdht" );
			//delete tmpdomi->dht;
			mfree( tmpdomi, sizeof(struct dom_info), "main-dcfdom" );
			dma+= sizeof(struct dom_info);
			tmpdomi = NULL;
		}

		mfree( dom_table, numRecs * sizeof(struct dom_info *), "main-dcfdt" );

		int64_t time_end = gettimeofdayInMilliseconds_force();
		log( LOG_INFO, "cntDm: Took %"INT64"ms to count domains in %"INT32" recs.",
		     time_end-time_start, countDocs );
		log( LOG_INFO, "cntDm: %"INT32" bytes of Total Memory Used.",
		     ima + dma + (8 * numRecs) );
		log( LOG_INFO, "cntDm: %"INT32" bytes Total for IP.", ima );
		log( LOG_INFO, "cntDm: %"INT32" bytes Total for Dom.", dma );
		log( LOG_INFO, "cntDm: %"INT32" bytes Average for IP.", ima/countIp );
		log( LOG_INFO, "cntDm: %"INT32" bytes Average for Dom.", dma/countDom );

		return;
	}
	goto loop;
}

// JAB: warning abatement
#if 0
// Sort by IP address 9->0
int ip_hcmp (const void *p1, const void *p2) {
	int32_t n1, n2;
	struct ip_info *ii1;
	struct ip_info *ii2;
	int64_t n3 = 0;
	int64_t n4 = 0;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);

	ii1 = (struct ip_info *)n1;
	ii2 = (struct ip_info *)n2;

	*(((unsigned char *)(&n3))+3) = *(((char *)ii1->ip)+0);
	*(((unsigned char *)(&n3))+2) = *(((char *)ii1->ip)+1);
	*(((unsigned char *)(&n3))+1) = *(((char *)ii1->ip)+2);
	*(((unsigned char *)(&n3))+0) = *(((char *)ii1->ip)+3);

	*(((unsigned char *)(&n2))+3) = *(((char *)ii2->ip)+0);
	*(((unsigned char *)(&n2))+2) = *(((char *)ii2->ip)+1);
	*(((unsigned char *)(&n2))+1) = *(((char *)ii2->ip)+2);
	*(((unsigned char *)(&n2))+0) = *(((char *)ii2->ip)+3);

	return (n4 - n3)/100;
}
#endif

// Sort by IP frequency in pages 9->0
int ip_fcmp (const void *p1, const void *p2) {
	//int32_t n1, n2;
	// break this! need to fix later MDW 11/12/14
	char *n1 ;
	char *n2 ;
	struct ip_info *ii1;
	struct ip_info *ii2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);

	ii1 = (struct ip_info *)n1;
	ii2 = (struct ip_info *)n2;

	return ii2->pages-ii1->pages;
}

// Sort by number of domains linked to IP, descending
int ip_dcmp (const void *p1, const void *p2) {
	//int32_t n1, n2;
	// break this! need to fix later MDW 11/12/14
	char *n1 ;
	char *n2 ;

	struct ip_info *ii1;
	struct ip_info *ii2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);

	ii1 = (struct ip_info *)n1;
	ii2 = (struct ip_info *)n2;

	return ii2->numDom-ii1->numDom;
}

// JAB: warning abatement
#if 0
// Sort by Host name, a->z
int dom_hcmp (const void *p1, const void *p2) {
	int32_t len, n1, n2;
	struct dom_info *di1;
	struct dom_info *di2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);

	di1 = (struct dom_info *)n1;
	di2 = (struct dom_info *)n2;

	if( di1->domLen < di2->domLen ) len = di1->domLen;
	else len = di2->domLen;

	return strncasecmp( di1->dom, di2->dom, len );
}
#endif

// Sort by page frequency in titlerec 9->0
int dom_fcmp (const void *p1, const void *p2) {
	//int32_t n1, n2;
	// break this! need to fix later MDW 11/12/14
	char *n1 ;
	char *n2 ;
	struct dom_info *di1;
	struct dom_info *di2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);


	di1 = (struct dom_info *)n1;
	di2 = (struct dom_info *)n2;

	return di2->pages-di1->pages;
}

// Sort by quantity of outgoing links 9-0
int dom_lcmp (const void *p1, const void *p2) {
	//int32_t n1, n2;
	// break this! need to fix later MDW 11/12/14
	char *n1 ;
	char *n2 ;
	struct dom_info *di1;
	struct dom_info *di2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);


	di1 = (struct dom_info *)n1;
	di2 = (struct dom_info *)n2;

	return di2->lnkCnt-di1->lnkCnt;
}

// JAB: warning abatement
#if 0
// Sort by domain name a-z
int lnk_hcmp (const void *p1, const void *p2) {
	int32_t len, n1, n2;
	struct lnk_info *li1;
	struct lnk_info *li2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);


	li1 = (struct lnk_info *)n1;
	li2 = (struct lnk_info *)n2;

	if( li1->domLen < li2->domLen ) len = li1->domLen;
	else len = li2->domLen;

	return strncasecmp( li1->dom, li2->dom, len );
}
#endif

// JAB: warning abatement
#if 0
// Sort by frequency of link use, 9-0
int lnk_fcmp (const void *p1, const void *p2) {
	//int32_t n1, n2;
	// break this! need to fix later MDW 11/12/14
	char *n1 ;
	char *n2 ;
	struct lnk_info *li1;
	struct lnk_info *li2;

	*(((unsigned char *)(&n1))+0) = *(((char *)p1)+0);
	*(((unsigned char *)(&n1))+1) = *(((char *)p1)+1);
	*(((unsigned char *)(&n1))+2) = *(((char *)p1)+2);
	*(((unsigned char *)(&n1))+3) = *(((char *)p1)+3);

	*(((unsigned char *)(&n2))+0) = *(((char *)p2)+0);
	*(((unsigned char *)(&n2))+1) = *(((char *)p2)+1);
	*(((unsigned char *)(&n2))+2) = *(((char *)p2)+2);
	*(((unsigned char *)(&n2))+3) = *(((char *)p2)+3);


	li1 = (struct lnk_info *)n1;
	li2 = (struct lnk_info *)n2;

	return li2->pages-li1->pages;
}
#endif

/*
static void printBits(qvec_t bits, int32_t numDigits, char *buf){
	int32_t pos = 0;
	for (int32_t i=0; i < numDigits ; i++){
		if (i && i%4 == 0) buf[pos++] = ' ';
		if (bits & (1 << (numDigits-i-1))) buf[pos++] = '1';
		else buf[pos++] = '0';
	}
	buf[pos] = 0;
}

bool testBoolean() {
	if (!queryTest()) return false;

	char *testQueries [] = {
		"a AND b OR c",
		"a OR b AND c",
		"a AND NOT b OR b AND NOT a",
		//vivismo query bug
		"canada suntanning OR beaches",
		"canada AND suntanning OR beaches",
		"canada AND (suntanning OR beaches)",
		"(canada AND suntanning) OR beaches",
		"a AND b OR c AND d AND e OR f",
		// buzz problem query
		"(a AND NOT (b OR c)) d | f",
		"foo AND (bar OR boo)  keywords | sortkey"

// 		"a AND NOT b OR c",
// 		"a AND NOT b OR b AND NOT a",
// 		"a OR b | c",
// 		"(a AND b OR c) | d",
	};
	char *truthTables [] = {
		"00011111",
		"01010111",
		"0110",
		// term 0 has implicitbits for 1 and 2
		"0101011111111111",
		"00011111",
		"00010101",
		"00011111",
		// big uns
		"00010001000100010001000100011111"
		"11111111111111111111111111111111",

		"00000000000000000000000001000000",

		"00000000000000000000000000010101",
	};
	int numTests = 10;
	// buffer for holding truth table
	int32_t bufSize = 10000000;
	char *bitScoresBuf = (char*) mmalloc(bufSize, "bitScoreBuf");
	if (!bitScoresBuf){
		log("query: unable to alloc bitScores buffer: %s",
		    mstrerror(g_errno) );
		return false;
	}
	for (int i=0; i < numTests ; i++) {
		Query q;
		if ( ! q.set2 ( testQueries[i] , langUnknown ) ) {
			log("query: unable to set query: %s",
			    mstrerror(g_errno) );
			continue;
		}

		q.setBitMap();
		if ( ! q.setBitScoresBoolean(bitScoresBuf, bufSize) ) {
			log("query: unable to set bitScores: %s",
			    mstrerror(g_errno) );
			mfree(bitScoresBuf, bufSize,"bitScoresBuf");
			return false;
		}

		printf("\n");
		log(LOG_INIT, "query: Test #%d: %s",
		    i, testQueries[i]);

		// print parsed expressions
		SafeBuf sbuf(1024);
		Expression *e = &q.m_expressions[0];
		while (e->m_parent) e = e->m_parent;
		e->print(&sbuf);

		log("query: %s", sbuf.getBufStart());
		int32_t numCombos = 1 << q.m_numExplicitBits;
		//log("query: numcombos: %d", numCombos);

		// hack for duplicate terms bits so we don't need
		// an unreasonably
		// large test table
		qvec_t bitMask = 0;
		for (int j=0;j<q.m_numTerms;j++){
			QueryTerm *qt = &q.m_qterms[j];
			bitMask |= qt->m_explicitBit;
			bitMask |= qt->m_implicitBits;
			sbuf.reset();
			//sbuf.utf16Encode(qt->m_term, qt->m_termLen);
			sbuf.safeMemcpy(qt->m_term, qt->m_termLen);
			log("query: term #%d: ebit=0x08%"XINT64" ibit=0x08%"XINT64" %s",
			    j,
			    (int64_t) q.m_qterms[j].m_explicitBit,
			    (int64_t) q.m_qterms[j].m_implicitBits,
			    sbuf.getBufStart());
		}
		//some problem queries give no terms, and a zero bitmask
		// causes it to produce no errors
		if (!bitMask) bitMask = numCombos-1;

		int32_t errorCount = 0;
		char bitBuf[64];
		bitBuf[63] = 0;
		printBits(bitMask, q.m_numExplicitBits, bitBuf);

		log("query: bit mask: 0x%08llx (%s)",
		    (int64_t) bitMask, bitBuf);
		for (int j=0;j<numCombos;j++){
			qvec_t bits = j & bitMask;
			char ttval = truthTables[i][bits]-'0';
			// sanity check...if we go over bounds of truthTable
			// array, we are in the test query array and
			// weird stuff happens
			if (ttval != 0 && ttval != 1){
				log("query: error in truth table #%d!!!",i);
				char *xx=NULL;*xx=0;
			}

			printBits(bits,q.m_numExplicitBits,bitBuf);
			if (q.m_bitScores[bits])
				log(LOG_INIT, "query: 0x%04llx: (%s) true",
				    (int64_t) bits, bitBuf);

			if (q.m_bitScores[bits] && ttval)
				continue;
			if (!q.m_bitScores[bits] && !ttval)
				continue;
			errorCount++;
			printBits(bits, q.m_numExplicitBits, bitBuf);
			log("query: ERROR! 0x%04llx: %s %s",
			    (int64_t)bits, bitBuf,
			    q.m_bitScores[bits]?"true":"false");
		}
		if (!errorCount) log(LOG_INIT,
				     "query: Test #%d Passed (%"INT32" values)",
				     i, numCombos);
		else log(LOG_WARN, "Test #%d FAILED %"INT32" of %"INT32" truth values",
			 i, errorCount, numCombos);

	}
	mfree(bitScoresBuf, bufSize,"bitScoresBuf");
	return true;
}
*/

//#include "LinkText.h"

/*
void testSpamRules(char *coll,
		   int32_t startFileNum,
		   int32_t numFiles,
		   bool includeTree,
		   int64_t docid) {
	//int32_t collLen = gbstrlen(coll);
	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
	//g_conf.m_checksumdbMaxDiskPageCacheMem = 0;
	g_conf.m_spiderdbMaxDiskPageCacheMem   = 0;
	g_conf.m_tfndbMaxDiskPageCacheMem = 0;
	g_titledb.init ();
	g_collectiondb.init(true);
	g_titledb.getRdb()->addRdbBase1 ( coll );
	key_t startKey ;
	key_t endKey   ;
	key_t lastKey  ;
	startKey.setMin();
	endKey.setMax();
	lastKey.setMin();
	startKey = g_titledb.makeFirstTitleRecKey ( docid );
	// turn off threads
	g_threads.disableThreads();
	// get a meg at a time
	int32_t minRecSizes = 1024*1024;
	Msg5 msg5;
	Msg5 msg5b;
	Msg5 msg5c;
	RdbList list;
	RdbList ulist;

	if (!ucInit(g_hostdb.m_dir, true)) {
		log("Unicode initialization failed!");
	}

	g_tfndb.init ();
	g_collectiondb.init(true);
	g_tfndb.getRdb()->addRdbBase1 ( coll );

 loop:
	// use msg5 to get the list, should ALWAYS block since no threads
	if ( ! msg5.getList ( RDB_TITLEDB   ,
			      coll          ,
			      &list         ,
			      startKey      ,
			      endKey        ,
			      minRecSizes   ,
			      includeTree   ,
			      false         , // add to cache?
			      0             , // max cache age
			      startFileNum  ,
			      numFiles      ,
			      NULL          , // state
			      NULL          , // callback
			      0             , // niceness
			      false         , // err correction?
			      NULL          , // cache key ptr
			      0             , // retry num
			      -1            , // maxRetries
			      true          , // compensate for merge
			      -1LL          , // sync point
			      &msg5b        )){
		log(LOG_LOGIC,"db: getList did not block.");
		return;
	}
	// all done if empty
	if ( list.isEmpty() ) return;
	// loop over entries in list
	for ( list.resetListPtr() ; ! list.isExhausted() ;
	      list.skipCurrentRecord() ) {
		key_t k       = list.getCurrentKey();
		char *rec     = list.getCurrentRec();
		int32_t  recSize = list.getCurrentRecSize();
		//int64_t docId       = g_titledb.getDocIdFromKey ( k );
		if ( k <= lastKey )
			log("key out of order. "
			    "lastKey.n1=%"XINT32" n0=%"XINT64" "
			    "currKey.n1=%"XINT32" n0=%"XINT64" ",
			    lastKey.n1,lastKey.n0,
			    k.n1,k.n0);
		lastKey = k;
		// print deletes
// 		if ( (k.n0 & 0x01) == 0) {
// 			fprintf(stderr,"n1=%08"XINT32" n0=%016"XINT64" docId=%012"INT64" "
// 			       "hh=%07"XINT32" ch=%08"XINT32" (del)\n",
// 			       k.n1 , k.n0 , docId , hostHash , contentHash );
// 			continue;
// 		}
		// uncompress the title rec
		TitleRec tr;
		if ( ! tr.set ( rec , recSize , false ) )
			continue;
		Xml xml;
		char *s    = tr.getContent();
		int32_t  slen = tr.getContentLen();
		int16_t csEnum = tr.getCharset();
		if ( ! xml.set ( csEnum, s , slen ,
				 false , // ownData?
				 0,
				 false,
				 tr.getVersion() ) )
			continue;

		Links links;

		Url *linker = tr.getRedirUrl();
		//Xml *sx = g_tagdb.getSiteXml ( tr.getSiteFilenum(),
		//				coll , //tr.getColl() ,
		//				collLen);//tr.getCollLen());
		links.set ( true , &xml , linker , false, // includeLinkHashes
	true, TITLEREC_CURRENT_VERSION, // true=useBaseHref?
			    0 );


		Words words;
		words.set(&xml, true, 0);

		log(LOG_WARN, "looking at %s", tr.getUrl()->getUrl());
		//g_siteBonus.isSerp ( tr.getUrl(), &xml, &links, &words);
		g_siteBonus.getNegativeQualityWeight (tr.getUrl(),
						      &xml,
						      &links,
					  &words,
					  coll,
						      //NULL,//siterec
					  NULL,//safebuf
					  0);  //niceness

	}
	startKey = *(key_t *)list.getLastKey();
	startKey += (uint32_t) 1;
	// watch out for wrap around

	if ( startKey < *(key_t *)list.getLastKey() ) {
		return;
	}

	goto loop;
}


// Run automated qa test showing the differences between servers located at
// s1 and s2.
// u: optional filename of list of urls to check for parse diffs
// q: optional filename of list of queries to check for result diffs

void qaTest ( char *s1, char *s2, char *u, char *q) {
	QAClient qaClient;
	qaClient.init(s1, s2, u, q);

	//qaClient.parseUrls(urlList);
	//qaClient.diffQueries(queryList);
	// Crap, we need a loop
	qaClient.runTests();

}
// Need a test for the diff method used in qa test
void xmlDiffTest(char *file1, char *file2, DiffOpt *opt){
	diffXmlFiles(file1, file2, opt);
}
*/


// generate the copies that need to be done to scale from oldhosts.conf
// to newhosts.conf topology.
int collinject ( char *newHostsConf ) {

	g_hostdb.resetPortTables();

	Hostdb hdb;
	//if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
	if ( ! hdb.init( 0/*assume we're zero*/) ) {
		log("collinject failed. Could not init hostdb with %s",
		    newHostsConf);
		return -1;
	}

	// ptrs to the two hostdb's
	Hostdb *hdb1 = &g_hostdb;
	Hostdb *hdb2 = &hdb;

	if ( hdb1->m_numHosts != hdb2->m_numHosts ) {
		log("collinject: num hosts differ!");
		return -1;
	}

	// . ensure old hosts in g_hostdb are in a derivate groupId in
	//   newHostsConf
	// . old hosts may not even be present! consider them the same host,
	//   though, if have same ip and working dir, because that would
	//   interfere with a file copy.
	for ( int32_t i = 0 ; i < hdb1->m_numShards ; i++ ) {
		//Host *h1 = &hdb1->getHost(i);//m_hosts[i];
		//int32_t gid = hdb1->getGroupId ( i ); // groupNum
		uint32_t shardNum = (uint32_t)i;

		Host *h1 = hdb1->getShard ( shardNum );
		Host *h2 = hdb2->getShard ( shardNum );

		printf("ssh %s 'nohup /w/gbi -w /w/ inject titledb "
		       "%s:%"INT32" >& /w/ilog' &\n"
		       , h1->m_hostname
		       , iptoa(h2->m_ip)
		       //, h2->m_hostname
		       , (int32_t)h2->m_httpPort
		       );
	}
	return 1;
}

bool isRecoveryFutile ( ) {

	// scan logs in last 60 seconds
	Dir dir;
	dir.set ( g_hostdb.m_dir );
	dir.open ();

	// scan files in dir
	char *filename;

	int32_t now = getTimeLocal();

	int32_t fails = 0;

	// getNextFilename() writes into this
	char pattern[8]; strcpy ( pattern , "*"); // log*-*" );

	while ( ( filename = dir.getNextFilename ( pattern ) ) ) {
		// filename must be a certain length
		//int32_t filenameLen = gbstrlen(filename);

		char *p = filename;

		if ( !strstr ( filename,"log") ) continue;

		// skip "log"
		p += 3;
		// skip digits for hostid
		while ( isdigit(*p) ) p++;

		// skip hyphen
		if ( *p != '-' ) continue;
		p++;

		// open file
		File ff;
		ff.set ( dir.getDir() , filename );
		// skip if 0 bytes or had error calling ff.getFileSize()
		int32_t fsize = ff.getFileSize();
		if ( fsize == 0 ) continue;
		ff.open ( O_RDONLY );
		// get time stamp
		int32_t timestamp = ff.getLastModifiedTime ( );

		// skip if not iwthin 2 minutes
		if ( timestamp < now - 2*60 ) continue;

		// open it up to see if ends with sighandle
		int32_t toRead = 3000;
		if ( toRead > fsize ) toRead = fsize;
		char mbuf[3002];
		ff.read ( mbuf , toRead , fsize - toRead );

		bool failedToStart = false;

		if ( strstr (mbuf,"sigbadhandler") ) failedToStart = true;
		if ( strstr (mbuf,"Failed to bind") ) failedToStart = true;

		if ( ! failedToStart ) continue;

		// count it otherwise
		fails++;
	}

	// if we had less than 5 failures to start in last 60 secs
	// do not consider futile
	if ( fails < 5 ) return false;

	log("process: KEEP ALIVE LOOP GIVING UP. Five or more cores in "
	    "last 60 seconds.");

	// otherwise, give up!
	return true;
}

char *getcwd2 ( char *arg2 ) {

	// get full absolute non-symlink path from /proc/<pid>
	/*
	pid_t pid = getpid();
	char ff[128];
	sprintf(ff,"/proc/%"INT64"/cmdline",(int64_t)pid);

	int fd = open ( ff , O_RDONLY, 0 );
	if ( ! fd ) return NULL;

	static char s_cmdline[1024];
	int32_t len = read ( fd , s_cmdline, 990 );
	if ( len<=0 || len > 1000 ) return NULL;

	// take the /gb off the end
	char *cend = s_cmdline + gbstrlen(s_cmdline)-1;
	while ( cend>s_cmdline && *cend!='/' ) cend--;
	if ( cend > s_cmdline ) end[1] = '\0';

	return s_cmdline;
	*/


	// test it
	//arg2 = "/bin/gb";

	//fprintf(stderr,"arg2=%s\n",arg2);


	char argBuf[1026];
	char *arg = argBuf;

	//
	// arg2 examples:
	// ./gb
	// /bin/gb (symlink to ../../var/gigablast/data0/gb)
	// /usr/bin/gb (symlink to ../../var/gigablast/data0/gb)
	//

	//
	// if it is a symbolic link...
	// get real path (no symlinks symbolic links)
	char tmp[1026];
	int32_t tlen = readlink ( arg2 , tmp , 1020 );
	// if we got the actual path, copy that over
	if ( tlen != -1 ) {
		//fprintf(stderr,"tmp=%s\n",tmp);
		// if symbolic link is relative...
		if ( tmp[0]=='.' && tmp[1]=='.') {
			// store original path (/bin/gb --> ../../var/gigablast/data/gb)
			strcpy(arg,arg2); // /bin/gb
			// back up to /
			while(arg[gbstrlen(arg)-1] != '/' ) arg[gbstrlen(arg)-1] = '\0';
			int32_t len2 = gbstrlen(arg);
			strcpy(arg+len2,tmp);
		}
		else {
			strcpy(arg,tmp);
		}
	}
	else {
		strcpy(arg,arg2);
	}

 again:
	// now remove ..'s from path
	char *p = arg;
	// char *start = arg;
	for ( ; *p ; p++ ) {
		if (p[0] != '.' || p[1] !='.' ) continue;
		// if .. is at start of string
		if ( p == arg ) {
			gbmemcpy ( arg , p+2,gbstrlen(p+2)+1);
			goto again;
		}
		// find previous /
		char *slash = p-1;
		if ( *slash !='/' ) { char *xx=NULL;*xx=0; }
		slash--;
		for ( ; slash > arg && *slash != '/' ; slash-- );
		if ( slash<arg) slash=arg;
		gbmemcpy(slash,p+2,gbstrlen(p+2)+1);
		goto again;
		// if can't back up anymore...
	}


	//fprintf(stderr,"arg=%s\n",arg);


	// skip initial . and /
	//if ( arg[0] == '.' && arg[1] == '/' ) arg += 1;

	char *a = arg;

	// remove "gb" from the end
	int32_t alen = 0;
	for ( ; *a ; a++ ) {
		if ( *a != '/' ) continue;
		alen = a - arg + 1;
	}
	if ( alen > 512 ) {
		log("db: path is too long");
		g_errno = EBADENGINEER;
		return NULL;
	}
	// hack off the "gb" (seems to hack off the "/gb")
	//*a = '\0';
	// don't hack off the "/gb" just the "gb"
	arg[alen] = '\0';

	// get cwd which is only relevant to us if arg starts
	// with . at this point
	static char s_cwdBuf[1025];
	getcwd ( s_cwdBuf , 1020 );
	char *end = s_cwdBuf + gbstrlen(s_cwdBuf);
	// make sure that shit ends in /
	if ( s_cwdBuf[gbstrlen(s_cwdBuf)-1] != '/' ) {
		int32_t len = gbstrlen(s_cwdBuf);
		s_cwdBuf[len] = '/';
		s_cwdBuf[len+1] = '\0';
		end++;
	}

	//fprintf(stderr,"cwdBuf=%s\n",s_cwdBuf);
	//fprintf(stderr,"arg=%s\n",arg);


	// if "arg" is a RELATIVE path then append it
	if ( arg && arg[0]!='/' ) {
		if ( arg[0]=='.' && arg[1]=='/' ) {
			gbmemcpy ( end , arg+2 , alen -2 );
			end += alen - 2;
		}
		else {
			gbmemcpy ( end , arg , alen );
			end += alen;
		}
		*end = '\0';
	}
	// if our path started with / then it was absolute...
	else {
		strncpy(s_cwdBuf,arg,alen);
		s_cwdBuf[alen]='\0';
	}

	// make sure it ends in / for consistency
	int32_t clen = gbstrlen(s_cwdBuf);
	if ( s_cwdBuf[clen-1] != '/' ) {
		s_cwdBuf[clen-1] = '/';
		s_cwdBuf[clen] = '\0';
		clen--;
	}

	//fprintf(stderr,"cwdBuf is %s\n",s_cwdBuf);
	// size of the whole thing
	//int32_t clen = gbstrlen(s_cwdBuf);
	// store terminating /
	//if ( clen < 1024 ) {
	//	s_cwdBuf[clen] = '/';
	//	s_cwdBuf[clen+1] = '\0';
	//}

	//log("hey: hey %s",s_cwdBuf);

	// ensure 'gb' binary exists in that dir.
	// binaryCmd is usually gb but use this just in case
	char *binaryCmd = arg2 + gbstrlen(arg2) - 1;
	for ( ; binaryCmd[-1] && binaryCmd[-1] != '/' ; binaryCmd-- );
	File fff;
	fff.set (s_cwdBuf,binaryCmd);

	// assume it is in the usual spot
	if ( fff.doesExist() ) return s_cwdBuf;

	// try just "gb" as binary
	fff.set(s_cwdBuf,"gb");
	if ( fff.doesExist() ) return s_cwdBuf;

	// if nothing is found resort to the default location
	return "/var/gigablast/data0/";
}

///////
//
// used to make package to install files for the package
//
///////
int copyFiles ( char *dstDir ) {

	char *srcDir = "./";
	SafeBuf fileListBuf;
	g_process.getFilesToCopy ( srcDir , &fileListBuf );

	// include data files so when building a debian/redhat
	// package 'make install' we copy those as well.
	// no let's just build it the first time gb runs, but gb should
	// bind to port 8000 before building and just return a msg
	// that says "pls wait while building data files for the first time"
	//File f;
	//f.set ( srcDir ,"wikititles2.dat");
	//if ( f.doesExist() )
	//	fileListBuf.safePrintf(" %s",f.getFilename());

	SafeBuf tmp;
	tmp.safePrintf(
		       "cp -r %s %s"
		       , fileListBuf.getBufStart()
		       , dstDir
		       );

	//log(LOG_INIT,"admin: %s", tmp.getBufStart());
	fprintf(stderr,"\nRunning cmd: %s\n",tmp.getBufStart());
	system ( tmp.getBufStart() );
	return 0;
}

void rmTest() {

	File f;
	// make five files
	int32_t max = 100;

	for ( int32_t i = 0 ; i < max ; i++ ) {
		SafeBuf fn;
		fn.safePrintf("./tmpfile%"INT32"",i);
		SafeBuf sb;
		for ( int32_t j = 0 ; j < 100 ; j++ ) {
			sb.safePrintf("%"INT32"\n",(int32_t)rand());
		}
		sb.save ( fn.getBufStart() );
	}

	// now delete
	fprintf(stderr,"Deleting files\n");
	int64_t now = gettimeofdayInMilliseconds_force();

	for ( int32_t i = 0 ; i < max ; i++ ) {
		SafeBuf fn;
		fn.safePrintf("./tmpfile%"INT32"",i);
		File f;
		f.set ( fn.getBufStart() );
		f.unlink();
	}

	int64_t took = gettimeofdayInMilliseconds_force() - now;

	fprintf(stderr,"Deleting files took %"INT64" ms\n",took);

}