Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

2024-10-04 12:17:35 +03:00 · 2014-02-28 08:23:59 -08:00 · 2014-02-28 08:23:59 -08:00 · 11efab9862
commit 11efab9862
parent 1030e6ada8 5f3aa24805
14 changed files with 227 additions and 152 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -452,7 +452,7 @@ bool Collectiondb::addNewColl ( char *coll ,
 		cr->m_diffbotOnlyProcessIfNewUrl = true;
 		// default respider to off
 		cr->m_collectiveRespiderFrequency = 0.0;
-		cr->m_restrictDomain = true;
+		//cr->m_restrictDomain = true;
 		// reset the crawl stats
 		// . this will core if a host was dead and then when it came
 		//   back up host #0's parms.cpp told it to add a new coll
@ -2091,6 +2091,66 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 	if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
 	if ( upp && ! upp[0] ) upp = NULL;

+	///////
+	//
+	// recompile regular expressions
+	//
+	///////
+
+
+	if ( m_hasucr ) {
+		regfree ( &m_ucr );
+		m_hasucr = false;
+	}
+
+	if ( m_hasupr ) {
+		regfree ( &m_upr );
+		m_hasupr = false;
+	}
+
+	// copy into tmpbuf
+	SafeBuf tmp;
+
+	char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
+	if ( rx && ! rx[0] ) rx = NULL;
+	if ( rx ) {
+		tmp.reset();
+		tmp.safeStrcpy ( rx );
+		expandRegExShortcuts ( &tmp );
+		m_hasucr = true;
+	}
+	if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
+			     REG_EXTENDED| //REG_ICASE|
+			     REG_NEWLINE ) ) { // |REG_NOSUB) ) {
+		// error!
+		log("coll: regcomp %s failed: %s. "
+			   "Ignoring.",
+			   rx,mstrerror(errno));
+		regfree ( &m_ucr );
+		m_hasucr = false;
+	}
+
+
+	rx = m_diffbotUrlProcessRegEx.getBufStart();
+	if ( rx && ! rx[0] ) rx = NULL;
+	if ( rx ) m_hasupr = true;
+	if ( rx ) {
+		tmp.reset();
+		tmp.safeStrcpy ( rx );
+		expandRegExShortcuts ( &tmp );
+		m_hasupr = true;
+	}
+	if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
+			     REG_EXTENDED| // REG_ICASE|
+			     REG_NEWLINE ) ) { // |REG_NOSUB) ) {
+		// error!
+		log("coll: regcomp %s failed: %s. "
+		    "Ignoring.",
+		    rx,mstrerror(errno));
+		regfree ( &m_upr );
+		m_hasupr = false;
+	}
+

 	// what diffbot url to use for processing
 	char *api = m_diffbotApiUrl.getBufStart();
@ -2139,11 +2199,18 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 	// 2nd default filter
 	// always turn this on for now. they need to add domains they want
 	// to crawl as seeds so they do not spider the web.
-	//if ( m_restrictDomain ) {
-	m_regExs[i].set("!isonsamedomain && !ismanualadd");
-	m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
-	i++;
-	//}
+	// no because FTB seeds with link pages that link to another
+	// domain. they just need to be sure to supply a crawl pattern
+	// to avoid spidering the whole web.
+	//
+	// if they did not EXPLICITLY provide a url crawl pattern or
+	// url crawl regex then restrict to seeds to prevent from spidering
+	// the entire internet
+	if ( ! ucp && ! m_hasucr ) { // m_restrictDomain ) {
+		m_regExs[i].set("!isonsamedomain && !ismanualadd");
+		m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
+		i++;
+	}

 	m_regExs[i].set("errorcount>=1 && !hastmperror");
 	m_spiderPriorities   [i] = 15;
@ -2268,66 +2335,6 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 	m_numRegExs8  = i;
 	//m_numRegExs11 = i;

-	///////
-	//
-	// recompile regular expressions
-	//
-	///////
-
-
-	if ( m_hasucr ) {
-		regfree ( &m_ucr );
-		m_hasucr = false;
-	}
-
-	if ( m_hasupr ) {
-		regfree ( &m_upr );
-		m_hasupr = false;
-	}
-
-	// copy into tmpbuf
-	SafeBuf tmp;
-
-	char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
-	if ( rx && ! rx[0] ) rx = NULL;
-	if ( rx ) {
-		tmp.reset();
-		tmp.safeStrcpy ( rx );
-		expandRegExShortcuts ( &tmp );
-		m_hasucr = true;
-	}
-	if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
-			     REG_EXTENDED| //REG_ICASE|
-			     REG_NEWLINE ) ) { // |REG_NOSUB) ) {
-		// error!
-		log("coll: regcomp %s failed: %s. "
-			   "Ignoring.",
-			   rx,mstrerror(errno));
-		regfree ( &m_ucr );
-		m_hasucr = false;
-	}
-
-
-	rx = m_diffbotUrlProcessRegEx.getBufStart();
-	if ( rx && ! rx[0] ) rx = NULL;
-	if ( rx ) m_hasupr = true;
-	if ( rx ) {
-		tmp.reset();
-		tmp.safeStrcpy ( rx );
-		expandRegExShortcuts ( &tmp );
-		m_hasupr = true;
-	}
-	if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
-			     REG_EXTENDED| // REG_ICASE|
-			     REG_NEWLINE ) ) { // |REG_NOSUB) ) {
-		// error!
-		log("coll: regcomp %s failed: %s. "
-		    "Ignoring.",
-		    rx,mstrerror(errno));
-		regfree ( &m_upr );
-		m_hasupr = false;
-	}
-

 	//char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
 	//if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }
--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -458,7 +458,7 @@ class CollectionRec {
 	char  m_enforceNewQuotas        ;
 	char  m_doIpLookups             ; // considered iff using proxy
 	char  m_useRobotsTxt            ;
-	char  m_restrictDomain          ; // say on same domain as seeds?
+	//char  m_restrictDomain          ; // say on same domain as seeds?
 	char  m_doTuringTest            ; // for addurl
 	char  m_applyFilterToText       ; // speeds us up
 	char  m_allowHttps              ; // read HTTPS using SSL
--- a/Hostdb.cpp
+++ b/Hostdb.cpp
@ -2315,10 +2315,10 @@ uint32_t Hostdb::getShardNum ( char rdbId,void *k ) { // ,bool split ) {
 	else if ( rdbId == RDB_LINKDB || rdbId == RDB2_LINKDB2 ) {
 		return m_map [(*(uint16_t *)((char *)k + 26))>>3];	
 	}
-	else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
-		unsigned long long d = g_tfndb.getDocId ( (key_t *)k );
-		return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
-	}
+	//else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
+	//	unsigned long long d = g_tfndb.getDocId ( (key_t *)k );
+	//	return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
+	//}
 	else if ( rdbId == RDB_TITLEDB || rdbId == RDB2_TITLEDB2 ) {
 		unsigned long long d = g_titledb.getDocId ( (key_t *)k );
 		return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
--- a/Linkdb.cpp
+++ b/Linkdb.cpp
@ -633,8 +633,8 @@ static void sendReplyWrapper ( void *state ) {
 	// steal this buffer
 	char *reply1 = info->getBufStart();
 	long  replySize = info->length();
-	// sanity
-	if ( replySize <= 0 ) { char *xx=NULL;*xx=0; }
+	// sanity. no if collrec not found its 0!
+	if ( ! saved && replySize <= 0 ) { char *xx=NULL;*xx=0; }
 	// get original request
 	Msg25Request *req = (Msg25Request *)slot2->m_readBuf;
 	// sanity
@ -645,7 +645,10 @@ static void sendReplyWrapper ( void *state ) {
 nextLink:

 	UdpSlot *udpSlot = req->m_udpSlot;
-	
+
+	// update for next udpSlot
+	req = req->m_next;
+
 	// just dup the reply for each one
 	char *reply2 = (char *)mdup(reply1,replySize,"m25repd");

@ -666,7 +669,6 @@ static void sendReplyWrapper ( void *state ) {
 	}

 	// if we had a link
-	req = req->m_next;
 	if ( req ) goto nextLink;

 	// the destructor
@ -684,6 +686,10 @@ void  handleRequest25 ( UdpSlot *slot , long netnice ) {
 	// make sure this always NULL for our linked list logic
 	req->m_next = NULL;

+	// udp socket for sending back the final linkInfo in m_linkInfoBuf
+	// used by sendReply()
+	req->m_udpSlot = slot;
+
 	// set up the hashtable if our first time
 	if ( ! g_lineTable.isInitialized() )
 		g_lineTable.set ( 8,4,256,NULL,0,false,MAX_NICENESS,"lht25");
@ -735,10 +741,6 @@ void  handleRequest25 ( UdpSlot *slot , long netnice ) {
 	// point to a real safebuf here for populating with data
 	m25->m_linkInfoBuf = &m25->m_realBuf;

-	// udp socket for sending back the final linkInfo in m_linkInfoBuf
-	// used by sendReply()
-	req->m_udpSlot = slot;
-
 	// set some new stuff. should probably be set in getLinkInfo2()
 	// but we are trying to leave that as unaltered as possible to
 	// try to reduce debugging.
--- a/2
+++ b/2
@ -2,7 +2,7 @@ SHELL = /bin/bash

 CC=g++

-OBJS =  Tfndb.o UdpSlot.o Rebalance.o \
+OBJS =  UdpSlot.o Rebalance.o \
 	Msg13.o Mime.o IndexReadInfo.o \
 	PageGet.o PageHosts.o PageIndexdb.o PageLogin.o \
 	PageParser.o PageInject.o PagePerf.o PageReindex.o PageResults.o \
--- a/Msg20.cpp
+++ b/Msg20.cpp
@ -14,6 +14,7 @@ void Msg20::constructor () {
 	m_r       = NULL;
 	m_inProgress = false;
 	m_launched = false;
+	m_i = -1;
 	reset();
 	m_mcast.constructor();
 }
--- a/Msg40.cpp
+++ b/Msg40.cpp
@ -881,19 +881,20 @@ bool Msg40::reallocMsg20Buf ( ) {
 		return true;
 	}

-	m_buf2        = NULL;
-	m_bufMaxSize2 = need;
 	m_numMsg20s = m_msg3a.m_numDocIds;

 	// when streaming because we can have hundreds of thousands of
 	// search results we recycle a few msg20s to save mem
 	if ( m_si->m_streamResults ) {
-		long max = MAX_OUTSTANDING_MSG20S;
+		long max = MAX_OUTSTANDING_MSG20S * 2;
 		if ( m_msg3a.m_numDocIds < max ) max = m_msg3a.m_numDocIds;
 		need = max * (4+sizeof(Msg20));
 		m_numMsg20s = max;
 	}

+	m_buf2        = NULL;
+	m_bufMaxSize2 = need;
+
 	// do the alloc
 	if ( need ) m_buf2 = (char *)mmalloc ( need ,"Msg40msg20");
 	if ( need && ! m_buf2 ) { m_errno = g_errno; return false; }
@ -1033,6 +1034,11 @@ bool Msg40::launchMsg20s ( bool recalled ) {
 		//if ( m_numRequests-m_numReplies >= need ) break;
 		// hard limit
 		if ( m_numRequests-m_numReplies >= maxOut ) break;
+		// do not launch another until m_printi comes back because
+		// all summaries are bottlenecked on printing him out now
+		if ( m_si->m_streamResults &&
+		     i >= m_printi + MAX_OUTSTANDING_MSG20S - 1 )
+			break;
 		// do not double count!
 		//if ( i <= m_lastProcessedi ) continue;
 		// do not repeat for this i
@ -1238,7 +1244,8 @@ Msg20 *Msg40::getAvailMsg20 ( ) {
 		// m_inProgress is set to false right before it
 		// calls Msg20::m_callback which is gotSummaryWrapper()
 		// so we should be ok with this
-		if ( ! m_msg20[i]->m_inProgress ) return m_msg20[i];
+		if ( m_msg20[i]->m_launched ) continue;
+		return m_msg20[i];
 	}
 	// how can this happen???
 	char *xx=NULL;*xx=0; 
@ -1377,27 +1384,42 @@ bool Msg40::gotSummary ( ) {
 		// otherwise, get the summary for result #m_printi
 		//Msg20 *m20 = m_msg20[m_printi];

-		if ( ! m20 ) {
-			log("msg40: m20 NULL #%li",m_printi);
-			continue;
-		}
+		//if ( ! m20 ) {
+		//	log("msg40: m20 NULL #%li",m_printi);
+		//	continue;
+		//}
+
+		// if result summary #i not yet in, wait...
+		if ( ! m20 ) 
+			break;
+
+		// wait if no reply for it yet
+		//if ( m20->m_inProgress )
+		//	break;
+
 		if ( m20->m_errno ) {
 			log("msg40: sum #%li error: %s",
 			    m_printi,mstrerror(m20->m_errno));
+			// make it available to be reused
+			m20->reset();
 			continue;
 		}

 		// get the next reply we are waiting on to print results order
 		Msg20Reply *mr = m20->m_r;
 		if ( ! mr ) break;
+		//if ( ! mr ) { char *xx=NULL;*xx=0; }

 		// primitive deduping. for diffbot json exclude url's from the
 		// XmlDoc::m_contentHash32.. it will be zero if invalid i guess
 		if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
 		     mr->m_contentHash32 &&
 		     m_dedupTable.isInTable ( &mr->m_contentHash32 ) ) {
+			//if ( g_conf.m_logDebugQuery )
 			log("msg40: dup sum #%li (%lu)",m_printi,
 			    mr->m_contentHash32);
+			// make it available to be reused
+			m20->reset();
 			continue;
 		}

@ -1418,8 +1440,12 @@ bool Msg40::gotSummary ( ) {
 		printSearchResult9 ( m_printi );

 		// now free the reply to save memory since we could be 
-		// streaming back 1M+
-		m20->freeReply();
+		// streaming back 1M+. we call reset below, no need for this.
+		//m20->freeReply();
+
+		// return it so getAvailMsg20() can use it again
+		// this will set m_launched to false
+		m20->reset();
 	}

 	// set it to true on all but the last thing we send!
@ -1477,6 +1503,9 @@ bool Msg40::gotSummary ( ) {
 		//   do a recursive stack explosion
 		// . this returns false if still waiting on more to come back
 		if ( ! launchMsg20s ( true ) ) return false; 
+		// it won't launch now if we are bottlnecked waiting for
+		// m_printi's summary to come in
+		if ( m_si->m_streamResults ) return false;
 		// maybe some were cached?
 		//goto refilter;
 		// it returned true, so m_numRequests == m_numReplies and
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -784,6 +784,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 			// do not print "Fake First Ip"...
 			if ( m_prevReplyError == EFAKEFIRSTIP )
 				msg = "Initial crawl request";
+			// if the initial crawl request got a reply then that
+			// means the spiderrequest was added under the correct
+			// firstip... so skip it. i am assuming that the
+			// correct spidrerequest got added ok here...
+			if ( m_prevReplyError == EFAKEFIRSTIP )
+				continue;
 		}

 		if ( srep && srep->m_hadDiffbotError )
@ -1533,7 +1539,7 @@ static class HelpItem s_his[] = {
 	 "the maxtocrawl or maxtoprocess limit, or when the crawl "
 	 "completes."},
 	{"obeyRobots","Obey robots.txt files?"},
-	{"restrictDomain","Restrict downloaded urls to domains of seeds?"},
+	//{"restrictDomain","Restrict downloaded urls to domains of seeds?"},

 	{"urlCrawlPattern","List of || separated strings. If the url "
 	 "contains any of these then we crawl the url, otherwise, we do not. "
@ -2365,11 +2371,11 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
 			      // settable parms
 			      "\"maxToCrawl\":%lli,\n"
 			      "\"maxToProcess\":%lli,\n"
-			      "\"restrictDomain\":%li,\n"
+			      //"\"restrictDomain\":%li,\n"
 			      "\"onlyProcessIfNew\":%li,\n"
 			      , cx->m_maxToCrawl
 			      , cx->m_maxToProcess
-			      , (long)cx->m_restrictDomain
+			      //, (long)cx->m_restrictDomain
 			      , (long)cx->m_diffbotOnlyProcessIfNewUrl
 			      );
 		sb.safePrintf("\"seeds\":\"");
@ -3344,13 +3350,15 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			urtYes = "";
 			urtNo  = " checked";
 		}
-		
+
+		/*
 		char *rdomYes = " checked";
 		char *rdomNo  = "";
 		if ( ! cr->m_restrictDomain ) {
 			rdomYes = "";
 			rdomNo  = " checked";
 		}
+		*/

 		char *isNewYes = "";
 		char *isNewNo  = " checked";
@ -3541,15 +3549,15 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      "</td>"
 			      "</tr>"

-			      "<tr><td>"
-			      "<b>Restrict domain to seeds?</b> "
-			      "</td><td>"
-			      "<input type=radio name=restrictDomain "
-			      "value=1%s> yes &nbsp; "
-			      "<input type=radio name=restrictDomain "
-			      "value=0%s> no &nbsp; "
-			      "</td>"
-			      "</tr>"
+			      //"<tr><td>"
+			      //"<b>Restrict domain to seeds?</b> "
+			      //"</td><td>"
+			      //"<input type=radio name=restrictDomain "
+			      //"value=1%s> yes &nbsp; "
+			      //"<input type=radio name=restrictDomain "
+			      //"value=0%s> no &nbsp; "
+			      //"</td>"
+			      //"</tr>"

 			      //"<tr><td>"
 			      //"Use spider proxies on AWS? "
@ -3592,8 +3600,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      , urtYes
 			      , urtNo

-			      , rdomYes
-			      , rdomNo
+			      //, rdomYes
+			      //, rdomNo

 			      );
 	}
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -2057,8 +2057,15 @@ bool printResult ( State0 *st, long ix ) {
 	}


-	Msg20      *m20 = msg40->m_msg20[ix];
-	Msg20Reply *mr  = m20->m_r;
+	Msg20      *m20 ;
+	if ( si->m_streamResults )
+		m20 = msg40->getCompletedSummary(ix);
+	else
+		m20 = msg40->m_msg20[ix];
+
+	// get the reply
+	Msg20Reply *mr = m20->m_r;
+		

 	// . sometimes the msg20reply is NULL so prevent it coring
 	// . i think this happens if all hosts in a shard are down or timeout
@ -5302,6 +5309,14 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
 		ji = ptrs[i];
 		// skip if none
 		if ( ! ji ) continue;
+
+		// skip "html" field... too spammy for csv and > 32k causes
+		// libreoffice calc to truncate it and break its parsing
+		if ( ji->m_name && 
+		     //! ji->m_parent &&
+		     strcmp(ji->m_name,"html")==0)
+			continue;
+
 		//
 		// get value and print otherwise
 		//
--- a/Parms.cpp
+++ b/Parms.cpp
@ -9963,6 +9963,7 @@ void Parms::init ( ) {
 	m++;

 	// use url filters for this. this is a crawlbot parm really.
+	/*
 	m->m_title = "restrict domain";
 	m->m_desc  = "Keep crawler on same domain as seed urls?";
 	m->m_cgi   = "restrictDomain";
@ -9972,6 +9973,7 @@ void Parms::init ( ) {
 	// we need to save this it is a diffbot parm
 	m->m_flags = PF_HIDDEN | PF_DIFFBOT;// | PF_NOSAVE;
 	m++;
+	*/

 	m->m_title = "do url sporn checking";
 	m->m_desc  = "If this is true and the spider finds "
--- a/Rdb.cpp
+++ b/Rdb.cpp
@ -13,7 +13,7 @@
 #include "Datedb.h"
 #include "Titledb.h"
 #include "Spider.h"
-#include "Tfndb.h"
+//#include "Tfndb.h"
 //#include "Sync.h"
 #include "Spider.h"
 #include "Repair.h"
@ -2648,7 +2648,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
 		s_table9 [ RDB_SYNCDB    ] = g_syncdb.getRdb();
 		s_table9 [ RDB_SPIDERDB  ] = g_spiderdb.getRdb();
 		s_table9 [ RDB_DOLEDB    ] = g_doledb.getRdb();
-		s_table9 [ RDB_TFNDB     ] = g_tfndb.getRdb();
+		//s_table9 [ RDB_TFNDB     ] = g_tfndb.getRdb();
 		s_table9 [ RDB_CLUSTERDB ] = g_clusterdb.getRdb();
 		s_table9 [ RDB_CATDB     ] = g_catdb.getRdb();
 		s_table9 [ RDB_DATEDB    ] = g_datedb.getRdb();
@ -2667,7 +2667,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
 		s_table9 [ RDB2_SECTIONDB2 ] = g_sectiondb2.getRdb();
 		s_table9 [ RDB2_PLACEDB2   ] = g_placedb2.getRdb();
 		s_table9 [ RDB2_SPIDERDB2  ] = g_spiderdb2.getRdb();
-		s_table9 [ RDB2_TFNDB2     ] = g_tfndb2.getRdb();
+		//s_table9 [ RDB2_TFNDB2     ] = g_tfndb2.getRdb();
 		s_table9 [ RDB2_CLUSTERDB2 ] = g_clusterdb2.getRdb();
 		s_table9 [ RDB2_DATEDB2    ] = g_datedb2.getRdb();
 		s_table9 [ RDB2_LINKDB2    ] = g_linkdb2.getRdb();
@ -2691,7 +2691,7 @@ char getIdFromRdb ( Rdb *rdb ) {
 	//if ( rdb == g_checksumdb.getRdb() ) return RDB_CHECKSUMDB;
 	if ( rdb == g_spiderdb.getRdb  () ) return RDB_SPIDERDB;
 	if ( rdb == g_doledb.getRdb    () ) return RDB_DOLEDB;
-	if ( rdb == g_tfndb.getRdb     () ) return RDB_TFNDB;
+	//if ( rdb == g_tfndb.getRdb     () ) return RDB_TFNDB;
 	if ( rdb == g_clusterdb.getRdb () ) return RDB_CLUSTERDB;
 	if ( rdb == g_statsdb.getRdb   () ) return RDB_STATSDB;
 	if ( rdb == g_linkdb.getRdb    () ) return RDB_LINKDB;
@ -2712,7 +2712,7 @@ char getIdFromRdb ( Rdb *rdb ) {
 	if ( rdb == g_placedb2.getRdb   () ) return RDB2_PLACEDB2;
 	//if ( rdb == g_checksumdb2.getRdb() ) return RDB2_CHECKSUMDB2;
 	if ( rdb == g_spiderdb2.getRdb  () ) return RDB2_SPIDERDB2;
-	if ( rdb == g_tfndb2.getRdb     () ) return RDB2_TFNDB2;
+	//if ( rdb == g_tfndb2.getRdb     () ) return RDB2_TFNDB2;
 	if ( rdb == g_clusterdb2.getRdb () ) return RDB2_CLUSTERDB2;
 	//if ( rdb == g_statsdb2.getRdb   () ) return RDB2_STATSDB2;
 	if ( rdb == g_linkdb2.getRdb    () ) return RDB2_LINKDB2;
--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -2,7 +2,7 @@

 #include "Rdb.h"
 #include "Msg35.h"
-#include "Tfndb.h"
+//#include "Tfndb.h"
 //#include "Checksumdb.h"
 #include "Clusterdb.h"
 #include "Hostdb.h"
@ -966,7 +966,7 @@ bool RdbBase::incorporateMerge ( ) {

 	// tfndb has his own merge class since titledb merges write tfndb recs
 	RdbMerge *m = &g_merge;
-	if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
+	//if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;

 	// print out info of newly merged file
 	long long tp = m_maps[x]->getNumPositiveRecs();
@ -974,7 +974,7 @@ bool RdbBase::incorporateMerge ( ) {
 	log(LOG_INFO,
 	    "merge: Merge succeeded. %s (#%li) has %lli positive "
 	     "and %lli negative recs.", m_files[x]->getFilename(), x, tp, tn);
-	if ( m_rdb == g_posdb.getRdb() || m_rdb == g_tfndb.getRdb() )
+	if ( m_rdb == g_posdb.getRdb() ) // || m_rdb == g_tfndb.getRdb() )
 		log(LOG_INFO,"merge: Removed %lli dup keys.",
 		     m->getDupsRemoved() );
 	// . bitch if bad news
@ -1470,8 +1470,8 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
 	// if we are tfndb and someone else is merging, do not merge unless
 	// we have 3 or more files
 	long minToMerge = m_minToMerge;
-	if (g_tfndb.getRdb()==m_rdb&& g_merge.isMerging() && minToMerge <=2 )
-		minToMerge = 3;
+	//if (g_tfndb.getRdb()==m_rdb&& g_merge.isMerging() && minToMerge <=2 )
+	//	minToMerge = 3;
 	// do not start a tfndb merge while someone is dumping because the
 	// dump starves the tfndb merge and we clog up adding links. i think
 	// this is mainly just indexdb dumps, but we'll see.
@ -1565,7 +1565,7 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
 	//if ( m_mergeUrgent ) priority = 2;
 	//else                 priority = 0;
 	// tfndb doesn't need token, since titledb merge writes tfndb recs
-	if ( m_rdb != g_tfndb.getRdb() &&
+	if ( //m_rdb != g_tfndb.getRdb() &&
 	     ! g_msg35.getToken ( this , gotTokenForMergeWrapper, priority ) )
 		return ;
 	// bitch if we got token because there was an error somewhere
@ -1616,7 +1616,7 @@ void RdbBase::gotTokenForMerge ( ) {
 	}
 	// tfndb has his own merge class since titledb merges write tfndb recs
 	RdbMerge *m = &g_merge;
-	if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
+	//if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
 	// sanity check
 	if ( m_isMerging || m->isMerging() ) {
 		//if ( m_doLog )
@ -1724,8 +1724,8 @@ void RdbBase::gotTokenForMerge ( ) {
 	}

 	minToMerge = m_minToMerge;
-	if (m_rdb==g_tfndb.getRdb()&& g_merge.isMerging() && minToMerge <=2 )
-		minToMerge = 3;
+	//if (m_rdb==g_tfndb.getRdb()&& g_merge.isMerging() && minToMerge <=2 )
+	//	minToMerge = 3;

 	// look at this merge:
 	// indexdb0003.dat.part1
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -2101,8 +2101,15 @@ bool XmlDoc::indexDoc ( ) {
 		//    cr->m_localCrawlInfo.m_pageDownloadAttempts);
 		// this is just how many urls we tried to index
 		//cr->m_localCrawlInfo.m_urlsConsidered++;
-		cr->m_localCrawlInfo.m_pageDownloadAttempts++;
-		cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
+		// avoid counting if it is a fake first ip
+		bool countIt = true;
+		// pagereindex.cpp sets this as does any add url (bulk job)
+		if ( m_sreqValid && m_sreq.m_fakeFirstIp ) 
+			countIt = false;
+		if ( countIt ) {
+			cr->m_localCrawlInfo.m_pageDownloadAttempts++;
+			cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
+		}
 		// need to save collection rec now during auto save
 		cr->m_needsSave = true;
 		// update this just in case we are the last url crawled
@ -2358,7 +2365,8 @@ bool XmlDoc::indexDoc2 ( ) {
 	//	return false;


-
+	// MDW: we do this in indexDoc() above why do we need it here?
+	/*
 	// even if not using diffbot, keep track of these counts
 	if ( ! m_isDiffbotJSONObject && 
 	     ! m_incrementedAttemptsCount ) {
@ -2374,7 +2382,7 @@ bool XmlDoc::indexDoc2 ( ) {
 		long long now = gettimeofdayInMillisecondsGlobal();
 		cr->m_diffbotCrawlEndTime = now;
 	}
-
+	*/
 	/*
 	// if we are being called from Spider.cpp and we met our max
 	// to crawl requirement, then bail out on this. this might
@ -12973,11 +12981,13 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
 		// because we need the anchor text to pass in to diffbot
 		bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
 		bool oneVotePerIpDom = cr->m_oneVotePerIpDom;
-		if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
-			doLinkSpamCheck     = false;
-			oneVotePerIpDom     = false;
-			onlyNeedGoodInlinks = false;
-		}
+		// this seems to overdo it when we have a ton of linktext
+		// perhaps, so take this out...
+		//if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
+		//	doLinkSpamCheck     = false;
+		//	oneVotePerIpDom     = false;
+		//	onlyNeedGoodInlinks = false;
+		//}

 		// call it
 		char *url = getFirstUrl()->getUrl();
@ -13764,7 +13774,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {

 	// we make a "fake" url for the diffbot reply when indexing it
 	// by appending -diffbotxyz%lu. see "fakeUrl" below.
-	if ( m_firstUrl.getUrlLen() + 15 >= MAX_URL_LEN ) {
+	if ( m_firstUrl.getUrlLen() + 24 >= MAX_URL_LEN ) {
 		if ( m_firstUrlValid )
 			log("build: diffbot url would be too long for "
 			    "%s", m_firstUrl.getUrl() );
--- a/main.cpp
+++ b/main.cpp
@ -25,7 +25,7 @@
 #include "Tagdb.h"
 #include "Catdb.h"
 #include "Users.h"
-#include "Tfndb.h"
+//#include "Tfndb.h"
 #include "Spider.h"
 //#include "Doledb.h"
 //#include "Checksumdb.h"
@ -150,8 +150,8 @@ static void dumpTitledb  ( char *coll,long sfn,long numFiles,bool includeTree,
 			   long long docId , char justPrintDups ,
 			   bool dumpSentences ,
 			   bool dumpWords );
-static void dumpTfndb    ( char *coll,long sfn,long numFiles,bool includeTree,
-			   bool verify);
+//static void dumpTfndb    (char *coll,long sfn,long numFiles,bool includeTree,
+//			   bool verify);
 static long dumpSpiderdb ( char *coll,long sfn,long numFiles,bool includeTree,
 			   char printStats , long firstIp );
 static void dumpSectiondb( char *coll,long sfn,long numFiles,bool includeTree);
@ -773,8 +773,8 @@ int main ( int argc , char *argv[] ) {
 			"\tV is z to dump statsdb all keys.\n"
 			"\tV is Z to dump statsdb all keys and data samples.\n"
 			"\tV is L to dump linkdb.\n"
-			"\tV is u to dump tfndb.\n"
-			"\tV is vu to verify tfndb.\n"
+			//"\tV is u to dump tfndb.\n"
+			//"\tV is vu to verify tfndb.\n"
 			"\tC is the name of the collection.\n"
 			"\tX is start file num.    (default  0)\n"
 			"\tY is num files.         (default -1)\n"
@ -2420,10 +2420,10 @@ int main ( int argc , char *argv[] ) {
 			dumpTitledb(coll,startFileNum,numFiles,includeTree,
 				     docId,1,false,false);
 		}
-		else if ( argv[cmdarg+1][0] == 'v' && argv[cmdarg+1][1] =='u' )
-			dumpTfndb   (coll,startFileNum,numFiles,includeTree,1);
-		else if ( argv[cmdarg+1][0] == 'u' )
-			dumpTfndb   (coll,startFileNum,numFiles,includeTree,0);
+		//else if(argv[cmdarg+1][0] == 'v' && argv[cmdarg+1][1] =='u' )
+		//	dumpTfndb   (coll,startFileNum,numFiles,includeTree,1);
+		//else if ( argv[cmdarg+1][0] == 'u' )
+		//	dumpTfndb   (coll,startFileNum,numFiles,includeTree,0);
 		else if ( argv[cmdarg+1][0] == 'w' )
 		       dumpWaitingTree(coll);
 		else if ( argv[cmdarg+1][0] == 'x' )
@ -5652,7 +5652,7 @@ void zlibtest() {

 #include "Rdb.h"
 #include "Xml.h"
-#include "Tfndb.h"
+//#include "Tfndb.h"
 //#include "Checksumdb.h"
 #include "Threads.h"

@ -5988,7 +5988,7 @@ void dumpTitledb (char *coll,long startFileNum,long numFiles,bool includeTree,
 	if ( startKey < *(key_t *)list.getLastKey() ) return;
 	goto loop;
 }
-
+/*
 void dumpTfndb (char *coll,long startFileNum,long numFiles,bool includeTree ,
 		bool verify) {
 	//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
@ -6060,7 +6060,7 @@ void dumpTfndb (char *coll,long startFileNum,long numFiles,bool includeTree ,
 	if ( startKey < *(key_t *)list.getLastKey() ) return;
 	goto loop;
 }
-
+*/
 void dumpWaitingTree (char *coll ) {
 	RdbTree wt;
 	if (!wt.set(0,-1,true,20000000,true,"waittree2",
@ -7895,9 +7895,9 @@ void dumpMissing ( char *coll ) {
 	g_conf.m_indexdbMaxCacheMem = 0;
 	//g_conf.m_clusterdbMaxDiskPageCacheMem = 0;

-	g_tfndb.init ();
+	//g_tfndb.init ();
 	//g_collectiondb.init(true); // isDump?
-	g_tfndb.getRdb()->addRdbBase1 ( coll );
+	//g_tfndb.getRdb()->addRdbBase1 ( coll );
 	g_titledb.init();
 	g_titledb.getRdb()->addRdbBase1 ( coll );
 	// if titledb has stuff in memory, do not do this, it needs to
@ -7911,7 +7911,8 @@ void dumpMissing ( char *coll ) {
 	}
 	// . just get the docids from tfndb...
 	// . this tfndb rec count is for ALL colls!! DOH!
-	long long numRecs = g_tfndb.getRdb()->getNumTotalRecs();
+	// MDW FIX THIS RIGHT!
+	long long numRecs = 12345;//g_tfndb.getRdb()->getNumTotalRecs();
 	long long oldNumSlots = (numRecs * 100) / 80;
 	// make a power of 2
 	// make it a power of 2
@ -7980,10 +7981,10 @@ void dumpMissing ( char *coll ) {
 		if ( (k.n0 & 0x01LL) == 0x00 ) continue;
 		// titledb tree is empty, so this must indicate it is in
 		// spiderdb only
-		long  tfn = g_tfndb.getTfn(&k);
+		long  tfn = 0;//g_tfndb.getTfn(&k);
 		if ( tfn == 255 ) continue;
 		// get docid
-		unsigned long long d = g_tfndb.getDocId ( &k );
+		unsigned long long d = 0LL;//g_tfndb.getDocId ( &k );
 		// add to hash table
 		//long n = (unsigned long)d & mask;
 		long n = (unsigned long)d % numSlots;
@ -8664,12 +8665,12 @@ void removeDocIds  ( char *coll , char *filename ) {
 	//g_conf.m_checksumdbMaxCacheMem = 0;
 	//g_conf.m_clusterdbMaxCacheMem = 0;

-	g_tfndb.init();
+	//g_tfndb.init();
 	g_indexdb.init ();
 	//g_checksumdb.init();
 	g_clusterdb.init();
 	//g_collectiondb.init(true);
-	g_tfndb.getRdb()->addRdbBase1 ( coll );
+	//g_tfndb.getRdb()->addRdbBase1 ( coll );
 	g_indexdb.getRdb()->addRdbBase1 ( coll );
 	//g_checksumdb.getRdb()->addRdbBase1 ( coll );
 	g_clusterdb.getRdb()->addRdbBase1 ( coll );
@ -9044,7 +9045,7 @@ void removeDocIds  ( char *coll , char *filename ) {
 	//

 	logf(LOG_INFO,"db: Scanning tfndb and removing recs.");
-	r = g_tfndb.getRdb();
+	r = 0;//g_tfndb.getRdb();
 	count = 0;
 	scanned = 0;
 	recs = 0;
@ -9089,7 +9090,7 @@ void removeDocIds  ( char *coll , char *filename ) {
 		key_t k    = list.getCurrentKey();
 		// skip deletes
 		if ( (k.n0 & 0x01) == 0x00 ) continue;
-		unsigned long long d = g_tfndb.getDocId(&k);
+		unsigned long long d = 0;//g_tfndb.getDocId(&k);
 		// see if docid is in delete list
 		long n = (unsigned long)d & mask;
 		while ( slots[n] && slots[n] != d )