Merge remote-tracking branch 'origin/diffbot' into diffbot-dan

2024-10-04 12:17:35 +03:00 · 2014-04-01 19:48:24 -07:00 · 2014-04-01 19:48:24 -07:00 · 0988a134d0
commit 0988a134d0
parent 4856cc4c60 9c8410767d
27 changed files with 1068 additions and 442 deletions
--- a/Json.cpp
+++ b/Json.cpp
@ -433,7 +433,7 @@ char *JsonItem::getValueAsString ( long *valueLen ) {

 	// numbers...
 	static char s_numBuf[64];
-	if ( m_valueLong == (long)m_valueDouble ) {
+	if ( (float)m_valueLong == m_valueDouble ) {
 		*valueLen = sprintf ( s_numBuf,"%li", m_valueLong );
 		return s_numBuf;
 	}
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
 			"(compatible; MSIE 6.0; Windows 98; "
 			"Win 9x 4.90)" ;

+	// for bulk jobs avoid actual downloads of the page for efficiency
+	if ( r->m_isCustomCrawl == 2 ) {
+		char *s = 
+			"HTTP/1.0 200 (OK)\r\n"
+			"Content-Length: 0\r\n"
+			"Connection: Close\r\n"
+			"Content-Type: text/html\r\n\r\n";
+		long slen = gbstrlen(s);
+		long fakeBufSize = slen + 1;
+		char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
+		gotHttpReply2 ( r , 
+				fakeBuf,
+				fakeBufSize, // include \0
+				fakeBufSize, // allocsize
+				NULL ); // tcpsock
+		return;
+	}
+
+
 	// download it
 	if ( ! g_httpServer.getDoc ( r->m_url             ,
 				     r->m_urlIp           ,
--- a/Msg13.h
+++ b/Msg13.h
@ -32,6 +32,8 @@ public:
 	// if doing spider compression, compute contentHash32 of document
 	// downloaded, and if it matches this then send back EDOCUNCHANGED
 	long  m_contentHash32;
+	// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
+	char m_isCustomCrawl;
 	// send back error ENOGOODDATE if it does not have one. but if
 	// harvestLinks is true, just send back a filtered list of links
 	long  m_requireGoodDate:1;
--- a/Msg39.cpp
+++ b/Msg39.cpp
@ -543,7 +543,7 @@ bool Msg39::getLists () {
 			     "component=%li "
 			     "otermLen=%li "
 			     "isSynonym=%li "
-			     "querylangid=%li ",
+			     "querylangid=%li " ,
 			     (long)this ,
 			     i          ,
 			     qt->m_term,//bb ,
@ -569,7 +569,7 @@ bool Msg39::getLists () {
 			     (long)m_tmpq.m_componentCodes[i],
 			     (long)m_tmpq.getTermLen(i) ,
 			     isSynonym,
-			     (long)m_tmpq.m_langId); // ,tt
+			     (long)m_tmpq.m_langId ); // ,tt
 			// put it back
 			*tpc = tmp;
 			if ( st ) {
@ -661,6 +661,7 @@ void gotListsWrapper ( void *state ) {
 	Msg39 *THIS = (Msg39 *) state;
 	// . hash the lists into our index table
 	// . this will send back a reply or recycle and read more list data
+
 	if ( ! THIS->gotLists ( true ) ) return;

 	// . if he did not block and there was an errno we send reply
@ -671,6 +672,12 @@ void gotListsWrapper ( void *state ) {
 		log("msg39: sending back error reply = %s",mstrerror(g_errno));
 		sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
 	}
+
+	// no, block? call the docid split loop
+	//if ( numDocIdSplits <= 1 ) return;
+
+	// if we get the lists and processed them without blocking, repeat!
+	THIS->doDocIdSplitLoop();
 }

 // . now come here when we got the necessary index lists
@ -753,10 +760,25 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
 	// . now we must call this separately here, not in allocTopTree()
 	// . we have to re-set the QueryTermInfos with each docid range split
 	//   since it will set the list ptrs from the msg2 lists
-	if ( m_r->m_useNewAlgo && ! m_posdbTable.setQueryTermInfo () ) {
-		return true;
+	if ( ! m_posdbTable.setQueryTermInfo () ) return true;
+
+	// print query term bit numbers here
+	for ( long i = 0 ; 
+	      m_debug && i < m_tmpq.getNumTerms() ; i++ ) {
+		QueryTerm *qt = &m_tmpq.m_qterms[i];
+		//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
+		char *tpc = qt->m_term + qt->m_termLen;
+		char  tmp = *tpc;
+		*tpc = '\0';
+		SafeBuf sb;
+		sb.safePrintf("query: msg39: BITNUM query term #%li \"%s\" "
+			      "bitnum=%li ", i , qt->m_term, qt->m_bitNum );
+		// put it back
+		*tpc = tmp;
+		logf(LOG_DEBUG,"%s",sb.getBufStart());
 	}

+
 	// timestamp log
 	if ( m_debug ) {
 		log(LOG_DEBUG,"query: msg39: [%lu] Preparing to intersect "
@ -817,6 +839,7 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
 	// time it
 	diff = gettimeofdayInMilliseconds() - start;
 	if ( diff > 10 ) log("query: Took %lli ms for intersection",diff);
+
 	// returns false if blocked, true otherwise
 	return addedLists ();
 }
--- a/Msg5.cpp
+++ b/Msg5.cpp
@ -859,9 +859,9 @@ bool Msg5::needsRecall ( ) {
 	if ( m_round == 0 ) logIt = false;
 	if ( logIt )
 		logf(LOG_DEBUG,"db: Reading %li again from %s (need %li total "
-		     "got %li) this=0x%lx round=%li.", 
+		     "got %li) cn=%li this=0x%lx round=%li.", 
 		     m_newMinRecSizes , base->m_dbname , m_minRecSizes, 
-		     m_list->m_listSize, (long)this , m_round );
+		     m_list->m_listSize, (long)m_collnum,(long)this, m_round );
 	m_round++;
 	// record how many screw ups we had so we know if it hurts performance
 	base->m_rdb->didReSeek ( );
--- a/PageAddUrl.cpp
+++ b/PageAddUrl.cpp
@ -85,7 +85,9 @@ bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
 	if ( url ) {
 		// normalize and add www. if it needs it
 		Url uu;
-		uu.set ( url , gbstrlen(url) , true );
+		// do not convert xyz.com to www.xyz.com because sometimes
+		// people want xyz.com exactly
+		uu.set ( url , gbstrlen(url) , false ); // true );
 		// remove >'s i guess and store in st1->m_url[] buffer
 		st1->m_urlLen=cleanInput ( st1->m_url,
 					   MAX_URL_LEN, 
--- a/PageBasic.cpp
+++ b/PageBasic.cpp
@ -623,6 +623,7 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
 		      //"</td>"
 		      //"</tr>"

+		      /*
 		      // local subdir match
 		      "<tr>"
 		      "<td>file://C/mydir/mysubdir/"
@ -637,6 +638,7 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
 		      "Do not spider files in this subdirectory."
 		      "</td>"
 		      "</tr>"
+		      */

 		      // connect to a device and index it as a stream
 		      //"<tr>"
--- a/PageHosts.cpp
+++ b/PageHosts.cpp
@ -582,6 +582,7 @@ skipReplaceHost:
 	sb.safePrintf ( "</table><br>\n" );

 	
+	/*
 	// print spare hosts table
 	sb.safePrintf ( 
 		  "<table %s>"
@ -646,7 +647,9 @@ skipReplaceHost:
 			  h->m_note );
 	}
 	sb.safePrintf ( "</table><br>" );
+	*/

+	/*
 	// print proxy hosts table
 	sb.safePrintf ( 
 		  "<table %s>"
@ -754,6 +757,7 @@ skipReplaceHost:
 			  h->m_note );
 	}
 	sb.safePrintf ( "</table><br><br>" );
+	*/

 	sb.safePrintf(
 		      "<style>"
@ -812,7 +816,6 @@ skipReplaceHost:
 		  "<td>The UDP port used to send and receive dns traffic with."
 		  "</td>"
 		  "</tr>\n"
-		  */

 		  "<tr class=poo>"
 		  "<td>http port</td>"
@ -820,7 +823,6 @@ skipReplaceHost:
 		  "</td>"
 		  "</tr>\n"

-		  /*
 		  "<tr class=poo>"
 		  "<td>best switch id</td>"
 		  "<td>The host prefers to be on this switch because it "
@ -886,6 +888,43 @@ skipReplaceHost:
 		  "</td>"
 		  "</tr>\n"

+		  "<tr class=poo>"
+		  "<td>avg split time</td>"
+		  "<td>Average time this host took to compute the docids "
+		  "for a query. Useful for guaging the slowness of a host "
+		  "compare to other hosts."
+		  "</td>"
+		  "</tr>\n"
+
+		  "<tr class=poo>"
+		  "<td>splits done</td>"
+		  "<td>Number of queries this host completed. Used in "
+		  "computation of the <i>avg split time</i>."
+		  "</td>"
+		  "</tr>\n"
+
+		  "<tr class=poo>"
+		  "<td>status</td>"
+		  "<td>Status flags for the host. See key below."
+		  "</td>"
+		  "</tr>\n"
+
+		  "<tr class=poo>"
+		  "<td>slow reads</td>"
+		  "<td>Number of slow disk reads the host has had. "
+		  "When this is big compared to other hosts it is a good "
+		  "indicator its drives are relatively slow."
+		  "</td>"
+		  "</tr>\n"
+
+		  "<tr class=poo>"
+		  "<td>docs indexed</td>"
+		  "<td>Number of documents this host has indexed over all "
+		  "collections. All hosts should have close to the same "
+		  "number in a well-sharded situation."
+		  "</td>"
+		  "</tr>\n"
+
 		  //"<tr class=poo>"
 		  //"<td>loadavg</td>"
 		  //"<td>1-minute sliding-window load average from "
@ -895,13 +934,26 @@ skipReplaceHost:

 		  "<tr class=poo>"
 		  "<td>mem used</td>"
-		  "<td>percentage of memory currently used."
+		  "<td>Percentage of memory currently used."
 		  "</td>"
 		  "</tr>\n"

 		  "<tr class=poo>"
 		  "<td>cpu usage</td>"
-		  "<td>percentage of cpu resources in use by the gb process."
+		  "<td>Percentage of cpu resources in use by the gb process."
+		  "</td>"
+		  "</tr>\n"
+
+		  "<tr class=poo>"
+		  "<td>disk usage</td>"
+		  "<td>Percentage of disk in use. When this gets close to "
+		  "100%% you need to do something."
+		  "</td>"
+		  "</tr>\n"
+
+		  "<tr class=poo>"
+		  "<td>max ping1</td>"
+		  "<td>The worst ping latency from host to host."
 		  "</td>"
 		  "</tr>\n"

@ -918,6 +970,7 @@ skipReplaceHost:
 		  "</td>"
 		  "</tr>\n"

+		  /*
 		  "<tr class=poo>"
 		  "<td>ping2</td>"
 		  "<td>Ping time to this host on the seconday/shotgun "
@ -925,6 +978,7 @@ skipReplaceHost:
 		  "network is not enabled in the master controls."
 		  "</td>"
 		  "</tr>\n"
+		  */

 		  "<tr class=poo>"
 		  "<td>M (status flag)</td>"
@ -950,6 +1004,27 @@ skipReplaceHost:
 		  "</td>"
 		  "</tr>\n"

+		  "<tr class=poo>"
+		  "<td>R (status flag)</td>"
+		  "<td>Indicates host is performing a rebalance operation."
+		  "</td>"
+		  "</tr>\n"
+
+		  "<tr class=poo>"
+		  "<td>F (status flag)</td>"
+		  "<td>Indicates host has foreign records and requires "
+		  "a rebalance operation."
+		  "</td>"
+		  "</tr>\n"
+
+		  "<tr class=poo>"
+		  "<td>x (status flag)</td>"
+		  "<td>Indicates host has abruptly exited due to a fatal "
+		  "error (cored) and "
+		  "restarted itself."
+		  "</td>"
+		  "</tr>\n"
+

 		  ,
 		  TABLE_STYLE
--- a/PageInject.cpp
+++ b/PageInject.cpp
@ -233,13 +233,13 @@ bool sendReply ( void *state ) {

 		  "By default, injected urls "
 		  "take precedence over the \"insitelist\" directive in the "
-		  "<a href=/admin/scheduler>spider scheduler</a> "
+		  "<a href=/admin/filters>url filters</a> "
 		  "so injected urls need not match the "
 		  "<a href=/admin/sites>spider sites</a> patterns. You can "
-		  "change that behavior in the <a href=/scheduler>spider "
-		  "scheduler</a> if you want. "
+		  "change that behavior in the <a href=/admin/filters>url "
+		  "filters</a> if you want. "
 		  "Injected urls will have a "
-		  "<a href=/admin/scheduler#hopcount>hopcount</a> of 0. "
+		  "<a href=/admin/filters#hopcount>hopcount</a> of 0. "
 		  "The injection api is described on the "
 		  "<a href=/admin/api>api</a> page."

--- a/PageParser.cpp
+++ b/PageParser.cpp
@ -1,8 +1,8 @@
 #include "gb-include.h"

 #include "PageParser.h"
-#include "IndexTable.h"
-#include "IndexTable2.h"
+//#include "IndexTable.h"
+//#include "IndexTable2.h"
 //#include "XmlDoc.h" // addCheckboxSpan()

 bool g_inPageParser = false;
@ -101,7 +101,7 @@ bool sendPageParser2 ( TcpSocket   *s ,
 	st->m_termFreqs       = termFreqs;
 	st->m_termFreqWeights = termFreqWeights;
 	st->m_affWeights      = affWeights;
-	st->m_total           = (score_t)-1;
+	//st->m_total           = (score_t)-1;
 	st->m_indexCode       = 0;
 	st->m_blocked         = false;
 	st->m_didRootDom      = false;
@ -654,7 +654,7 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
 	//st->m_termFreqs       = termFreqs;
 	//st->m_termFreqWeights = termFreqWeights;
 	//st->m_affWeights      = affWeights;
-	st->m_total           = (score_t)-1;
+	//st->m_total           = (score_t)-1;
 	st->m_indexCode       = 0;
 	st->m_blocked         = false;
 	st->m_didRootDom      = false;
--- a/PageParser.h
+++ b/PageParser.h
@ -80,7 +80,7 @@ public:
 	long long *m_termFreqs;
 	float     *m_termFreqWeights;
 	float     *m_affWeights;
-	score_t    m_total;
+	//score_t    m_total;
 	bool       m_freeIt;
 	bool       m_blocked;

--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -1324,7 +1324,7 @@ bool printSearchResultsHeader ( State0 *st ) {
 	if ( isAdmin ) {
 		sb->safePrintf(" &nbsp; "
 			      "<font color=red><b>"
-			      "<a href=\"/admin/basic?c=%s\">"
+			      "<a href=\"/admin/settings?c=%s\">"
 			      "[admin]"
 			      "</a></b></font>",coll);
 		// print reindex link
@ -2141,7 +2141,9 @@ bool printResult ( State0 *st, long ix ) {
 		     *end == '}' ) {
 			// replace trailing } with spidertime}
 			sb->incrementLength(-1);
-			sb->safePrintf(",\"docId\":%lli\n", mr->m_docId);
+			sb->safePrintf(",\"docId\":%lli", mr->m_docId);
+			// for deduping
+			//sb->safePrintf(",\"crc\":%lu",mr->m_contentHash32);
 			// crap, we lose resolution storing as a float
 			// so fix that shit here...
 			//float f = mr->m_lastSpidered;
--- a/PageRoot.cpp
+++ b/PageRoot.cpp
@ -78,7 +78,7 @@ bool printNav ( SafeBuf &sb , HttpRequest *r ) {
 			      //" &nbsp; &nbsp; <a href=/logout>Logout</a>"
 			      );

-	if ( r->isLocal() )
+	//if ( r->isLocal() )
 	    sb.safePrintf("&nbsp; &nbsp; [<a href=\"/admin/settings\">"
 			  "<font color=red>Admin</font></a>]");
 	sb.safePrintf("</p></b></center></body></html>");
--- a/Pages.cpp
+++ b/Pages.cpp
@ -233,9 +233,9 @@ static WebPage s_pages[] = {
 	  "what sites can be spidered",
 	  sendPageGeneric , 0 } , // sendPageBasicSettings

-	{ PAGE_FILTERS   , "admin/scheduler", 0 , "spider scheduler" ,  1 , 1,
+	{ PAGE_FILTERS   , "admin/filters", 0 , "url filters" ,  1 , 1,
 	  //USER_ADMIN | USER_MASTER   , 
-	  "schedule urls to be spidered",
+	  "prioritize urls for spidering",
 	  sendPageGeneric  , 0 } ,

 	{ PAGE_INJECT    , "admin/inject"   , 0 , "inject url" ,  0 , 1 ,
@ -1353,7 +1353,7 @@ bool Pages::printAdminBottom ( SafeBuf *sb, HttpRequest *r ) {
 bool Pages::printSubmit ( SafeBuf *sb ) {
 	// update button
 	return sb->safePrintf ( 
-				"<br>"
+			       //"<br>"
 				"<center>"
 				"<input type=submit name=action value=submit>"
 				"</center>"
@ -1764,7 +1764,9 @@ bool  Pages::printAdminLinks ( SafeBuf *sb,
 	//if ( g_users.hasPermission(username,PAGE_ADMIN ) ) 
 	//	sprintf(buf,"&master=0");

-	//sb->safePrintf("<div style=max-width:1000px;>");
+	// unfortunately width:100% is percent of the virtual window, not the
+	// visible window... so just try 1000px max
+	sb->safePrintf("<div style=max-width:800px;>");

 	//long matt1 = atoip ( MATTIP1 , gbstrlen(MATTIP1) );
 	//long matt2 = atoip ( MATTIP2 , gbstrlen(MATTIP2) );
@ -1904,7 +1906,7 @@ bool  Pages::printAdminLinks ( SafeBuf *sb,
 			       "href=/developer.html>"
 			       "<b>dev guide</b></a>" );
 	
-	//sb->safePrintf("</div>");
+	sb->safePrintf("</div>");

 	//sb->safePrintf("</center>" );
 	//sb->safePrintf("<br/>" );
--- a/Parms.cpp
+++ b/Parms.cpp
@ -929,7 +929,7 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
 	long  fromIp   = s->m_ip;

 	char fmt = r->getReplyFormat();
-
+	/*
 	if ( fmt == FORMAT_HTML )
 		sb->safePrintf (  
 				"<script type=\"text/javascript\">"
@ -959,7 +959,7 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
 				"    }\n"
 				"}\n"
 				"</script>");
-	
+	*/
 	// print the start of the table
 	char *tt = "None";
 	if ( page == PAGE_LOG        ) tt = "Log Controls";
@ -969,7 +969,7 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
 	if ( page == PAGE_SPIDER     ) tt = "Spider Controls";
 	if ( page == PAGE_SEARCH     ) tt = "Search Controls";
 	if ( page == PAGE_ACCESS     ) tt = "Access Controls";
-	if ( page == PAGE_FILTERS    ) tt = "Spider Scheduler";
+	if ( page == PAGE_FILTERS    ) tt = "Url Filters";
 	if ( page == PAGE_BASIC_SETTINGS ) tt = "Settings";
 	if ( page == PAGE_BASIC_SECURITY ) tt = "Security";
 	if ( page == PAGE_SITES ) tt = "Site List";
@ -1049,11 +1049,12 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
 	//p= g_parms.printParms (p, pend, page, user, THIS, coll, pwd, nc, pd);
 	g_parms.printParms ( sb , s , r );

-	if ( fmt == FORMAT_HTML ) sb->safePrintf ( "<br><br>\n" );
-
 	// end the table
 	if ( fmt == FORMAT_HTML ) sb->safePrintf ( "</table>\n" );

+	// this must be outside of table, submit button follows
+	if ( fmt == FORMAT_HTML ) sb->safePrintf ( "<br>\n" );
+
 	// url filter page has a test table
 	if ( page == PAGE_FILTERS && fmt == FORMAT_HTML ) {
 		// wrap up the form, print a submit button
@ -1698,10 +1699,12 @@ bool Parms::printParm ( SafeBuf* sb,
 			// and default value if it exists
 			if ( m->m_def && m->m_def[0] && t != TYPE_CMD ) {
 				char *d = m->m_def;
-				if ( t == TYPE_BOOL ) {
+				if ( t == TYPE_BOOL || t == TYPE_CHECKBOX ) {
 					if ( d[0]=='0' ) d = "NO";
 					else             d = "YES";
-					sb->safePrintf ( " Default: %s.",d);
+					sb->safePrintf ( " <nobr>"
+							 "Default: %s."
+							 "</nobr>",d);
 				} 
 				else {
 					sb->safePrintf (" Default: ");
@ -1782,7 +1785,8 @@ bool Parms::printParm ( SafeBuf* sb,
 			}
 		}
 		else {
-			sb->safePrintf("<center><nobr>");
+			//sb->safePrintf("<center><nobr>");
+			sb->safePrintf("<nobr>");
 			// this is part of the "HACK" fix below. you have to
 			// specify the cgi parm in the POST request, and 
 			// unchecked checkboxes are not included in the POST 
@ -1829,7 +1833,9 @@ bool Parms::printParm ( SafeBuf* sb,
 			//	sb->safePrintf("value=0 name=%s%s>",
 			//		       cgi,ddd2);
 			//}
-			sb->safePrintf("</nobr></center>");
+			sb->safePrintf("</nobr>"
+				       //"</center>"
+				       );
 		}
 	}
 	else if ( t == TYPE_CHAR )
@ -5778,11 +5784,19 @@ void Parms::init ( ) {
 	m++;

 	m->m_title = "email server 1";
-	m->m_desc  = "Connects to this server directly when sending email 1 ";
+	m->m_desc  = "Connects to this IP or hostname "
+		"directly when sending email 1. "
+		"Use <i>apt-get install sendmail</i> to install sendmail "
+		"on that IP or hostname. Add <i>From:10.5 RELAY</i> to "
+		"/etc/mail/access to allow sendmail to forward email it "
+		"receives from gigablast if gigablast hosts are on the "
+		"10.5.*.* IPs. Then run <i>/etc/init.d/sendmail restart</i> "
+		"as root to pick up those changes so sendmail will forward "
+		"Gigablast's mail to the address you give below.";
 	m->m_cgi   = "esrvone";
 	m->m_off   = (char *)&g_conf.m_email1MX - g;
 	m->m_type  = TYPE_STRING;
-	m->m_def   = "10.5.54.47";
+	m->m_def   = "127.0.0.1";
 	m->m_size  = MAX_MX_LEN;
 	m->m_priv  = 2;
 	m->m_group = 0;
@ -7487,7 +7501,7 @@ void Parms::init ( ) {
 		"If your url does not index as you expect you "
 		"can check it's history. " // (spiderdb lookup)
 		"Added urls will have a "
-		"<a href=/admin/scheduler#hopcount>hopcount</a> of 0. "
+		"<a href=/admin/filters#hopcount>hopcount</a> of 0. "
 		"The add url api is described on the "
 		"<a href=/admin/api>api</a> page.";
 	m->m_cgi   = "urls";
@ -7509,7 +7523,7 @@ void Parms::init ( ) {
 	m++;

 	m->m_title = "strip sessionids";
-	m->m_desc  = "strip added urls of their session ids.";
+	m->m_desc  = "Strip added urls of their session ids.";
 	m->m_cgi   = "strip";
 	m->m_page  = PAGE_ADDURL2;
 	m->m_obj   = OBJ_NONE;
@ -7518,7 +7532,7 @@ void Parms::init ( ) {
 	m++;

 	m->m_title = "harvest links";
-	m->m_desc  = "harvest links of added urls so we can spider them?.";
+	m->m_desc  = "Harvest links of added urls so we can spider them?.";
 	m->m_cgi   = "spiderLinks";
 	m->m_page  = PAGE_ADDURL2;
 	m->m_obj   = OBJ_NONE;
@ -7557,17 +7571,17 @@ void Parms::init ( ) {
 	m->m_xml   = "siteList";
 	m->m_desc  = "List of sites to spider, one per line. "
 		"Gigablast uses the "
-		"<a href=/admin/scheduler#insitelist>insitelist</a> "
+		"<a href=/admin/filters#insitelist>insitelist</a> "
 		"directive on "
-		"the <a href=/admin/scheduler>spider scheduler</a> "
+		"the <a href=/admin/filters>url filters</a> "
 		"page to make sure that the spider only indexes urls "
 		"that match the site patterns you specify here, other than "
 		"urls you add individually via the add urls or inject url "
 		"tools. "
 		"See <a href=#examples>example site list</a> below. "
 		"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
-		"to add then consider using the <a href=/admin/addurl>addurl"
-		"</a> interface.";
+		"to add then consider using the <a href=/admin/addurl>add "
+		"urls</a> interface.";
 	m->m_cgi   = "sitelist";
 	m->m_off   = (char *)&cr.m_siteListBuf - x;
 	m->m_page  = PAGE_BASIC_SETTINGS;
@ -7625,9 +7639,9 @@ void Parms::init ( ) {
 	m->m_xml   = "siteList";
 	m->m_desc  = "List of sites to spider, one per line. "
 		"Gigablast uses the "
-		"<a href=/admin/scheduler#insitelist>insitelist</a> "
+		"<a href=/admin/filters#insitelist>insitelist</a> "
 		"directive on "
-		"the <a href=/admin/scheduler>spider scheduler</a> "
+		"the <a href=/admin/filters>url filters</a> "
 		"page to make sure that the spider only indexes urls "
 		"that match the site patterns you specify here, other than "
 		"urls you add individually via the add urls or inject url "
@ -10691,7 +10705,7 @@ void Parms::init ( ) {
 	m->m_off   = (char *)&cr.m_siteClusterByDefault - x;
 	m->m_soff  = (char *)&si.m_doSiteClustering - y;
 	m->m_type  = TYPE_BOOL;
-	m->m_def   = "1";
+	m->m_def   = "0";
 	m->m_sparm = 1;
 	m->m_scgi  = "sc";
 	m++;
@ -18523,9 +18537,9 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
 			  "the list of sites on the <a href=/admin/sites>"
 			  "site list</a> page. That site list is useful for "
 			  "adding a large number of sites that can not be "
-			  "accomodated by the spider scheduler table. Plus "
+			  "accomodated by the url fitlers table. Plus "
 			  "it is higher performance and easier to use, but "
-			  "lacks the spider scheduler's "
+			  "lacks the url filter table's "
 			  "fine level of control."
 			  "</td></tr>"

--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -1263,7 +1263,6 @@ char *getHashGroupString ( unsigned char hg ) {
 //
 ////////////////

-#define MAX_SUBLISTS 50
 /*
 // . these lists[] are 1-1 with q->m_qterms
 void PosdbTable::intersectLists9_r ( ) {
@ -4075,38 +4074,6 @@ float PosdbTable::getTermPairScoreForAny ( long i, long j,
 //


-// . each QueryTerm has this attached additional info now:
-// . these should be 1-1 with query terms, Query::m_qterms[]
-class QueryTermInfo {
-public:
-	// the required lists for this query term, synonym lists, etc.
-	RdbList  *m_subLists        [MAX_SUBLISTS];
-	// flags to indicate if bigram list should be scored higher
-	char      m_bigramFlags     [MAX_SUBLISTS];
-	// shrinkSubLists() set this:
-	long      m_newSubListSize  [MAX_SUBLISTS];
-	char     *m_newSubListStart [MAX_SUBLISTS];
-	char     *m_newSubListEnd   [MAX_SUBLISTS];
-	char     *m_cursor          [MAX_SUBLISTS];
-	char     *m_savedCursor     [MAX_SUBLISTS];
-	long      m_numNewSubLists;
-	// how many are valid?
-	long      m_numSubLists;
-	// size of all m_subLists in bytes
-	long long m_totalSubListsSize;
-	// the term freq weight for this term
-	float     m_termFreqWeight;
-	// what query term # do we correspond to in Query.h
-	long      m_qtermNum;
-	// the word position of this query term in the Words.h class
-	long      m_qpos;
-	// the wikipedia phrase id if we start one
-	long      m_wikiPhraseId;
-	// phrase id term or bigram is in
-	long      m_quotedStartId;
-};
-
-
 // returns false and sets g_errno on error
 bool PosdbTable::setQueryTermInfo ( ) {

@ -4215,6 +4182,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
 			qti->m_bigramFlags[nn] = BF_HALFSTOPWIKIBIGRAM;
 			// before a pipe operator?
 			if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
+			// add list of member terms as well
+			//qti->m_qtermList[nn] = &m_q->m_qterms[left];
+			m_q->m_qterms[left].m_bitNum = nrg;
 			// only really add if useful
 			if ( list && list->m_listSize ) nn++;

@ -4231,6 +4201,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
 				qti->m_bigramFlags[nn] |= BF_SYNONYM;
 				if (qt->m_piped)
 					qti->m_bigramFlags[nn]|=BF_PIPED;
+				// add list of member terms as well
+				//qti->m_qtermList[nn] = bt;
+				bt->m_bitNum = nrg;
 				if ( list && list->m_listSize ) nn++;
 			}

@ -4252,6 +4225,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
 			qti->m_bigramFlags[nn] = BF_HALFSTOPWIKIBIGRAM;
 			// before a pipe operator?
 			if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
+			// add list of member terms as well
+			//qti->m_qtermList[nn] = &m_q->m_qterms[right];
+			m_q->m_qterms[right].m_bitNum = nrg;
 			// only really add if useful
 			if ( list && list->m_listSize ) nn++;

@ -4268,6 +4244,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
 				qti->m_bigramFlags[nn] |= BF_SYNONYM;
 				if (qt->m_piped)
 					qti->m_bigramFlags[nn]|=BF_PIPED;
+				// add list of member terms as well
+				//qti->m_qtermList[nn] = bt;
+				bt->m_bitNum = nrg;
 				if ( list && list->m_listSize ) nn++;
 			}

@ -4312,6 +4291,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
 		if (qt->m_fieldCode == FIELD_GBNUMBERMAXINT )
 			qti->m_bigramFlags[nn]|=BF_NUMBER;

+		// add list of member terms
+		//qti->m_qtermList[nn] = qt;
+		qt->m_bitNum = nrg;
+
 		// only really add if useful
 		// no, because when inserting NEW (related) terms that are
 		// not currently in the document, this list may initially
@ -4334,6 +4317,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
 			if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
 			// call it a synonym i guess
 			qti->m_bigramFlags[nn] |= BF_BIGRAM;
+			// add list of member terms
+			//qti->m_qtermList[nn] = &m_q->m_qterms[left];
+			m_q->m_qterms[left].m_bitNum = nrg;
 			// only really add if useful
 			if ( list && list->m_listSize ) nn++;

@ -4349,6 +4335,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
 				qti->m_bigramFlags[nn] = BF_SYNONYM;
 				if (qt->m_piped)
 					qti->m_bigramFlags[nn]|=BF_PIPED;
+				// add list of member terms
+				//qti->m_qtermList[nn] = bt;
+				bt->m_bitNum = nrg;
 				if ( list && list->m_listSize ) nn++;
 			}

@ -4370,6 +4359,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
 			qti->m_bigramFlags[nn] |= BF_BIGRAM;
 			// before a pipe operator?
 			if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
+			// add list of query terms too that are in this group
+			//qti->m_qtermList[nn] = &m_q->m_qterms[right];
+			m_q->m_qterms[right].m_bitNum = nrg;
 			// only really add if useful
 			if ( list && list->m_listSize ) nn++;

@ -4385,6 +4377,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
 				qti->m_bigramFlags[nn] = BF_SYNONYM;
 				if (qt->m_piped)
 					qti->m_bigramFlags[nn]|=BF_PIPED;
+				// add list of member terms
+				//qti->m_qtermList[nn] = bt;
+				bt->m_bitNum = nrg;
 				if ( list && list->m_listSize ) nn++;
 			}

@ -4408,6 +4403,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
 			qti->m_bigramFlags[nn] = BF_SYNONYM;
 			// before a pipe operator?
 			if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
+			// add list of member terms as well
+			//qti->m_qtermList[nn] = qt2;
+			// set bitnum here i guess
+			qt2->m_bitNum = nrg;
 			// only really add if useful
 			if ( list && list->m_listSize ) nn++;
 		}
@ -4448,11 +4447,34 @@ bool PosdbTable::setQueryTermInfo ( ) {
 		nrg++;
 	}

+	//
+	// now set QueryTerm::m_bitNum for use by Expression::isTruth()
+	// in Query.cpp for boolean queries, so we can get the bit vector
+	// of a docid that is 1-1 with the queryterminfos and see which
+	// query words in the boolean expression it contains.
+	// used by matchesBoolQuery() which we call below.
+	//
+	/*
+	for ( long i = 0 ; i < nrg ; i++ ) {
+		// get one
+		QueryTermInfo *qti = &qip[i];
+		// how many query terms are in this group?
+		for ( long j = 0 ; j < qti->m_numSubLists ; j++ ) {
+			// get the query term
+			QueryTerm *qt = qti->m_qtermList[j];
+			// set the bit num member
+			qt->m_bitNum = i;
+		}
+	}
+	*/
+
+
 	//
 	// get the query term with the least data in posdb including syns
 	//
 	m_minListSize = 0;
 	m_minListi    = -1;
+	long long grand = 0LL;
 	// hopefully no more than 100 sublists per term
 	//char *listEnds  [ MAX_QUERY_TERMS ][ MAX_SUBLISTS ];
 	// set ptrs now i guess
@ -4465,6 +4487,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
 		if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
 		// add to it
 		total = qti->m_totalSubListsSize;
+		// add up this now
+		grand += total;
 		// get min
 		if ( total < m_minListSize || m_minListi == -1 ) {
 			m_minListSize = total;
@ -4485,9 +4509,40 @@ bool PosdbTable::setQueryTermInfo ( ) {
 	long maxDocIds = m_minListSize / 12;
 	// store all interesected docids in here for new algo plus 1 byte vote
 	long need = maxDocIds * 6;
+
+	// they could all be OR'd together!
+	if ( m_q->m_isBoolean ) need = grand;
+
+	// so we can always cast a long long from a ptr in there
+	// for setting m_docId when m_booleanQuery is true below
+	need += 8;
+
 	// get max # of docids we got in an intersection from all the lists
 	if ( ! m_docIdVoteBuf.reserve ( need,"divbuf" ) ) return false;

+	// i'm feeling if a boolean query put this in there too, the
+	// hashtable that maps each docid to its boolean bit vector
+	// where each bit stands for an operand so we can quickly evaluate
+	// the bit vector in a truth table
+	long maxSlots = maxDocIds * 2;
+	// get total operands we used
+	//long numOperands = m_q->m_numWords;//Operands;
+	// a quoted phrase counts as a single operand
+	// . QueryTerm::m_bitNum <== m_numQueryTermInfos
+	// . each queryTermInfo class corresponds to one bit in our bit vec
+	// . essentially each queryTermInfo is a query term, but it has
+	//   all the synonym and word forms for that query, etc.
+	m_vecSize = m_numQueryTermInfos;//numOperands / 8 ;
+	// allow an extra byte for remainders
+	if ( m_numQueryTermInfos % 8 ) m_vecSize++;
+	// now preallocate the hashtable. 0 niceness.
+	if ( m_q->m_isBoolean && 
+	     ! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,0,"booltbl"))
+		return false;
+	if ( m_q->m_isBoolean && 
+	     ! m_ct.set (8,1,maxSlots,NULL,0,false,0,
+			 "booltbl"))
+		return false;

 	return true;
 }
@ -5110,7 +5165,7 @@ void PosdbTable::intersectLists10_r ( ) {

 	// . if smallest required list is empty, 0 results
 	// . also set in setQueryTermInfo
-	if ( m_minListSize == 0 ) return;
+	if ( m_minListSize == 0 && ! m_q->m_isBoolean ) return;

 	/*
 	for ( long k = 0 ; seoHack && k < m_q->m_numTerms ; k++ ) {
@ -5165,6 +5220,20 @@ void PosdbTable::intersectLists10_r ( ) {
 	//if ( ! m_msg2 ) goto seoHackSkip;


+	// for boolean queries we scan every docid in all termlists,
+	// then we see what query terms it has, and make a bit vector for it.
+	// then use a hashtable to map that bit vector to a true or false
+	// as to whether we should include it in the results or not.
+	// we use Query::getBitScore(qvec_t ebits) to evaluate a docid's
+	// query term explicit term bit vector.
+	if ( m_q->m_isBoolean ) {
+		// keeping the docids sorted is the challenge here...
+		makeDocIdVoteBufForBoolQuery_r();
+		goto skip3;
+	}
+
+
+
 	// . create "m_docIdVoteBuf" filled with just the docids from the
 	//   smallest group of sublists 
 	// . m_minListi is the queryterminfo that had the smallest total
@ -5238,6 +5307,8 @@ void PosdbTable::intersectLists10_r ( ) {
 	}
 	*/

+ skip3:
+
 	if ( m_debug ) {
 		now = gettimeofdayInMilliseconds();
 		took = now - lastTime;
@ -5662,6 +5733,16 @@ void PosdbTable::intersectLists10_r ( ) {
 		}
 	}

+	if ( m_q->m_isBoolean ) {
+		minScore = 1.0;
+		// since we are jumping, we need to set m_docId here
+		m_docId = *(unsigned long *)(docIdPtr+1);
+		m_docId <<= 8;
+		m_docId |= (unsigned char)docIdPtr[0];
+		m_docId >>= 2;
+		goto boolJump;
+	}
+
 	// TODO: consider skipping this pre-filter if it sucks, as it does
 	// for 'time enough for love'. it might save time!

@ -6512,6 +6593,8 @@ void PosdbTable::intersectLists10_r ( ) {
 		goto advance;


+ boolJump:
+
 	// try dividing it by 3! (or multiply by .33333 faster)
 	score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);

@ -6670,6 +6753,8 @@ void PosdbTable::intersectLists10_r ( ) {
 		// set the score and docid ptr
 		t->m_score = score;
 		t->m_docId = m_docId;
+		// sanity
+		if ( m_docId == 0 ) { char *xx=NULL;*xx=0; }
 		// use an integer score like lastSpidered timestamp?
 		if ( m_sortByTermNumInt >= 0 ) {
 			t->m_intScore = intScore;
@ -6961,4 +7046,213 @@ void printTermList ( long i, char *list, long listSize ) {
 	}
 }

+// sort in descending order
+int dcmp6 ( const void *h1 , const void *h2 ) {
+	if ( *(unsigned long *)((char *)h1+2) < 
+	     *(unsigned long *)((char *)h2+2) )
+		return -1;
+	if ( *(unsigned long *)((char *)h1+2) > 
+	     *(unsigned long *)((char *)h2+2) )
+		return  1;
+	if ( *(unsigned short *)((char *)h1) < 
+	     *(unsigned short *)((char *)h2) )
+		return -1;
+	// they shouldn't be any dups in there...
+	return 1;
+}

+// TODO: do this in docid range phases to save memory and be much faster
+// since we could contain to the L1 cache for hashing
+bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
+
+	// . make a hashtable of all the docids from all the termlists
+	// . the value slot will be the operand bit vector i guess
+	// . the size of the vector needs one bit per query operand
+	// . if the vector is only 1-2 bytes we can just evaluate each
+	//   combination we encounter and store it into an array, otherwise,
+	//   we can use a another hashtable in order to avoid re-evaluation
+	//   on if it passes the boolean query.
+	char bitVec[MAX_OVEC_SIZE];
+	if ( m_vecSize > MAX_OVEC_SIZE ) m_vecSize = MAX_OVEC_SIZE;
+
+	QueryTermInfo *qip = (QueryTermInfo *)m_qiBuf.getBufStart();
+
+	// . scan each list of docids to a get a new docid, keep a dedup
+	//   table to avoid re-processing the same docid.
+	// . each posdb list we read corresponds to a query term,
+	//   or a synonym of a query term, or bigram of a query term, etc.
+	//   but we really want to know what operand, so we associate an
+	//   operand bit with each query term, and each list can map to 
+	//   the base query term so we can get the operand # from that.
+	for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
+
+		// get it
+		QueryTermInfo *qti = &qip[i];
+
+		QueryTerm *qt = &m_q->m_qterms[qti->m_qtermNum];
+		// get the query word
+		//QueryWord *qw = qt->m_qword;
+
+		// just use the word # now
+		//long opNum = qw->m_wordNum;//opNum;
+
+		// . make it consistent with Query::isTruth()
+		// . m_bitNum is set above to the QueryTermInfo #
+		long bitNum = qt->m_bitNum;
+
+		// do not consider for adding if negative ('my house -home')
+		//if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
+
+		// set all to zeroes
+		memset ( bitVec , 0 , m_vecSize );
+		// set bitvec for him
+		long byte = bitNum / 8;
+		unsigned char mask = 1<<(bitNum % 8);
+		bitVec[byte] |= mask;
+
+		// each query term can have synonym lists etc. scan those
+		for ( long j = 0 ; j < qti->m_numSubLists ; j++ ) {
+
+			// scan all docids in this list
+			char *p = qti->m_subLists[j]->getList();
+			char *pend = qti->m_subLists[j]->getListEnd();
+
+			//long long lastDocId = 0LL;
+
+			for ( ; p < pend ; ) {
+				// place holder
+				long long docId = g_posdb.getDocId(p);
+
+				// sanity
+				//if ( d < lastDocId ) { char *xx=NULL;*xx=0; }
+				//lastDocId = d;
+
+				// point to it
+				//char *dp = p + 8;
+
+				// this was the first key for this docid for 
+				// this termid and possible the first key for 
+				// this termid, so skip it, either 12 or 18 
+				// bytes
+				if ( (((char *)p)[0])&0x02 ) p += 12;
+				// the first key for this termid?
+				else p += 18;
+
+				// then only 6 byte keys would follow from the
+				// same docid, so skip those as well
+			subloop:
+				if((((char *)p)[0])&0x04){p += 6;goto subloop;}
+
+				// convert docid into hash key
+				//long long docId = *(long long *)dp;
+				// shift down 2 bits
+				//docId >>= 2;
+				// and mask
+				//docId &= DOCID_MASK;
+				// test it
+				//long long docId = g_posdb.getDocId(dp-8);
+				//if ( d2 != docId ) { char *xx=NULL;*xx=0; }
+				// store this docid though. treat as long long
+				// but we mask with keymask
+				long slot = m_bt.getSlot ( &docId );
+				if ( slot < 0 ) {
+					// we can't alloc in a thread, careful
+					if ( ! m_bt.addKey(&docId,bitVec) ) {
+						char *xx=NULL;*xx=0; }
+					continue;
+				}
+				// or the bit in otherwise
+				char *bv = (char *)m_bt.getValueFromSlot(slot);
+				bv[byte] |= mask;
+			}
+		}
+	}
+
+
+	char *dst = m_docIdVoteBuf.getBufStart();
+
+	// . now our hash table is filled with all the docids
+	// . evaluate each bit vector
+	for ( long i = 0 ; i < m_bt.m_numSlots ; i++ ) {
+		// skip if empty
+		if ( ! m_bt.m_flags[i] ) continue;
+		// get the bit vector
+		unsigned char *vec = (unsigned char *)m_bt.getValueFromSlot(i);
+		// hash the vector
+		long long h64 = 0LL;
+		for ( long k = 0 ; k < m_vecSize ; k++ )
+		       h64^=g_hashtab[(unsigned char)vec[k]][(unsigned char)k];
+		// check in hash table
+		char *val = (char *)m_ct.getValue ( &h64 );
+
+		// it passes, add the ocid
+		if ( m_debug ) {
+			long long docId =*(long long *)m_bt.getKeyFromSlot(i);
+			log("query: eval d=%llu vec[0]=%lx h64=%lli",
+			    docId,(long)vec[0],h64);
+			//if ( docId == 47801316261LL )
+			//	log("hy");
+		}
+
+		// add him to the good table
+		if ( val && *val ) {
+			// it passes, add the ocid
+			long long docId =*(long long *)m_bt.getKeyFromSlot(i);
+			// fix it up
+			if ( m_debug ) {
+				log("query: adding d=%llu vec[0]=%lx",
+				    docId,(long)vec[0]);
+			}
+			// shift up
+			docId <<= 2;
+			// a 6 byte key means you pass
+			memcpy ( dst , &docId , 6 );
+			dst += 6;
+			continue;
+		}
+		// evaluate the vector
+		char include = m_q->matchesBoolQuery ( (unsigned char *)vec ,
+						        m_vecSize );
+		if ( include ) {
+			// it passes, add the ocid
+			long long docId =*(long long *)m_bt.getKeyFromSlot(i);
+			// fix it up
+			if ( m_debug ) {
+				log("query: adding d=%llu vec[0]=0x%lx",
+				    docId,(long)vec[0]);
+			}
+			// shift up
+			docId <<= 2;
+			// a 6 byte key means you pass
+			memcpy ( dst , &docId , 6 );
+			// test it
+			long long d2;
+			d2 = *(unsigned long *)(dst+1);
+			d2 <<= 8;
+			d2 |= (unsigned char)dst[0];
+			d2 >>= 2;
+			docId >>= 2;
+			if ( d2 != docId ) { char *xx=NULL;*xx=0; }
+			// end test
+			dst += 6;
+		}
+		// store in hash table
+		m_ct.addKey ( &h64  , &include );
+	}
+
+	// update SafeBuf::m_length
+	m_docIdVoteBuf.setLength ( dst - m_docIdVoteBuf.getBufStart() );
+
+	// now sort the docids. TODO: break makeDocIdVoteBufForBoolQuery_r()
+	// up into docid ranges so we have like 1/100th the # of docids to 
+	// sort. that should make this part a lot faster.
+	// i.e. 1000*log(1000) > 1000*(10*log(10))) --> 3000 > 1000
+	// i.e. it's faster to break it down into 1000 pieces
+	// i.e. for log base 2 maybe it's like 10x faster...
+	qsort ( m_docIdVoteBuf.getBufStart() ,
+		m_docIdVoteBuf.length() / 6 ,
+		6 ,
+		dcmp6 );
+
+	return true;
+}
--- a/Posdb.h
+++ b/Posdb.h
@ -395,6 +395,42 @@ class Posdb {
 	DiskPageCache m_pc;
 };

+#define MAX_SUBLISTS 50
+
+// . each QueryTerm has this attached additional info now:
+// . these should be 1-1 with query terms, Query::m_qterms[]
+class QueryTermInfo {
+public:
+	// the required lists for this query term, synonym lists, etc.
+	RdbList  *m_subLists        [MAX_SUBLISTS];
+	// flags to indicate if bigram list should be scored higher
+	char      m_bigramFlags     [MAX_SUBLISTS];
+	// shrinkSubLists() set this:
+	long      m_newSubListSize  [MAX_SUBLISTS];
+	char     *m_newSubListStart [MAX_SUBLISTS];
+	char     *m_newSubListEnd   [MAX_SUBLISTS];
+	char     *m_cursor          [MAX_SUBLISTS];
+	char     *m_savedCursor     [MAX_SUBLISTS];
+	// the corresponding QueryTerm for this sublist
+	//class QueryTerm *m_qtermList [MAX_SUBLISTS];
+	long      m_numNewSubLists;
+	// how many are valid?
+	long      m_numSubLists;
+	// size of all m_subLists in bytes
+	long long m_totalSubListsSize;
+	// the term freq weight for this term
+	float     m_termFreqWeight;
+	// what query term # do we correspond to in Query.h
+	long      m_qtermNum;
+	// the word position of this query term in the Words.h class
+	long      m_qpos;
+	// the wikipedia phrase id if we start one
+	long      m_wikiPhraseId;
+	// phrase id term or bigram is in
+	long      m_quotedStartId;
+};
+
+
 /*
 #include "RdbList.h"

@ -523,6 +559,8 @@ class PosdbTable {
 					 char *endi, char *endj,
 					 class DocIdScore *pdcs );

+	bool makeDocIdVoteBufForBoolQuery_r ( ) ;
+
 	// some generic stuff
 	PosdbTable();
 	~PosdbTable();
@ -670,6 +708,13 @@ class PosdbTable {
 	long                 m_minListi;
 	// intersect docids from each QueryTermInfo into here
 	SafeBuf              m_docIdVoteBuf;
+
+	// boolean truth table for boolean queries
+	HashTableX m_bt;
+	HashTableX m_ct;
+	// size of the data slot in m_bt
+	long m_vecSize;
+
 	// are all positive query terms in same wikipedia phrase like
 	// 'time enough for love'?
 	bool m_allInSameWikiPhrase;
--- a/Query.cpp
+++ b/Query.cpp
@ -24,11 +24,11 @@ void Query::constructor ( ) {
 	//m_bmap      = NULL;
 	m_bitScores = NULL;
 	m_qwords      = NULL;
-	m_expressions = NULL;
+	//m_expressions = NULL;
 	m_qwordsAllocSize      = 0;
-	m_expressionsAllocSize = 0;
+	//m_expressionsAllocSize = 0;
 	m_qwords               = NULL;
-	m_expressions          = NULL;
+	//m_expressions          = NULL;
 	reset ( );
 }

@ -46,7 +46,7 @@ void Query::reset ( ) {
 	m_bufLen      = 0;
 	m_origLen     = 0;
 	m_numWords    = 0;
-	m_numOperands = 0;
+	//m_numOperands = 0;
 	m_numTerms    = 0;
 	m_synTerm     = 0;
 	//m_numIgnored  = 0;
@ -60,14 +60,14 @@ void Query::reset ( ) {
 	m_bitScores = NULL;
 	//m_bmapSize      = 0;
 	m_bitScoresSize = 0;
-	if ( m_expressionsAllocSize )
-		mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
+	//if ( m_expressionsAllocSize )
+	//	mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
 	if ( m_qwordsAllocSize )
 		mfree ( m_qwords      , m_qwordsAllocSize      , "Query4" );
-	m_expressionsAllocSize = 0;
+	//m_expressionsAllocSize = 0;
 	m_qwordsAllocSize      = 0;
 	m_qwords               = NULL;
-	m_expressions          = NULL;
+	//m_expressions          = NULL;
 	m_numExpressions       = 0;
 	m_gnext                = m_gbuf;
 	m_hasUOR               = false;
@ -149,7 +149,7 @@ bool Query::set2 ( char *query        ,

 	char *q = query;
 	// see if it should be boolean...
-	for ( long i = 0 ; boolFlag && i < queryLen ; i++ ) {
+	for ( long i = 0 ; i < queryLen ; i++ ) {
 		if ( q[i]=='A' && q[i+1]=='N' && q[i+2]=='D' &&
 		     (q[i+3]==' ' || q[i+3]=='(') )
 			boolFlag = 1;
@ -343,8 +343,8 @@ bool Query::set2 ( char *query        ,

 	// set m_expressions[] and m_operands[] arrays and m_numOperands 
 	// for boolean queries
-	if ( m_isBoolean )
-		if ( ! setBooleanOperands() ) return false;
+	//if ( m_isBoolean )
+	//	if ( ! setBooleanOperands() ) return false;

 	// disable stuff for site:, ip: and url: queries
 	for ( long i = 0 ; i < m_numWords ; i++ ) {
@ -386,6 +386,17 @@ bool Query::set2 ( char *query        ,
 		break;
 	}

+	// . keep it simple for now
+	// . we limit to MAX_EXRESSIONS to like 10 now i guess
+	if ( m_isBoolean ) {
+		m_numExpressions = 1;
+		m_expressions[0].add ( 0 , 
+				       m_numWords ,
+				       this , // Query
+				       0 ); // level
+	}
+
+
 	// . if it is not truncated, no need to use hard counts
 	// . comment this line and the next one out for testing hard counts
 	if ( ! m_truncated ) return true;
@ -450,16 +461,16 @@ bool Query::set2 ( char *query        ,
 	//    "(nt=%li)",
 	//     m_numExplicitBits,m_numTerms-m_numExplicitBits,m_numTerms);

-	if ( ! m_isBoolean ) return true;
+	//if ( ! m_isBoolean ) return true;

 	// free cuz it was already set
-	if ( m_expressionsAllocSize ) 
-		mfree(m_expressions,m_expressionsAllocSize , "Query" );
-	m_expressionsAllocSize = 0;
-	m_expressions = NULL;
+	//if ( m_expressionsAllocSize ) 
+	//	mfree(m_expressions,m_expressionsAllocSize , "Query" );
+	//m_expressionsAllocSize = 0;
+	//m_expressions = NULL;

 	// also set the boolean stuff again too!
-	if ( ! setBooleanOperands() ) return false;
+	//if ( ! setBooleanOperands() ) return false;

 	return true;
 }
@ -498,7 +509,6 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 	long max = (long)MAX_EXPLICIT_BITS;
 	if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;
 	//char u8Buf[256]; 
-
 	for ( long i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) {
 		// break out if no more explicit bits!
 		/*
@ -617,7 +627,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 		// doh! gotta reset to 0
 		qt->m_implicitBits = 0;
 		// assume not under a NOT bool op
-		qt->m_underNOT = false;
+		//qt->m_underNOT = false;
 		// assign score weight, we're a phrase here
 		qt->m_userWeight = qw->m_userWeightPhrase ;
 		qt->m_userType   = qw->m_userTypePhrase   ;
@ -819,7 +829,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 // 			break;
 // 		}
 		// assume not under a NOT bool op
-		qt->m_underNOT = false;
+		//qt->m_underNOT = false;
 		// assign score weight, we're a phrase here
 		qt->m_userWeight = qw->m_userWeight ;
 		qt->m_userType   = qw->m_userType   ;
@ -1162,7 +1172,8 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 		if ( qw->m_wordSign == '+' ) continue;
 		// no url: stuff, maybe only title
 		if ( qw->m_fieldCode &&
-		     qw->m_fieldCode != FIELD_TITLE )
+		     qw->m_fieldCode != FIELD_TITLE &&
+		     qw->m_fieldCode != FIELD_GENERIC )
 			continue;
 		// skip if ignored like a stopword (stop to->too)
 		//if ( qw->m_ignoreWord ) continue;
@ -1232,8 +1243,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 			// stop word? no, we're a phrase term
 			qt->m_isQueryStopWord = qw->m_isQueryStopWord;
 			// change in both places
-			qt->m_termId    = syn.m_aids[j] & TERMID_MASK;
-			m_termIds[n]    = syn.m_aids[j] & TERMID_MASK;
+			long long wid = syn.m_aids[j];
+			// might be in a title: field or something
+			if ( qw->m_prefixHash ) {
+				long long ph = qw->m_prefixHash;
+				wid= hash64h(wid,ph);
+			}
+			qt->m_termId    = wid & TERMID_MASK;
+			m_termIds[n]    = wid & TERMID_MASK;
 			qt->m_rawTermId = syn.m_aids[j];
 			// assume explicit bit is 0
 			qt->m_explicitBit = 0;
@ -1265,7 +1282,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
 			// reset our implicit bits to 0
 			qt->m_implicitBits = 0;
 			// assume not under a NOT bool op
-			qt->m_underNOT = false;
+			//qt->m_underNOT = false;
 			// assign score weight, we're a phrase here
 			qt->m_userWeight = qw->m_userWeight ;
 			qt->m_userType   = qw->m_userType   ;
@ -1902,7 +1919,7 @@ bool Query::setQWords ( char boolFlag ,
 		// assume QueryWord is ignored by default
 		qw->m_ignoreWord   = IGNORE_DEFAULT;
 		qw->m_ignorePhrase = IGNORE_DEFAULT;
-
+		qw->m_wordNum = i;
 		// get word as a string
 		//char *w    = words.getWord(i);
 		//long  wlen = words.getWordLen(i);
@ -3308,24 +3325,24 @@ void Query::printQueryTerms(){
 //////////   ONLY BOOLEAN STUFF BELOW HERE  /////////////
 ////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////
-bool  Query::testBoolean(qvec_t bits, qvec_t bitmask){
+bool  Query::testBoolean( unsigned char *bits ,long vecSize){//qvec_t bitmask){
 	if (!m_isBoolean) return false;
 	Expression *e = &m_expressions [ 0 ];
 	// find top-level expression
-	while (e->m_parent && e != e->m_parent) e = e->m_parent;
-	return e->isTruth(bits, bitmask);
+	//while (e->m_parent && e != e->m_parent) e = e->m_parent;
+	return e->isTruth(bits,vecSize);//, bitmask);
 	
 }
 void  Query::printBooleanTree(){
 	if (!m_isBoolean) return;
-	Expression *e = &m_expressions [ 0 ];
+	//Expression *e = &m_expressions [ 0 ];
 	// find top-level expression
-	while (e->m_parent && e != e->m_parent) e = e->m_parent;
-	SafeBuf sbuf(1024);
-	e->print(&sbuf);
-	logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());	
+	//while (e->m_parent && e != e->m_parent) e = e->m_parent;
+	//SafeBuf sbuf(1024,"botree");
+	//e->print(&sbuf);
+	//logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
 }
-
+/*
 // . also sets the m_underNOT member of each QueryTerm, too!!
 // . returns false and sets g_errno on error, true otherwise
 bool Query::setBooleanOperands ( ) {
@ -3338,6 +3355,20 @@ bool Query::setBooleanOperands ( ) {
 			   "exceeded (%ld).",m_numTerms);
 	}

+	// set the QueryWord::m_opBit member of each query word.
+	// so if  you have a query like 'A B OR C' then you need
+	// to have both A and B if you don't have C. so every word
+	// unless its an operator needs its own bit. quoted phrases
+	// may present a problem down the road we'll have to deal with.
+	long opNum = 0;
+	for ( long i = 0 ; i < m_numWords ; i++ ) {
+		// skip if field, opcode, punct. etc.
+		if ( m_qwords[i].m_ignoreWord ) continue;
+		// assign it a # i guess
+		m_qwords[i].m_opNum = opNum++;
+	}
+	
+
 	// alloc the mem if we need to (mdw left off here) 
 	//long need = (m_numWords/3) * sizeof(Expression);
 	// illegitmate bool expressions breech the buffer
@ -3367,14 +3398,11 @@ bool Query::setBooleanOperands ( ) {
 	// . set the expression recursively
 	// . just setting this will not set the m_hasNOT members of each 
 	//   QueryTerm
-	long status = e->set ( 0           , // first word #
-				m_numWords  , // last  word #
-				0           , // parser position
-				this        , // array of QueryWords
-				0              ,// level
-				NULL, NULL,  // parent, leftchild
-			       false ,  // has NOT?
-			       false ); // under NOT?
+	long status = e->add ( 0           , // first word #
+			       m_numWords  , // last  word #
+			       this        , // array of QueryWords
+			       0              ,// level
+			       false );  // has NOT?
 	if ( status < 0 ) {
 		g_errno = ETOOMANYOPERANDS;
 		return log("query: Maximum number of bool operands "
@ -3399,6 +3427,8 @@ bool Query::setBooleanOperands ( ) {

 	// . get all the terms that are UNDER a NOT operator in some fashion
 	// . these bits are 1-1 with m_qterms[]
+	*/
+	/*
 	qvec_t notBits = e->getNOTBits( false );
 	for ( long i = 0 ; i < m_numTerms ; i++ ) {
 		if ( m_qterms[i].m_explicitBit & notBits )
@ -3406,15 +3436,20 @@ bool Query::setBooleanOperands ( ) {
 		else
 			m_qterms[i].m_underNOT = false;
 	}
+	*/
+/*
 	return true;
 }
-
+*/
+/*
 // . returns -1 on bad query error
 // . returns word AFTER the last word in our operand
 long Operand::set ( long a , long b , QueryWord *qwords , long level ,
 		    bool underNOT ) {
 	// clear these
-	m_termBits         = 0;
+	//m_termBits         = 0;
+	memset(m_opBits,0,MAX_OVEC_SIZE);
+
 	m_hasNOT           = false;

 	//m_hardRequiredBits = 0;
@ -3429,7 +3464,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
 		// set the parenthetical level of the word
 		qw->m_level = level;
 		// set this
-		qw->m_underNOT = underNOT;
+		//qw->m_underNOT = underNOT;
 		// skip punct
 		if ( ! qw->isAlphaWord() ) {
 			// if it is a parens, bail!
@ -3459,9 +3494,12 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
 		//   query is too long
 		if ( qw->m_phraseId && qw->m_queryPhraseTerm &&
 		     qw->m_phraseSign ) {
-			qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
+			//qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
 			//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
-			m_termBits |= e;
+			//m_termBits |= e;
+			long byte = qw->m_opNum / 8;
+			long mask = 1<<(qw->m_opNum % 8);
+			if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
 		}
 		// why would it be ignored? oh... if like cd-rom or in quotes
 		if ( qw->m_ignoreWord ) continue;
@ -3469,13 +3507,17 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
 		// . might be a word that's not a QueryTerm because
 		//   query is too long
 		if ( qw->m_queryWordTerm ) {
-			qvec_t e = qw->m_queryWordTerm->m_explicitBit;
+			//qvec_t e = qw->m_queryWordTerm->m_explicitBit;
 			//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
-			m_termBits |= e;
+			//m_termBits |= e;
+			long byte = qw->m_opNum / 8;
+			long mask = 1<<(qw->m_opNum % 8);
+			if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
 		}
 	}
 	return b;
 }
+*/

 // . returns -1 on bad query error
 // . returns next word to parse (after expression) on success
@ -3485,6 +3527,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
 // . new: organize query into sum of products normal form, ie:
 // . (a) OR (b AND c AND d) OR (e AND f)

+/*
 unsigned char precedence[] = {
 	0, // term
 	4, // OR
@ -3495,238 +3538,214 @@ unsigned char precedence[] = {
 	3, // UOR
 	5, // PIPE
 }; 
+*/

-long Expression::set (long start, 
-		       long end, 
-		       long pos, // current parsing position
-		       class Query      *q,
-		       long              level, 
-		       class Expression *parent, 
-		       class Expression *leftChild,
-		       bool hasNOT ,
-		       bool underNOT ) {
-	m_start = start;
-	m_end = end;
-	m_opcode = 0;
-	m_operand = NULL;
-	m_numChildren = 0;
-	m_hasNOT = hasNOT;
-	m_parent = parent;
-	uint8_t curOp = 0;
+//#define TYPE_OPERAND 1
+//#define TYPE_OPCODE 2
+//#define TYPE_EXPRESSION 3

-	QueryWord  *qwords        =  q->m_qwords;
-	Expression *o_expressions =  q->m_expressions;
-	Operand    *o_operands    =  q->m_operands;
-	long       *o_numOperands = &q->m_numOperands;
-	long       *o_numExpressions = &q->m_numExpressions;
-	long maxExpressions       =  q->m_numWords;
-		

-	// Lets really try to catch this
-	if (m_parent == this) {
-		//log(LOG_WARN, "query: Warning, setting expression "
-		//    "parent to self");
-		char *xx = NULL; *xx = 0;
-	}
+// return -1 and set g_errno on error
+// returns how many words expression was
+bool Expression::add (long start, 
+		      long end, 
+		      class Query      *q,
+		      long              level
+		      ) {
+
+	if ( level >= MAX_EXPRESSIONS ) { g_errno = EBADENGINEER; return -1;}
+
+	// the # of the first alnumpunct word in the expression
+	m_expressionStartWord = start;
+	// and the last one
+	//m_end = end;
+	//m_hasNOT = hasNOT;
+	m_q = q;
+
+	//m_cc = 0;
+
+	long i = m_expressionStartWord;
+
+	// "start" is the current alnumpunct word we are parsing out
+	for ( ; i<end ; i++ ) {
+
+		QueryWord *qwords = q->m_qwords;

-	//set initial args
-	if (leftChild) {
-		leftChild->m_parent = this;
-		m_children[0] = leftChild;
-		m_numChildren = 1;
-	}
-	hasNOT = false;
-	for ( long i=pos ; i<end ; i++ ){
 		QueryWord * qw = &qwords[i];
 		// set this
-		qw->m_underNOT = underNOT;
-		// set leaf node
-		if (!qw->m_opcode && qw->isAlphaWord()){
-			if (i > m_start) goto setChildExpr;
-			// if we maxxed out, error out
-			if ( *o_numOperands >= MAX_OPERANDS ) return -1;
-			Operand *op = &o_operands [ *o_numOperands ];
-			*o_numOperands = *o_numOperands + 1;
-			// . return ptr to next word for us to parse
-			// . subtract once since for loop will inc it
-			i = op->set ( i , end , qwords , level , underNOT );
-			if ( i < 0 ) return -1;
-			m_operand = op;
-			goto endExpr;
+		//qw->m_underNOT = underNOT;
+
+		// set leaf node if not an opcode like "AND" and not punct.
+		if ( ! qw->m_opcode && qw->isAlphaWord()){
+			//m_opSlots[m_cc] = i;
+			//m_opTypes[m_cc] = TYPE_OPERAND;
+			//qw->m_opBitNum = m_cc;
+			continue;//goto endExpr; mdw
 		}
 		if (qw->m_opcode == OP_NOT){
-			hasNOT = !hasNOT;
-			underNOT = hasNOT;
+			//hasNOT = !hasNOT;
+			//underNOT = hasNOT;
 			continue;
 		}
 		else if (qw->m_opcode == OP_LEFTPAREN){
-			if (i == m_start) i++;
-			goto setChildExpr;			
+			// this is expression
+			// . it should advance "i" to end of expression
+			// point to next...
+			q->m_numExpressions++;
+			// make a new one:
+			Expression *e=&q->m_expressions[q->m_numExpressions-1];
+			// now set it
+			e->add ( i+1, // skip over (
+				 end ,
+				 q ,
+				 level + 1);
+			// skip over it. pt to ')'
+			i += e->m_numWordsInExpression;
+			qw->m_expressionPtr = e;
+			//m_opSlots[m_cc] = (long)e;
+			//m_opTypes[m_cc] = TYPE_EXPRESSION;
+			//qw->m_opBitNum = m_cc;
 		}
 		else if (qw->m_opcode == OP_RIGHTPAREN){
-			goto endExpr;
-		}
-		else if (qw->m_opcode) {
-			int delta = 0;
-			curOp = qw->m_opcode;
-			if (m_numChildren == 1)
-				m_opcode = curOp;
-
-			if (m_numChildren > 1 && curOp != m_opcode) {
-
-			  delta = (int)precedence[curOp] -
-					(int)precedence[m_opcode];
-			}
-			
-			if (delta > 0){
-				goto endExpr;
-			}
-		        if (delta < 0){
-				// set a subexpression conataining the 
-				// last operand we found as the first 
-				goto setChildExpr2;
-			}
-		}
-		continue;
-	endExpr:
-		//log(LOG_DEBUG, "query: set Expr [%ld, %ld), opcode: %d",
-		//    a, i, curOp);
-		// if we've matched parens, go to next word
-		// but if we have an extra right paren, don't crash
-		if (qw->m_opcode == OP_RIGHTPAREN &&
-		    (qwords[m_start].m_opcode == OP_LEFTPAREN ||
-		     m_start == 0)) 
-			i++;
-
-		m_end = i;
-		// We have an extra open paren
-		if (qwords[m_start].m_opcode == OP_LEFTPAREN &&
-		    qw->m_opcode != OP_RIGHTPAREN) 
-			goto setParentExpr;
-		// we are top-level expr, but there is more to parse
-		if (!m_parent && i < end-1) 
-			goto setParentExpr;
-		// just return
-		return i;
-		// add a parent expression with this one as the left child 
-	setParentExpr:
-		{
-			if ( *o_numExpressions >= maxExpressions ) return -1;
-			//if (qw->m_opcode == OP_RIGHTPAREN) i++;
-			Expression *e = &o_expressions[*o_numExpressions];
-			*o_numExpressions = *o_numExpressions + 1;
-			i = e->set ( m_start , end ,i, q , 
-				     level+1, 
-				     m_parent,
-				     this, 
-				     false ,
-				     underNOT ) ;
+			// return size i guess, include )
+			m_numWordsInExpression = i - m_expressionStartWord+1;
 			return i;
 		}
+		else if (qw->m_opcode) {
+			// add that mdw
+			//m_opSlots[m_cc] = qw->m_opcode;
+			//m_opTypes[m_cc] = TYPE_OPCODE;
+			//qw->m_opBitNum = m_cc;
+			//m_cc++;
+			continue;
+		}
+		// white space?
+		continue;
+	}

-		// add a child expression
-	setChildExpr:
-		{
-			if ( *o_numExpressions >= maxExpressions ) return -1;

-			Expression *e = &o_expressions[*o_numExpressions];
-			*o_numExpressions = *o_numExpressions + 1;
-			i = e->set ( i , end , i, q , 
-				     level+1, 
-				     this, NULL, hasNOT , 
-				     underNOT ) -1;
-			if ( i < 0 ) return -1;
-			
-			// trim needless parens 
-			while (e->m_numChildren == 1) {
-				hasNOT = e->m_hasNOT;
-				e = e->m_children[0];
-				if (hasNOT) e->m_hasNOT = ! e->m_hasNOT;
+	m_numWordsInExpression = i - m_expressionStartWord;
+
+	return true;
+}
+
+// each bit is 1-1 with the explicit terms in the boolean query
+bool Query::matchesBoolQuery ( unsigned char *bitVec , long vecSize ) {
+	return m_expressions[0].isTruth ( bitVec , vecSize );
+}
+
+
+bool isBitNumSet ( long opBitNum, unsigned char *bitVec, long vecSize ) {
+	long byte = opBitNum / 8;
+	long mask = 1<<(opBitNum % 8);
+	if ( byte >= vecSize ) { char *xx=NULL;*xx=0; }
+	return bitVec[byte] & mask;
+}
+
+// . "bits" are 1-1 with the query words in Query::m_qwords[] array
+//   including ignored words and spaces i guess since Expression::add()
+//   seems to do that.
+bool Expression::isTruth ( unsigned char *bitVec ,long vecSize ) {
+
+	//
+	// operand1 operand2 operator1 operand3 operator2 ....
+	//
+
+	// result: -1 means unknown at this point
+	long result = -1;
+
+	char prevOpCode = 0;
+	long prevResult ;
+	// result of current operand
+	long opResult = -1;
+
+	long i    =     m_expressionStartWord;
+	long iend = i + m_numWordsInExpression;
+
+	bool hasNot = false;
+
+	for ( ; i < iend ; i++ ) {
+
+		QueryWord *qw = &m_q->m_qwords[i];
+
+		if ( qw->m_opcode == OP_NOT ) {
+			hasNot = true;
+			continue;
+		}
+
+
+		// so operands are expressions as well
+		Expression *e = (Expression *)qw->m_expressionPtr;
+		if ( e ) {
+			// save prev one. -1 means no prev.
+			prevResult = opResult;
+			// set new onw
+			opResult = e->isTruth ( bitVec , vecSize );
+			// skip over that expression. point to ')'
+			i += e->m_numWordsInExpression;
+			// flip?
+			if ( hasNot ) {
+				if ( opResult == 1 ) opResult = 0;
+				else                 opResult = 1;
+				hasNot = false;
 			}
-			hasNOT = false;
-			//cull empty expressions
-			if (e->m_numChildren < 1 &&
-			    e->m_operand == NULL) continue;
+		}

-			if (m_numChildren >= MAX_OPERANDS) return -1;
-			// add good expressions
-			m_children [ m_numChildren] = e;
-			m_numChildren++;
-			if (m_numChildren > 1 && m_opcode == 0)
-				m_opcode = OP_AND; // default AND
+		if ( qw->m_opcode && ! e ) {
+			prevOpCode = qw->m_opcode;//m_opSlots[i];
 			continue;
 		}

-		// we need to make the last operand we passed 
-		// be the first operand of a subexpression
-	setChildExpr2:
-		{
-			// remove the last expression from our list
-			Expression *ce = m_children[m_numChildren-1];
+		// simple operand
+		if ( ! qw->m_opcode && ! e ) {
+			// for regular word operands
+			// ignore it like a space?
+			if ( qw->m_ignoreWord ) continue;
+			// save old one
+			prevResult = opResult;
+			// convert word to term #
+			QueryTerm *qt = qw->m_queryWordTerm;
+			if ( ! qt ) continue;
+			// . m_bitNum is set in Posdb.cpp when it sets its
+			//   QueryTermInfo array
+			// . it is basically the query term #
+			// . see iff that bit is set in this docid's vec
+			opResult = isBitNumSet ( qt->m_bitNum,bitVec,vecSize );
+			// flip?
+			if ( hasNot ) {
+				if ( opResult == 1 ) opResult = 0;
+				else                 opResult = 1;
+				hasNot = false;
+			}
+		}

-			m_numChildren--;
+		// need two to tango. i.e. (true OR false)
+		if ( prevResult == -1 ) continue;

-
-			if ( *o_numExpressions >= maxExpressions ) return -1;
-
-			Expression *e = &o_expressions[*o_numExpressions];
-			*o_numExpressions = *o_numExpressions + 1;
-			i = e->set ( ce->m_start , end , i, q , 
-				     level+1, 
-				     this, ce, 
-				     false , 
-				     underNOT ) -1;
-			ce->m_parent = e;
-			if ( i < 0 ) return -1;
-
-			if (m_numChildren >= MAX_OPERANDS) return -1;
-			m_children [ m_numChildren ] = e;
-
-			hasNOT = false;
-			m_numChildren++;
-			continue;
+		// if this is not the first time... we got two
+		if ( prevOpCode == OP_AND ) {
+			// if first operation we encount is A AND B then
+			// default result to on. only allow an AND operation
+			// to turn if off.
+			if ( result == -1 ) result = true;
+			if ( ! prevResult ) result = false;
+			if ( !    opResult ) result = false;
+		}
+		else if ( prevOpCode == OP_OR ) {
+			// if first operation we encount is A OR B then
+			// default result to off
+			if ( result == -1 ) result = false;
+			if ( prevResult ) result = true;
+			if (   opResult ) result = true;
 		}
 	}
-	return end;
-}
-
-
-// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
-bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
-	//bool op1 = false ; // set to false so compiler shuts up
-	//bool op2 ;
-	//bool accumulator = false;
-	//bool hadOR       = false;
-	bool result = false;
-
-	// leaf node
-	if (m_operand){
-		result = m_operand->isTruth(bits, mask);
-		// handle masked terms better.. don't apply NOT operator
-		if (!(m_operand->m_termBits & mask)) return true;
-	}
-	else if (m_numChildren == 1){
-		result = m_children[0]->isTruth(bits, mask);
-	}
-	else if (m_opcode == OP_OR || m_opcode == OP_UOR) {
-		for ( long i=0 ; i<m_numChildren ; i++ ) {
-			result = result || m_children[i]->isTruth(bits, mask);
-			if (result) goto done;
-		}
-	}
-	else if (m_opcode == OP_AND || m_opcode == OP_PIPE){
-		result = true;
-		for (long i = 0 ; i < m_numChildren ; i++ ) {
-			result = result && m_children[i]->isTruth(bits, mask);
-			if (!result) goto done;
-		}
-	}
-
-done :
-	if (m_hasNOT) return !result;
-	else return result;
+
+	if ( result == -1 ) return true;
+	if ( result ==  0 ) return false;
+	return true;
 }

+/*
 // . "bits" are 1-1 with the query terms in Query::m_qterms[] array
 // . hasNOT is true if there's a NOT just to the left of this WHOLE expressions
 //   ourside the parens
@ -3744,9 +3763,11 @@ qvec_t Expression::getNOTBits ( bool hasNOT ) {
 	// success, all operand pairs were true
 	return notBits;
 }
+*/

 // print boolean expression for debug purposes
 void Expression::print(SafeBuf *sbuf) {
+	/*
 	if (m_hasNOT) sbuf->safePrintf("NOT ");
 	if (m_operand){
 		m_operand->print(sbuf);
@ -3765,16 +3786,18 @@ void Expression::print(SafeBuf *sbuf) {
 		}
 	}
 	sbuf->safePrintf(")");
-
+	*/
 }

+/*
 void Operand::print(SafeBuf *sbuf) {
 // 	long shift = 0;
 // 	while (m_termBits >> shift) shift++;
 // 	sbuf->safePrintf("%i", 1<<(shift-1));
-	if (m_hasNOT) sbuf->safePrintf("NOT 0x%lx", (long)m_termBits);
-	else sbuf->safePrintf("0x%lx", (long)m_termBits);
+	if (m_hasNOT) sbuf->safePrintf("NOT 0x%llx",*(long long *)m_opBits);
+	else sbuf->safePrintf("0x%llx", *(long long *)m_opBits);
 }
+*/

 // if any one query term is split, msg3a has to split the query
 bool Query::isSplit() {
--- a/Query.h
+++ b/Query.h
@ -49,6 +49,8 @@ typedef unsigned long long qvec_t;

 #define MAX_EXPLICIT_BITS (sizeof(qvec_t)*8)

+#define MAX_OVEC_SIZE 256
+
 // only can use 16-bit since have to make a 64k truth table!
 #define MAX_EXPLICIT_BITS_BOOLEAN (16*8)

@ -166,6 +168,7 @@ extern struct QueryField g_fields[];
 ////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////

+/*
 // . creating a QueryBoolean class was unnecessary since it was only functional
 //   and had nothing new it would store that the Query class doesn't store
 // . the entry point is the Query::setBitScoresBoolean() function below
@ -181,76 +184,46 @@ public:
 	long set ( long a , long b , class QueryWord *qwords , long level ,
 		   bool underNOT ) ;
 	// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
-	// . Operand::m_termBits is the required bits for operand to be true
+	// . Operand::m_opBits is the required bits for operand to be true
 	// . does not include signless phrases
-	bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
+	//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
+	bool isTruth ( unsigned char *bitVec , long vecSize ) {
 		// must always satisfy hard required terms (+ sign)
 		//if ( (bits & m_forcedBits) != m_forcedBits )
 		//	return false;
-		if (m_hasNOT) return (bits & m_termBits & mask) == 0;
-		return ( (bits & m_termBits & mask) == (m_termBits & mask)); 
+		//if (m_hasNOT) return (bits & m_opBits & mask) == 0;
+		//return ( (bits & m_opBits & mask) == (m_opBits & mask)); 
+		if ( m_hasNOT ) {
+			for ( long i = 0 ; i < vecSize ; i++ )
+				if ( m_opBits[i] & bitVec[i] ) return false;
+			return true;
+		}
+		for ( long i = 0 ; i < vecSize ; i++ )
+			if ( m_opBits[i] & bitVec[i] ) return true;
+		return false;
 		// . we are now back to good ol' default OR
-		// . m_termBits should have been masked with
+		// . m_opBits should have been masked with
 		//   m_requiredBits so as not to include signless phrases
-		//return ( (bits & m_termBits) != 0 ); 
+		//return ( (bits & m_opBits) != 0 ); 
 	};
 	void print (SafeBuf *sbuf);
 	// we are a sequence of QueryWords
 	//long m_startWordNum;
 	//long m_lastWordNum;
-	// . we treat the required term bits of those words as one unit (ANDed)
-	// . unsigned phrases are not included in these term bits
 	// . doc just needs one of these bits for this op to be considered true
-	qvec_t m_termBits;
+	// . terms under the same QueryTermInfo class should have the same
+	//   termbit here
+	unsigned char m_opBits[MAX_OVEC_SIZE];
+	//long m_vecSize;
+	// does the word NOT preceed the operand?
 	bool   m_hasNOT;
-	class Expression *m_parent;
+	//class Expression *m_parent;

 	// we MUST have these for this OPERAND to be true
 	//unsigned short m_forcedBits;
 };
+*/

-// operand1 AND operand2 OR  ...
-// operand1 OR  operand2 AND ...
-class Expression {
-public:
-	long set (long start, 
-		   long end, 
-		   long pos, // current parsing position
-		   class Query      *q,
-		   long              level, 
-		   class Expression *parent, 
-		   class Expression *leftChild,
-		   bool hasNOT ,
-		  bool underNOT );
-
-	bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1  ) ;
-	// . what QueryTerms are UNDER the influence of the NOT opcode?
-	// . we read in the WHOLE termlist of those that are (like '-' sign)
-	// . returned bit vector is 1-1 with m_qterms in Query class
-	qvec_t getNOTBits ( bool hasNOT );
-	void print (SafeBuf *sbuf);
-	// . a list of operands separated by op codes (a AND b OR c ...)
-	// . sometimes and operand is another expression: a AND (b OR c)
-	// . use NULL in m_operands slot if we got an expression and vice versa
-	// . m_opcodes[i] is the opcode after operand #i
-	class Expression *m_parent;
-	//class Operand    *m_operands    [ MAX_OPERANDS ];
-	class Expression *m_children [ MAX_OPERANDS ];
-	//char              m_opcodes     [ MAX_OPERANDS ];
-	//long              m_numOperands;
-	// now expressions can have either child expressions or 1 operand
-	long              m_numChildren;
-	// do we have a NOT operator before operand #i?
-	//bool              m_hasNOT      [ MAX_OPERANDS ];
-	// only one opcode, operand, hasNOT per expression now
-	uint8_t           m_opcode;
-	class Operand    *m_operand;
-	bool              m_hasNOT;
-	// needed for nesting
-	long              m_start;
-	long              m_end;
-
-};

 ////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////
@ -292,7 +265,7 @@ class QueryWord {
 	long long   m_phraseId;
 	// hash of field name then collection, used to hash termId
 	long long   m_prefixHash;
-
+	long        m_wordNum;
 	// are we in a phrase in a wikipedia title?
 	long        m_wikiPhraseId;
 	long        m_wikiPhraseStart;
@ -373,6 +346,10 @@ class QueryWord {
 	float m_float;
 	// for gbminint:99 etc. uses integers instead of floats for better res
 	long  m_int;
+	// what operand bit # is it for doing boolen queries?
+	//long  m_opBitNum;
+	// when an operand is an expression...
+	class Expression *m_expressionPtr;
 };

 // . we filter the QueryWords and turn them into QueryTerms
@ -415,6 +392,13 @@ class QueryTerm {
 	// expressions) and just use a hardCount to see how many hard required
 	// terms are contained by a document. see IndexTable.cpp "hardCount"
 	char       m_hardCount;
+
+	// the "number" of the query term used for evaluation boolean
+	// expressions in Expression::isTruth(). Basically just the
+	// QueryTermInfo for which this query term belongs. each QueryTermInfo
+	// is like a single query term and all its synonyms, etc.
+	long       m_bitNum;
+
 	// point to term, either m_word or m_phrase
 	char      *m_term;
 	long       m_termLen;
@ -485,6 +469,14 @@ class QueryTerm {
 	// we can be in? uses -1 to indicate none.
 	long  m_leftPhraseTermNum;
 	long  m_rightPhraseTermNum;
+	// . what operand # are we a part of in a boolean query?
+	// . like for (x AND y) x would have an opNum of 0 and y an
+	//   opNum of 1 for instance.
+	// . for things like (x1 OR x2 OR x3 ... ) we try to give all
+	//   those query terms the same m_opNum for efficiency since
+	//   they all have the same effecct
+	//long  m_opNum;
+	
 	// same as above basically
 	class QueryTerm *m_leftPhraseTerm;
 	class QueryTerm *m_rightPhraseTerm;
@ -501,6 +493,41 @@ class QueryTerm {

 };

+//#define MAX_OPSLOTS 256
+
+#define MAX_EXPRESSIONS 10
+
+// operand1 AND operand2 OR  ...
+// operand1 OR  operand2 AND ...
+class Expression {
+public:
+	bool add (long start, 
+		  long end, 
+		  class Query      *q,
+		  long    level );
+	bool isTruth ( unsigned char *bitVec , long vecSize );
+	// . what QueryTerms are UNDER the influence of the NOT opcode?
+	// . we read in the WHOLE termlist of those that are (like '-' sign)
+	// . returned bit vector is 1-1 with m_qterms in Query class
+	void print (SafeBuf *sbuf);
+	// . a list of operands separated by op codes (a AND b OR c ...)
+	// . sometimes and operand is another expression: a AND (b OR c)
+	// . use NULL in m_operands slot if we got an expression and vice versa
+	// . m_opcodes[i] is the opcode after operand #i
+	//class Expression *m_parent;
+	//bool              m_hasNOT;
+	//long              m_start;
+	//long              m_end;
+	long m_expressionStartWord;
+	long m_numWordsInExpression;
+	Query *m_q;
+	// . opSlots can be operands operators or expressions
+	// . m_opTypes tells which of the 3 they are
+	//long m_opSlots[MAX_OPSLOTS];
+	//char m_opTypes[MAX_OPSLOTS];
+	//long m_cc;
+};
+
 // . this is the main class for representing a query
 // . it contains array of QueryWords (m_qwords[]) and QueryTerms (m_qterms[])
 class Query {
@ -589,11 +616,17 @@ class Query {

 	// sets m_bmap[][] so getImplicits() works
 	void setBitMap ( );
-	bool testBoolean(qvec_t bits, qvec_t bitmask=(qvec_t)-1);
+	bool testBoolean(unsigned char *bits,long vecSize);
 	// print to log
 	void printBooleanTree();
 	void printQueryTerms();

+	// the new way as of 3/12/2014. just determine if matches the bool
+	// query or not. let's try to offload the scoring logic to other places
+	// if possible.
+	// bitVec is all the QueryWord::m_opBits some docid contains, so
+	// does it match our boolean query or not?
+	bool matchesBoolQuery ( unsigned char *bitVec , long vecSize ) ;


 	// . call this before calling getBitScore() to set m_bitScores[] table
@ -613,6 +646,7 @@ class Query {
 	//   through the phrase
 	// . the greater the number of IMplicit SINGLE words a doc has the 
 	//   bigger its bit score
+	/*
 	uint8_t getBitScore ( qvec_t ebits ) {
 		// get implicit bits from explicit bits
 		qvec_t ibits = getImplicits ( ebits );
@ -661,6 +695,7 @@ class Query {
 		if (ibits                     == m_requiredBits ) bscore|=0x20;
 		return bscore;
 	};
+	*/

 	// return an implicit vector from an explicit which contains the explic
 	qvec_t getImplicits ( qvec_t ebits ) {
@ -716,7 +751,7 @@ class Query {
 	bool        isConnection ( char *s , long len ) ;

 	// set the QueryTerm::m_hasNOT members
-	void setHasNOTs();
+	//void setHasNOTs();

 	// . used by IndexTable.cpp to make a ptr map of the query terms
 	//   to make intersecting the termlists one at a time efficient
@ -874,11 +909,12 @@ class Query {
 	
 	// . we now contain the parsing components for boolean queries
 	// . m_expressions points into m_gbuf or is allocated
-	class Expression *m_expressions; // [ MAX_OPERANDS ];
-	long              m_expressionsAllocSize;
+	//class Expression *m_expressions; // [ MAX_OPERANDS ];
+	//long              m_expressionsAllocSize;
+	Expression        m_expressions[MAX_EXPRESSIONS];
 	long              m_numExpressions;
-	class Operand     m_operands    [ MAX_OPERANDS ];
-	long              m_numOperands ;
+	//class Operand     m_operands    [ MAX_OPERANDS ];
+	//long              m_numOperands ;

 	// does query contain the pipe operator
 	bool m_piped;
--- a/Spider.cpp
+++ b/Spider.cpp
@ -9920,7 +9920,8 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,

 	char *row;
 	bool checkedRow = false;
-	SpiderColl *sc = cr->m_spiderColl;
+	//SpiderColl *sc = cr->m_spiderColl;
+	SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);

 	//if ( strstr(url,"http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2" ))
 	//	log("hey");
--- a/Title.cpp
+++ b/Title.cpp
@ -71,6 +71,7 @@ void Title::reset() {
 		mfree ( m_title , m_titleAllocSize , "Title" );
 	m_title = NULL;
 	m_titleBytes = 0;
+	m_titleAllocSize = 0;
 	m_query = NULL;
 	m_titleTagStart = -1;
 	m_titleTagEnd   = -1;
@ -113,7 +114,7 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 	char *val = NULL;
 	// look for the "title:" field in json then use that
 	SafeBuf jsonTitle;
-	long vlen;
+	long vlen = 0;
 	if ( xd->m_contentType == CT_JSON ) {
 		char *jt;
 		jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
@ -124,7 +125,6 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 			val = jsonTitle.getBufStart();
 			vlen = jsonTitle.length();
 		}
-		
 	}
 	// if we had a title: field in the json...
 	if ( val && vlen > 0 ) {
@ -135,6 +135,7 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 		else {
 			dst = (char *)mmalloc ( m_titleBytes+1,"titdst" );
 			if ( ! dst ) return false;
+			m_titleAllocSize = m_titleBytes+1;
 		}
 		m_title = dst;
 		memcpy ( dst , val , m_titleBytes );
@ -142,6 +143,13 @@ bool Title::setTitle ( XmlDoc   *xd            ,
 		return true;
 	}

+	// json content, if has no explicit title field, has no title then
+	if ( xd->m_contentType == CT_JSON ) {
+		m_localBuf[0] = '\0';
+		m_title = m_localBuf;
+		m_titleBytes = 0;
+		return true;
+	}

 	bool status = setTitle4 ( xd ,
 				  xml ,
--- a/TopTree.h
+++ b/TopTree.h
@ -9,7 +9,7 @@
 #define _TOPTREE_H_

 #include "Clusterdb.h"   // SAMPLE_VECTOR_SIZE, 48 bytes for now
-#include "IndexTable2.h" // score_t definition
+//#include "IndexTable2.h" // score_t definition
 #include "RdbTree.h"

 class TopNode {
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -14474,6 +14474,7 @@ char **XmlDoc::getHttpReply2 ( ) {
 	// turn off
 	r->m_useCompressionProxy = false;
 	r->m_compressReply       = false;
+	r->m_isCustomCrawl       = cr->m_isCustomCrawl;

 	// set it for this too
 	if ( g_conf.m_useCompressionProxy &&
@ -17199,12 +17200,16 @@ long *XmlDoc::getContentHashJson32 ( ) {
 	JsonItem *ji = jp->getFirstItem();
 	long totalHash32 = 0;

+	//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);
+
 	for ( ; ji ; ji = ji->m_next ) {
 		QUICKPOLL(m_niceness);
 		// skip if not number or string
 		if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
 			continue;

+		char *topName = NULL;
+
 		// what name level are we?
 		long numNames = 1;
 		JsonItem *pi = ji->m_parent;
@ -17212,6 +17217,7 @@ long *XmlDoc::getContentHashJson32 ( ) {
 			// empty name?
 			if ( ! pi->m_name ) continue;
 			if ( ! pi->m_name[0] ) continue;
+			topName = pi->m_name;
 			numNames++;
 		}

@ -17230,6 +17236,22 @@ long *XmlDoc::getContentHashJson32 ( ) {
 		     strcmp(ji->m_name,"resolved_url") == 0 )
 			continue;

+		if ( topName && strcmp(topName,"stats") == 0 )
+			continue;
+
+		if ( topName && strcmp(topName,"queryString") == 0 )
+			continue;
+
+		if ( topName && strcmp(topName,"nextPages") == 0 )
+			continue;
+
+		if ( topName && strcmp(topName,"textAnalysis") == 0 )
+			continue;
+
+		if ( topName && strcmp(topName,"links") == 0 )
+			continue;
+
+
 		// hash the fully compound name
 		long nameHash32 = 0;
 		JsonItem *p = ji;
@ -17275,6 +17297,11 @@ long *XmlDoc::getContentHashJson32 ( ) {
 		long combined32 = hash32h ( nameHash32 , vh32 );
 		// accumulate field/val pairs order independently
 		totalHash32 ^= combined32;
+		// debug note
+		//logf(LOG_DEBUG,"ch32: field=%s nh32=%lu vallen=%li",
+		//     ji->m_name,
+		//     nameHash32,
+		//     vlen);
 	}

 	m_contentHash32 = totalHash32;
@ -29753,7 +29780,10 @@ bool XmlDoc::hashWords3 ( //long        wordStart ,
 	long plen = 0;
 	if ( hi->m_prefix ) plen = gbstrlen ( hi->m_prefix );
 	if ( hi->m_prefix && plen ) {
-		prefixHash = hash64 ( hi->m_prefix , plen );
+		// we gotta make this case insensitive, and skip spaces
+		// because if it is 'focal length' we can't search
+		// 'focal length:10' because that comes across as TWO terms.
+		prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
 		// . sanity test, make sure it is in supported list
 		// . hashing diffbot json output of course fails this so
 		//   skip in that case if diffbot
@ -30287,6 +30317,9 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
 	// . this now allows for commas in numbers like "1,500.62"
 	float f = atof2 ( p , bufEnd - p );

+	// debug
+	//log("build: hashing %s %f",hi->m_prefix,f);
+
 	if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
 		return false;

@ -30324,7 +30357,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
 	long nameLen = 0;
 	if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
 	if ( hi->m_prefix && nameLen ) 
-		nameHash = hash64Lower_utf8 ( hi->m_prefix , nameLen );
+		nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
 	// need a prefix for hashing numbers... for now
 	else { char *xx=NULL; *xx=0; }
 		
@ -30429,7 +30462,7 @@ bool XmlDoc::hashNumber3 ( long n , HashInfo *hi , char *sortByStr ) {
 	long nameLen = 0;
 	if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
 	if ( hi->m_prefix && nameLen ) 
-		nameHash = hash64Lower_utf8 ( hi->m_prefix , nameLen );
+		nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
 	// need a prefix for hashing numbers... for now
 	else { char *xx=NULL; *xx=0; }
 		
--- a/coll.main.0/coll.conf
+++ b/coll.main.0/coll.conf
@ -1,8 +1,8 @@
 # List of sites to spider, one per line. Gigablast uses the <a
-# href=/admin/scheduler#insitelist>insitelist</a> directive on the <a
-# href=/admin/scheduler>spider scheduler</a> page to make sure that the spider
-# only indexes urls that match the site patterns you specify here, other than
-# urls you add individually via the add urls or inject url tools. See <a
+# href=/admin/filters#insitelist>insitelist</a> directive on the <a
+# href=/admin/filters>url filters</a> page to make sure that the spider only
+# indexes urls that match the site patterns you specify here, other than urls
+# you add individually via the add urls or inject url tools. See <a
 # href=#examples>example site list</a> below. Limit list to 300MB. If you have
 # a lot of INDIVIDUAL URLS to add then consider using the <a
 # href=/admin/addurl>addurl</a> interface.
@ -12,7 +12,7 @@
 # must be represented as &lt;, &gt;, &#34; and &#035; respectively.

 # Controls just the spiders for this collection.
-<spideringEnabled>0</>
+<spideringEnabled>1</>

 # What is the maximum number of web pages the spider is allowed to download
 # simultaneously PER HOST for THIS collection?
--- a/gb.conf
+++ b/gb.conf
@ -51,7 +51,7 @@
 <readOnlyMode>0</>

 # Controls all spidering for all collections
-<spideringEnabled>0</>
+<spideringEnabled>1</>

 # What is the maximum number of web pages the spider is allowed to download
 # simultaneously for ALL collections PER HOST?
@ -144,7 +144,13 @@
 # Sends to email address 1 through email server 1 if any parm is changed.
 <sendParmChangeEmailAlertsToEmail1>0</>

-# Connects to this server directly when sending email 1 
+# Connects to this IP or hostname directly when sending email 1. Use
+# <i>apt-get install sendmail</i> to install sendmail on that IP or hostname.
+# Add <i>From:10.5 RELAY</i> to /etc/mail/access to allow sendmail to forward
+# email it receives from gigablast if gigablast hosts are on the 10.5.*.* IPs.
+# Then run <i>/etc/init.d/sendmail restart</i> as root to pick up those
+# changes so sendmail will forward Gigablast's mail to the address you give
+# below.
 <emailServer1><![CDATA[10.5.54.47]]></>

 # Sends to this address when sending email 1 
--- a/html/admin.html
+++ b/html/admin.html
@ -138,6 +138,7 @@ rather your current working directory, where the 'gb' binary resides.
 <li> Indexes JSON and XML natively. Provides ability to search individual structured fields.
 <li> Sorting. Sort the search results by meta tags or JSON fields that contain numbers, simply by adding something like gbsortby:price or gbrevsortby:price as a query term, assuming you have meta price tags.
 <li>Easy Scaling. Add new servers to the hosts.conf file then click 'rebalance shards' to automatically rebalance the sharded data.
+<li>Using &stream=1 can stream back millions of search results for a query without running out of memory.
 </ul>

 <br>
--- a/main.cpp
+++ b/main.cpp
@ -411,8 +411,8 @@ int main ( int argc , char *argv[] ) {

 			"-h\tprint this help.\n\n"
 			"-v\tprint version and exit.\n\n"
-			"-o\tprint the overview documentation in HTML. "
-			"Contains the format of hosts.conf.\n\n"
+			//"-o\tprint the overview documentation in HTML. "
+			//"Contains the format of hosts.conf.\n\n"
 			"-r\tindicates recovery mode, "
 			"sends email to addresses "
 			"specified in Conf.h upon startup.\n\n"
@ -440,6 +440,7 @@ int main ( int argc , char *argv[] ) {
 			"\ttwo hostids with a hyphen in between indicates a "
 			"range.\n\n"

+			/*
 			"tmpstart [hostId]\n"
 			"\tstart the gb process on all hosts or just on "
 			"[hostId] if specified, but "
@ -456,6 +457,7 @@ int main ( int argc , char *argv[] ) {
 			"\tsaves and exits for all gb hosts or "
 			"just on [hostId] if specified, for the "
 			"tmpstart command.\n\n"
+			*/

 			"spidersoff [hostId]\n"
 			"\tdisables spidering for all gb hosts or "
@ -465,6 +467,7 @@ int main ( int argc , char *argv[] ) {
 			"\tensables spidering for all gb hosts or "
 			"just on [hostId] if specified.\n\n"

+			/*
 			"cacheoff [hostId]\n"
 			"\tdisables all disk PAGE caches on all hosts or "
 			"just on [hostId] if specified.\n\n"
@ -472,11 +475,17 @@ int main ( int argc , char *argv[] ) {
 			"freecache [maxShmid]\n"
 			"\tfinds and frees all shared memory up to shmid "
 			"maxShmid, default is 3000000.\n\n"
+			*/

+			/*
 			"ddump [hostId]\n"
-			"\tdisk dump in memory trees to binary files "
-			"just on [hostId] if specified.\n\n"
+			"\tdump all b-trees in memory to sorted files on "
+			"disk. "
+			"Will likely trigger merges on files on disk. "
+			"Restrict to just host [hostId] if given.\n\n"
+			*/

+			/*
 			"pmerge [hostId|hostId1-hostId2]\n"
 			"\tforce merge of posdb files "
 			"just on [hostId] if specified.\n\n"
@ -492,16 +501,19 @@ int main ( int argc , char *argv[] ) {
 			"merge [hostId|hostId1-hostId2]\n"
 			"\tforce merge of all rdb files "
 			"just on [hostId] if specified.\n\n"
+			*/

 			"dsh <CMD>\n"
 			"\trun this command on the primary IPs of "
 			"all active hosts in hosts.conf. Example: "
 			"gb dsh 'ps auxw; uptime'\n\n"

+			/*
 			"dsh2 <CMD>\n"
 			"\trun this command on the secondary IPs of "
 			"all active hosts in hosts.conf. Example: "
 			"gb dsh2 'ps auxw; uptime'\n\n"
+			*/

 			"install [hostId]\n"
 			"\tinstall all required files for gb from "
@ -509,13 +521,16 @@ int main ( int argc , char *argv[] ) {
 			"to [hostId]. If no [hostId] is specified install "
 			"to ALL hosts.\n\n"

+			/*
 			"install2 [hostId]\n"
 			"\tlike above, but use the secondary IPs in the "
 			"hosts.conf.\n\n"
+			*/

 			"installgb [hostId]\n"
 			"\tlike above, but install just the gb executable.\n\n"

+			/*
 			"installgb2 [hostId]\n"
 			"\tlike above, but use the secondary IPs in the "
 			"hosts.conf.\n\n"
@ -592,7 +607,9 @@ int main ( int argc , char *argv[] ) {
 			"search for them on server2. If you do not want to"
 			" use the proxy server "
 			"on gk10, use -p\n\n"
+			*/

+			/*
 			"blaster [-l|-u|-i] <file> <maxNumThreads> <wait>\n"
 			"\tget documents from the urls given in file. The "
 			"-l argument is to "
@ -606,7 +623,9 @@ int main ( int argc , char *argv[] ) {
 			"\tmaxNumThreads is the"
 			" number of concurrent threads at one time and wait "
 			" is the time to wait between threads.\n\n"
+			*/

+			/*
 			"scale <newHosts.conf>\n"
 			"\tGenerate a script to be called to migrate the "
 			"data to the new places. Remaining hosts will "
@ -647,7 +666,9 @@ int main ( int argc , char *argv[] ) {
 			"ping <hostId> [clientport]\n"
 			"\tperforms pings to <hostId>. [clientport] defaults "
 			"to 2050.\n\n"
+			*/

+			/*
 			"spellcheck <file>\n"
 			"\tspellchecks the the queries in <file>.\n\n"

@ -701,7 +722,9 @@ int main ( int argc , char *argv[] ) {

 			"parsetest <docIdToTest> [coll] [query]\n\t"
 			"parser speed tests\n\n"
+			*/

+			/*
 			"thrutest [dir] [fileSize]\n\tdisk write/read speed "
 			"test\n\n"

@ -711,6 +734,9 @@ int main ( int argc , char *argv[] ) {
 			
 			"memtest\n"
 			"\t Test how much memory we can use\n\n"
+			*/
+
+			/*
 			// Quality Tests
 			"countdomains <coll> <X>\n"
 			"\tCounts the domains and IPs in collection coll and "
@ -738,33 +764,38 @@ int main ( int argc , char *argv[] ) {

 			"dump es <coll> <UTCtimestamp>\n\tdump stats for "
 			"all events as if the time is UTCtimestamp.\n\n"
+			*/

+			/*
 #ifdef _CLIENT_
 			//there was <hostId> in this command but it 
 			// wasn't used in the program, so deleting it from 
 			// here
 			"dump <V> [C [X [Y [Z]]]]\n\tdump a db in "
 #else
+			*/
+
+			/*
 			"dump <V> [C [X [Y [Z [T]]]]]\n\tdump a db in "
-#endif
+			//#endif
 			"working directory.\n"
-#ifndef _CLIENT_
-#ifndef _METALINCS_
+			//#ifndef _CLIENT_
+			//#ifndef _METALINCS_
 			//"\tV is u to dump tfndb.\n"
 			"\tV is d to dump datedb.\n"
-#endif
-#endif
+			//#endif
+			//#endif
 			"\tV is s to dump spiderdb. set [T] to 1 to print "
 			"new stats. 2 to print old stats. T is ip of firstip."
 			"\n"
 			"\tV is t to dump titledb.\n"
-			"\tV is ts to dump sentences from events.\n"
-			"\tV is tw to dump words from events.\n"
+			//"\tV is ts to dump sentences from events.\n"
+			//"\tV is tw to dump words from events.\n"
 			"\tV is D to dump duplicate docids in titledb.\n"
 			"\tV is c to dump checksumdb.\n"
 			"\tV is S to dump tagdb.\n"
 			"\tV is W to dump tagdb for wget.\n"
-			"\tV is V to dump revdb.\n"
+			//"\tV is V to dump revdb.\n"
 			"\tV is x to dump doledb.\n"
 			"\tV is w to dump waiting tree.\n"
 			"\tV is B to dump sectiondb.\n"
@ -779,13 +810,13 @@ int main ( int argc , char *argv[] ) {
 			"\tX is start file num.    (default  0)\n"
 			"\tY is num files.         (default -1)\n"
 			"\tZ is 1 to include tree. (default  1)\n"
-#ifndef _CLIENT_
-#ifndef _METALINCS_
-#ifndef _GLOBALSPEC_
+			//#ifndef _CLIENT_
+			//#ifndef _METALINCS_
+			//#ifndef _GLOBALSPEC_
 			"\tT is the termid to dump. Applies only to indexdb.\n"
-#endif
-#endif
-#endif
+			//#endif
+			//#endif
+			//#endif
 			"\tT is the first docId to dump. Applies only to "
 			"titledb. "
 			//"(default none)\n\n"
@ -806,22 +837,27 @@ int main ( int argc , char *argv[] ) {
 			//"\tB is -1 to dump all priorities\n"
 			"\tC is 1 to just show the stats.  (default  0)\n"
 			"\n"
+			*/
+
+
 			//"dump i X Y Z t\n\tdump indexdb termId t in working "
 			//"directory.\n"
 			//"\tX is start file num.     (default  0)\n"
 			//"\tY is num files.          (default -1)\n"
 			//"\tZ is 1 to include tree.  (default  1)\n"
 			//"\tt is the termid to dump. (default none)\n\n"
-#ifndef _CLIENT_
-#ifndef _METALINCS_
+			//#ifndef _CLIENT_
+			//#ifndef _METALINCS_
+			/*
 			"dump I [X [V]]\n\tdump indexdb in working "
 			"directory at "
 			"an offset.\n"
-#endif
-#endif
+			//#endif
+			//#endif
 			"\tX is the file NAME.      (default  NULL)\n"
 			"\tV is the start offset.   (default  0)\n"
-
+			*/
+			/*
 			"\n"
 			"dumpmissing <coll> [hostId]\n\t"
 			"dump the docIds in indexdb but not "
@ -867,6 +903,7 @@ int main ( int argc , char *argv[] ) {
 			"in the current gb. Use synchost2 to use secondary "
 			"IPs.\n"
 			"\n"
+			*/
 			//#endif
 			);
 		SafeBuf sb2;
@ -894,6 +931,7 @@ int main ( int argc , char *argv[] ) {
 	if ( strcmp ( cmd , "-h" ) == 0 ) goto printHelp;
 	// version
 	if ( strcmp ( cmd , "-v" ) == 0 ) {
+		fprintf(stdout,"Gigablast March-2014\n");
 	//	fprintf(stderr,"Gigablast %s\nMD5KEY: %s\n"
 	//		"TAG: %s\nPATH:   %s\n",
 	//		GBVersion, GBCommitID, GBTag, GBBuildPath); 
@ -901,10 +939,10 @@ int main ( int argc , char *argv[] ) {
 	}

 	// print overview
-	if ( strcmp ( cmd , "-o" ) == 0 ) {
-		//printOverview ( );
-		return 0;
-	}
+	//if ( strcmp ( cmd , "-o" ) == 0 ) {
+	//	//printOverview ( );
+	//	return 0;
+	//}

 	bool hadHostId = false;