Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

2024-10-04 20:27:43 +03:00 · 2013-11-13 13:27:45 -08:00 · 2013-11-13 13:27:45 -08:00 · 9e77f1b2f6
commit 9e77f1b2f6
parent a31b13ad61 6cc4e6d980
18 changed files with 601 additions and 44 deletions
--- a/Json.cpp
+++ b/Json.cpp
@ -330,4 +330,60 @@ void Json::test ( ) {

 	return;
 }
-	
+
+bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) {
+
+	// reset, but don't free mem etc. just set m_length to 0
+	nameBuf.reset();
+	// get its full compound name like "meta.twitter.title"
+	JsonItem *p = this;//ji;
+	char *lastName = NULL;
+	char *nameArray[20];
+	long  numNames = 0;
+	for ( ; p ; p = p->m_parent ) {
+		// empty name?
+		if ( ! p->m_name ) continue;
+		if ( ! p->m_name[0] ) continue;
+		// dup? can happen with arrays. parent of string
+		// in object, has same name as his parent, the
+		// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
+		if ( p->m_name == lastName ) continue;
+		// update
+		lastName = p->m_name;
+		// add it up
+		nameArray[numNames++] = p->m_name;
+		// breach?
+		if ( numNames < 15 ) continue;
+		log("build: too many names in json tag");
+		break;
+	}
+	// assemble the names in reverse order which is correct order
+	for ( long i = 1 ; i <= numNames ; i++ ) {
+		// copy into our safebuf
+		if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) ) 
+			return false;
+		// separate names with periods
+		if ( ! nameBuf.pushChar('.') ) return false;
+	}
+	// remove last period
+	nameBuf.removeLastChar('.');
+	// and null terminate
+	if ( ! nameBuf.nullTerm() ) return false;
+	// change all :'s in names to .'s since : is reserved!
+	char *px = nameBuf.getBufStart();
+	for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
+
+	return true;
+}
+
+// is this json item in an array of json items?
+bool JsonItem::isInArray ( ) {
+	JsonItem *p = this;//ji;
+	for ( ; p ; p = p->m_parent ) {
+		// empty name? it's just a "value item" then, i guess.
+		//if ( ! p->m_name ) continue;
+		//if ( ! p->m_name[0] ) continue;
+		if ( p->m_type == JT_ARRAY ) return true;
+	}
+	return false;
+}
--- a/Json.h
+++ b/Json.h
@ -51,6 +51,10 @@ class JsonItem {
 		return (char *)this + sizeof(JsonItem);
 	};

+	// like acme.product.offerPrice if "acme:{product:{offerprice:1.23}}"
+	bool getCompoundName ( SafeBuf &nameBuf ) ;
+
+	bool isInArray ( );
 };


--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -738,6 +738,7 @@ public:
 				 char **lastKeyPtr ) ;
 	void printTitledbList ( RdbList *list , SafeBuf *sb ,
 				char **lastKeyPtr );
+	bool printJsonItemInCsv ( char *json , SafeBuf *sb ) ;

 	char m_fmt;
 	Msg4 m_msg4;
@ -751,6 +752,8 @@ public:
 	bool m_printedEndingBracket;
 	bool m_printedItem;

+	bool m_needHeaderRow;
+
 	bool m_needsMime;
 	char m_rdbId;
 	bool m_downloadJSON;
@ -810,10 +813,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 		fmt = FMT_JSON;
 		downloadJSON = true;
 	}
-	else if ( ( xx = strstr ( path , "_data.xml" ) ) ) {
+	else if ( ( xx = strstr ( path , "_data.csv" ) ) ) {
 		rdbId = RDB_TITLEDB;
 		downloadJSON = true;
-		fmt = FMT_XML;
+		fmt = FMT_CSV;
 	}
 	else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
 		rdbId = RDB_SPIDERDB;
@ -886,6 +889,9 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
 	st->m_printedItem = false;
 	st->m_printedEndingBracket = false;

+	// for csv...
+	st->m_needHeaderRow = true;
+
 	// debug
 	//log("mnew1: st=%lx",(long)st);

@ -1027,6 +1033,8 @@ bool StateCD::sendList ( ) {
 		ct = "text/xml";
 	if ( m_fmt == FMT_TXT )
 		ct = "text/plain";
+	if ( m_fmt == FMT_CSV )
+		ct = "text/csv";

 	// . if we haven't yet sent an http mime back to the user
 	//   then do so here, the content-length will not be in there
@ -1557,22 +1565,34 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 		// get the json content
 		char *json = xd.ptr_utf8Content;
 		
-		if ( m_printedItem )
-			sb->safePrintf("\n,\n");
+		// empty?
+		if ( xd.size_utf8Content <= 1 )
+			continue;

-		m_printedItem = true;
+		// if not json, just print the json item out in csv
+		// moved into PageResults.cpp...
+		//if ( m_fmt == FMT_CSV ) {
+		//	printJsonItemInCsv ( json , sb );
+		//	continue;
+		//}

 		// just print that out. encode \n's and \r's back to \\n \\r
 		// and backslash to a \\ ...
 		// but if they originally had a \u<backslash> encoding and
 		// we made into utf8, do not put that back into the \u
 		// encoding because it is not necessary.
+
+		// print in json
+		if ( m_printedItem )
+			sb->safePrintf("\n,\n");
+
+		m_printedItem = true;
+
 		if ( ! sb->safeStrcpyPrettyJSON ( json ) ) 
 			log("diffbot: error printing json in dump");

 		// separate each JSON object with \n i guess
 		//sb->pushChar('\n');
-
 	}
 }

@ -3498,20 +3518,77 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      "</td><td>"
 			      "<a href=/crawlbot/download/%s_data.json>"
 			      "json</a>"
-			      "&nbsp; "
-			      "<a href=/crawlbot/download/%s_data.xml>"
+			      "</td>"
+			      "</tr>"
+
+
+			      "<tr>"
+			      "<td><b>Download Products:</b> "
+			      "</td><td>"
+			      // make it search.csv so excel opens it
+			      "<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
+			      "c=%s&n=10000000&rand=%llu&id=1&"
+			      "q=gbrevsortby%%3AofferPrice&"
+			      "prepend=type%%3Ajson"
+			      //"+type%%3Aproduct%%7C"
+			      ">"
+			      "csv</a>"
+			      " &nbsp; "
+			      "<a href=/search?icc=1&format=html&sc=0&dr=0&"
+			      "c=%s&n=10000000&rand=%llu&id=1&"
+			      "q=gbrevsortby%%3AofferPrice&"
+			      "prepend=type%%3Ajson"
+			      ">"
+			      "html</a>"

-			      "xml</a>"
 			      "</td>"
 			      "</tr>"

 			      "<tr>"
 			      "<td><b>Download Urls:</b> "
 			      "</td><td>"
-
 			      "<a href=/crawlbot/download/%s_urls.csv>"
 			      "csv</a>"
-			      //
+			      "</td>"
+			      "</tr>"
+
+
+			      "<tr>"
+			      "<td><b>Latest Objects:</b> "
+			      "</td><td>"
+			      "<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
+			      "c=%s&n=10&rand=%llu&id=1&"
+			      "q=gbsortby%%3Agbspiderdate&"
+			      "prepend=type%%3Ajson"
+			      ">"
+			      "csv</a>"
+			      " &nbsp; "
+			      "<a href=/search?icc=1&format=html&sc=0&dr=0&"
+			      "c=%s&n=10rand=%llu&id=1&"
+			      "q=gbsortby%%3Agbspiderdate&"
+			      "prepend=type%%3Ajson"
+			      ">"
+			      "html</a>"
+			      "</td>"
+			      "</tr>"
+
+			      "<tr>"
+			      "<td><b>Latest Products:</b> "
+			      "</td><td>"
+			      "<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
+			      "c=%s&n=10&rand=%llu&id=1&"
+			      "q=gbsortby%%3Agbspiderdate&"
+			      "prepend=type%%3Ajson+type%%3Aproduct"
+			      ">"
+			      "csv</a>"
+			      " &nbsp; "
+			      "<a href=/search?icc=1&format=html&sc=0&dr=0&"
+			      "c=%s&n=10&rand=%llu&id=1&"
+			      "q=gbsortby%%3Agbspiderdate&"
+			      "prepend=type%%3Ajson+type%%3Aproduct"
+			      ">"
+			      "html</a>"
+
 			      "</td>"
 			      "</tr>"

@ -3648,11 +3725,38 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,


 			      , cr->m_coll
+
 			      , cr->m_coll
+			      , rand64
+
+			      // download products html
+			      , cr->m_coll
+			      , rand64
+
+			      //, cr->m_coll
 			      //, cr->m_coll
 			      //, cr->m_coll

 			      , cr->m_coll
+
+			      // latest objects in html
+			      , cr->m_coll
+			      , rand64
+
+			      // latest objects in csv
+			      , cr->m_coll
+			      , rand64
+
+
+			      // latest products in html
+			      , cr->m_coll
+			      , rand64
+
+			      // latest products in csv
+			      , cr->m_coll
+			      , rand64
+
+
 			      , cr->m_coll

 			      , cr->m_collectiveRespiderFrequency
@ -3878,6 +3982,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 	char *ct = "text/html";
 	if ( fmt == FMT_JSON ) ct = "application/json";
 	if ( fmt == FMT_XML ) ct = "text/xml";
+	if ( fmt == FMT_CSV ) ct = "text/csv";

 	// this could be in html json or xml
 	return g_httpServer.sendDynamicPage ( socket, 
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -52,6 +52,7 @@ public:
        long         m_numDocIds;
 	long long    m_took; // how long it took to get the results
 	HttpRequest  m_hr;
+	bool         m_printedHeaderRow;
 };

 static int printResult ( SafeBuf &sb,
@ -60,6 +61,8 @@ static int printResult ( SafeBuf &sb,
 			 CollectionRec *cr ,
 			 char *qe ) ;

+bool printJsonItemInCsv ( char *json , SafeBuf *sb , bool *printedHeaderRow ) ;
+
 bool printPairScore ( SafeBuf &sb , SearchInput *si , PairScore *ps ,
 		      Msg20Reply *mr , Msg40 *msg40 , bool first ) ;

@ -80,6 +83,7 @@ bool sendReply ( State0 *st , char *reply ) {
 	char *ct = "text/html";
 	if ( si && si->m_format == FORMAT_XML ) ct = "text/xml"; 
 	if ( si && si->m_format == FORMAT_JSON ) ct = "application/json";
+	if ( si && si->m_format == FORMAT_CSV ) ct = "text/csv";
 	char *charset = "utf-8";

 	// . filter anything < 0x20 to 0x20 to keep XML legal
@ -466,7 +470,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {

 	// limit here
 	long maxpp = cr->m_maxSearchResultsPerQuery ;
-	if ( si->m_docsWanted > maxpp ) si->m_docsWanted = maxpp;
+	if ( si->m_docsWanted > maxpp &&
+	     // disable serp max per page for custom crawls
+	     ! cr->m_isCustomCrawl )
+		si->m_docsWanted = maxpp;

        st->m_numDocIds = si->m_docsWanted;

@ -492,6 +499,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
 	st->m_gotAds     = false;
 	st->m_gotSpell   = false;

+	// reset
+	st->m_printedHeaderRow = false;
+
 	long ip = s->m_ip;
 	long uipLen;
 	char *uip = hr->getString("uip", &uipLen, NULL);
@ -1720,6 +1730,19 @@ static int printResult ( SafeBuf &sb,
 	Msg20      *m20 = msg40->m_msg20[ix];
 	Msg20Reply *mr  = m20->m_r;

+
+	if ( si->m_format == FORMAT_CSV &&
+	     mr->ptr_content &&
+	     mr->m_contentType == CT_JSON ) {
+		// parse it up
+		char *json = mr->ptr_content;
+		// only print header row once, so pass in that flag
+		printJsonItemInCsv ( json , &sb , &st->m_printedHeaderRow );
+		return true;
+	}
+
+
+
 	// just print cached web page?
 	if ( mr->ptr_content ) {
 		sb.safeStrcpy ( mr->ptr_content );
@ -4680,3 +4703,108 @@ bool printDirectorySearchType ( SafeBuf& sb, long sdirt ) {
 	return true;
 }
 */
+
+#include "Json.h"
+
+bool printJsonItemInCsv ( char *json , SafeBuf *sb , bool *printedHeaderRow ) {
+
+	// parse the json
+	Json jp;
+	jp.parseJsonStringIntoJsonItems ( json );
+
+	// . TODO: index individual "Products":[...] as each an
+	//   individual title rec.
+		
+	SafeBuf nameBuf;
+	bool firstOne = true;
+
+	JsonItem *ji;
+
+	////
+	// 
+	// print header row in csv
+	//
+	////
+	for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) {
+
+		if ( *printedHeaderRow )
+			break;
+
+		// skip if not number or string
+		if ( ji->m_type != JT_NUMBER && 
+		     ji->m_type != JT_STRING )
+			continue;
+
+		// if in an array, do not print! csv is not
+		// good for arrays... like "media":[....] . that
+		// one might be ok, but if the elements in the
+		// array are not simple types, like, if they are
+		// unflat json objects then it is not well suited
+		// for csv.
+		if ( ji->isInArray() ) continue;
+
+		if ( ! firstOne ) sb->pushChar(',');
+
+		firstOne = false;
+
+		ji->getCompoundName ( nameBuf );
+
+		//
+		// product.offerprice
+		//
+		sb->csvEncode ( nameBuf.getBufStart() , nameBuf.getLength() );
+	}
+
+	if ( ! *printedHeaderRow ) {
+		sb->pushChar('\n');
+		sb->nullTerm();
+		*printedHeaderRow = true;
+	}
+
+
+	firstOne = true;
+
+	///////
+	//
+	// print json item in csv
+	//
+	///////
+	for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) {
+
+		// skip if not number or string
+		if ( ji->m_type != JT_NUMBER && 
+		     ji->m_type != JT_STRING )
+			continue;
+
+		// skip if not well suited for csv (see above comment)
+		if ( ji->isInArray() ) continue;
+
+
+		if ( ! firstOne ) sb->pushChar(',');
+
+		firstOne = false;
+
+		if ( ji->m_type == JT_NUMBER ) {
+			// print numbers without double quotes
+			if ( ji->m_valueDouble *10000000.0 == 
+			     (double)ji->m_valueLong * 10000000.0 )
+				sb->safePrintf("%li",ji->m_valueLong);
+			else
+				sb->safePrintf("%f",ji->m_valueDouble);
+			continue;
+		}
+
+		// print the value
+		sb->pushChar('\"');
+		sb->csvEncode ( ji->getValue() , ji->getValueLen() );
+		sb->pushChar('\"');
+	}
+
+	if ( ! firstOne )
+		sb->pushChar('\n');
+
+	sb->nullTerm();
+
+	return true;
+}
+
--- a/Pages.cpp
+++ b/Pages.cpp
@ -344,6 +344,8 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
 		path = "admin/inject"; pathLen = gbstrlen(path); }
 	if ( pathLen == 9 && strncmp ( path , "index.php" , 9 ) == 0 ) {
 		path = "search"; pathLen = gbstrlen(path); }
+	if ( pathLen == 10 && strncmp ( path , "search.csv" , 10 ) == 0 ) {
+		path = "search"; pathLen = gbstrlen(path); }

 	// if it is like /GA/Atlanta then call sendPageResults
 	// and that should be smart enough to set the m_where in
--- a/Parms.cpp
+++ b/Parms.cpp
@ -14964,18 +14964,19 @@ void Parms::init ( ) {
 	m->m_sprpp = 0;
 	m++;

+	/*
 	m->m_title = "format of the returned search results";
 	m->m_desc  = "X is 0 to get back results in regular html, 1 to "
 		"get back results in XML, 2 for JSON.";
 	m->m_def   = "0";
-	m->m_soff  = (char *)&si.m_format - y;
-	m->m_type  = TYPE_CHAR;
+	m->m_soff  = (char *)&si.m_formatStr - y;
+	m->m_type  = TYPE_STRING;//CHAR;
 	m->m_sparm = 1;
 	m->m_scgi  = "format";
 	m->m_smin  = 0;
 	m->m_smax  = 12;
 	m++;
-
+	*/

 	m->m_title = "highlight query terms in summaries.";
 	m->m_desc  = "Use to disable or enable "
--- a/Posdb.cpp
+++ b/Posdb.cpp
@ -4115,7 +4115,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
 		qti->m_wikiPhraseId  = qw->m_wikiPhraseId;
 		qti->m_quotedStartId = qw->m_quoteStart;
 		// is it gbsortby:?
-		if ( qt->m_fieldCode == FIELD_GBSORTBY )
+		if ( qt->m_fieldCode == FIELD_GBSORTBY ||
+		     qt->m_fieldCode == FIELD_GBREVSORTBY )
 			m_sortByTermNum = i;
 		// count
 		long nn = 0;
@ -4237,6 +4238,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
 		// they have a float stored there for sorting etc.
 		if (qt->m_fieldCode == FIELD_GBSORTBY )
 			qti->m_bigramFlags[nn]|=BF_NUMBER;
+		if (qt->m_fieldCode == FIELD_GBREVSORTBY )
+			qti->m_bigramFlags[nn]|=BF_NUMBER;
 		if (qt->m_fieldCode == FIELD_GBNUMBERMIN )
 			qti->m_bigramFlags[nn]|=BF_NUMBER;
 		if (qt->m_fieldCode == FIELD_GBNUMBERMAX )
--- a/Posdb.h
+++ b/Posdb.h
@ -206,6 +206,15 @@ class Posdb {
 	float getFloat ( void *vkp ) {
 		return *(float *)(((char *)vkp) + 2); };

+	void setAlignmentBit ( void *vkp , char val ) {
+		char *p = (char *)vkp;
+		if ( val ) p[1] = p[1] | 0x02;
+		else       p[1] = p[1] & 0xfd;
+	};
+
+	bool isAlignmentBitClear ( void *vkp ) {
+		return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
+	};

 	void makeStartKey ( void *kp, long long termId , 
 			    long long docId=0LL){
@ -436,7 +445,7 @@ class PosdbList : public RdbList {
 #include "Query.h"         // MAX_QUERY_TERMS, qvec_t

 // max # search results that can be viewed without using TopTree
-#define MAX_RESULTS 1000
+//#define MAX_RESULTS 1000

 class PosdbTable {

--- a/Query.cpp
+++ b/Query.cpp
@ -2200,6 +2200,7 @@ bool Query::setQWords ( char boolFlag ,
 		     fieldCode == FIELD_ISCLEAN ||
 		     fieldCode == FIELD_QUOTA ||
 		     fieldCode == FIELD_GBSORTBY ||
+		     fieldCode == FIELD_GBREVSORTBY ||
 		     fieldCode == FIELD_GBNUMBERMIN ||
 		     fieldCode == FIELD_GBNUMBERMAX ||
 		     fieldCode == FIELD_GBAD  ) {
@ -2217,6 +2218,7 @@ bool Query::setQWords ( char boolFlag ,
 			// i've decided not to make 
 			// gbsortby:products.offerPrice case sensitive
 			if ( fieldCode == FIELD_GBSORTBY ||
+			     fieldCode == FIELD_GBREVSORTBY ||
 			     fieldCode == FIELD_GBNUMBERMIN ||
 			     fieldCode == FIELD_GBNUMBERMAX )
 				wid = hash64Lower_utf8 ( w , wlen , 0LL );
@ -3044,6 +3046,7 @@ struct QueryField g_fields[] = {
 	{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
 	{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
 	{"gbsortby", FIELD_GBSORTBY, false,""},
+	{"gbrevsortby", FIELD_GBREVSORTBY, false,""},

 	{"gbnumbermin", FIELD_GBNUMBERMIN, false,""},
 	{"gbnumbermax", FIELD_GBNUMBERMAX, false,""},
--- a/Query.h
+++ b/Query.h
@ -105,8 +105,9 @@ typedef unsigned long long qvec_t;
 #define FIELD_GBDOCID          52
 #define FIELD_GBCONTENTHASH    53 // for deduping at spider time
 #define FIELD_GBSORTBY         54 // i.e. sortby:price -> numeric termlist
-#define FIELD_GBNUMBERMIN      55
-#define FIELD_GBNUMBERMAX      56
+#define FIELD_GBREVSORTBY      55 // i.e. sortby:price -> low to high
+#define FIELD_GBNUMBERMIN      56
+#define FIELD_GBNUMBERMAX      57

 #define FIELD_GBOTHER 92

--- a/SafeBuf.cpp
+++ b/SafeBuf.cpp
@ -3169,3 +3169,49 @@ bool SafeBuf::htmlDecode ( char *src,
 	// good to go
 	return true;
 }
+
+void SafeBuf::replaceChar ( char src , char dst ) {
+	char *px = m_buf;
+	char *pxEnd = m_buf + m_length;
+	for ( ; px < pxEnd ; px++ ) if ( *px == src ) *px = dst;
+}
+
+
+// encode a double quote char to two double quote chars
+bool SafeBuf::csvEncode ( char *s , long len , long niceness ) {
+
+	if ( ! s ) return true;
+
+	// assume all chars are double quotes and will have to be encoded
+	long need = len * 2 + 1;
+	if ( ! reserve ( need ) ) return false;
+
+	// tmp vars
+	char *dst  = m_buf + m_length;
+	//char *dstEnd = m_buf + m_capacity;
+
+	// scan through all 
+	char *send = s + len;
+	for ( ; s < send ; s++ ) {
+		// breathe
+		QUICKPOLL ( niceness );
+		// convert it?
+		if ( *s == '\"' ) {
+			*dst++ = '\"';
+			*dst++ = '\"';
+			continue;
+		}
+		//if ( *s == '\\' ) {
+		//	*dst++ = '\\';
+		//	*dst++ = '\\';
+		//	continue;
+		//}
+		*dst++ = *s;
+	}
+
+	m_length += dst - (m_buf + m_length);
+
+	nullTerm();
+
+	return true;
+}
--- a/SafeBuf.h
+++ b/SafeBuf.h
@ -97,6 +97,9 @@ struct SafeBuf {
 	bool  safeStrcpy ( char *s ) ;
 	bool  safeStrcpyPrettyJSON ( char *decodedJson ) ;
 	bool  safeUtf8ToJSON ( char *utf8 ) ;
+
+	bool  csvEncode ( char *s , long len , long niceness = 0 );
+
 	//bool  pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
 	bool  cat(SafeBuf& c);
 	// . only cat the sections/tag that start with "tagFilter"
@ -144,6 +147,7 @@ struct SafeBuf {
 			     char *t , long tlen ,
 			     long niceness ,
 			     long startOff = 0 );
+	void replaceChar ( char src , char dst );
 	bool  copyToken(char* s);;
 	//output encoding
 	bool  setEncoding(short cs);
--- a/SearchInput.cpp
+++ b/SearchInput.cpp
@ -354,8 +354,16 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 		return log("query: unable to strcpy whitelist");
 	

+	char format = FORMAT_HTML;
+
 	// what format should search results be in? default is html
-	long format = r->getLong("format", FORMAT_HTML );
+	char *formatStr = r->getString("format", NULL );
+
+	if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
+	if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
+	if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
+	if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
+

 	// support old api &xml=1 to mean &format=1
 	if ( r->getLong("xml",0) ) {
@ -367,6 +375,10 @@ m	if (! cr->hasSearchPermission ( sock, encapIp ) ) {
 		format = FORMAT_JSON;
 	}

+	if ( r->getLong("csv",0) ) {
+		format = FORMAT_CSV;
+	}
+

 	// now override automatic defaults for special cases
 	if ( format != FORMAT_HTML ) {
--- a/SearchInput.h
+++ b/SearchInput.h
@ -46,6 +46,7 @@ class TopicGroup {
 #define FORMAT_HTML 0
 #define FORMAT_XML  1
 #define FORMAT_JSON 2
+#define FORMAT_CSV  3

 class SearchInput {

@ -217,6 +218,10 @@ class SearchInput {
 	// so can search results
 	//long   m_xml;                        // msg40
 	// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON
+	//long  m_formatStrLen;
+	//char *m_formatStr;
+
+	// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
 	char m_format;

 	// this should be part of the key because it will affect the results!
--- a/Wiktionary.cpp
+++ b/Wiktionary.cpp
@ -47,6 +47,11 @@ Wiktionary::~Wiktionary () {


 bool Wiktionary::test ( ) {
+
+	// test words parsing here
+	//Words w;
+	//w.set9 ("get $4,500.00 now",0);
+
 	// test it out!
 	char *str = "love";//pie"; //forsake";
 	//long long wid = hash64Lower_utf8(str);
--- a/Words.cpp
+++ b/Words.cpp
@ -419,10 +419,11 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {

 	// comma is ok if like ,ddd!d
 	if ( s[i]==',' && 
-	     j-i <= 3 &&
-	     is_digit(s[i-1]) &&
-	     (j==i-1 || is_digit(s[i-2]) ) &&
-	     (j==i-2 || is_digit(s[i-3]) ) ) {
+	     i-j <= 3 &&
+	     is_digit(s[i-1]) ) {
+		// if word so far is 2 or 3 chars, make sure digits
+		if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
+		if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
 		// scan forward
 	subloop:
 		if ( s[i] == ',' &&
@ -445,6 +446,8 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
 		while ( is_digit(s[i]) ) i++;
 	}
 	
+ nogo:
+
 	// allow for words like we're dave's and i'm
 	if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
 		i++;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -99,7 +99,11 @@ static bool getWordPosVec ( Words *words ,

 static void getMetaListWrapper ( void *state ) ;

-char *getNextJSONObject ( char *p ) ;
+char *getFirstJSONObject ( char *p , 
+			   long niceness ,
+			   bool *isProduct , 
+			   bool *isImage ) ;
+char *getNextJSONObject ( char *p , long niceness ) ;

 XmlDoc::XmlDoc() { 
 	for ( long i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
@ -19286,10 +19290,18 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 			}
 			mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
 			// init cursor to first json object
-			m_diffbotObj       = m_diffbotReply.getBufStart();
+			//m_diffbotObj       = m_diffbotReply.getBufStart();
+			char *rp = m_diffbotReply.getBufStart();
+			// we now parse the array of products out of the
+			// diffbot reply. each product is an item/object.
+			m_diffbotObj = getFirstJSONObject ( rp , 
+							    m_niceness ,
+							    &m_isJsonProduct , 
+							    &m_isJsonImage );
 			m_diffbotJSONCount = 0;
 			// set end of it
-			m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj );
+			m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj,
+							      m_niceness);
 			// temp null it
 			m_diffbotSavedChar = *m_diffbotObjEnd;
 			*m_diffbotObjEnd = '\0';
@ -19322,6 +19334,33 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 			sreq.m_hopCountValid = 1;
 			sreq.m_fakeFirstIp   = 1;
 			sreq.m_firstIp       = firstIp;
+
+			// copy the content
+			m_tmpBuf.reset();
+			// how much
+			long clen = m_diffbotObjEnd - m_diffbotObj;
+			// include \0
+			long need = clen + 1;
+			// insert ,"type":"product" or
+			// possibly ,"type":"image" to make it kosher
+			need += 32;
+			// reserve the mem
+			if ( ! m_tmpBuf.reserve ( need ) ) 
+				return NULL;
+			// sanity
+			if ( m_diffbotObj[0] != '{' ) { char *xx=NULL;*xx=0;}
+			// copy first '{'
+			m_tmpBuf.pushChar(m_diffbotObj[0]);
+			// HACK: insert the type: thing here
+			if ( m_isJsonProduct )
+				m_tmpBuf.safePrintf("\"type\":\"product\",");
+			else if ( m_isJsonImage )
+				m_tmpBuf.safePrintf("\"type\":\"image\",");
+			// do the copy of the rest, title, etc.
+			m_tmpBuf.safeMemcpy ( m_diffbotObj+1 , clen-1 );
+			// null term
+			m_tmpBuf.nullTerm();
+
 			// set this
 			if (!m_dx->set4 ( &sreq       ,
 					  NULL        ,
@ -19332,7 +19371,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 					  // niceness of 0!!!!
 					  m_niceness, // 1 , 
 					  // inject this content
-					  m_diffbotObj, // content ,
+					  m_tmpBuf.getBufStart(), // content ,
 					  false, // deleteFromIndex ,
 					  0, // forcedIp ,
 					  CT_JSON, // contentType ,
@ -19347,6 +19386,13 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 			m_dx->m_usePlacedb    = false;
 			m_dx->m_useLinkdb     = false;
 			m_dx->m_isChildDoc    = true;
+			// we like to sort json objects using
+			// 'gbsortby:spiderdate' query to get the most
+			// recent json objects, so this must be valid
+			if ( m_spideredTimeValid ) {
+				m_dx->m_spideredTimeValid = true;
+				m_dx->m_spideredTime = m_spideredTime;
+			}

 			m_dx->m_isDiffbotJSONObject = true;
 		}
@ -19377,7 +19423,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		// we successfully index the json object, skip to next one
 		m_diffbotObj = m_diffbotObjEnd;
 		// point to next json object again
-		m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj );
+		m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj ,
+						      m_niceness );
 		// re-save
 		m_diffbotSavedChar = *m_diffbotObjEnd;
 		// but gotta set this crap back
@ -22065,10 +22112,14 @@ bool XmlDoc::addTable144 ( HashTableX *tt1 , bool nosplit ) {
 		// store it as is
 		memcpy ( m_p , kp , sizeof(key144_t) );
 		// sanity check
-		//long long final = 202176590884090LL;
-		//final &= TERMID_MASK;
-		//if ( g_posdb.getTermId(kp) == final ) 
-		//	log("hey");
+		//long long final = hash64n("products.offerprice",0);
+		//long long prefix = hash64n("gbsortby",0);
+		//long long h64 = hash64 ( final , prefix);
+		//h64 &= TERMID_MASK;
+		//if ( g_posdb.getTermId(kp) == h64 ) {
+		//	log("hey: docid=%lli float=%f",m_docId,
+		//	    g_posdb.getFloat(kp) );
+		//}
 		/*
 		// get the score
 		long score = tt1->getScoreFromSlot ( i ) ;
@ -22091,10 +22142,25 @@ bool XmlDoc::addTable144 ( HashTableX *tt1 , bool nosplit ) {
 		*/
 		// this was zero when we added these keys to zero, so fix it
 		g_posdb.setDocIdBits ( m_p , m_docId );
-		// this too
-		g_posdb.setSiteRankBits ( m_p , siteRank );
-		// set language here too
-		g_posdb.setLangIdBits ( m_p , m_langId );
+		// if this is a numeric field we do not want to set
+		// the siterank or langid bits because it will mess up
+		// sorting by the float which is basically in the position
+		// of the word position bits.
+		if ( g_posdb.isAlignmentBitClear ( m_p ) ) {
+			// make sure it is set again. it was just cleared
+			// to indicate that this key contains a float
+			// like a price or something, and we should not
+			// set siterank or langid so that its termlist
+			// remains sorted just by that float
+			g_posdb.setAlignmentBit ( m_p , 1 );
+		}
+		// otherwise, set the siterank and langid
+		else {
+			// this too
+			g_posdb.setSiteRankBits ( m_p , siteRank );
+			// set language here too
+			g_posdb.setLangIdBits ( m_p , m_langId );
+		}
 		// advance over it
 		m_p += sizeof(key144_t);
 	}
@ -22839,6 +22905,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 		// country?
 		if ( ! hashCountry       ( table ) ) return NULL;
 		if ( ! hashTagRec        ( table ) ) return NULL;
+		// hash for gbsortby:gbspiderdate
+		if ( ! hashDateNumbers   ( table ) ) return NULL;
 		// and the json itself
 		return hashJSON ( table ); 
 	}
@ -22880,6 +22948,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 	if ( ! hashLinks         ( table ) ) return NULL;
 	if ( ! hashContentType   ( table ) ) return NULL;
 	if ( ! hashUrl           ( table ) ) return NULL;
+	if ( ! hashDateNumbers   ( table ) ) return NULL;
 	if ( ! hashMetaTags      ( table ) ) return NULL;
 	if ( ! hashMetaZip       ( table ) ) return NULL;
 	if ( ! hashDMOZCategories( table ) ) return NULL;
@ -23071,6 +23140,31 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
 	return true;
 }

+// . hash dates for sorting by using gbsortby: and gbrevsortby:
+// . do 'gbsortby:gbspiderdate' as your query to see this in action
+bool XmlDoc::hashDateNumbers ( HashTableX *tt ) {
+
+	// stop if already set
+	if ( ! m_spideredTimeValid ) return true;
+
+
+	// first the last spidered date
+	HashInfo hi;
+	hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
+	hi.m_tt        = tt;
+	hi.m_desc      = "last spidered date";
+	hi.m_prefix    = "gbspiderdate";
+
+	char buf[64];
+	long bufLen = sprintf ( buf , "%lu", m_spideredTime );
+
+	if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
+		return false;
+
+	// all done
+	return true;
+}
+
 bool XmlDoc::hashMetaZip ( HashTableX *tt ) {

 	setStatus ( "hashing meta zip" );
@ -23760,6 +23854,9 @@ bool XmlDoc::hashUrl ( HashTableX *tt ) {
 	sprintf(buf2,"%llu",(m_docId) );
 	if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;

+	// hash
+
+
 	return true;
 }

@ -28506,10 +28603,19 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
 	// . this now allows for commas in numbers like "1,500.62"
 	float f = atof2 ( p , bufEnd - p );

-	return hashNumber2 ( f , hi );
+	if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
+		return false;
+
+	// also hash in reverse order for sorting from low to high
+	f = -1.0 * f;
+
+	if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) )
+		return false;
+
+	return true;
 }

-bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
+bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {

 	// prefix is something like price. like the meta "name" or
 	// the json name with dots in it like "product.info.price" or something
@ -28523,7 +28629,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
 		
 	// combine prefix hash with a special hash to make it unique to avoid
 	// collisions. this is the "TRUE" prefix.
-	long long truePrefix64 = hash64n ( "gbsortby");
+	long long truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
 	// hash with the "TRUE" prefix
 	long long ph2 = hash64 ( nameHash , truePrefix64 );

@ -28534,7 +28640,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
 	key144_t k;
 	g_posdb.makeKey ( &k ,
 			  ph2 ,
-			  0LL,//docid
+			  0,//docid
 			  0,// word pos #
 			  0,// densityRank , // 0-15
 			  0 , // MAXDIVERSITYRANK
@ -28554,9 +28660,25 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
 			  false, // syn?
 			  false ); // delkey?

+	//long long final = hash64n("products.offerprice",0);
+	//long long prefix = hash64n("gbsortby",0);
+	//long long h64 = hash64 ( final , prefix);
+	//if ( ph2 == h64 )
+	//	log("hey: got offer price");
+
 	// now set the float in that key
 	g_posdb.setFloat ( &k , f );

+	// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
+	// so that we can b-step into a posdb list and make sure
+	// we are aligned on a 6 byte or 12 byte key, since they come
+	// in both sizes. but for this, hack it off to tell
+	// addTable144() that we are a special posdb key, a "numeric"
+	// key that has a float stored in it. then it will NOT
+	// set the siterank and langid bits which throw our sorting
+	// off!!
+	g_posdb.setAlignmentBit ( &k , 0 );
+
 	// sanity
 	float t = g_posdb.getFloat ( &k );
 	if ( t != f ) { char *xx=NULL;*xx=0; }
@ -43553,12 +43675,49 @@ SafeBuf *XmlDoc::getQueryLinkBuf(SafeBuf *docIdList, bool doMatchingQueries) {
 //void XmlDoc::getGigabitExcerpts ( ) {
 //}

+
+// . the products and image types are listed as arrays in the json object.
+// . so go to those first if there...
+char *getFirstJSONObject ( char *p , 
+			   long niceness ,
+			   bool *isProduct ,
+			   bool *isImage ) {
+
+	// do we have a "products": array?
+	char *needle = ",\"products\":[";
+	char *s = strstr(p,needle);
+
+	*isProduct = false;
+	*isImage   = false;
+
+	// return ptr to first product if there
+	if ( s ) {
+		*isProduct = true;
+		return s + gbstrlen(needle);
+	}
+
+	QUICKPOLL ( niceness );
+
+	// images?
+	needle = ",\"images\":[";
+	s = strstr(p,needle);
+	// return ptr to first product if there
+	if ( s ) {
+		*isImage = true;
+		return s + gbstrlen(needle);
+	}
+
+	// default to just that json otherwise
+	return p;
+}
+
+
 // . advance p to skip over the json object it is pointing to and return 
 //   ptr to the following json object
 // . deal with nested {}'s
 // . basically skips over current json object in a list of json objects to
 //   point to the next brother object
-char *getNextJSONObject ( char *p ) {
+char *getNextJSONObject ( char *p , long niceness ) {
 	// otherwise, *p must be {
 	for ( ; *p && *p != '{' ; p++ );
 	// empty?
@ -43571,6 +43730,8 @@ char *getNextJSONObject ( char *p ) {
 	bool inQuotes = false;
 	// scan
 	for ( ; *p ; p++ ) {
+		// breathe
+		QUICKPOLL ( niceness );
 		// escaping a quote? ignore quote then.
 		if ( *p == '\\' && p[1] == '\"' ) {
 			// skip two bytes then..
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -752,6 +752,7 @@ class XmlDoc {
 	bool hashDMOZCategories ( class HashTableX *table ) ;
 	bool hashLinks ( class HashTableX *table ) ;
 	bool hashUrl ( class HashTableX *table ) ;
+	bool hashDateNumbers ( class HashTableX *tt ) ;
 	bool hashSections ( class HashTableX *table ) ;
 	bool hashIncomingLinkText ( class HashTableX *table            ,
 				    bool       hashAnomalies    ,
@ -854,7 +855,9 @@ class XmlDoc {
 			  long bufLen , 
 			  class HashInfo *hi ) ;

-	bool hashNumber2 ( float f , class HashInfo *hi ) ;
+	bool hashNumber2 ( float f , 
+			   class HashInfo *hi ,
+			   char *gbsortByStr ) ;

 	// print out for PageTitledb.cpp and PageParser.cpp
 	bool printDoc ( class SafeBuf *pbuf );
@ -1487,6 +1490,12 @@ class XmlDoc {
 	char m_isWWWDup;
 	char m_calledMsg0b;
 	Url  m_tmpUrl;
+
+	// hack stuff:
+	SafeBuf m_tmpBuf;
+	bool m_isJsonProduct;
+	bool m_isJsonImage;
+	
 	SafeBuf m_tmpsb1;
 	SafeBuf m_tmpsb2;
 	SafeBuf m_turkBuf;