diff --git a/Json.cpp b/Json.cpp index b0126ea3..22c4e92b 100644 --- a/Json.cpp +++ b/Json.cpp @@ -330,4 +330,60 @@ void Json::test ( ) { return; } - + +bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) { + + // reset, but don't free mem etc. just set m_length to 0 + nameBuf.reset(); + // get its full compound name like "meta.twitter.title" + JsonItem *p = this;//ji; + char *lastName = NULL; + char *nameArray[20]; + long numNames = 0; + for ( ; p ; p = p->m_parent ) { + // empty name? + if ( ! p->m_name ) continue; + if ( ! p->m_name[0] ) continue; + // dup? can happen with arrays. parent of string + // in object, has same name as his parent, the + // name of the array. "dupname":[{"a":"b"},{"c":"d"}] + if ( p->m_name == lastName ) continue; + // update + lastName = p->m_name; + // add it up + nameArray[numNames++] = p->m_name; + // breach? + if ( numNames < 15 ) continue; + log("build: too many names in json tag"); + break; + } + // assemble the names in reverse order which is correct order + for ( long i = 1 ; i <= numNames ; i++ ) { + // copy into our safebuf + if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) ) + return false; + // separate names with periods + if ( ! nameBuf.pushChar('.') ) return false; + } + // remove last period + nameBuf.removeLastChar('.'); + // and null terminate + if ( ! nameBuf.nullTerm() ) return false; + // change all :'s in names to .'s since : is reserved! + char *px = nameBuf.getBufStart(); + for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.'; + + return true; +} + +// is this json item in an array of json items? +bool JsonItem::isInArray ( ) { + JsonItem *p = this;//ji; + for ( ; p ; p = p->m_parent ) { + // empty name? it's just a "value item" then, i guess. + //if ( ! p->m_name ) continue; + //if ( ! p->m_name[0] ) continue; + if ( p->m_type == JT_ARRAY ) return true; + } + return false; +} diff --git a/Json.h b/Json.h index ac53516e..b0250062 100644 --- a/Json.h +++ b/Json.h @@ -51,6 +51,10 @@ class JsonItem { return (char *)this + sizeof(JsonItem); }; + // like acme.product.offerPrice if "acme:{product:{offerprice:1.23}}" + bool getCompoundName ( SafeBuf &nameBuf ) ; + + bool isInArray ( ); }; diff --git a/PageCrawlBot.cpp b/PageCrawlBot.cpp index ace4d3b7..d6695991 100644 --- a/PageCrawlBot.cpp +++ b/PageCrawlBot.cpp @@ -738,6 +738,7 @@ public: char **lastKeyPtr ) ; void printTitledbList ( RdbList *list , SafeBuf *sb , char **lastKeyPtr ); + bool printJsonItemInCsv ( char *json , SafeBuf *sb ) ; char m_fmt; Msg4 m_msg4; @@ -751,6 +752,8 @@ public: bool m_printedEndingBracket; bool m_printedItem; + bool m_needHeaderRow; + bool m_needsMime; char m_rdbId; bool m_downloadJSON; @@ -810,10 +813,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) { fmt = FMT_JSON; downloadJSON = true; } - else if ( ( xx = strstr ( path , "_data.xml" ) ) ) { + else if ( ( xx = strstr ( path , "_data.csv" ) ) ) { rdbId = RDB_TITLEDB; downloadJSON = true; - fmt = FMT_XML; + fmt = FMT_CSV; } else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) { rdbId = RDB_SPIDERDB; @@ -886,6 +889,9 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) { st->m_printedItem = false; st->m_printedEndingBracket = false; + // for csv... + st->m_needHeaderRow = true; + // debug //log("mnew1: st=%lx",(long)st); @@ -1027,6 +1033,8 @@ bool StateCD::sendList ( ) { ct = "text/xml"; if ( m_fmt == FMT_TXT ) ct = "text/plain"; + if ( m_fmt == FMT_CSV ) + ct = "text/csv"; // . if we haven't yet sent an http mime back to the user // then do so here, the content-length will not be in there @@ -1557,22 +1565,34 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){ // get the json content char *json = xd.ptr_utf8Content; - if ( m_printedItem ) - sb->safePrintf("\n,\n"); + // empty? + if ( xd.size_utf8Content <= 1 ) + continue; - m_printedItem = true; + // if not json, just print the json item out in csv + // moved into PageResults.cpp... + //if ( m_fmt == FMT_CSV ) { + // printJsonItemInCsv ( json , sb ); + // continue; + //} // just print that out. encode \n's and \r's back to \\n \\r // and backslash to a \\ ... // but if they originally had a \u encoding and // we made into utf8, do not put that back into the \u // encoding because it is not necessary. + + // print in json + if ( m_printedItem ) + sb->safePrintf("\n,\n"); + + m_printedItem = true; + if ( ! sb->safeStrcpyPrettyJSON ( json ) ) log("diffbot: error printing json in dump"); // separate each JSON object with \n i guess //sb->pushChar('\n'); - } } @@ -3498,20 +3518,77 @@ bool printCrawlBotPage2 ( TcpSocket *socket , "" "" "json" - "  " - "" + "" + "" + + + "" + "Download Products: " + "" + // make it search.csv so excel opens it + "" + "csv" + "   " + "" + "html" - "xml" "" "" "" "Download Urls: " "" - "" "csv" - // + "" + "" + + + "" + "Latest Objects: " + "" + "" + "csv" + "   " + "" + "html" + "" + "" + + "" + "Latest Products: " + "" + "" + "csv" + "   " + "" + "html" + "" "" @@ -3648,11 +3725,38 @@ bool printCrawlBotPage2 ( TcpSocket *socket , , cr->m_coll + , cr->m_coll + , rand64 + + // download products html + , cr->m_coll + , rand64 + + //, cr->m_coll //, cr->m_coll //, cr->m_coll , cr->m_coll + + // latest objects in html + , cr->m_coll + , rand64 + + // latest objects in csv + , cr->m_coll + , rand64 + + + // latest products in html + , cr->m_coll + , rand64 + + // latest products in csv + , cr->m_coll + , rand64 + + , cr->m_coll , cr->m_collectiveRespiderFrequency @@ -3878,6 +3982,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , char *ct = "text/html"; if ( fmt == FMT_JSON ) ct = "application/json"; if ( fmt == FMT_XML ) ct = "text/xml"; + if ( fmt == FMT_CSV ) ct = "text/csv"; // this could be in html json or xml return g_httpServer.sendDynamicPage ( socket, diff --git a/PageResults.cpp b/PageResults.cpp index 255aec81..c2b199de 100644 --- a/PageResults.cpp +++ b/PageResults.cpp @@ -52,6 +52,7 @@ public: long m_numDocIds; long long m_took; // how long it took to get the results HttpRequest m_hr; + bool m_printedHeaderRow; }; static int printResult ( SafeBuf &sb, @@ -60,6 +61,8 @@ static int printResult ( SafeBuf &sb, CollectionRec *cr , char *qe ) ; +bool printJsonItemInCsv ( char *json , SafeBuf *sb , bool *printedHeaderRow ) ; + bool printPairScore ( SafeBuf &sb , SearchInput *si , PairScore *ps , Msg20Reply *mr , Msg40 *msg40 , bool first ) ; @@ -80,6 +83,7 @@ bool sendReply ( State0 *st , char *reply ) { char *ct = "text/html"; if ( si && si->m_format == FORMAT_XML ) ct = "text/xml"; if ( si && si->m_format == FORMAT_JSON ) ct = "application/json"; + if ( si && si->m_format == FORMAT_CSV ) ct = "text/csv"; char *charset = "utf-8"; // . filter anything < 0x20 to 0x20 to keep XML legal @@ -466,7 +470,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) { // limit here long maxpp = cr->m_maxSearchResultsPerQuery ; - if ( si->m_docsWanted > maxpp ) si->m_docsWanted = maxpp; + if ( si->m_docsWanted > maxpp && + // disable serp max per page for custom crawls + ! cr->m_isCustomCrawl ) + si->m_docsWanted = maxpp; st->m_numDocIds = si->m_docsWanted; @@ -492,6 +499,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) { st->m_gotAds = false; st->m_gotSpell = false; + // reset + st->m_printedHeaderRow = false; + long ip = s->m_ip; long uipLen; char *uip = hr->getString("uip", &uipLen, NULL); @@ -1720,6 +1730,19 @@ static int printResult ( SafeBuf &sb, Msg20 *m20 = msg40->m_msg20[ix]; Msg20Reply *mr = m20->m_r; + + if ( si->m_format == FORMAT_CSV && + mr->ptr_content && + mr->m_contentType == CT_JSON ) { + // parse it up + char *json = mr->ptr_content; + // only print header row once, so pass in that flag + printJsonItemInCsv ( json , &sb , &st->m_printedHeaderRow ); + return true; + } + + + // just print cached web page? if ( mr->ptr_content ) { sb.safeStrcpy ( mr->ptr_content ); @@ -4680,3 +4703,108 @@ bool printDirectorySearchType ( SafeBuf& sb, long sdirt ) { return true; } */ + +#include "Json.h" + +bool printJsonItemInCsv ( char *json , SafeBuf *sb , bool *printedHeaderRow ) { + + // parse the json + Json jp; + jp.parseJsonStringIntoJsonItems ( json ); + + // . TODO: index individual "Products":[...] as each an + // individual title rec. + + SafeBuf nameBuf; + bool firstOne = true; + + JsonItem *ji; + + //// + // + // print header row in csv + // + //// + for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) { + + if ( *printedHeaderRow ) + break; + + // skip if not number or string + if ( ji->m_type != JT_NUMBER && + ji->m_type != JT_STRING ) + continue; + + // if in an array, do not print! csv is not + // good for arrays... like "media":[....] . that + // one might be ok, but if the elements in the + // array are not simple types, like, if they are + // unflat json objects then it is not well suited + // for csv. + if ( ji->isInArray() ) continue; + + if ( ! firstOne ) sb->pushChar(','); + + firstOne = false; + + ji->getCompoundName ( nameBuf ); + + // + // product.offerprice + // + sb->csvEncode ( nameBuf.getBufStart() , nameBuf.getLength() ); + } + + if ( ! *printedHeaderRow ) { + sb->pushChar('\n'); + sb->nullTerm(); + *printedHeaderRow = true; + } + + + firstOne = true; + + /////// + // + // print json item in csv + // + /////// + for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) { + + // skip if not number or string + if ( ji->m_type != JT_NUMBER && + ji->m_type != JT_STRING ) + continue; + + // skip if not well suited for csv (see above comment) + if ( ji->isInArray() ) continue; + + + if ( ! firstOne ) sb->pushChar(','); + + firstOne = false; + + if ( ji->m_type == JT_NUMBER ) { + // print numbers without double quotes + if ( ji->m_valueDouble *10000000.0 == + (double)ji->m_valueLong * 10000000.0 ) + sb->safePrintf("%li",ji->m_valueLong); + else + sb->safePrintf("%f",ji->m_valueDouble); + continue; + } + + // print the value + sb->pushChar('\"'); + sb->csvEncode ( ji->getValue() , ji->getValueLen() ); + sb->pushChar('\"'); + } + + if ( ! firstOne ) + sb->pushChar('\n'); + + sb->nullTerm(); + + return true; +} + diff --git a/Pages.cpp b/Pages.cpp index 12f34bef..77d5ea30 100644 --- a/Pages.cpp +++ b/Pages.cpp @@ -344,6 +344,8 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) { path = "admin/inject"; pathLen = gbstrlen(path); } if ( pathLen == 9 && strncmp ( path , "index.php" , 9 ) == 0 ) { path = "search"; pathLen = gbstrlen(path); } + if ( pathLen == 10 && strncmp ( path , "search.csv" , 10 ) == 0 ) { + path = "search"; pathLen = gbstrlen(path); } // if it is like /GA/Atlanta then call sendPageResults // and that should be smart enough to set the m_where in diff --git a/Parms.cpp b/Parms.cpp index 04609d15..83717e5d 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -14964,18 +14964,19 @@ void Parms::init ( ) { m->m_sprpp = 0; m++; + /* m->m_title = "format of the returned search results"; m->m_desc = "X is 0 to get back results in regular html, 1 to " "get back results in XML, 2 for JSON."; m->m_def = "0"; - m->m_soff = (char *)&si.m_format - y; - m->m_type = TYPE_CHAR; + m->m_soff = (char *)&si.m_formatStr - y; + m->m_type = TYPE_STRING;//CHAR; m->m_sparm = 1; m->m_scgi = "format"; m->m_smin = 0; m->m_smax = 12; m++; - + */ m->m_title = "highlight query terms in summaries."; m->m_desc = "Use to disable or enable " diff --git a/Posdb.cpp b/Posdb.cpp index eea4398a..234b5493 100644 --- a/Posdb.cpp +++ b/Posdb.cpp @@ -4115,7 +4115,8 @@ bool PosdbTable::setQueryTermInfo ( ) { qti->m_wikiPhraseId = qw->m_wikiPhraseId; qti->m_quotedStartId = qw->m_quoteStart; // is it gbsortby:? - if ( qt->m_fieldCode == FIELD_GBSORTBY ) + if ( qt->m_fieldCode == FIELD_GBSORTBY || + qt->m_fieldCode == FIELD_GBREVSORTBY ) m_sortByTermNum = i; // count long nn = 0; @@ -4237,6 +4238,8 @@ bool PosdbTable::setQueryTermInfo ( ) { // they have a float stored there for sorting etc. if (qt->m_fieldCode == FIELD_GBSORTBY ) qti->m_bigramFlags[nn]|=BF_NUMBER; + if (qt->m_fieldCode == FIELD_GBREVSORTBY ) + qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBNUMBERMIN ) qti->m_bigramFlags[nn]|=BF_NUMBER; if (qt->m_fieldCode == FIELD_GBNUMBERMAX ) diff --git a/Posdb.h b/Posdb.h index 2dc13070..caa87922 100644 --- a/Posdb.h +++ b/Posdb.h @@ -206,6 +206,15 @@ class Posdb { float getFloat ( void *vkp ) { return *(float *)(((char *)vkp) + 2); }; + void setAlignmentBit ( void *vkp , char val ) { + char *p = (char *)vkp; + if ( val ) p[1] = p[1] | 0x02; + else p[1] = p[1] & 0xfd; + }; + + bool isAlignmentBitClear ( void *vkp ) { + return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 ); + }; void makeStartKey ( void *kp, long long termId , long long docId=0LL){ @@ -436,7 +445,7 @@ class PosdbList : public RdbList { #include "Query.h" // MAX_QUERY_TERMS, qvec_t // max # search results that can be viewed without using TopTree -#define MAX_RESULTS 1000 +//#define MAX_RESULTS 1000 class PosdbTable { diff --git a/Query.cpp b/Query.cpp index e209d5bb..347814cd 100644 --- a/Query.cpp +++ b/Query.cpp @@ -2200,6 +2200,7 @@ bool Query::setQWords ( char boolFlag , fieldCode == FIELD_ISCLEAN || fieldCode == FIELD_QUOTA || fieldCode == FIELD_GBSORTBY || + fieldCode == FIELD_GBREVSORTBY || fieldCode == FIELD_GBNUMBERMIN || fieldCode == FIELD_GBNUMBERMAX || fieldCode == FIELD_GBAD ) { @@ -2217,6 +2218,7 @@ bool Query::setQWords ( char boolFlag , // i've decided not to make // gbsortby:products.offerPrice case sensitive if ( fieldCode == FIELD_GBSORTBY || + fieldCode == FIELD_GBREVSORTBY || fieldCode == FIELD_GBNUMBERMIN || fieldCode == FIELD_GBNUMBERMAX ) wid = hash64Lower_utf8 ( w , wlen , 0LL ); @@ -3044,6 +3046,7 @@ struct QueryField g_fields[] = { {"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""}, {"gbcontenthash", FIELD_GBCONTENTHASH, false,""}, {"gbsortby", FIELD_GBSORTBY, false,""}, + {"gbrevsortby", FIELD_GBREVSORTBY, false,""}, {"gbnumbermin", FIELD_GBNUMBERMIN, false,""}, {"gbnumbermax", FIELD_GBNUMBERMAX, false,""}, diff --git a/Query.h b/Query.h index a4ed0cc1..877effcd 100644 --- a/Query.h +++ b/Query.h @@ -105,8 +105,9 @@ typedef unsigned long long qvec_t; #define FIELD_GBDOCID 52 #define FIELD_GBCONTENTHASH 53 // for deduping at spider time #define FIELD_GBSORTBY 54 // i.e. sortby:price -> numeric termlist -#define FIELD_GBNUMBERMIN 55 -#define FIELD_GBNUMBERMAX 56 +#define FIELD_GBREVSORTBY 55 // i.e. sortby:price -> low to high +#define FIELD_GBNUMBERMIN 56 +#define FIELD_GBNUMBERMAX 57 #define FIELD_GBOTHER 92 diff --git a/SafeBuf.cpp b/SafeBuf.cpp index 8802f0cd..446cbd7b 100644 --- a/SafeBuf.cpp +++ b/SafeBuf.cpp @@ -3169,3 +3169,49 @@ bool SafeBuf::htmlDecode ( char *src, // good to go return true; } + +void SafeBuf::replaceChar ( char src , char dst ) { + char *px = m_buf; + char *pxEnd = m_buf + m_length; + for ( ; px < pxEnd ; px++ ) if ( *px == src ) *px = dst; +} + + +// encode a double quote char to two double quote chars +bool SafeBuf::csvEncode ( char *s , long len , long niceness ) { + + if ( ! s ) return true; + + // assume all chars are double quotes and will have to be encoded + long need = len * 2 + 1; + if ( ! reserve ( need ) ) return false; + + // tmp vars + char *dst = m_buf + m_length; + //char *dstEnd = m_buf + m_capacity; + + // scan through all + char *send = s + len; + for ( ; s < send ; s++ ) { + // breathe + QUICKPOLL ( niceness ); + // convert it? + if ( *s == '\"' ) { + *dst++ = '\"'; + *dst++ = '\"'; + continue; + } + //if ( *s == '\\' ) { + // *dst++ = '\\'; + // *dst++ = '\\'; + // continue; + //} + *dst++ = *s; + } + + m_length += dst - (m_buf + m_length); + + nullTerm(); + + return true; +} diff --git a/SafeBuf.h b/SafeBuf.h index 20155991..3ea92231 100644 --- a/SafeBuf.h +++ b/SafeBuf.h @@ -97,6 +97,9 @@ struct SafeBuf { bool safeStrcpy ( char *s ) ; bool safeStrcpyPrettyJSON ( char *decodedJson ) ; bool safeUtf8ToJSON ( char *utf8 ) ; + + bool csvEncode ( char *s , long len , long niceness = 0 ); + //bool pushLong ( long val ) { return safeMemcpy((char *)&val,4); } bool cat(SafeBuf& c); // . only cat the sections/tag that start with "tagFilter" @@ -144,6 +147,7 @@ struct SafeBuf { char *t , long tlen , long niceness , long startOff = 0 ); + void replaceChar ( char src , char dst ); bool copyToken(char* s);; //output encoding bool setEncoding(short cs); diff --git a/SearchInput.cpp b/SearchInput.cpp index 93f60403..ed441d73 100644 --- a/SearchInput.cpp +++ b/SearchInput.cpp @@ -354,8 +354,16 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) { return log("query: unable to strcpy whitelist"); + char format = FORMAT_HTML; + // what format should search results be in? default is html - long format = r->getLong("format", FORMAT_HTML ); + char *formatStr = r->getString("format", NULL ); + + if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML; + if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON; + if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML; + if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV; + // support old api &xml=1 to mean &format=1 if ( r->getLong("xml",0) ) { @@ -367,6 +375,10 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) { format = FORMAT_JSON; } + if ( r->getLong("csv",0) ) { + format = FORMAT_CSV; + } + // now override automatic defaults for special cases if ( format != FORMAT_HTML ) { diff --git a/SearchInput.h b/SearchInput.h index 420d9ba6..39e8c6c7 100644 --- a/SearchInput.h +++ b/SearchInput.h @@ -46,6 +46,7 @@ class TopicGroup { #define FORMAT_HTML 0 #define FORMAT_XML 1 #define FORMAT_JSON 2 +#define FORMAT_CSV 3 class SearchInput { @@ -217,6 +218,10 @@ class SearchInput { // so can search results //long m_xml; // msg40 // can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON + //long m_formatStrLen; + //char *m_formatStr; + + // can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv char m_format; // this should be part of the key because it will affect the results! diff --git a/Wiktionary.cpp b/Wiktionary.cpp index 570d04ed..1e1c374f 100644 --- a/Wiktionary.cpp +++ b/Wiktionary.cpp @@ -47,6 +47,11 @@ Wiktionary::~Wiktionary () { bool Wiktionary::test ( ) { + + // test words parsing here + //Words w; + //w.set9 ("get $4,500.00 now",0); + // test it out! char *str = "love";//pie"; //forsake"; //long long wid = hash64Lower_utf8(str); diff --git a/Words.cpp b/Words.cpp index 15e99d51..146607e8 100644 --- a/Words.cpp +++ b/Words.cpp @@ -419,10 +419,11 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { // comma is ok if like ,ddd!d if ( s[i]==',' && - j-i <= 3 && - is_digit(s[i-1]) && - (j==i-1 || is_digit(s[i-2]) ) && - (j==i-2 || is_digit(s[i-3]) ) ) { + i-j <= 3 && + is_digit(s[i-1]) ) { + // if word so far is 2 or 3 chars, make sure digits + if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo; + if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo; // scan forward subloop: if ( s[i] == ',' && @@ -445,6 +446,8 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) { while ( is_digit(s[i]) ) i++; } + nogo: + // allow for words like we're dave's and i'm if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){ i++; diff --git a/XmlDoc.cpp b/XmlDoc.cpp index 0082d207..a84a9633 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -99,7 +99,11 @@ static bool getWordPosVec ( Words *words , static void getMetaListWrapper ( void *state ) ; -char *getNextJSONObject ( char *p ) ; +char *getFirstJSONObject ( char *p , + long niceness , + bool *isProduct , + bool *isImage ) ; +char *getNextJSONObject ( char *p , long niceness ) ; XmlDoc::XmlDoc() { for ( long i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL; @@ -19286,10 +19290,18 @@ char *XmlDoc::getMetaList ( bool forDelete ) { } mnew ( m_dx , sizeof(XmlDoc),"xmldocdx"); // init cursor to first json object - m_diffbotObj = m_diffbotReply.getBufStart(); + //m_diffbotObj = m_diffbotReply.getBufStart(); + char *rp = m_diffbotReply.getBufStart(); + // we now parse the array of products out of the + // diffbot reply. each product is an item/object. + m_diffbotObj = getFirstJSONObject ( rp , + m_niceness , + &m_isJsonProduct , + &m_isJsonImage ); m_diffbotJSONCount = 0; // set end of it - m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj ); + m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj, + m_niceness); // temp null it m_diffbotSavedChar = *m_diffbotObjEnd; *m_diffbotObjEnd = '\0'; @@ -19322,6 +19334,33 @@ char *XmlDoc::getMetaList ( bool forDelete ) { sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; + + // copy the content + m_tmpBuf.reset(); + // how much + long clen = m_diffbotObjEnd - m_diffbotObj; + // include \0 + long need = clen + 1; + // insert ,"type":"product" or + // possibly ,"type":"image" to make it kosher + need += 32; + // reserve the mem + if ( ! m_tmpBuf.reserve ( need ) ) + return NULL; + // sanity + if ( m_diffbotObj[0] != '{' ) { char *xx=NULL;*xx=0;} + // copy first '{' + m_tmpBuf.pushChar(m_diffbotObj[0]); + // HACK: insert the type: thing here + if ( m_isJsonProduct ) + m_tmpBuf.safePrintf("\"type\":\"product\","); + else if ( m_isJsonImage ) + m_tmpBuf.safePrintf("\"type\":\"image\","); + // do the copy of the rest, title, etc. + m_tmpBuf.safeMemcpy ( m_diffbotObj+1 , clen-1 ); + // null term + m_tmpBuf.nullTerm(); + // set this if (!m_dx->set4 ( &sreq , NULL , @@ -19332,7 +19371,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) { // niceness of 0!!!! m_niceness, // 1 , // inject this content - m_diffbotObj, // content , + m_tmpBuf.getBufStart(), // content , false, // deleteFromIndex , 0, // forcedIp , CT_JSON, // contentType , @@ -19347,6 +19386,13 @@ char *XmlDoc::getMetaList ( bool forDelete ) { m_dx->m_usePlacedb = false; m_dx->m_useLinkdb = false; m_dx->m_isChildDoc = true; + // we like to sort json objects using + // 'gbsortby:spiderdate' query to get the most + // recent json objects, so this must be valid + if ( m_spideredTimeValid ) { + m_dx->m_spideredTimeValid = true; + m_dx->m_spideredTime = m_spideredTime; + } m_dx->m_isDiffbotJSONObject = true; } @@ -19377,7 +19423,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) { // we successfully index the json object, skip to next one m_diffbotObj = m_diffbotObjEnd; // point to next json object again - m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj ); + m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj , + m_niceness ); // re-save m_diffbotSavedChar = *m_diffbotObjEnd; // but gotta set this crap back @@ -22065,10 +22112,14 @@ bool XmlDoc::addTable144 ( HashTableX *tt1 , bool nosplit ) { // store it as is memcpy ( m_p , kp , sizeof(key144_t) ); // sanity check - //long long final = 202176590884090LL; - //final &= TERMID_MASK; - //if ( g_posdb.getTermId(kp) == final ) - // log("hey"); + //long long final = hash64n("products.offerprice",0); + //long long prefix = hash64n("gbsortby",0); + //long long h64 = hash64 ( final , prefix); + //h64 &= TERMID_MASK; + //if ( g_posdb.getTermId(kp) == h64 ) { + // log("hey: docid=%lli float=%f",m_docId, + // g_posdb.getFloat(kp) ); + //} /* // get the score long score = tt1->getScoreFromSlot ( i ) ; @@ -22091,10 +22142,25 @@ bool XmlDoc::addTable144 ( HashTableX *tt1 , bool nosplit ) { */ // this was zero when we added these keys to zero, so fix it g_posdb.setDocIdBits ( m_p , m_docId ); - // this too - g_posdb.setSiteRankBits ( m_p , siteRank ); - // set language here too - g_posdb.setLangIdBits ( m_p , m_langId ); + // if this is a numeric field we do not want to set + // the siterank or langid bits because it will mess up + // sorting by the float which is basically in the position + // of the word position bits. + if ( g_posdb.isAlignmentBitClear ( m_p ) ) { + // make sure it is set again. it was just cleared + // to indicate that this key contains a float + // like a price or something, and we should not + // set siterank or langid so that its termlist + // remains sorted just by that float + g_posdb.setAlignmentBit ( m_p , 1 ); + } + // otherwise, set the siterank and langid + else { + // this too + g_posdb.setSiteRankBits ( m_p , siteRank ); + // set language here too + g_posdb.setLangIdBits ( m_p , m_langId ); + } // advance over it m_p += sizeof(key144_t); } @@ -22839,6 +22905,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) { // country? if ( ! hashCountry ( table ) ) return NULL; if ( ! hashTagRec ( table ) ) return NULL; + // hash for gbsortby:gbspiderdate + if ( ! hashDateNumbers ( table ) ) return NULL; // and the json itself return hashJSON ( table ); } @@ -22880,6 +22948,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) { if ( ! hashLinks ( table ) ) return NULL; if ( ! hashContentType ( table ) ) return NULL; if ( ! hashUrl ( table ) ) return NULL; + if ( ! hashDateNumbers ( table ) ) return NULL; if ( ! hashMetaTags ( table ) ) return NULL; if ( ! hashMetaZip ( table ) ) return NULL; if ( ! hashDMOZCategories( table ) ) return NULL; @@ -23071,6 +23140,31 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) { return true; } +// . hash dates for sorting by using gbsortby: and gbrevsortby: +// . do 'gbsortby:gbspiderdate' as your query to see this in action +bool XmlDoc::hashDateNumbers ( HashTableX *tt ) { + + // stop if already set + if ( ! m_spideredTimeValid ) return true; + + + // first the last spidered date + HashInfo hi; + hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field + hi.m_tt = tt; + hi.m_desc = "last spidered date"; + hi.m_prefix = "gbspiderdate"; + + char buf[64]; + long bufLen = sprintf ( buf , "%lu", m_spideredTime ); + + if ( ! hashNumber ( buf , buf , bufLen , &hi ) ) + return false; + + // all done + return true; +} + bool XmlDoc::hashMetaZip ( HashTableX *tt ) { setStatus ( "hashing meta zip" ); @@ -23760,6 +23854,9 @@ bool XmlDoc::hashUrl ( HashTableX *tt ) { sprintf(buf2,"%llu",(m_docId) ); if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false; + // hash + + return true; } @@ -28506,10 +28603,19 @@ bool XmlDoc::hashNumber ( char *beginBuf , // . this now allows for commas in numbers like "1,500.62" float f = atof2 ( p , bufEnd - p ); - return hashNumber2 ( f , hi ); + if ( ! hashNumber2 ( f , hi , "gbsortby" ) ) + return false; + + // also hash in reverse order for sorting from low to high + f = -1.0 * f; + + if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) ) + return false; + + return true; } -bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) { +bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) { // prefix is something like price. like the meta "name" or // the json name with dots in it like "product.info.price" or something @@ -28523,7 +28629,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) { // combine prefix hash with a special hash to make it unique to avoid // collisions. this is the "TRUE" prefix. - long long truePrefix64 = hash64n ( "gbsortby"); + long long truePrefix64 = hash64n ( sortByStr ); // "gbsortby"); // hash with the "TRUE" prefix long long ph2 = hash64 ( nameHash , truePrefix64 ); @@ -28534,7 +28640,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) { key144_t k; g_posdb.makeKey ( &k , ph2 , - 0LL,//docid + 0,//docid 0,// word pos # 0,// densityRank , // 0-15 0 , // MAXDIVERSITYRANK @@ -28554,9 +28660,25 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) { false, // syn? false ); // delkey? + //long long final = hash64n("products.offerprice",0); + //long long prefix = hash64n("gbsortby",0); + //long long h64 = hash64 ( final , prefix); + //if ( ph2 == h64 ) + // log("hey: got offer price"); + // now set the float in that key g_posdb.setFloat ( &k , f ); + // HACK: this bit is ALWAYS set by Posdb::makeKey() to 1 + // so that we can b-step into a posdb list and make sure + // we are aligned on a 6 byte or 12 byte key, since they come + // in both sizes. but for this, hack it off to tell + // addTable144() that we are a special posdb key, a "numeric" + // key that has a float stored in it. then it will NOT + // set the siterank and langid bits which throw our sorting + // off!! + g_posdb.setAlignmentBit ( &k , 0 ); + // sanity float t = g_posdb.getFloat ( &k ); if ( t != f ) { char *xx=NULL;*xx=0; } @@ -43553,12 +43675,49 @@ SafeBuf *XmlDoc::getQueryLinkBuf(SafeBuf *docIdList, bool doMatchingQueries) { //void XmlDoc::getGigabitExcerpts ( ) { //} + +// . the products and image types are listed as arrays in the json object. +// . so go to those first if there... +char *getFirstJSONObject ( char *p , + long niceness , + bool *isProduct , + bool *isImage ) { + + // do we have a "products": array? + char *needle = ",\"products\":["; + char *s = strstr(p,needle); + + *isProduct = false; + *isImage = false; + + // return ptr to first product if there + if ( s ) { + *isProduct = true; + return s + gbstrlen(needle); + } + + QUICKPOLL ( niceness ); + + // images? + needle = ",\"images\":["; + s = strstr(p,needle); + // return ptr to first product if there + if ( s ) { + *isImage = true; + return s + gbstrlen(needle); + } + + // default to just that json otherwise + return p; +} + + // . advance p to skip over the json object it is pointing to and return // ptr to the following json object // . deal with nested {}'s // . basically skips over current json object in a list of json objects to // point to the next brother object -char *getNextJSONObject ( char *p ) { +char *getNextJSONObject ( char *p , long niceness ) { // otherwise, *p must be { for ( ; *p && *p != '{' ; p++ ); // empty? @@ -43571,6 +43730,8 @@ char *getNextJSONObject ( char *p ) { bool inQuotes = false; // scan for ( ; *p ; p++ ) { + // breathe + QUICKPOLL ( niceness ); // escaping a quote? ignore quote then. if ( *p == '\\' && p[1] == '\"' ) { // skip two bytes then.. diff --git a/XmlDoc.h b/XmlDoc.h index 56388584..808b786c 100644 --- a/XmlDoc.h +++ b/XmlDoc.h @@ -752,6 +752,7 @@ class XmlDoc { bool hashDMOZCategories ( class HashTableX *table ) ; bool hashLinks ( class HashTableX *table ) ; bool hashUrl ( class HashTableX *table ) ; + bool hashDateNumbers ( class HashTableX *tt ) ; bool hashSections ( class HashTableX *table ) ; bool hashIncomingLinkText ( class HashTableX *table , bool hashAnomalies , @@ -854,7 +855,9 @@ class XmlDoc { long bufLen , class HashInfo *hi ) ; - bool hashNumber2 ( float f , class HashInfo *hi ) ; + bool hashNumber2 ( float f , + class HashInfo *hi , + char *gbsortByStr ) ; // print out for PageTitledb.cpp and PageParser.cpp bool printDoc ( class SafeBuf *pbuf ); @@ -1487,6 +1490,12 @@ class XmlDoc { char m_isWWWDup; char m_calledMsg0b; Url m_tmpUrl; + + // hack stuff: + SafeBuf m_tmpBuf; + bool m_isJsonProduct; + bool m_isJsonImage; + SafeBuf m_tmpsb1; SafeBuf m_tmpsb2; SafeBuf m_turkBuf;