Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

This commit is contained in:
Matt Wells 2013-11-13 13:27:45 -08:00
commit 9e77f1b2f6
18 changed files with 601 additions and 44 deletions

View File

@ -331,3 +331,59 @@ void Json::test ( ) {
return;
}
bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) {
// reset, but don't free mem etc. just set m_length to 0
nameBuf.reset();
// get its full compound name like "meta.twitter.title"
JsonItem *p = this;//ji;
char *lastName = NULL;
char *nameArray[20];
long numNames = 0;
for ( ; p ; p = p->m_parent ) {
// empty name?
if ( ! p->m_name ) continue;
if ( ! p->m_name[0] ) continue;
// dup? can happen with arrays. parent of string
// in object, has same name as his parent, the
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
if ( p->m_name == lastName ) continue;
// update
lastName = p->m_name;
// add it up
nameArray[numNames++] = p->m_name;
// breach?
if ( numNames < 15 ) continue;
log("build: too many names in json tag");
break;
}
// assemble the names in reverse order which is correct order
for ( long i = 1 ; i <= numNames ; i++ ) {
// copy into our safebuf
if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) )
return false;
// separate names with periods
if ( ! nameBuf.pushChar('.') ) return false;
}
// remove last period
nameBuf.removeLastChar('.');
// and null terminate
if ( ! nameBuf.nullTerm() ) return false;
// change all :'s in names to .'s since : is reserved!
char *px = nameBuf.getBufStart();
for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
return true;
}
// is this json item in an array of json items?
bool JsonItem::isInArray ( ) {
JsonItem *p = this;//ji;
for ( ; p ; p = p->m_parent ) {
// empty name? it's just a "value item" then, i guess.
//if ( ! p->m_name ) continue;
//if ( ! p->m_name[0] ) continue;
if ( p->m_type == JT_ARRAY ) return true;
}
return false;
}

4
Json.h
View File

@ -51,6 +51,10 @@ class JsonItem {
return (char *)this + sizeof(JsonItem);
};
// like acme.product.offerPrice if "acme:{product:{offerprice:1.23}}"
bool getCompoundName ( SafeBuf &nameBuf ) ;
bool isInArray ( );
};

View File

@ -738,6 +738,7 @@ public:
char **lastKeyPtr ) ;
void printTitledbList ( RdbList *list , SafeBuf *sb ,
char **lastKeyPtr );
bool printJsonItemInCsv ( char *json , SafeBuf *sb ) ;
char m_fmt;
Msg4 m_msg4;
@ -751,6 +752,8 @@ public:
bool m_printedEndingBracket;
bool m_printedItem;
bool m_needHeaderRow;
bool m_needsMime;
char m_rdbId;
bool m_downloadJSON;
@ -810,10 +813,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
fmt = FMT_JSON;
downloadJSON = true;
}
else if ( ( xx = strstr ( path , "_data.xml" ) ) ) {
else if ( ( xx = strstr ( path , "_data.csv" ) ) ) {
rdbId = RDB_TITLEDB;
downloadJSON = true;
fmt = FMT_XML;
fmt = FMT_CSV;
}
else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
rdbId = RDB_SPIDERDB;
@ -886,6 +889,9 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
st->m_printedItem = false;
st->m_printedEndingBracket = false;
// for csv...
st->m_needHeaderRow = true;
// debug
//log("mnew1: st=%lx",(long)st);
@ -1027,6 +1033,8 @@ bool StateCD::sendList ( ) {
ct = "text/xml";
if ( m_fmt == FMT_TXT )
ct = "text/plain";
if ( m_fmt == FMT_CSV )
ct = "text/csv";
// . if we haven't yet sent an http mime back to the user
// then do so here, the content-length will not be in there
@ -1557,22 +1565,34 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// get the json content
char *json = xd.ptr_utf8Content;
if ( m_printedItem )
sb->safePrintf("\n,\n");
// empty?
if ( xd.size_utf8Content <= 1 )
continue;
m_printedItem = true;
// if not json, just print the json item out in csv
// moved into PageResults.cpp...
//if ( m_fmt == FMT_CSV ) {
// printJsonItemInCsv ( json , sb );
// continue;
//}
// just print that out. encode \n's and \r's back to \\n \\r
// and backslash to a \\ ...
// but if they originally had a \u<backslash> encoding and
// we made into utf8, do not put that back into the \u
// encoding because it is not necessary.
// print in json
if ( m_printedItem )
sb->safePrintf("\n,\n");
m_printedItem = true;
if ( ! sb->safeStrcpyPrettyJSON ( json ) )
log("diffbot: error printing json in dump");
// separate each JSON object with \n i guess
//sb->pushChar('\n');
}
}
@ -3498,20 +3518,77 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"</td><td>"
"<a href=/crawlbot/download/%s_data.json>"
"json</a>"
"&nbsp; "
"<a href=/crawlbot/download/%s_data.xml>"
"</td>"
"</tr>"
"<tr>"
"<td><b>Download Products:</b> "
"</td><td>"
// make it search.csv so excel opens it
"<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
"c=%s&n=10000000&rand=%llu&id=1&"
"q=gbrevsortby%%3AofferPrice&"
"prepend=type%%3Ajson"
//"+type%%3Aproduct%%7C"
">"
"csv</a>"
" &nbsp; "
"<a href=/search?icc=1&format=html&sc=0&dr=0&"
"c=%s&n=10000000&rand=%llu&id=1&"
"q=gbrevsortby%%3AofferPrice&"
"prepend=type%%3Ajson"
">"
"html</a>"
"xml</a>"
"</td>"
"</tr>"
"<tr>"
"<td><b>Download Urls:</b> "
"</td><td>"
"<a href=/crawlbot/download/%s_urls.csv>"
"csv</a>"
//
"</td>"
"</tr>"
"<tr>"
"<td><b>Latest Objects:</b> "
"</td><td>"
"<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
"c=%s&n=10&rand=%llu&id=1&"
"q=gbsortby%%3Agbspiderdate&"
"prepend=type%%3Ajson"
">"
"csv</a>"
" &nbsp; "
"<a href=/search?icc=1&format=html&sc=0&dr=0&"
"c=%s&n=10rand=%llu&id=1&"
"q=gbsortby%%3Agbspiderdate&"
"prepend=type%%3Ajson"
">"
"html</a>"
"</td>"
"</tr>"
"<tr>"
"<td><b>Latest Products:</b> "
"</td><td>"
"<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
"c=%s&n=10&rand=%llu&id=1&"
"q=gbsortby%%3Agbspiderdate&"
"prepend=type%%3Ajson+type%%3Aproduct"
">"
"csv</a>"
" &nbsp; "
"<a href=/search?icc=1&format=html&sc=0&dr=0&"
"c=%s&n=10&rand=%llu&id=1&"
"q=gbsortby%%3Agbspiderdate&"
"prepend=type%%3Ajson+type%%3Aproduct"
">"
"html</a>"
"</td>"
"</tr>"
@ -3648,11 +3725,38 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, cr->m_coll
, cr->m_coll
, rand64
// download products html
, cr->m_coll
, rand64
//, cr->m_coll
//, cr->m_coll
//, cr->m_coll
, cr->m_coll
// latest objects in html
, cr->m_coll
, rand64
// latest objects in csv
, cr->m_coll
, rand64
// latest products in html
, cr->m_coll
, rand64
// latest products in csv
, cr->m_coll
, rand64
, cr->m_coll
, cr->m_collectiveRespiderFrequency
@ -3878,6 +3982,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
char *ct = "text/html";
if ( fmt == FMT_JSON ) ct = "application/json";
if ( fmt == FMT_XML ) ct = "text/xml";
if ( fmt == FMT_CSV ) ct = "text/csv";
// this could be in html json or xml
return g_httpServer.sendDynamicPage ( socket,

View File

@ -52,6 +52,7 @@ public:
long m_numDocIds;
long long m_took; // how long it took to get the results
HttpRequest m_hr;
bool m_printedHeaderRow;
};
static int printResult ( SafeBuf &sb,
@ -60,6 +61,8 @@ static int printResult ( SafeBuf &sb,
CollectionRec *cr ,
char *qe ) ;
bool printJsonItemInCsv ( char *json , SafeBuf *sb , bool *printedHeaderRow ) ;
bool printPairScore ( SafeBuf &sb , SearchInput *si , PairScore *ps ,
Msg20Reply *mr , Msg40 *msg40 , bool first ) ;
@ -80,6 +83,7 @@ bool sendReply ( State0 *st , char *reply ) {
char *ct = "text/html";
if ( si && si->m_format == FORMAT_XML ) ct = "text/xml";
if ( si && si->m_format == FORMAT_JSON ) ct = "application/json";
if ( si && si->m_format == FORMAT_CSV ) ct = "text/csv";
char *charset = "utf-8";
// . filter anything < 0x20 to 0x20 to keep XML legal
@ -466,7 +470,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
// limit here
long maxpp = cr->m_maxSearchResultsPerQuery ;
if ( si->m_docsWanted > maxpp ) si->m_docsWanted = maxpp;
if ( si->m_docsWanted > maxpp &&
// disable serp max per page for custom crawls
! cr->m_isCustomCrawl )
si->m_docsWanted = maxpp;
st->m_numDocIds = si->m_docsWanted;
@ -492,6 +499,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
st->m_gotAds = false;
st->m_gotSpell = false;
// reset
st->m_printedHeaderRow = false;
long ip = s->m_ip;
long uipLen;
char *uip = hr->getString("uip", &uipLen, NULL);
@ -1720,6 +1730,19 @@ static int printResult ( SafeBuf &sb,
Msg20 *m20 = msg40->m_msg20[ix];
Msg20Reply *mr = m20->m_r;
if ( si->m_format == FORMAT_CSV &&
mr->ptr_content &&
mr->m_contentType == CT_JSON ) {
// parse it up
char *json = mr->ptr_content;
// only print header row once, so pass in that flag
printJsonItemInCsv ( json , &sb , &st->m_printedHeaderRow );
return true;
}
// just print cached web page?
if ( mr->ptr_content ) {
sb.safeStrcpy ( mr->ptr_content );
@ -4680,3 +4703,108 @@ bool printDirectorySearchType ( SafeBuf& sb, long sdirt ) {
return true;
}
*/
#include "Json.h"
bool printJsonItemInCsv ( char *json , SafeBuf *sb , bool *printedHeaderRow ) {
// parse the json
Json jp;
jp.parseJsonStringIntoJsonItems ( json );
// . TODO: index individual "Products":[...] as each an
// individual title rec.
SafeBuf nameBuf;
bool firstOne = true;
JsonItem *ji;
////
//
// print header row in csv
//
////
for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) {
if ( *printedHeaderRow )
break;
// skip if not number or string
if ( ji->m_type != JT_NUMBER &&
ji->m_type != JT_STRING )
continue;
// if in an array, do not print! csv is not
// good for arrays... like "media":[....] . that
// one might be ok, but if the elements in the
// array are not simple types, like, if they are
// unflat json objects then it is not well suited
// for csv.
if ( ji->isInArray() ) continue;
if ( ! firstOne ) sb->pushChar(',');
firstOne = false;
ji->getCompoundName ( nameBuf );
//
// product.offerprice
//
sb->csvEncode ( nameBuf.getBufStart() , nameBuf.getLength() );
}
if ( ! *printedHeaderRow ) {
sb->pushChar('\n');
sb->nullTerm();
*printedHeaderRow = true;
}
firstOne = true;
///////
//
// print json item in csv
//
///////
for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) {
// skip if not number or string
if ( ji->m_type != JT_NUMBER &&
ji->m_type != JT_STRING )
continue;
// skip if not well suited for csv (see above comment)
if ( ji->isInArray() ) continue;
if ( ! firstOne ) sb->pushChar(',');
firstOne = false;
if ( ji->m_type == JT_NUMBER ) {
// print numbers without double quotes
if ( ji->m_valueDouble *10000000.0 ==
(double)ji->m_valueLong * 10000000.0 )
sb->safePrintf("%li",ji->m_valueLong);
else
sb->safePrintf("%f",ji->m_valueDouble);
continue;
}
// print the value
sb->pushChar('\"');
sb->csvEncode ( ji->getValue() , ji->getValueLen() );
sb->pushChar('\"');
}
if ( ! firstOne )
sb->pushChar('\n');
sb->nullTerm();
return true;
}

View File

@ -344,6 +344,8 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
path = "admin/inject"; pathLen = gbstrlen(path); }
if ( pathLen == 9 && strncmp ( path , "index.php" , 9 ) == 0 ) {
path = "search"; pathLen = gbstrlen(path); }
if ( pathLen == 10 && strncmp ( path , "search.csv" , 10 ) == 0 ) {
path = "search"; pathLen = gbstrlen(path); }
// if it is like /GA/Atlanta then call sendPageResults
// and that should be smart enough to set the m_where in

View File

@ -14964,18 +14964,19 @@ void Parms::init ( ) {
m->m_sprpp = 0;
m++;
/*
m->m_title = "format of the returned search results";
m->m_desc = "X is 0 to get back results in regular html, 1 to "
"get back results in XML, 2 for JSON.";
m->m_def = "0";
m->m_soff = (char *)&si.m_format - y;
m->m_type = TYPE_CHAR;
m->m_soff = (char *)&si.m_formatStr - y;
m->m_type = TYPE_STRING;//CHAR;
m->m_sparm = 1;
m->m_scgi = "format";
m->m_smin = 0;
m->m_smax = 12;
m++;
*/
m->m_title = "highlight query terms in summaries.";
m->m_desc = "Use to disable or enable "

View File

@ -4115,7 +4115,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_wikiPhraseId = qw->m_wikiPhraseId;
qti->m_quotedStartId = qw->m_quoteStart;
// is it gbsortby:?
if ( qt->m_fieldCode == FIELD_GBSORTBY )
if ( qt->m_fieldCode == FIELD_GBSORTBY ||
qt->m_fieldCode == FIELD_GBREVSORTBY )
m_sortByTermNum = i;
// count
long nn = 0;
@ -4237,6 +4238,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
// they have a float stored there for sorting etc.
if (qt->m_fieldCode == FIELD_GBSORTBY )
qti->m_bigramFlags[nn]|=BF_NUMBER;
if (qt->m_fieldCode == FIELD_GBREVSORTBY )
qti->m_bigramFlags[nn]|=BF_NUMBER;
if (qt->m_fieldCode == FIELD_GBNUMBERMIN )
qti->m_bigramFlags[nn]|=BF_NUMBER;
if (qt->m_fieldCode == FIELD_GBNUMBERMAX )

11
Posdb.h
View File

@ -206,6 +206,15 @@ class Posdb {
float getFloat ( void *vkp ) {
return *(float *)(((char *)vkp) + 2); };
void setAlignmentBit ( void *vkp , char val ) {
char *p = (char *)vkp;
if ( val ) p[1] = p[1] | 0x02;
else p[1] = p[1] & 0xfd;
};
bool isAlignmentBitClear ( void *vkp ) {
return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
};
void makeStartKey ( void *kp, long long termId ,
long long docId=0LL){
@ -436,7 +445,7 @@ class PosdbList : public RdbList {
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
// max # search results that can be viewed without using TopTree
#define MAX_RESULTS 1000
//#define MAX_RESULTS 1000
class PosdbTable {

View File

@ -2200,6 +2200,7 @@ bool Query::setQWords ( char boolFlag ,
fieldCode == FIELD_ISCLEAN ||
fieldCode == FIELD_QUOTA ||
fieldCode == FIELD_GBSORTBY ||
fieldCode == FIELD_GBREVSORTBY ||
fieldCode == FIELD_GBNUMBERMIN ||
fieldCode == FIELD_GBNUMBERMAX ||
fieldCode == FIELD_GBAD ) {
@ -2217,6 +2218,7 @@ bool Query::setQWords ( char boolFlag ,
// i've decided not to make
// gbsortby:products.offerPrice case sensitive
if ( fieldCode == FIELD_GBSORTBY ||
fieldCode == FIELD_GBREVSORTBY ||
fieldCode == FIELD_GBNUMBERMIN ||
fieldCode == FIELD_GBNUMBERMAX )
wid = hash64Lower_utf8 ( w , wlen , 0LL );
@ -3044,6 +3046,7 @@ struct QueryField g_fields[] = {
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
{"gbsortby", FIELD_GBSORTBY, false,""},
{"gbrevsortby", FIELD_GBREVSORTBY, false,""},
{"gbnumbermin", FIELD_GBNUMBERMIN, false,""},
{"gbnumbermax", FIELD_GBNUMBERMAX, false,""},

View File

@ -105,8 +105,9 @@ typedef unsigned long long qvec_t;
#define FIELD_GBDOCID 52
#define FIELD_GBCONTENTHASH 53 // for deduping at spider time
#define FIELD_GBSORTBY 54 // i.e. sortby:price -> numeric termlist
#define FIELD_GBNUMBERMIN 55
#define FIELD_GBNUMBERMAX 56
#define FIELD_GBREVSORTBY 55 // i.e. sortby:price -> low to high
#define FIELD_GBNUMBERMIN 56
#define FIELD_GBNUMBERMAX 57
#define FIELD_GBOTHER 92

View File

@ -3169,3 +3169,49 @@ bool SafeBuf::htmlDecode ( char *src,
// good to go
return true;
}
void SafeBuf::replaceChar ( char src , char dst ) {
char *px = m_buf;
char *pxEnd = m_buf + m_length;
for ( ; px < pxEnd ; px++ ) if ( *px == src ) *px = dst;
}
// encode a double quote char to two double quote chars
bool SafeBuf::csvEncode ( char *s , long len , long niceness ) {
if ( ! s ) return true;
// assume all chars are double quotes and will have to be encoded
long need = len * 2 + 1;
if ( ! reserve ( need ) ) return false;
// tmp vars
char *dst = m_buf + m_length;
//char *dstEnd = m_buf + m_capacity;
// scan through all
char *send = s + len;
for ( ; s < send ; s++ ) {
// breathe
QUICKPOLL ( niceness );
// convert it?
if ( *s == '\"' ) {
*dst++ = '\"';
*dst++ = '\"';
continue;
}
//if ( *s == '\\' ) {
// *dst++ = '\\';
// *dst++ = '\\';
// continue;
//}
*dst++ = *s;
}
m_length += dst - (m_buf + m_length);
nullTerm();
return true;
}

View File

@ -97,6 +97,9 @@ struct SafeBuf {
bool safeStrcpy ( char *s ) ;
bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
bool safeUtf8ToJSON ( char *utf8 ) ;
bool csvEncode ( char *s , long len , long niceness = 0 );
//bool pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
bool cat(SafeBuf& c);
// . only cat the sections/tag that start with "tagFilter"
@ -144,6 +147,7 @@ struct SafeBuf {
char *t , long tlen ,
long niceness ,
long startOff = 0 );
void replaceChar ( char src , char dst );
bool copyToken(char* s);;
//output encoding
bool setEncoding(short cs);

View File

@ -354,8 +354,16 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
return log("query: unable to strcpy whitelist");
char format = FORMAT_HTML;
// what format should search results be in? default is html
long format = r->getLong("format", FORMAT_HTML );
char *formatStr = r->getString("format", NULL );
if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
// support old api &xml=1 to mean &format=1
if ( r->getLong("xml",0) ) {
@ -367,6 +375,10 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
format = FORMAT_JSON;
}
if ( r->getLong("csv",0) ) {
format = FORMAT_CSV;
}
// now override automatic defaults for special cases
if ( format != FORMAT_HTML ) {

View File

@ -46,6 +46,7 @@ class TopicGroup {
#define FORMAT_HTML 0
#define FORMAT_XML 1
#define FORMAT_JSON 2
#define FORMAT_CSV 3
class SearchInput {
@ -217,6 +218,10 @@ class SearchInput {
// so can search results
//long m_xml; // msg40
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON
//long m_formatStrLen;
//char *m_formatStr;
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
char m_format;
// this should be part of the key because it will affect the results!

View File

@ -47,6 +47,11 @@ Wiktionary::~Wiktionary () {
bool Wiktionary::test ( ) {
// test words parsing here
//Words w;
//w.set9 ("get $4,500.00 now",0);
// test it out!
char *str = "love";//pie"; //forsake";
//long long wid = hash64Lower_utf8(str);

View File

@ -419,10 +419,11 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
// comma is ok if like ,ddd!d
if ( s[i]==',' &&
j-i <= 3 &&
is_digit(s[i-1]) &&
(j==i-1 || is_digit(s[i-2]) ) &&
(j==i-2 || is_digit(s[i-3]) ) ) {
i-j <= 3 &&
is_digit(s[i-1]) ) {
// if word so far is 2 or 3 chars, make sure digits
if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
// scan forward
subloop:
if ( s[i] == ',' &&
@ -445,6 +446,8 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
while ( is_digit(s[i]) ) i++;
}
nogo:
// allow for words like we're dave's and i'm
if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
i++;

View File

@ -99,7 +99,11 @@ static bool getWordPosVec ( Words *words ,
static void getMetaListWrapper ( void *state ) ;
char *getNextJSONObject ( char *p ) ;
char *getFirstJSONObject ( char *p ,
long niceness ,
bool *isProduct ,
bool *isImage ) ;
char *getNextJSONObject ( char *p , long niceness ) ;
XmlDoc::XmlDoc() {
for ( long i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
@ -19286,10 +19290,18 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
}
mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
// init cursor to first json object
m_diffbotObj = m_diffbotReply.getBufStart();
//m_diffbotObj = m_diffbotReply.getBufStart();
char *rp = m_diffbotReply.getBufStart();
// we now parse the array of products out of the
// diffbot reply. each product is an item/object.
m_diffbotObj = getFirstJSONObject ( rp ,
m_niceness ,
&m_isJsonProduct ,
&m_isJsonImage );
m_diffbotJSONCount = 0;
// set end of it
m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj );
m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj,
m_niceness);
// temp null it
m_diffbotSavedChar = *m_diffbotObjEnd;
*m_diffbotObjEnd = '\0';
@ -19322,6 +19334,33 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
// copy the content
m_tmpBuf.reset();
// how much
long clen = m_diffbotObjEnd - m_diffbotObj;
// include \0
long need = clen + 1;
// insert ,"type":"product" or
// possibly ,"type":"image" to make it kosher
need += 32;
// reserve the mem
if ( ! m_tmpBuf.reserve ( need ) )
return NULL;
// sanity
if ( m_diffbotObj[0] != '{' ) { char *xx=NULL;*xx=0;}
// copy first '{'
m_tmpBuf.pushChar(m_diffbotObj[0]);
// HACK: insert the type: thing here
if ( m_isJsonProduct )
m_tmpBuf.safePrintf("\"type\":\"product\",");
else if ( m_isJsonImage )
m_tmpBuf.safePrintf("\"type\":\"image\",");
// do the copy of the rest, title, etc.
m_tmpBuf.safeMemcpy ( m_diffbotObj+1 , clen-1 );
// null term
m_tmpBuf.nullTerm();
// set this
if (!m_dx->set4 ( &sreq ,
NULL ,
@ -19332,7 +19371,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// niceness of 0!!!!
m_niceness, // 1 ,
// inject this content
m_diffbotObj, // content ,
m_tmpBuf.getBufStart(), // content ,
false, // deleteFromIndex ,
0, // forcedIp ,
CT_JSON, // contentType ,
@ -19347,6 +19386,13 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
m_dx->m_usePlacedb = false;
m_dx->m_useLinkdb = false;
m_dx->m_isChildDoc = true;
// we like to sort json objects using
// 'gbsortby:spiderdate' query to get the most
// recent json objects, so this must be valid
if ( m_spideredTimeValid ) {
m_dx->m_spideredTimeValid = true;
m_dx->m_spideredTime = m_spideredTime;
}
m_dx->m_isDiffbotJSONObject = true;
}
@ -19377,7 +19423,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// we successfully index the json object, skip to next one
m_diffbotObj = m_diffbotObjEnd;
// point to next json object again
m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj );
m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj ,
m_niceness );
// re-save
m_diffbotSavedChar = *m_diffbotObjEnd;
// but gotta set this crap back
@ -22065,10 +22112,14 @@ bool XmlDoc::addTable144 ( HashTableX *tt1 , bool nosplit ) {
// store it as is
memcpy ( m_p , kp , sizeof(key144_t) );
// sanity check
//long long final = 202176590884090LL;
//final &= TERMID_MASK;
//if ( g_posdb.getTermId(kp) == final )
// log("hey");
//long long final = hash64n("products.offerprice",0);
//long long prefix = hash64n("gbsortby",0);
//long long h64 = hash64 ( final , prefix);
//h64 &= TERMID_MASK;
//if ( g_posdb.getTermId(kp) == h64 ) {
// log("hey: docid=%lli float=%f",m_docId,
// g_posdb.getFloat(kp) );
//}
/*
// get the score
long score = tt1->getScoreFromSlot ( i ) ;
@ -22091,10 +22142,25 @@ bool XmlDoc::addTable144 ( HashTableX *tt1 , bool nosplit ) {
*/
// this was zero when we added these keys to zero, so fix it
g_posdb.setDocIdBits ( m_p , m_docId );
// if this is a numeric field we do not want to set
// the siterank or langid bits because it will mess up
// sorting by the float which is basically in the position
// of the word position bits.
if ( g_posdb.isAlignmentBitClear ( m_p ) ) {
// make sure it is set again. it was just cleared
// to indicate that this key contains a float
// like a price or something, and we should not
// set siterank or langid so that its termlist
// remains sorted just by that float
g_posdb.setAlignmentBit ( m_p , 1 );
}
// otherwise, set the siterank and langid
else {
// this too
g_posdb.setSiteRankBits ( m_p , siteRank );
// set language here too
g_posdb.setLangIdBits ( m_p , m_langId );
}
// advance over it
m_p += sizeof(key144_t);
}
@ -22839,6 +22905,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// country?
if ( ! hashCountry ( table ) ) return NULL;
if ( ! hashTagRec ( table ) ) return NULL;
// hash for gbsortby:gbspiderdate
if ( ! hashDateNumbers ( table ) ) return NULL;
// and the json itself
return hashJSON ( table );
}
@ -22880,6 +22948,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
if ( ! hashLinks ( table ) ) return NULL;
if ( ! hashContentType ( table ) ) return NULL;
if ( ! hashUrl ( table ) ) return NULL;
if ( ! hashDateNumbers ( table ) ) return NULL;
if ( ! hashMetaTags ( table ) ) return NULL;
if ( ! hashMetaZip ( table ) ) return NULL;
if ( ! hashDMOZCategories( table ) ) return NULL;
@ -23071,6 +23140,31 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
return true;
}
// . hash dates for sorting by using gbsortby: and gbrevsortby:
// . do 'gbsortby:gbspiderdate' as your query to see this in action
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) {
// stop if already set
if ( ! m_spideredTimeValid ) return true;
// first the last spidered date
HashInfo hi;
hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
hi.m_tt = tt;
hi.m_desc = "last spidered date";
hi.m_prefix = "gbspiderdate";
char buf[64];
long bufLen = sprintf ( buf , "%lu", m_spideredTime );
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
return false;
// all done
return true;
}
bool XmlDoc::hashMetaZip ( HashTableX *tt ) {
setStatus ( "hashing meta zip" );
@ -23760,6 +23854,9 @@ bool XmlDoc::hashUrl ( HashTableX *tt ) {
sprintf(buf2,"%llu",(m_docId) );
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
// hash
return true;
}
@ -28506,10 +28603,19 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
// . this now allows for commas in numbers like "1,500.62"
float f = atof2 ( p , bufEnd - p );
return hashNumber2 ( f , hi );
if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
return false;
// also hash in reverse order for sorting from low to high
f = -1.0 * f;
if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) )
return false;
return true;
}
bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
// prefix is something like price. like the meta "name" or
// the json name with dots in it like "product.info.price" or something
@ -28523,7 +28629,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
// combine prefix hash with a special hash to make it unique to avoid
// collisions. this is the "TRUE" prefix.
long long truePrefix64 = hash64n ( "gbsortby");
long long truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
// hash with the "TRUE" prefix
long long ph2 = hash64 ( nameHash , truePrefix64 );
@ -28534,7 +28640,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
key144_t k;
g_posdb.makeKey ( &k ,
ph2 ,
0LL,//docid
0,//docid
0,// word pos #
0,// densityRank , // 0-15
0 , // MAXDIVERSITYRANK
@ -28554,9 +28660,25 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
false, // syn?
false ); // delkey?
//long long final = hash64n("products.offerprice",0);
//long long prefix = hash64n("gbsortby",0);
//long long h64 = hash64 ( final , prefix);
//if ( ph2 == h64 )
// log("hey: got offer price");
// now set the float in that key
g_posdb.setFloat ( &k , f );
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
// so that we can b-step into a posdb list and make sure
// we are aligned on a 6 byte or 12 byte key, since they come
// in both sizes. but for this, hack it off to tell
// addTable144() that we are a special posdb key, a "numeric"
// key that has a float stored in it. then it will NOT
// set the siterank and langid bits which throw our sorting
// off!!
g_posdb.setAlignmentBit ( &k , 0 );
// sanity
float t = g_posdb.getFloat ( &k );
if ( t != f ) { char *xx=NULL;*xx=0; }
@ -43553,12 +43675,49 @@ SafeBuf *XmlDoc::getQueryLinkBuf(SafeBuf *docIdList, bool doMatchingQueries) {
//void XmlDoc::getGigabitExcerpts ( ) {
//}
// . the products and image types are listed as arrays in the json object.
// . so go to those first if there...
char *getFirstJSONObject ( char *p ,
long niceness ,
bool *isProduct ,
bool *isImage ) {
// do we have a "products": array?
char *needle = ",\"products\":[";
char *s = strstr(p,needle);
*isProduct = false;
*isImage = false;
// return ptr to first product if there
if ( s ) {
*isProduct = true;
return s + gbstrlen(needle);
}
QUICKPOLL ( niceness );
// images?
needle = ",\"images\":[";
s = strstr(p,needle);
// return ptr to first product if there
if ( s ) {
*isImage = true;
return s + gbstrlen(needle);
}
// default to just that json otherwise
return p;
}
// . advance p to skip over the json object it is pointing to and return
// ptr to the following json object
// . deal with nested {}'s
// . basically skips over current json object in a list of json objects to
// point to the next brother object
char *getNextJSONObject ( char *p ) {
char *getNextJSONObject ( char *p , long niceness ) {
// otherwise, *p must be {
for ( ; *p && *p != '{' ; p++ );
// empty?
@ -43571,6 +43730,8 @@ char *getNextJSONObject ( char *p ) {
bool inQuotes = false;
// scan
for ( ; *p ; p++ ) {
// breathe
QUICKPOLL ( niceness );
// escaping a quote? ignore quote then.
if ( *p == '\\' && p[1] == '\"' ) {
// skip two bytes then..

View File

@ -752,6 +752,7 @@ class XmlDoc {
bool hashDMOZCategories ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table ) ;
bool hashDateNumbers ( class HashTableX *tt ) ;
bool hashSections ( class HashTableX *table ) ;
bool hashIncomingLinkText ( class HashTableX *table ,
bool hashAnomalies ,
@ -854,7 +855,9 @@ class XmlDoc {
long bufLen ,
class HashInfo *hi ) ;
bool hashNumber2 ( float f , class HashInfo *hi ) ;
bool hashNumber2 ( float f ,
class HashInfo *hi ,
char *gbsortByStr ) ;
// print out for PageTitledb.cpp and PageParser.cpp
bool printDoc ( class SafeBuf *pbuf );
@ -1487,6 +1490,12 @@ class XmlDoc {
char m_isWWWDup;
char m_calledMsg0b;
Url m_tmpUrl;
// hack stuff:
SafeBuf m_tmpBuf;
bool m_isJsonProduct;
bool m_isJsonImage;
SafeBuf m_tmpsb1;
SafeBuf m_tmpsb2;
SafeBuf m_turkBuf;