mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot
This commit is contained in:
commit
9e77f1b2f6
58
Json.cpp
58
Json.cpp
@ -330,4 +330,60 @@ void Json::test ( ) {
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) {
|
||||
|
||||
// reset, but don't free mem etc. just set m_length to 0
|
||||
nameBuf.reset();
|
||||
// get its full compound name like "meta.twitter.title"
|
||||
JsonItem *p = this;//ji;
|
||||
char *lastName = NULL;
|
||||
char *nameArray[20];
|
||||
long numNames = 0;
|
||||
for ( ; p ; p = p->m_parent ) {
|
||||
// empty name?
|
||||
if ( ! p->m_name ) continue;
|
||||
if ( ! p->m_name[0] ) continue;
|
||||
// dup? can happen with arrays. parent of string
|
||||
// in object, has same name as his parent, the
|
||||
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
|
||||
if ( p->m_name == lastName ) continue;
|
||||
// update
|
||||
lastName = p->m_name;
|
||||
// add it up
|
||||
nameArray[numNames++] = p->m_name;
|
||||
// breach?
|
||||
if ( numNames < 15 ) continue;
|
||||
log("build: too many names in json tag");
|
||||
break;
|
||||
}
|
||||
// assemble the names in reverse order which is correct order
|
||||
for ( long i = 1 ; i <= numNames ; i++ ) {
|
||||
// copy into our safebuf
|
||||
if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) )
|
||||
return false;
|
||||
// separate names with periods
|
||||
if ( ! nameBuf.pushChar('.') ) return false;
|
||||
}
|
||||
// remove last period
|
||||
nameBuf.removeLastChar('.');
|
||||
// and null terminate
|
||||
if ( ! nameBuf.nullTerm() ) return false;
|
||||
// change all :'s in names to .'s since : is reserved!
|
||||
char *px = nameBuf.getBufStart();
|
||||
for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// is this json item in an array of json items?
|
||||
bool JsonItem::isInArray ( ) {
|
||||
JsonItem *p = this;//ji;
|
||||
for ( ; p ; p = p->m_parent ) {
|
||||
// empty name? it's just a "value item" then, i guess.
|
||||
//if ( ! p->m_name ) continue;
|
||||
//if ( ! p->m_name[0] ) continue;
|
||||
if ( p->m_type == JT_ARRAY ) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
4
Json.h
4
Json.h
@ -51,6 +51,10 @@ class JsonItem {
|
||||
return (char *)this + sizeof(JsonItem);
|
||||
};
|
||||
|
||||
// like acme.product.offerPrice if "acme:{product:{offerprice:1.23}}"
|
||||
bool getCompoundName ( SafeBuf &nameBuf ) ;
|
||||
|
||||
bool isInArray ( );
|
||||
};
|
||||
|
||||
|
||||
|
127
PageCrawlBot.cpp
127
PageCrawlBot.cpp
@ -738,6 +738,7 @@ public:
|
||||
char **lastKeyPtr ) ;
|
||||
void printTitledbList ( RdbList *list , SafeBuf *sb ,
|
||||
char **lastKeyPtr );
|
||||
bool printJsonItemInCsv ( char *json , SafeBuf *sb ) ;
|
||||
|
||||
char m_fmt;
|
||||
Msg4 m_msg4;
|
||||
@ -751,6 +752,8 @@ public:
|
||||
bool m_printedEndingBracket;
|
||||
bool m_printedItem;
|
||||
|
||||
bool m_needHeaderRow;
|
||||
|
||||
bool m_needsMime;
|
||||
char m_rdbId;
|
||||
bool m_downloadJSON;
|
||||
@ -810,10 +813,10 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
fmt = FMT_JSON;
|
||||
downloadJSON = true;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_data.xml" ) ) ) {
|
||||
else if ( ( xx = strstr ( path , "_data.csv" ) ) ) {
|
||||
rdbId = RDB_TITLEDB;
|
||||
downloadJSON = true;
|
||||
fmt = FMT_XML;
|
||||
fmt = FMT_CSV;
|
||||
}
|
||||
else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
|
||||
rdbId = RDB_SPIDERDB;
|
||||
@ -886,6 +889,9 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
st->m_printedItem = false;
|
||||
st->m_printedEndingBracket = false;
|
||||
|
||||
// for csv...
|
||||
st->m_needHeaderRow = true;
|
||||
|
||||
// debug
|
||||
//log("mnew1: st=%lx",(long)st);
|
||||
|
||||
@ -1027,6 +1033,8 @@ bool StateCD::sendList ( ) {
|
||||
ct = "text/xml";
|
||||
if ( m_fmt == FMT_TXT )
|
||||
ct = "text/plain";
|
||||
if ( m_fmt == FMT_CSV )
|
||||
ct = "text/csv";
|
||||
|
||||
// . if we haven't yet sent an http mime back to the user
|
||||
// then do so here, the content-length will not be in there
|
||||
@ -1557,22 +1565,34 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
// get the json content
|
||||
char *json = xd.ptr_utf8Content;
|
||||
|
||||
if ( m_printedItem )
|
||||
sb->safePrintf("\n,\n");
|
||||
// empty?
|
||||
if ( xd.size_utf8Content <= 1 )
|
||||
continue;
|
||||
|
||||
m_printedItem = true;
|
||||
// if not json, just print the json item out in csv
|
||||
// moved into PageResults.cpp...
|
||||
//if ( m_fmt == FMT_CSV ) {
|
||||
// printJsonItemInCsv ( json , sb );
|
||||
// continue;
|
||||
//}
|
||||
|
||||
// just print that out. encode \n's and \r's back to \\n \\r
|
||||
// and backslash to a \\ ...
|
||||
// but if they originally had a \u<backslash> encoding and
|
||||
// we made into utf8, do not put that back into the \u
|
||||
// encoding because it is not necessary.
|
||||
|
||||
// print in json
|
||||
if ( m_printedItem )
|
||||
sb->safePrintf("\n,\n");
|
||||
|
||||
m_printedItem = true;
|
||||
|
||||
if ( ! sb->safeStrcpyPrettyJSON ( json ) )
|
||||
log("diffbot: error printing json in dump");
|
||||
|
||||
// separate each JSON object with \n i guess
|
||||
//sb->pushChar('\n');
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -3498,20 +3518,77 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"</td><td>"
|
||||
"<a href=/crawlbot/download/%s_data.json>"
|
||||
"json</a>"
|
||||
" "
|
||||
"<a href=/crawlbot/download/%s_data.xml>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Download Products:</b> "
|
||||
"</td><td>"
|
||||
// make it search.csv so excel opens it
|
||||
"<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
|
||||
"c=%s&n=10000000&rand=%llu&id=1&"
|
||||
"q=gbrevsortby%%3AofferPrice&"
|
||||
"prepend=type%%3Ajson"
|
||||
//"+type%%3Aproduct%%7C"
|
||||
">"
|
||||
"csv</a>"
|
||||
" "
|
||||
"<a href=/search?icc=1&format=html&sc=0&dr=0&"
|
||||
"c=%s&n=10000000&rand=%llu&id=1&"
|
||||
"q=gbrevsortby%%3AofferPrice&"
|
||||
"prepend=type%%3Ajson"
|
||||
">"
|
||||
"html</a>"
|
||||
|
||||
"xml</a>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Download Urls:</b> "
|
||||
"</td><td>"
|
||||
|
||||
"<a href=/crawlbot/download/%s_urls.csv>"
|
||||
"csv</a>"
|
||||
//
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Latest Objects:</b> "
|
||||
"</td><td>"
|
||||
"<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
|
||||
"c=%s&n=10&rand=%llu&id=1&"
|
||||
"q=gbsortby%%3Agbspiderdate&"
|
||||
"prepend=type%%3Ajson"
|
||||
">"
|
||||
"csv</a>"
|
||||
" "
|
||||
"<a href=/search?icc=1&format=html&sc=0&dr=0&"
|
||||
"c=%s&n=10rand=%llu&id=1&"
|
||||
"q=gbsortby%%3Agbspiderdate&"
|
||||
"prepend=type%%3Ajson"
|
||||
">"
|
||||
"html</a>"
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td><b>Latest Products:</b> "
|
||||
"</td><td>"
|
||||
"<a href=/search.csv?icc=1&format=csv&sc=0&dr=0&"
|
||||
"c=%s&n=10&rand=%llu&id=1&"
|
||||
"q=gbsortby%%3Agbspiderdate&"
|
||||
"prepend=type%%3Ajson+type%%3Aproduct"
|
||||
">"
|
||||
"csv</a>"
|
||||
" "
|
||||
"<a href=/search?icc=1&format=html&sc=0&dr=0&"
|
||||
"c=%s&n=10&rand=%llu&id=1&"
|
||||
"q=gbsortby%%3Agbspiderdate&"
|
||||
"prepend=type%%3Ajson+type%%3Aproduct"
|
||||
">"
|
||||
"html</a>"
|
||||
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
@ -3648,11 +3725,38 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
|
||||
, cr->m_coll
|
||||
|
||||
, cr->m_coll
|
||||
, rand64
|
||||
|
||||
// download products html
|
||||
, cr->m_coll
|
||||
, rand64
|
||||
|
||||
//, cr->m_coll
|
||||
//, cr->m_coll
|
||||
//, cr->m_coll
|
||||
|
||||
, cr->m_coll
|
||||
|
||||
// latest objects in html
|
||||
, cr->m_coll
|
||||
, rand64
|
||||
|
||||
// latest objects in csv
|
||||
, cr->m_coll
|
||||
, rand64
|
||||
|
||||
|
||||
// latest products in html
|
||||
, cr->m_coll
|
||||
, rand64
|
||||
|
||||
// latest products in csv
|
||||
, cr->m_coll
|
||||
, rand64
|
||||
|
||||
|
||||
, cr->m_coll
|
||||
|
||||
, cr->m_collectiveRespiderFrequency
|
||||
@ -3878,6 +3982,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
char *ct = "text/html";
|
||||
if ( fmt == FMT_JSON ) ct = "application/json";
|
||||
if ( fmt == FMT_XML ) ct = "text/xml";
|
||||
if ( fmt == FMT_CSV ) ct = "text/csv";
|
||||
|
||||
// this could be in html json or xml
|
||||
return g_httpServer.sendDynamicPage ( socket,
|
||||
|
130
PageResults.cpp
130
PageResults.cpp
@ -52,6 +52,7 @@ public:
|
||||
long m_numDocIds;
|
||||
long long m_took; // how long it took to get the results
|
||||
HttpRequest m_hr;
|
||||
bool m_printedHeaderRow;
|
||||
};
|
||||
|
||||
static int printResult ( SafeBuf &sb,
|
||||
@ -60,6 +61,8 @@ static int printResult ( SafeBuf &sb,
|
||||
CollectionRec *cr ,
|
||||
char *qe ) ;
|
||||
|
||||
bool printJsonItemInCsv ( char *json , SafeBuf *sb , bool *printedHeaderRow ) ;
|
||||
|
||||
bool printPairScore ( SafeBuf &sb , SearchInput *si , PairScore *ps ,
|
||||
Msg20Reply *mr , Msg40 *msg40 , bool first ) ;
|
||||
|
||||
@ -80,6 +83,7 @@ bool sendReply ( State0 *st , char *reply ) {
|
||||
char *ct = "text/html";
|
||||
if ( si && si->m_format == FORMAT_XML ) ct = "text/xml";
|
||||
if ( si && si->m_format == FORMAT_JSON ) ct = "application/json";
|
||||
if ( si && si->m_format == FORMAT_CSV ) ct = "text/csv";
|
||||
char *charset = "utf-8";
|
||||
|
||||
// . filter anything < 0x20 to 0x20 to keep XML legal
|
||||
@ -466,7 +470,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
|
||||
// limit here
|
||||
long maxpp = cr->m_maxSearchResultsPerQuery ;
|
||||
if ( si->m_docsWanted > maxpp ) si->m_docsWanted = maxpp;
|
||||
if ( si->m_docsWanted > maxpp &&
|
||||
// disable serp max per page for custom crawls
|
||||
! cr->m_isCustomCrawl )
|
||||
si->m_docsWanted = maxpp;
|
||||
|
||||
st->m_numDocIds = si->m_docsWanted;
|
||||
|
||||
@ -492,6 +499,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
st->m_gotAds = false;
|
||||
st->m_gotSpell = false;
|
||||
|
||||
// reset
|
||||
st->m_printedHeaderRow = false;
|
||||
|
||||
long ip = s->m_ip;
|
||||
long uipLen;
|
||||
char *uip = hr->getString("uip", &uipLen, NULL);
|
||||
@ -1720,6 +1730,19 @@ static int printResult ( SafeBuf &sb,
|
||||
Msg20 *m20 = msg40->m_msg20[ix];
|
||||
Msg20Reply *mr = m20->m_r;
|
||||
|
||||
|
||||
if ( si->m_format == FORMAT_CSV &&
|
||||
mr->ptr_content &&
|
||||
mr->m_contentType == CT_JSON ) {
|
||||
// parse it up
|
||||
char *json = mr->ptr_content;
|
||||
// only print header row once, so pass in that flag
|
||||
printJsonItemInCsv ( json , &sb , &st->m_printedHeaderRow );
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// just print cached web page?
|
||||
if ( mr->ptr_content ) {
|
||||
sb.safeStrcpy ( mr->ptr_content );
|
||||
@ -4680,3 +4703,108 @@ bool printDirectorySearchType ( SafeBuf& sb, long sdirt ) {
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
#include "Json.h"
|
||||
|
||||
bool printJsonItemInCsv ( char *json , SafeBuf *sb , bool *printedHeaderRow ) {
|
||||
|
||||
// parse the json
|
||||
Json jp;
|
||||
jp.parseJsonStringIntoJsonItems ( json );
|
||||
|
||||
// . TODO: index individual "Products":[...] as each an
|
||||
// individual title rec.
|
||||
|
||||
SafeBuf nameBuf;
|
||||
bool firstOne = true;
|
||||
|
||||
JsonItem *ji;
|
||||
|
||||
////
|
||||
//
|
||||
// print header row in csv
|
||||
//
|
||||
////
|
||||
for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) {
|
||||
|
||||
if ( *printedHeaderRow )
|
||||
break;
|
||||
|
||||
// skip if not number or string
|
||||
if ( ji->m_type != JT_NUMBER &&
|
||||
ji->m_type != JT_STRING )
|
||||
continue;
|
||||
|
||||
// if in an array, do not print! csv is not
|
||||
// good for arrays... like "media":[....] . that
|
||||
// one might be ok, but if the elements in the
|
||||
// array are not simple types, like, if they are
|
||||
// unflat json objects then it is not well suited
|
||||
// for csv.
|
||||
if ( ji->isInArray() ) continue;
|
||||
|
||||
if ( ! firstOne ) sb->pushChar(',');
|
||||
|
||||
firstOne = false;
|
||||
|
||||
ji->getCompoundName ( nameBuf );
|
||||
|
||||
//
|
||||
// product.offerprice
|
||||
//
|
||||
sb->csvEncode ( nameBuf.getBufStart() , nameBuf.getLength() );
|
||||
}
|
||||
|
||||
if ( ! *printedHeaderRow ) {
|
||||
sb->pushChar('\n');
|
||||
sb->nullTerm();
|
||||
*printedHeaderRow = true;
|
||||
}
|
||||
|
||||
|
||||
firstOne = true;
|
||||
|
||||
///////
|
||||
//
|
||||
// print json item in csv
|
||||
//
|
||||
///////
|
||||
for ( ji = jp.getFirstItem(); ji ; ji = ji->m_next ) {
|
||||
|
||||
// skip if not number or string
|
||||
if ( ji->m_type != JT_NUMBER &&
|
||||
ji->m_type != JT_STRING )
|
||||
continue;
|
||||
|
||||
// skip if not well suited for csv (see above comment)
|
||||
if ( ji->isInArray() ) continue;
|
||||
|
||||
|
||||
if ( ! firstOne ) sb->pushChar(',');
|
||||
|
||||
firstOne = false;
|
||||
|
||||
if ( ji->m_type == JT_NUMBER ) {
|
||||
// print numbers without double quotes
|
||||
if ( ji->m_valueDouble *10000000.0 ==
|
||||
(double)ji->m_valueLong * 10000000.0 )
|
||||
sb->safePrintf("%li",ji->m_valueLong);
|
||||
else
|
||||
sb->safePrintf("%f",ji->m_valueDouble);
|
||||
continue;
|
||||
}
|
||||
|
||||
// print the value
|
||||
sb->pushChar('\"');
|
||||
sb->csvEncode ( ji->getValue() , ji->getValueLen() );
|
||||
sb->pushChar('\"');
|
||||
}
|
||||
|
||||
if ( ! firstOne )
|
||||
sb->pushChar('\n');
|
||||
|
||||
sb->nullTerm();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -344,6 +344,8 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
|
||||
path = "admin/inject"; pathLen = gbstrlen(path); }
|
||||
if ( pathLen == 9 && strncmp ( path , "index.php" , 9 ) == 0 ) {
|
||||
path = "search"; pathLen = gbstrlen(path); }
|
||||
if ( pathLen == 10 && strncmp ( path , "search.csv" , 10 ) == 0 ) {
|
||||
path = "search"; pathLen = gbstrlen(path); }
|
||||
|
||||
// if it is like /GA/Atlanta then call sendPageResults
|
||||
// and that should be smart enough to set the m_where in
|
||||
|
@ -14964,18 +14964,19 @@ void Parms::init ( ) {
|
||||
m->m_sprpp = 0;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "format of the returned search results";
|
||||
m->m_desc = "X is 0 to get back results in regular html, 1 to "
|
||||
"get back results in XML, 2 for JSON.";
|
||||
m->m_def = "0";
|
||||
m->m_soff = (char *)&si.m_format - y;
|
||||
m->m_type = TYPE_CHAR;
|
||||
m->m_soff = (char *)&si.m_formatStr - y;
|
||||
m->m_type = TYPE_STRING;//CHAR;
|
||||
m->m_sparm = 1;
|
||||
m->m_scgi = "format";
|
||||
m->m_smin = 0;
|
||||
m->m_smax = 12;
|
||||
m++;
|
||||
|
||||
*/
|
||||
|
||||
m->m_title = "highlight query terms in summaries.";
|
||||
m->m_desc = "Use to disable or enable "
|
||||
|
@ -4115,7 +4115,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_wikiPhraseId = qw->m_wikiPhraseId;
|
||||
qti->m_quotedStartId = qw->m_quoteStart;
|
||||
// is it gbsortby:?
|
||||
if ( qt->m_fieldCode == FIELD_GBSORTBY )
|
||||
if ( qt->m_fieldCode == FIELD_GBSORTBY ||
|
||||
qt->m_fieldCode == FIELD_GBREVSORTBY )
|
||||
m_sortByTermNum = i;
|
||||
// count
|
||||
long nn = 0;
|
||||
@ -4237,6 +4238,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
// they have a float stored there for sorting etc.
|
||||
if (qt->m_fieldCode == FIELD_GBSORTBY )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBREVSORTBY )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMIN )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMAX )
|
||||
|
11
Posdb.h
11
Posdb.h
@ -206,6 +206,15 @@ class Posdb {
|
||||
float getFloat ( void *vkp ) {
|
||||
return *(float *)(((char *)vkp) + 2); };
|
||||
|
||||
void setAlignmentBit ( void *vkp , char val ) {
|
||||
char *p = (char *)vkp;
|
||||
if ( val ) p[1] = p[1] | 0x02;
|
||||
else p[1] = p[1] & 0xfd;
|
||||
};
|
||||
|
||||
bool isAlignmentBitClear ( void *vkp ) {
|
||||
return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
|
||||
};
|
||||
|
||||
void makeStartKey ( void *kp, long long termId ,
|
||||
long long docId=0LL){
|
||||
@ -436,7 +445,7 @@ class PosdbList : public RdbList {
|
||||
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
|
||||
|
||||
// max # search results that can be viewed without using TopTree
|
||||
#define MAX_RESULTS 1000
|
||||
//#define MAX_RESULTS 1000
|
||||
|
||||
class PosdbTable {
|
||||
|
||||
|
@ -2200,6 +2200,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
fieldCode == FIELD_ISCLEAN ||
|
||||
fieldCode == FIELD_QUOTA ||
|
||||
fieldCode == FIELD_GBSORTBY ||
|
||||
fieldCode == FIELD_GBREVSORTBY ||
|
||||
fieldCode == FIELD_GBNUMBERMIN ||
|
||||
fieldCode == FIELD_GBNUMBERMAX ||
|
||||
fieldCode == FIELD_GBAD ) {
|
||||
@ -2217,6 +2218,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// i've decided not to make
|
||||
// gbsortby:products.offerPrice case sensitive
|
||||
if ( fieldCode == FIELD_GBSORTBY ||
|
||||
fieldCode == FIELD_GBREVSORTBY ||
|
||||
fieldCode == FIELD_GBNUMBERMIN ||
|
||||
fieldCode == FIELD_GBNUMBERMAX )
|
||||
wid = hash64Lower_utf8 ( w , wlen , 0LL );
|
||||
@ -3044,6 +3046,7 @@ struct QueryField g_fields[] = {
|
||||
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
|
||||
{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
|
||||
{"gbsortby", FIELD_GBSORTBY, false,""},
|
||||
{"gbrevsortby", FIELD_GBREVSORTBY, false,""},
|
||||
|
||||
{"gbnumbermin", FIELD_GBNUMBERMIN, false,""},
|
||||
{"gbnumbermax", FIELD_GBNUMBERMAX, false,""},
|
||||
|
5
Query.h
5
Query.h
@ -105,8 +105,9 @@ typedef unsigned long long qvec_t;
|
||||
#define FIELD_GBDOCID 52
|
||||
#define FIELD_GBCONTENTHASH 53 // for deduping at spider time
|
||||
#define FIELD_GBSORTBY 54 // i.e. sortby:price -> numeric termlist
|
||||
#define FIELD_GBNUMBERMIN 55
|
||||
#define FIELD_GBNUMBERMAX 56
|
||||
#define FIELD_GBREVSORTBY 55 // i.e. sortby:price -> low to high
|
||||
#define FIELD_GBNUMBERMIN 56
|
||||
#define FIELD_GBNUMBERMAX 57
|
||||
|
||||
#define FIELD_GBOTHER 92
|
||||
|
||||
|
46
SafeBuf.cpp
46
SafeBuf.cpp
@ -3169,3 +3169,49 @@ bool SafeBuf::htmlDecode ( char *src,
|
||||
// good to go
|
||||
return true;
|
||||
}
|
||||
|
||||
void SafeBuf::replaceChar ( char src , char dst ) {
|
||||
char *px = m_buf;
|
||||
char *pxEnd = m_buf + m_length;
|
||||
for ( ; px < pxEnd ; px++ ) if ( *px == src ) *px = dst;
|
||||
}
|
||||
|
||||
|
||||
// encode a double quote char to two double quote chars
|
||||
bool SafeBuf::csvEncode ( char *s , long len , long niceness ) {
|
||||
|
||||
if ( ! s ) return true;
|
||||
|
||||
// assume all chars are double quotes and will have to be encoded
|
||||
long need = len * 2 + 1;
|
||||
if ( ! reserve ( need ) ) return false;
|
||||
|
||||
// tmp vars
|
||||
char *dst = m_buf + m_length;
|
||||
//char *dstEnd = m_buf + m_capacity;
|
||||
|
||||
// scan through all
|
||||
char *send = s + len;
|
||||
for ( ; s < send ; s++ ) {
|
||||
// breathe
|
||||
QUICKPOLL ( niceness );
|
||||
// convert it?
|
||||
if ( *s == '\"' ) {
|
||||
*dst++ = '\"';
|
||||
*dst++ = '\"';
|
||||
continue;
|
||||
}
|
||||
//if ( *s == '\\' ) {
|
||||
// *dst++ = '\\';
|
||||
// *dst++ = '\\';
|
||||
// continue;
|
||||
//}
|
||||
*dst++ = *s;
|
||||
}
|
||||
|
||||
m_length += dst - (m_buf + m_length);
|
||||
|
||||
nullTerm();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -97,6 +97,9 @@ struct SafeBuf {
|
||||
bool safeStrcpy ( char *s ) ;
|
||||
bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
|
||||
bool safeUtf8ToJSON ( char *utf8 ) ;
|
||||
|
||||
bool csvEncode ( char *s , long len , long niceness = 0 );
|
||||
|
||||
//bool pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
|
||||
bool cat(SafeBuf& c);
|
||||
// . only cat the sections/tag that start with "tagFilter"
|
||||
@ -144,6 +147,7 @@ struct SafeBuf {
|
||||
char *t , long tlen ,
|
||||
long niceness ,
|
||||
long startOff = 0 );
|
||||
void replaceChar ( char src , char dst );
|
||||
bool copyToken(char* s);;
|
||||
//output encoding
|
||||
bool setEncoding(short cs);
|
||||
|
@ -354,8 +354,16 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
|
||||
return log("query: unable to strcpy whitelist");
|
||||
|
||||
|
||||
char format = FORMAT_HTML;
|
||||
|
||||
// what format should search results be in? default is html
|
||||
long format = r->getLong("format", FORMAT_HTML );
|
||||
char *formatStr = r->getString("format", NULL );
|
||||
|
||||
if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
|
||||
if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
|
||||
if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
|
||||
if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
|
||||
|
||||
|
||||
// support old api &xml=1 to mean &format=1
|
||||
if ( r->getLong("xml",0) ) {
|
||||
@ -367,6 +375,10 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
|
||||
format = FORMAT_JSON;
|
||||
}
|
||||
|
||||
if ( r->getLong("csv",0) ) {
|
||||
format = FORMAT_CSV;
|
||||
}
|
||||
|
||||
|
||||
// now override automatic defaults for special cases
|
||||
if ( format != FORMAT_HTML ) {
|
||||
|
@ -46,6 +46,7 @@ class TopicGroup {
|
||||
#define FORMAT_HTML 0
|
||||
#define FORMAT_XML 1
|
||||
#define FORMAT_JSON 2
|
||||
#define FORMAT_CSV 3
|
||||
|
||||
class SearchInput {
|
||||
|
||||
@ -217,6 +218,10 @@ class SearchInput {
|
||||
// so can search results
|
||||
//long m_xml; // msg40
|
||||
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON
|
||||
//long m_formatStrLen;
|
||||
//char *m_formatStr;
|
||||
|
||||
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
|
||||
char m_format;
|
||||
|
||||
// this should be part of the key because it will affect the results!
|
||||
|
@ -47,6 +47,11 @@ Wiktionary::~Wiktionary () {
|
||||
|
||||
|
||||
bool Wiktionary::test ( ) {
|
||||
|
||||
// test words parsing here
|
||||
//Words w;
|
||||
//w.set9 ("get $4,500.00 now",0);
|
||||
|
||||
// test it out!
|
||||
char *str = "love";//pie"; //forsake";
|
||||
//long long wid = hash64Lower_utf8(str);
|
||||
|
11
Words.cpp
11
Words.cpp
@ -419,10 +419,11 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
|
||||
|
||||
// comma is ok if like ,ddd!d
|
||||
if ( s[i]==',' &&
|
||||
j-i <= 3 &&
|
||||
is_digit(s[i-1]) &&
|
||||
(j==i-1 || is_digit(s[i-2]) ) &&
|
||||
(j==i-2 || is_digit(s[i-3]) ) ) {
|
||||
i-j <= 3 &&
|
||||
is_digit(s[i-1]) ) {
|
||||
// if word so far is 2 or 3 chars, make sure digits
|
||||
if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
|
||||
if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
|
||||
// scan forward
|
||||
subloop:
|
||||
if ( s[i] == ',' &&
|
||||
@ -445,6 +446,8 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
|
||||
while ( is_digit(s[i]) ) i++;
|
||||
}
|
||||
|
||||
nogo:
|
||||
|
||||
// allow for words like we're dave's and i'm
|
||||
if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
|
||||
i++;
|
||||
|
197
XmlDoc.cpp
197
XmlDoc.cpp
@ -99,7 +99,11 @@ static bool getWordPosVec ( Words *words ,
|
||||
|
||||
static void getMetaListWrapper ( void *state ) ;
|
||||
|
||||
char *getNextJSONObject ( char *p ) ;
|
||||
char *getFirstJSONObject ( char *p ,
|
||||
long niceness ,
|
||||
bool *isProduct ,
|
||||
bool *isImage ) ;
|
||||
char *getNextJSONObject ( char *p , long niceness ) ;
|
||||
|
||||
XmlDoc::XmlDoc() {
|
||||
for ( long i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
|
||||
@ -19286,10 +19290,18 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
}
|
||||
mnew ( m_dx , sizeof(XmlDoc),"xmldocdx");
|
||||
// init cursor to first json object
|
||||
m_diffbotObj = m_diffbotReply.getBufStart();
|
||||
//m_diffbotObj = m_diffbotReply.getBufStart();
|
||||
char *rp = m_diffbotReply.getBufStart();
|
||||
// we now parse the array of products out of the
|
||||
// diffbot reply. each product is an item/object.
|
||||
m_diffbotObj = getFirstJSONObject ( rp ,
|
||||
m_niceness ,
|
||||
&m_isJsonProduct ,
|
||||
&m_isJsonImage );
|
||||
m_diffbotJSONCount = 0;
|
||||
// set end of it
|
||||
m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj );
|
||||
m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj,
|
||||
m_niceness);
|
||||
// temp null it
|
||||
m_diffbotSavedChar = *m_diffbotObjEnd;
|
||||
*m_diffbotObjEnd = '\0';
|
||||
@ -19322,6 +19334,33 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
sreq.m_hopCountValid = 1;
|
||||
sreq.m_fakeFirstIp = 1;
|
||||
sreq.m_firstIp = firstIp;
|
||||
|
||||
// copy the content
|
||||
m_tmpBuf.reset();
|
||||
// how much
|
||||
long clen = m_diffbotObjEnd - m_diffbotObj;
|
||||
// include \0
|
||||
long need = clen + 1;
|
||||
// insert ,"type":"product" or
|
||||
// possibly ,"type":"image" to make it kosher
|
||||
need += 32;
|
||||
// reserve the mem
|
||||
if ( ! m_tmpBuf.reserve ( need ) )
|
||||
return NULL;
|
||||
// sanity
|
||||
if ( m_diffbotObj[0] != '{' ) { char *xx=NULL;*xx=0;}
|
||||
// copy first '{'
|
||||
m_tmpBuf.pushChar(m_diffbotObj[0]);
|
||||
// HACK: insert the type: thing here
|
||||
if ( m_isJsonProduct )
|
||||
m_tmpBuf.safePrintf("\"type\":\"product\",");
|
||||
else if ( m_isJsonImage )
|
||||
m_tmpBuf.safePrintf("\"type\":\"image\",");
|
||||
// do the copy of the rest, title, etc.
|
||||
m_tmpBuf.safeMemcpy ( m_diffbotObj+1 , clen-1 );
|
||||
// null term
|
||||
m_tmpBuf.nullTerm();
|
||||
|
||||
// set this
|
||||
if (!m_dx->set4 ( &sreq ,
|
||||
NULL ,
|
||||
@ -19332,7 +19371,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// niceness of 0!!!!
|
||||
m_niceness, // 1 ,
|
||||
// inject this content
|
||||
m_diffbotObj, // content ,
|
||||
m_tmpBuf.getBufStart(), // content ,
|
||||
false, // deleteFromIndex ,
|
||||
0, // forcedIp ,
|
||||
CT_JSON, // contentType ,
|
||||
@ -19347,6 +19386,13 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
m_dx->m_usePlacedb = false;
|
||||
m_dx->m_useLinkdb = false;
|
||||
m_dx->m_isChildDoc = true;
|
||||
// we like to sort json objects using
|
||||
// 'gbsortby:spiderdate' query to get the most
|
||||
// recent json objects, so this must be valid
|
||||
if ( m_spideredTimeValid ) {
|
||||
m_dx->m_spideredTimeValid = true;
|
||||
m_dx->m_spideredTime = m_spideredTime;
|
||||
}
|
||||
|
||||
m_dx->m_isDiffbotJSONObject = true;
|
||||
}
|
||||
@ -19377,7 +19423,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// we successfully index the json object, skip to next one
|
||||
m_diffbotObj = m_diffbotObjEnd;
|
||||
// point to next json object again
|
||||
m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj );
|
||||
m_diffbotObjEnd = getNextJSONObject ( m_diffbotObj ,
|
||||
m_niceness );
|
||||
// re-save
|
||||
m_diffbotSavedChar = *m_diffbotObjEnd;
|
||||
// but gotta set this crap back
|
||||
@ -22065,10 +22112,14 @@ bool XmlDoc::addTable144 ( HashTableX *tt1 , bool nosplit ) {
|
||||
// store it as is
|
||||
memcpy ( m_p , kp , sizeof(key144_t) );
|
||||
// sanity check
|
||||
//long long final = 202176590884090LL;
|
||||
//final &= TERMID_MASK;
|
||||
//if ( g_posdb.getTermId(kp) == final )
|
||||
// log("hey");
|
||||
//long long final = hash64n("products.offerprice",0);
|
||||
//long long prefix = hash64n("gbsortby",0);
|
||||
//long long h64 = hash64 ( final , prefix);
|
||||
//h64 &= TERMID_MASK;
|
||||
//if ( g_posdb.getTermId(kp) == h64 ) {
|
||||
// log("hey: docid=%lli float=%f",m_docId,
|
||||
// g_posdb.getFloat(kp) );
|
||||
//}
|
||||
/*
|
||||
// get the score
|
||||
long score = tt1->getScoreFromSlot ( i ) ;
|
||||
@ -22091,10 +22142,25 @@ bool XmlDoc::addTable144 ( HashTableX *tt1 , bool nosplit ) {
|
||||
*/
|
||||
// this was zero when we added these keys to zero, so fix it
|
||||
g_posdb.setDocIdBits ( m_p , m_docId );
|
||||
// this too
|
||||
g_posdb.setSiteRankBits ( m_p , siteRank );
|
||||
// set language here too
|
||||
g_posdb.setLangIdBits ( m_p , m_langId );
|
||||
// if this is a numeric field we do not want to set
|
||||
// the siterank or langid bits because it will mess up
|
||||
// sorting by the float which is basically in the position
|
||||
// of the word position bits.
|
||||
if ( g_posdb.isAlignmentBitClear ( m_p ) ) {
|
||||
// make sure it is set again. it was just cleared
|
||||
// to indicate that this key contains a float
|
||||
// like a price or something, and we should not
|
||||
// set siterank or langid so that its termlist
|
||||
// remains sorted just by that float
|
||||
g_posdb.setAlignmentBit ( m_p , 1 );
|
||||
}
|
||||
// otherwise, set the siterank and langid
|
||||
else {
|
||||
// this too
|
||||
g_posdb.setSiteRankBits ( m_p , siteRank );
|
||||
// set language here too
|
||||
g_posdb.setLangIdBits ( m_p , m_langId );
|
||||
}
|
||||
// advance over it
|
||||
m_p += sizeof(key144_t);
|
||||
}
|
||||
@ -22839,6 +22905,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// country?
|
||||
if ( ! hashCountry ( table ) ) return NULL;
|
||||
if ( ! hashTagRec ( table ) ) return NULL;
|
||||
// hash for gbsortby:gbspiderdate
|
||||
if ( ! hashDateNumbers ( table ) ) return NULL;
|
||||
// and the json itself
|
||||
return hashJSON ( table );
|
||||
}
|
||||
@ -22880,6 +22948,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
if ( ! hashLinks ( table ) ) return NULL;
|
||||
if ( ! hashContentType ( table ) ) return NULL;
|
||||
if ( ! hashUrl ( table ) ) return NULL;
|
||||
if ( ! hashDateNumbers ( table ) ) return NULL;
|
||||
if ( ! hashMetaTags ( table ) ) return NULL;
|
||||
if ( ! hashMetaZip ( table ) ) return NULL;
|
||||
if ( ! hashDMOZCategories( table ) ) return NULL;
|
||||
@ -23071,6 +23140,31 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// . hash dates for sorting by using gbsortby: and gbrevsortby:
|
||||
// . do 'gbsortby:gbspiderdate' as your query to see this in action
|
||||
bool XmlDoc::hashDateNumbers ( HashTableX *tt ) {
|
||||
|
||||
// stop if already set
|
||||
if ( ! m_spideredTimeValid ) return true;
|
||||
|
||||
|
||||
// first the last spidered date
|
||||
HashInfo hi;
|
||||
hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field
|
||||
hi.m_tt = tt;
|
||||
hi.m_desc = "last spidered date";
|
||||
hi.m_prefix = "gbspiderdate";
|
||||
|
||||
char buf[64];
|
||||
long bufLen = sprintf ( buf , "%lu", m_spideredTime );
|
||||
|
||||
if ( ! hashNumber ( buf , buf , bufLen , &hi ) )
|
||||
return false;
|
||||
|
||||
// all done
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashMetaZip ( HashTableX *tt ) {
|
||||
|
||||
setStatus ( "hashing meta zip" );
|
||||
@ -23760,6 +23854,9 @@ bool XmlDoc::hashUrl ( HashTableX *tt ) {
|
||||
sprintf(buf2,"%llu",(m_docId) );
|
||||
if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false;
|
||||
|
||||
// hash
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -28506,10 +28603,19 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
|
||||
// . this now allows for commas in numbers like "1,500.62"
|
||||
float f = atof2 ( p , bufEnd - p );
|
||||
|
||||
return hashNumber2 ( f , hi );
|
||||
if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
|
||||
return false;
|
||||
|
||||
// also hash in reverse order for sorting from low to high
|
||||
f = -1.0 * f;
|
||||
|
||||
if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
|
||||
bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
|
||||
|
||||
// prefix is something like price. like the meta "name" or
|
||||
// the json name with dots in it like "product.info.price" or something
|
||||
@ -28523,7 +28629,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
|
||||
|
||||
// combine prefix hash with a special hash to make it unique to avoid
|
||||
// collisions. this is the "TRUE" prefix.
|
||||
long long truePrefix64 = hash64n ( "gbsortby");
|
||||
long long truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
|
||||
// hash with the "TRUE" prefix
|
||||
long long ph2 = hash64 ( nameHash , truePrefix64 );
|
||||
|
||||
@ -28534,7 +28640,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
|
||||
key144_t k;
|
||||
g_posdb.makeKey ( &k ,
|
||||
ph2 ,
|
||||
0LL,//docid
|
||||
0,//docid
|
||||
0,// word pos #
|
||||
0,// densityRank , // 0-15
|
||||
0 , // MAXDIVERSITYRANK
|
||||
@ -28554,9 +28660,25 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi ) {
|
||||
false, // syn?
|
||||
false ); // delkey?
|
||||
|
||||
//long long final = hash64n("products.offerprice",0);
|
||||
//long long prefix = hash64n("gbsortby",0);
|
||||
//long long h64 = hash64 ( final , prefix);
|
||||
//if ( ph2 == h64 )
|
||||
// log("hey: got offer price");
|
||||
|
||||
// now set the float in that key
|
||||
g_posdb.setFloat ( &k , f );
|
||||
|
||||
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
||||
// so that we can b-step into a posdb list and make sure
|
||||
// we are aligned on a 6 byte or 12 byte key, since they come
|
||||
// in both sizes. but for this, hack it off to tell
|
||||
// addTable144() that we are a special posdb key, a "numeric"
|
||||
// key that has a float stored in it. then it will NOT
|
||||
// set the siterank and langid bits which throw our sorting
|
||||
// off!!
|
||||
g_posdb.setAlignmentBit ( &k , 0 );
|
||||
|
||||
// sanity
|
||||
float t = g_posdb.getFloat ( &k );
|
||||
if ( t != f ) { char *xx=NULL;*xx=0; }
|
||||
@ -43553,12 +43675,49 @@ SafeBuf *XmlDoc::getQueryLinkBuf(SafeBuf *docIdList, bool doMatchingQueries) {
|
||||
//void XmlDoc::getGigabitExcerpts ( ) {
|
||||
//}
|
||||
|
||||
|
||||
// . the products and image types are listed as arrays in the json object.
|
||||
// . so go to those first if there...
|
||||
char *getFirstJSONObject ( char *p ,
|
||||
long niceness ,
|
||||
bool *isProduct ,
|
||||
bool *isImage ) {
|
||||
|
||||
// do we have a "products": array?
|
||||
char *needle = ",\"products\":[";
|
||||
char *s = strstr(p,needle);
|
||||
|
||||
*isProduct = false;
|
||||
*isImage = false;
|
||||
|
||||
// return ptr to first product if there
|
||||
if ( s ) {
|
||||
*isProduct = true;
|
||||
return s + gbstrlen(needle);
|
||||
}
|
||||
|
||||
QUICKPOLL ( niceness );
|
||||
|
||||
// images?
|
||||
needle = ",\"images\":[";
|
||||
s = strstr(p,needle);
|
||||
// return ptr to first product if there
|
||||
if ( s ) {
|
||||
*isImage = true;
|
||||
return s + gbstrlen(needle);
|
||||
}
|
||||
|
||||
// default to just that json otherwise
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
// . advance p to skip over the json object it is pointing to and return
|
||||
// ptr to the following json object
|
||||
// . deal with nested {}'s
|
||||
// . basically skips over current json object in a list of json objects to
|
||||
// point to the next brother object
|
||||
char *getNextJSONObject ( char *p ) {
|
||||
char *getNextJSONObject ( char *p , long niceness ) {
|
||||
// otherwise, *p must be {
|
||||
for ( ; *p && *p != '{' ; p++ );
|
||||
// empty?
|
||||
@ -43571,6 +43730,8 @@ char *getNextJSONObject ( char *p ) {
|
||||
bool inQuotes = false;
|
||||
// scan
|
||||
for ( ; *p ; p++ ) {
|
||||
// breathe
|
||||
QUICKPOLL ( niceness );
|
||||
// escaping a quote? ignore quote then.
|
||||
if ( *p == '\\' && p[1] == '\"' ) {
|
||||
// skip two bytes then..
|
||||
|
11
XmlDoc.h
11
XmlDoc.h
@ -752,6 +752,7 @@ class XmlDoc {
|
||||
bool hashDMOZCategories ( class HashTableX *table ) ;
|
||||
bool hashLinks ( class HashTableX *table ) ;
|
||||
bool hashUrl ( class HashTableX *table ) ;
|
||||
bool hashDateNumbers ( class HashTableX *tt ) ;
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
bool hashIncomingLinkText ( class HashTableX *table ,
|
||||
bool hashAnomalies ,
|
||||
@ -854,7 +855,9 @@ class XmlDoc {
|
||||
long bufLen ,
|
||||
class HashInfo *hi ) ;
|
||||
|
||||
bool hashNumber2 ( float f , class HashInfo *hi ) ;
|
||||
bool hashNumber2 ( float f ,
|
||||
class HashInfo *hi ,
|
||||
char *gbsortByStr ) ;
|
||||
|
||||
// print out for PageTitledb.cpp and PageParser.cpp
|
||||
bool printDoc ( class SafeBuf *pbuf );
|
||||
@ -1487,6 +1490,12 @@ class XmlDoc {
|
||||
char m_isWWWDup;
|
||||
char m_calledMsg0b;
|
||||
Url m_tmpUrl;
|
||||
|
||||
// hack stuff:
|
||||
SafeBuf m_tmpBuf;
|
||||
bool m_isJsonProduct;
|
||||
bool m_isJsonImage;
|
||||
|
||||
SafeBuf m_tmpsb1;
|
||||
SafeBuf m_tmpsb2;
|
||||
SafeBuf m_turkBuf;
|
||||
|
Loading…
Reference in New Issue
Block a user