mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
customizable api list in url filters
This commit is contained in:
parent
0edcbcc7d8
commit
20952eedbe
@ -317,7 +317,10 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
|
||||
m_spidersEnabled[n] = 1;
|
||||
m_numRegExs7++;
|
||||
|
||||
m_spiderDiffbotApiNum[n] = 1;
|
||||
//m_spiderDiffbotApiNum[n] = 1;
|
||||
//m_numRegExs11++;
|
||||
m_spiderDiffbotApiUrl[n].set("");
|
||||
m_spiderDiffbotApiUrl[n].nullTerm();
|
||||
m_numRegExs11++;
|
||||
}
|
||||
|
||||
|
@ -389,11 +389,11 @@ class CollectionRec {
|
||||
// an alternate name for the collection. we tend to create
|
||||
// collection names as a random sequence of hex digits. this
|
||||
// will allow a user to give them an alternate name.
|
||||
SafeBuf m_collectionNameAlias;
|
||||
//SafeBuf m_collectionNameAlias;
|
||||
//SafeBuf m_diffbotSeed;
|
||||
// this will be NULL or "none" to not pass off to diffbot
|
||||
//SafeBuf m_diffbotApi;
|
||||
SafeBuf m_diffbotApiQueryString;
|
||||
SafeBuf m_diffbotApiList;//QueryString;
|
||||
//SafeBuf m_diffbotUrlCrawlPattern;
|
||||
//SafeBuf m_diffbotUrlProcessPattern;
|
||||
//SafeBuf m_diffbotPageProcessPattern;
|
||||
@ -403,9 +403,9 @@ class CollectionRec {
|
||||
char m_isCustomCrawl;
|
||||
//char m_isDiffbotCollection;
|
||||
// format of output. "csv" or "xml" or "json" or null
|
||||
SafeBuf m_diffbotFormat;
|
||||
//SafeBuf m_diffbotFormat;
|
||||
// what fields to return in the json output: (api dependent)
|
||||
SafeBuf m_diffbotFields;
|
||||
//SafeBuf m_diffbotFields;
|
||||
long long m_diffbotMaxToCrawl;
|
||||
long long m_diffbotMaxToProcess;
|
||||
long long m_diffbotCrawlStartTime;
|
||||
@ -466,8 +466,11 @@ class CollectionRec {
|
||||
|
||||
// should urls in this queue be sent to diffbot for processing
|
||||
// when we are trying to index them?
|
||||
//long m_numRegExs11;
|
||||
//char m_spiderDiffbotApiNum [ MAX_FILTERS ];
|
||||
|
||||
long m_numRegExs11;
|
||||
char m_spiderDiffbotApiNum [ MAX_FILTERS ];
|
||||
SafeBuf m_spiderDiffbotApiUrl [ MAX_FILTERS ];
|
||||
|
||||
// dummy?
|
||||
long m_numRegExs9;
|
||||
|
@ -34,6 +34,7 @@ CollectionRec *getCollRecFromHttpRequest ( HttpRequest *hr ) ;
|
||||
CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) ;
|
||||
//bool isAliasUnique ( CollectionRec *cr , char *token , char *alias ) ;
|
||||
|
||||
/*
|
||||
char *g_diffbotFields [] = {
|
||||
"Unused-ERROR",
|
||||
"None",
|
||||
@ -46,12 +47,12 @@ char *g_diffbotFields [] = {
|
||||
"Image (autodetect)",
|
||||
"FrontPage (force)",
|
||||
"FrontPage (autodetect)",
|
||||
|
||||
//
|
||||
// last field must be empty. add new fields above this.
|
||||
//
|
||||
NULL
|
||||
};
|
||||
*/
|
||||
|
||||
/*
|
||||
class StateNC {
|
||||
@ -918,7 +919,7 @@ void StateCD::gotRdbList ( ) {
|
||||
sb.safeMemcpy(mime.getMime(),mime.getMimeLen() );
|
||||
}
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
|
||||
// we set this to true below if any one shard has more spiderdb
|
||||
// records left to read
|
||||
@ -933,8 +934,9 @@ void StateCD::gotRdbList ( ) {
|
||||
RdbList *list = &m_lists[i];
|
||||
|
||||
// get the format
|
||||
char *format = cr->m_diffbotFormat.getBufStart();
|
||||
if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL;
|
||||
//char *format = cr->m_diffbotFormat.getBufStart();
|
||||
//if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL;
|
||||
char *format = NULL;
|
||||
|
||||
char *ek = list->getEndKey();
|
||||
|
||||
@ -1684,6 +1686,11 @@ static class HelpItem s_his[] = {
|
||||
{"maxtoprocess", "Specify max pages to successfully process through "
|
||||
"diffbot"},
|
||||
{"urt","Use robots.txt?"},
|
||||
{"dbapilist","Special list of diffbot API urls. The URL Filters "
|
||||
"will display these options in a drop down menu. "
|
||||
"Example (unencoded): "
|
||||
"&dbapilist=All|/api/analyze?mode=auto&u=,Article (forced)|/api/"
|
||||
"article?u="},
|
||||
{"fe[N]","Filter expression #N. The first expression in the url "
|
||||
"filters table is 0. But if N is 0, leave N out, only specify it "
|
||||
"if N is > 0. Example &fe=onsamedomain to change the expression in "
|
||||
@ -1695,7 +1702,10 @@ static class HelpItem s_his[] = {
|
||||
{"mspi[N]","Max outstanding spiders for this IP."},
|
||||
{"xg[N]","Wait this many milliseconds between spiders of same IP."},
|
||||
{"fsp[N]","Spider priority. Higher priorities spidered first. Can be from 0 to 127. But -3 means to ignore the URL. -2 means the URL is banned because it comes from an evil site."},
|
||||
{"dapi[N]","Diffbot api number. Process through this diffbot api."},
|
||||
{"dapi[N]","Diffbot API Url. This is a string. Usually it "
|
||||
"corresponds to dbapilist parm above. But it is the url we use when "
|
||||
"accessing diffbot for this url filter. "
|
||||
"Example (unencoded): &dapi2=/api/article?u="},
|
||||
{"injecturl","Specify a seed url to inject."},
|
||||
{"urldata","A huge string of whitespace separated URLs to add to "
|
||||
"spiderdb for crawling."},
|
||||
@ -3164,7 +3174,7 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
|
||||
cr->m_spiderIpMaxSpiders[i] = 3; // keep it respectful
|
||||
cr->m_spidersEnabled [i] = 1;
|
||||
cr->m_spiderFreqs [i] = 7.0;
|
||||
cr->m_spiderDiffbotApiNum[i] = DBA_NONE; // 1
|
||||
cr->m_spiderDiffbotApiUrl[i].purge();// = DBA_NONE; // 1
|
||||
}
|
||||
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
#define CRAWLBOT_H
|
||||
|
||||
// values for the diffbot dropdown
|
||||
/*
|
||||
#define DBA_NONE 0
|
||||
#define DBA_ALL 1
|
||||
#define DBA_ARTICLE_FORCE 2
|
||||
@ -17,6 +18,7 @@
|
||||
// add new fields to END of list since i think we store the
|
||||
// field we use as a number in the coll.conf, starting at 0
|
||||
extern char *g_diffbotFields [];
|
||||
*/
|
||||
|
||||
bool sendPageCrawlbot ( TcpSocket *s , HttpRequest *hr );
|
||||
|
||||
|
153
Parms.cpp
153
Parms.cpp
@ -222,7 +222,7 @@ unsigned long Parms::calcChecksum() {
|
||||
if ( m->m_type == TYPE_BOOL2 ) size = 1;
|
||||
if ( m->m_type == TYPE_PRIORITY ) size = 1;
|
||||
if ( m->m_type == TYPE_PRIORITY2 ) size = 1;
|
||||
if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
|
||||
//if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
|
||||
if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1;
|
||||
if ( m->m_type == TYPE_RETRIES ) size = 1;
|
||||
if ( m->m_type == TYPE_TIME ) size = 6;
|
||||
@ -1064,14 +1064,57 @@ char *printDropDown ( long n , char *p, char *pend, char *name, long select,
|
||||
}
|
||||
*/
|
||||
|
||||
bool printDiffbotDropDown ( long n , SafeBuf *sb , char *name , long select ) {
|
||||
sb->safePrintf ( "<select name=%s>", name );
|
||||
for ( long i = 0 ; i < 100 ; i++ ) {
|
||||
char *s = "";
|
||||
char *field = g_diffbotFields[i];
|
||||
if ( ! field || field[0] == '\0' ) break;
|
||||
if ( i == select ) s = " selected";
|
||||
sb->safePrintf ("<option value=%li%s>%s",i,s,field);
|
||||
bool printDiffbotDropDown ( SafeBuf *sb,char *name,char *THIS , SafeBuf *sx) {
|
||||
CollectionRec *cr = (CollectionRec *)THIS;
|
||||
// . get the string we have selected
|
||||
// . the list of available strings to select is in
|
||||
// m_diffbotApiList for this collection, and that can
|
||||
// be changed by john to add custom diffbot api urls.
|
||||
// . should just be m_spiderDiffbotApiUrl[i] safebuf
|
||||
char *usingApi = sx->getBufStart();
|
||||
if ( sx->length() == 0 ) usingApi = NULL;
|
||||
// now scan each item in the list. see the setting of
|
||||
// "m_def" for "diffbotApiList" below to see the
|
||||
// comma separated list of default strings. each item in
|
||||
// this list is of the format "<title>|<urlPath>,"
|
||||
char *p = cr->m_diffbotApiList.getBufStart();
|
||||
// wtf?
|
||||
if ( ! p ) return true;
|
||||
// print out. cgi is "dapi%li".
|
||||
sb->safePrintf("<select name=%s>\n",name);
|
||||
// print "none" as the first option
|
||||
char *sel = "";
|
||||
if ( ! usingApi ) sel = " selected";
|
||||
sb->safePrintf("<option value=\"\"%s>None</option>",sel);
|
||||
// the various "diffbot urls" are separated by commas
|
||||
for ( ; *p ; ) {
|
||||
// point to start of item name
|
||||
char *name = p;
|
||||
// p should now point to name of the item
|
||||
char *end1 = p;
|
||||
// point to start of url for that item
|
||||
for ( ; *end1 && *end1 != '|' ;end1++);
|
||||
// save that
|
||||
char *url = end1;
|
||||
if ( *url == '|' ) url++;
|
||||
// find end of url
|
||||
char *urlEnd = url;
|
||||
for ( ; *urlEnd && *urlEnd != ',' ; urlEnd++ );
|
||||
// do we match it?
|
||||
sel = "";
|
||||
if ( usingApi && strncmp(usingApi,url,urlEnd-url)== 0 )
|
||||
sel = " selected";
|
||||
// advance p
|
||||
p = urlEnd;
|
||||
// skip over comma to get next one
|
||||
if ( *p == ',' ) p++;
|
||||
// use the hash as the identifier
|
||||
sb->safePrintf("<option value=\"");
|
||||
sb->safeMemcpy ( url, urlEnd - url );
|
||||
sb->safePrintf("\"%s>",sel);
|
||||
// print item name
|
||||
sb->safeMemcpy ( name , end1 - name );
|
||||
sb->safePrintf("</option>\n");
|
||||
}
|
||||
sb->safePrintf("</select>");
|
||||
return true;
|
||||
@ -2101,18 +2144,12 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
printDropDown ( MAX_SPIDER_PRIORITIES , sb , cgi , *s ,
|
||||
true , true );
|
||||
}
|
||||
else if ( t == TYPE_DIFFBOT_DROPDOWN ) {
|
||||
// just show the parm name and value if printing in json
|
||||
if ( isJSON ) {
|
||||
// convert diffbot # to string
|
||||
long apiNum = (long)*s;
|
||||
char *str = g_diffbotFields [apiNum];
|
||||
sb->safePrintf("\"%s-str\":\"%s\",\n",cgi,str);
|
||||
sb->safePrintf("\"%s\":%li,\n",cgi,apiNum);
|
||||
}
|
||||
else
|
||||
printDiffbotDropDown ( 8, sb , cgi , *s );
|
||||
}
|
||||
// this url filters parm is an array of SAFEBUFs now, so each is
|
||||
// a string and that string is the diffbot api url to use.
|
||||
// the string is empty or zero length to indicate none.
|
||||
//else if ( t == TYPE_DIFFBOT_DROPDOWN ) {
|
||||
// char *xx=NULL;*xx=0;
|
||||
//}
|
||||
else if ( t == TYPE_RETRIES )
|
||||
printDropDown ( 4 , sb , cgi , *s , false , false );
|
||||
else if ( t == TYPE_PRIORITY_BOXES ) {
|
||||
@ -2173,6 +2210,31 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
sb->dequote ( s , gbstrlen(s) );
|
||||
sb->safePrintf ("\">");
|
||||
}
|
||||
// HACK: print a drop down not a textbox for selecting the
|
||||
// m_spiderDiffbotApiUrl[]. we can't just store this selection
|
||||
// as a number because m_diffbotApiList (a string of comma separated
|
||||
// items to select from) can change! it is not a typical dropdown.
|
||||
// so we have to record the actual text we selected, which is
|
||||
// basically the diffbot api url. this is because john can add
|
||||
// custom diffbot api urls at anytime to the list.
|
||||
else if ( t == TYPE_SAFEBUF && strcmp(m->m_cgi,"dapi") == 0 ) {
|
||||
SafeBuf *sx = (SafeBuf *)s;
|
||||
// just show the parm name and value if printing in json
|
||||
if ( isJSON ) {
|
||||
// this can be empty for the empty row i guess
|
||||
if ( sx->length() ) {
|
||||
// convert diffbot # to string
|
||||
sb->safePrintf("\"%s\":\"",cgi);
|
||||
// this is just the url path, not the title
|
||||
// of the menu option... so this would be
|
||||
// like "/api/article?u="
|
||||
sb->safeUtf8ToJSON (sx->getBufStart() );
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
}
|
||||
else
|
||||
printDiffbotDropDown ( sb , cgi , THIS , sx );
|
||||
}
|
||||
else if ( t == TYPE_SAFEBUF ) {
|
||||
long size = m->m_size;
|
||||
// give regular expression box on url filters page more room
|
||||
@ -2842,7 +2904,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
|
||||
t == TYPE_BOOL2 ||
|
||||
t == TYPE_PRIORITY ||
|
||||
t == TYPE_PRIORITY2 ||
|
||||
t == TYPE_DIFFBOT_DROPDOWN ||
|
||||
//t == TYPE_DIFFBOT_DROPDOWN ||
|
||||
t == TYPE_PRIORITY_BOXES ||
|
||||
t == TYPE_RETRIES ||
|
||||
t == TYPE_FILTER ) {
|
||||
@ -3047,10 +3109,10 @@ void Parms::setToDefault ( char *THIS ) {
|
||||
// . this is a backwards-compatibility hack since this new parm
|
||||
// will not be in old coll.conf files and will not be properly
|
||||
// initialize when displaying a url filter row.
|
||||
if ( THIS != (char *)&g_conf ) {
|
||||
CollectionRec *cr = (CollectionRec *)THIS;
|
||||
memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS);
|
||||
}
|
||||
//if ( THIS != (char *)&g_conf ) {
|
||||
// CollectionRec *cr = (CollectionRec *)THIS;
|
||||
// memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS);
|
||||
//}
|
||||
|
||||
for ( long i = 0 ; i < m_numParms ; i++ ) {
|
||||
Parm *m = &m_parms[i];
|
||||
@ -3662,7 +3724,7 @@ char *Parms::getParmHtmlEncoded ( char *p , char *pend , Parm *m , char *s ) {
|
||||
if ( t == TYPE_CHAR || t == TYPE_BOOL ||
|
||||
t == TYPE_CHECKBOX ||
|
||||
t == TYPE_PRIORITY || t == TYPE_PRIORITY2 ||
|
||||
t == TYPE_DIFFBOT_DROPDOWN ||
|
||||
//t == TYPE_DIFFBOT_DROPDOWN ||
|
||||
t == TYPE_PRIORITY_BOXES || t == TYPE_RETRIES ||
|
||||
t == TYPE_RETRIES || t == TYPE_FILTER ||
|
||||
t == TYPE_BOOL2 || t == TYPE_CHAR2 )
|
||||
@ -3762,7 +3824,7 @@ bool Parms::serialize( char *buf, long *bufSize ) {
|
||||
if ( m->m_type == TYPE_BOOL2 ) size = 1;
|
||||
if ( m->m_type == TYPE_PRIORITY ) size = 1;
|
||||
if ( m->m_type == TYPE_PRIORITY2 ) size = 1;
|
||||
if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
|
||||
//if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
|
||||
if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1;
|
||||
if ( m->m_type == TYPE_RETRIES ) size = 1;
|
||||
if ( m->m_type == TYPE_TIME ) size = 6;
|
||||
@ -8154,12 +8216,25 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
*/
|
||||
|
||||
m->m_cgi = "dbapiqs";
|
||||
m->m_xml = "diffbotApiQueryString";
|
||||
m->m_off = (char *)&cr.m_diffbotApiQueryString - x;
|
||||
m->m_cgi = "dbapilist";
|
||||
m->m_xml = "diffbotApiList";//QueryString";
|
||||
m->m_off = (char *)&cr.m_diffbotApiList - x;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_def = "";
|
||||
// XmlDoc.cpp when it first computes "ufn" it also sets
|
||||
// m_diffbotApiUrl to one of these. lest we change the url filters
|
||||
// table AFTER it gets the ufn and BEFORE it gets the diffbot api url.
|
||||
m->m_def =
|
||||
"All|http://www.diffbot.com/api/analzye?mode=auto,"
|
||||
"Article (autodetect)|http://www.diffbot.com/api/analyze?mode=article,"
|
||||
"Article (force)|http://www.diffbot.com/api/article?,"
|
||||
"Product (autodetect)|http://www.diffbot.com/api/analyze?mode=product,"
|
||||
"Product (force)|http://www.diffbot.com/api/product?,"
|
||||
"Image (autodetect)|http://www.diffbot.com/api/analyze?mode=image,"
|
||||
"Image (force)|http://www.diffbot.com/api/image?,"
|
||||
"FrontPage (autodetect)|http://www.diffbot.com/api/analyze?mode=frontpage,"
|
||||
"FrontPage (force)|http://www.diffbot.com/api/frontpage?"
|
||||
;
|
||||
m++;
|
||||
|
||||
/*
|
||||
@ -12746,10 +12821,16 @@ void Parms::init ( ) {
|
||||
m->m_cgi = "dapi";
|
||||
m->m_xml = "diffbotAPI";
|
||||
m->m_max = MAX_FILTERS;
|
||||
m->m_off = (char *)cr.m_spiderDiffbotApiNum - x;
|
||||
m->m_type = TYPE_DIFFBOT_DROPDOWN;
|
||||
m->m_def = "0";
|
||||
m->m_off = (char *)cr.m_spiderDiffbotApiUrl - x;
|
||||
// HACK: we print a dropdown for this but the value is a string
|
||||
// because the items in the drop down can change so we can't store
|
||||
// an item # here, it has to be a string, i.e. the diffbot api url.
|
||||
// john might add a new custom api to m_diffbotApiList at any time.
|
||||
// so we select the item in the drop down if it matches THIS string.
|
||||
m->m_type = TYPE_SAFEBUF;//DIFFBOT_DROPDOWN;
|
||||
m->m_def = "";
|
||||
m->m_page = PAGE_FILTERS;
|
||||
m->m_size = sizeof(SafeBuf);
|
||||
m->m_rowid = 1;
|
||||
m->m_addin = 1; // "insert" follows?
|
||||
m++;
|
||||
@ -15462,7 +15543,7 @@ void Parms::init ( ) {
|
||||
if ( t == TYPE_CHECKBOX ) size = 1;
|
||||
if ( t == TYPE_PRIORITY ) size = 1;
|
||||
if ( t == TYPE_PRIORITY2 ) size = 1;
|
||||
if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1;
|
||||
//if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1;
|
||||
if ( t == TYPE_PRIORITY_BOXES ) size = 1;
|
||||
if ( t == TYPE_RETRIES ) size = 1;
|
||||
if ( t == TYPE_TIME ) size = 6;
|
||||
|
4
Parms.h
4
Parms.h
@ -51,8 +51,8 @@ enum {
|
||||
TYPE_MONOM2 ,
|
||||
TYPE_LONG_CONST ,
|
||||
TYPE_SITERULE , // 29
|
||||
TYPE_SAFEBUF ,
|
||||
TYPE_DIFFBOT_DROPDOWN
|
||||
TYPE_SAFEBUF
|
||||
//TYPE_DIFFBOT_DROPDOWN
|
||||
};
|
||||
|
||||
//forward decls to make compiler happy:
|
||||
|
85
XmlDoc.cpp
85
XmlDoc.cpp
@ -11860,28 +11860,34 @@ skip:
|
||||
THIS->m_masterLoop ( THIS->m_masterState );
|
||||
}
|
||||
|
||||
long *XmlDoc::getDiffbotApiNum ( ) {
|
||||
SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
|
||||
|
||||
if ( m_diffbotApiNumValid )
|
||||
return &m_diffbotApiNum;
|
||||
if ( m_diffbotApiUrlValid )
|
||||
return &m_diffbotApiUrl;
|
||||
|
||||
// if we are a diffbot json object, do not re-send to diffbot!
|
||||
if ( m_isDiffbotJSONObject ) {
|
||||
m_diffbotApiNum = DBA_NONE;
|
||||
m_diffbotApiNumValid = true;
|
||||
return &m_diffbotApiNum;
|
||||
//m_diffbotApiNum = DBA_NONE;
|
||||
m_diffbotApiUrlValid = true;
|
||||
return &m_diffbotApiUrl;
|
||||
}
|
||||
|
||||
// this now automatically sets m_diffbotApiUrl and m_diffbotApiUrlValid
|
||||
// in case the url filters table changes while spidering this!!!
|
||||
// gotta be careful of that.
|
||||
long *ufn = getUrlFilterNum();
|
||||
if ( ! ufn || ufn == (void *)-1 ) return (long *)ufn;
|
||||
if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;
|
||||
|
||||
m_diffbotApiNum = m_cr->m_spiderDiffbotApiNum[*ufn];
|
||||
// ensure it does set it!
|
||||
if ( ! m_diffbotApiUrlValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
//m_diffbotApiNum = m_cr->m_spiderDiffbotApiNum[*ufn];
|
||||
|
||||
// sanity check
|
||||
if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }
|
||||
//if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
m_diffbotApiNumValid = true;
|
||||
return &m_diffbotApiNum;
|
||||
//m_diffbotApiNumValid = true;
|
||||
return &m_diffbotApiUrl;
|
||||
}
|
||||
|
||||
// the diffbot reply will be a list of json objects we want to index
|
||||
@ -11903,12 +11909,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
return &m_diffbotReply;
|
||||
}
|
||||
|
||||
// check the url filters table to see if diffbot api is specified
|
||||
long *an = getDiffbotApiNum();
|
||||
if ( ! an || an == (void *)-1 ) return (SafeBuf *)an;
|
||||
// . check the url filters table to see if diffbot api is specified
|
||||
// . just return "\0" if none, but NULL means error i guess
|
||||
SafeBuf *au = getDiffbotApiUrl();
|
||||
if ( ! au || au == (void *)-1 ) return (SafeBuf *)au;
|
||||
|
||||
// if "NONE" is in the diffbot api drop down, do not send to diffbot
|
||||
if ( *an == DBA_NONE ) {
|
||||
// if no url, assume do not access diffbot
|
||||
if ( au->length() <= 0 ) {
|
||||
m_diffbotReplyValid = true;
|
||||
return &m_diffbotReply;
|
||||
}
|
||||
@ -11975,7 +11982,15 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
// from this url
|
||||
SafeBuf diffbotUrl;
|
||||
// TODO: make sure "api" works as hostname for not just product...
|
||||
diffbotUrl.safePrintf("http://www.diffbot.com/api/");
|
||||
//diffbotUrl.safePrintf("http://www.diffbot.com/");
|
||||
// skip extra '/'?
|
||||
//char *api = au->getBufStart();
|
||||
//long apiLen = au->length();
|
||||
//if ( api && api[0] == '/' ) { api++; apiLen--; }
|
||||
// append the custom url. i.e. /api/analyze?mode=auto&u=
|
||||
//if ( api ) diffbotUrl.safeMemcpy ( api , apiLen );
|
||||
// store the api url into here
|
||||
diffbotUrl.safeMemcpy ( au );
|
||||
|
||||
// . m_diffbotApi Is like "article" or "product" etc.
|
||||
// . if classify is true we always return the classification
|
||||
@ -11985,6 +12000,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
// if there is no json objects of the specified page type, "api"
|
||||
// . BUT if api is "all" return all types of json objects
|
||||
// . SHOULD we return "type" in the json output?
|
||||
/*
|
||||
if ( *an == DBA_ALL )
|
||||
diffbotUrl.safePrintf("analyze?mode=auto&" );
|
||||
else if ( *an == DBA_ARTICLE_FORCE )
|
||||
@ -12007,19 +12023,20 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
log("build: unknown diffbot api num = %li. assuming all",*an );
|
||||
diffbotUrl.safePrintf("analyze?mode=auto&" );
|
||||
}
|
||||
*/
|
||||
|
||||
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
|
||||
diffbotUrl.safePrintf("token=%s",m_cr->m_diffbotToken.getBufStart());
|
||||
diffbotUrl.safePrintf("&token=%s",m_cr->m_diffbotToken.getBufStart());
|
||||
diffbotUrl.safePrintf("&url=");
|
||||
// give diffbot the url to process
|
||||
diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
|
||||
// append this just in case the next thing doesn't have it.
|
||||
if ( m_cr->m_diffbotApiQueryString.length() &&
|
||||
m_cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
|
||||
diffbotUrl.pushChar('&');
|
||||
//if ( m_cr->m_diffbotApiQueryString.length() &&
|
||||
// m_cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
|
||||
// diffbotUrl.pushChar('&');
|
||||
// then user provided parms that are dependent on if it is an
|
||||
// article, product, etc. like "&dontstripads=1" or whatever
|
||||
diffbotUrl.safeStrcpy ( m_cr->m_diffbotApiQueryString.getBufStart());
|
||||
//diffbotUrl.safeStrcpy ( m_cr->m_diffbotApiQueryString.getBufStart());
|
||||
// null term it
|
||||
diffbotUrl.nullTerm();
|
||||
|
||||
@ -15147,6 +15164,17 @@ long *XmlDoc::getUrlFilterNum ( ) {
|
||||
// store it
|
||||
m_urlFilterNum = ufn;
|
||||
m_urlFilterNumValid = true;
|
||||
|
||||
// set this too in case the url filters table changes while
|
||||
// we are spidering this and a row is inserted or deleted or something
|
||||
SafeBuf *yy = &m_cr->m_spiderDiffbotApiUrl[ufn];
|
||||
// copy to ours
|
||||
m_diffbotApiUrl.safeMemcpy ( yy );
|
||||
// ensure null term
|
||||
m_diffbotApiUrl.nullTerm();
|
||||
m_diffbotApiUrlValid = true;
|
||||
|
||||
|
||||
return &m_urlFilterNum;
|
||||
}
|
||||
|
||||
@ -15881,7 +15909,9 @@ bool XmlDoc::logIt ( ) {
|
||||
sb.safePrintf("urlfilternum=%li ",(long)m_urlFilterNum);
|
||||
|
||||
|
||||
if ( m_diffbotApiNumValid && m_diffbotApiNum != DBA_NONE )
|
||||
if ( m_diffbotApiUrlValid &&
|
||||
m_diffbotApiUrl.getBufStart() &&
|
||||
m_diffbotApiUrl.getBufStart()[0] )
|
||||
sb.safePrintf("diffbotjsonobjects=%li ",
|
||||
(long)m_diffbotJSONCount);
|
||||
|
||||
@ -17443,8 +17473,9 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
od->m_useTagdb = false;
|
||||
// do not use diffbot for old doc since we call
|
||||
// od->nukeJSONObjects below()
|
||||
od->m_diffbotApiNumValid = true;
|
||||
od->m_diffbotApiNum = DBA_NONE;
|
||||
od->m_diffbotApiUrlValid = true;
|
||||
// api url should be empty by default
|
||||
//od->m_diffbotApiNum = DBA_NONE;
|
||||
// if we are doing diffbot stuff, we are still indexing this
|
||||
// page, so we need to get the old doc meta list
|
||||
oldList = od->getMetaList ( true );
|
||||
@ -17752,8 +17783,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
return (char *)linkSiteHashes;
|
||||
}
|
||||
|
||||
long *an = getDiffbotApiNum();
|
||||
if ( ! an || an == (void *)-1 ) return (char *)an;
|
||||
//SafeBuf *au = getDiffbotApiUrl();
|
||||
//if ( ! au || au == (void *)-1 ) return (char *)au;
|
||||
|
||||
|
||||
// test json parser
|
||||
|
28
XmlDoc.h
28
XmlDoc.h
@ -775,7 +775,8 @@ class XmlDoc {
|
||||
char *getIsCompromised ( ) ;
|
||||
char *getIsNoArchive ( ) ;
|
||||
long *getUrlFilterNum();
|
||||
long *getDiffbotApiNum();
|
||||
//long *getDiffbotApiNum();
|
||||
SafeBuf *getDiffbotApiUrl();
|
||||
long long **getAdVector ( ) ;
|
||||
char *getIsLinkSpam ( ) ;
|
||||
char *getIsHijacked();
|
||||
@ -1248,11 +1249,12 @@ class XmlDoc {
|
||||
bool m_baseUrlValid;
|
||||
bool m_replyValid;
|
||||
bool m_diffbotReplyValid;
|
||||
bool m_diffbotUrlCrawlPatternMatchValid;
|
||||
bool m_diffbotUrlProcessPatternMatchValid;
|
||||
bool m_diffbotPageProcessPatternMatchValid;
|
||||
//bool m_diffbotUrlCrawlPatternMatchValid;
|
||||
//bool m_diffbotUrlProcessPatternMatchValid;
|
||||
//bool m_diffbotPageProcessPatternMatchValid;
|
||||
//bool m_useDiffbotValid;
|
||||
bool m_diffbotApiNumValid;
|
||||
//bool m_diffbotApiNumValid;
|
||||
bool m_diffbotApiUrlValid;
|
||||
bool m_crawlInfoValid;
|
||||
bool m_isPageParserValid;
|
||||
bool m_imageUrlValid;
|
||||
@ -1507,16 +1509,18 @@ class XmlDoc {
|
||||
char m_diffbotSavedChar;
|
||||
SafeBuf m_diffbotReply;
|
||||
long m_diffbotReplyError;
|
||||
bool m_diffbotUrlCrawlPatternMatch;
|
||||
bool m_diffbotUrlProcessPatternMatch;
|
||||
bool m_diffbotPageProcessPatternMatch;
|
||||
long m_diffbotApiNum;
|
||||
//bool m_diffbotUrlCrawlPatternMatch;
|
||||
//bool m_diffbotUrlProcessPatternMatch;
|
||||
//bool m_diffbotPageProcessPatternMatch;
|
||||
//long m_diffbotApiNum;
|
||||
//bool m_useDiffbot;
|
||||
// url to access diffbot with
|
||||
SafeBuf m_diffbotApiUrl;
|
||||
|
||||
SafeBuf *getDiffbotReply ( ) ;
|
||||
bool doesUrlMatchDiffbotCrawlPattern() ;
|
||||
bool doesUrlMatchDiffbotProcessPattern() ;
|
||||
bool doesPageContentMatchDiffbotProcessPattern() ;
|
||||
//bool doesUrlMatchDiffbotCrawlPattern() ;
|
||||
//bool doesUrlMatchDiffbotProcessPattern() ;
|
||||
//bool doesPageContentMatchDiffbotProcessPattern() ;
|
||||
char *hashJSON ( HashTableX *table );
|
||||
long *nukeJSONObjects ( ) ;
|
||||
long m_joc;
|
||||
|
6
gb.conf
6
gb.conf
@ -57,7 +57,7 @@
|
||||
<doNarrowSearch>0</>
|
||||
|
||||
# Overrides all spidering for all collections on just this host.
|
||||
<localSpideringEnabled>0</>
|
||||
<localSpideringEnabled>1</>
|
||||
|
||||
# Overrides all add urls for all collections on just this host.
|
||||
<localAddUrlEnabled>1</>
|
||||
@ -73,10 +73,10 @@
|
||||
<qaSearchTestEnabled>1</>
|
||||
|
||||
# Enable spidering on all hosts
|
||||
<allSpidersOn>0</>
|
||||
<allSpidersOn>1</>
|
||||
|
||||
# Disable spidering on all hosts
|
||||
<allSpidersOff>0</>
|
||||
<allSpidersOff>1</>
|
||||
|
||||
# Serves ads unless pure=1 is in cgi parms.
|
||||
<adFeedEnabled>0</>
|
||||
|
Loading…
Reference in New Issue
Block a user