diff --git a/CollectionRec.cpp b/CollectionRec.cpp
index 37560b33..86b8523d 100644
--- a/CollectionRec.cpp
+++ b/CollectionRec.cpp
@@ -317,7 +317,10 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
m_spidersEnabled[n] = 1;
m_numRegExs7++;
- m_spiderDiffbotApiNum[n] = 1;
+ //m_spiderDiffbotApiNum[n] = 1;
+ //m_numRegExs11++;
+ m_spiderDiffbotApiUrl[n].set("");
+ m_spiderDiffbotApiUrl[n].nullTerm();
m_numRegExs11++;
}
diff --git a/CollectionRec.h b/CollectionRec.h
index a196e8c8..d4c81828 100644
--- a/CollectionRec.h
+++ b/CollectionRec.h
@@ -389,11 +389,11 @@ class CollectionRec {
// an alternate name for the collection. we tend to create
// collection names as a random sequence of hex digits. this
// will allow a user to give them an alternate name.
- SafeBuf m_collectionNameAlias;
+ //SafeBuf m_collectionNameAlias;
//SafeBuf m_diffbotSeed;
// this will be NULL or "none" to not pass off to diffbot
//SafeBuf m_diffbotApi;
- SafeBuf m_diffbotApiQueryString;
+ SafeBuf m_diffbotApiList;//QueryString;
//SafeBuf m_diffbotUrlCrawlPattern;
//SafeBuf m_diffbotUrlProcessPattern;
//SafeBuf m_diffbotPageProcessPattern;
@@ -403,9 +403,9 @@ class CollectionRec {
char m_isCustomCrawl;
//char m_isDiffbotCollection;
// format of output. "csv" or "xml" or "json" or null
- SafeBuf m_diffbotFormat;
+ //SafeBuf m_diffbotFormat;
// what fields to return in the json output: (api dependent)
- SafeBuf m_diffbotFields;
+ //SafeBuf m_diffbotFields;
long long m_diffbotMaxToCrawl;
long long m_diffbotMaxToProcess;
long long m_diffbotCrawlStartTime;
@@ -466,8 +466,11 @@ class CollectionRec {
// should urls in this queue be sent to diffbot for processing
// when we are trying to index them?
+ //long m_numRegExs11;
+ //char m_spiderDiffbotApiNum [ MAX_FILTERS ];
+
long m_numRegExs11;
- char m_spiderDiffbotApiNum [ MAX_FILTERS ];
+ SafeBuf m_spiderDiffbotApiUrl [ MAX_FILTERS ];
// dummy?
long m_numRegExs9;
diff --git a/PageCrawlBot.cpp b/PageCrawlBot.cpp
index f2eb4ca3..c7f053ef 100644
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@@ -34,6 +34,7 @@ CollectionRec *getCollRecFromHttpRequest ( HttpRequest *hr ) ;
CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) ;
//bool isAliasUnique ( CollectionRec *cr , char *token , char *alias ) ;
+/*
char *g_diffbotFields [] = {
"Unused-ERROR",
"None",
@@ -46,12 +47,12 @@ char *g_diffbotFields [] = {
"Image (autodetect)",
"FrontPage (force)",
"FrontPage (autodetect)",
-
//
// last field must be empty. add new fields above this.
//
NULL
};
+*/
/*
class StateNC {
@@ -918,7 +919,7 @@ void StateCD::gotRdbList ( ) {
sb.safeMemcpy(mime.getMime(),mime.getMimeLen() );
}
- CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
+ //CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
// we set this to true below if any one shard has more spiderdb
// records left to read
@@ -933,8 +934,9 @@ void StateCD::gotRdbList ( ) {
RdbList *list = &m_lists[i];
// get the format
- char *format = cr->m_diffbotFormat.getBufStart();
- if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL;
+ //char *format = cr->m_diffbotFormat.getBufStart();
+ //if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL;
+ char *format = NULL;
char *ek = list->getEndKey();
@@ -1684,6 +1686,11 @@ static class HelpItem s_his[] = {
{"maxtoprocess", "Specify max pages to successfully process through "
"diffbot"},
{"urt","Use robots.txt?"},
+ {"dbapilist","Special list of diffbot API urls. The URL Filters "
+ "will display these options in a drop down menu. "
+ "Example (unencoded): "
+ "&dbapilist=All|/api/analyze?mode=auto&u=,Article (forced)|/api/"
+ "article?u="},
{"fe[N]","Filter expression #N. The first expression in the url "
"filters table is 0. But if N is 0, leave N out, only specify it "
"if N is > 0. Example &fe=onsamedomain to change the expression in "
@@ -1695,7 +1702,10 @@ static class HelpItem s_his[] = {
{"mspi[N]","Max outstanding spiders for this IP."},
{"xg[N]","Wait this many milliseconds between spiders of same IP."},
{"fsp[N]","Spider priority. Higher priorities spidered first. Can be from 0 to 127. But -3 means to ignore the URL. -2 means the URL is banned because it comes from an evil site."},
- {"dapi[N]","Diffbot api number. Process through this diffbot api."},
+ {"dapi[N]","Diffbot API Url. This is a string. Usually it "
+ "corresponds to dbapilist parm above. But it is the url we use when "
+ "accessing diffbot for this url filter. "
+ "Example (unencoded): &dapi2=/api/article?u="},
{"injecturl","Specify a seed url to inject."},
{"urldata","A huge string of whitespace separated URLs to add to "
"spiderdb for crawling."},
@@ -3164,7 +3174,7 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
cr->m_spiderIpMaxSpiders[i] = 3; // keep it respectful
cr->m_spidersEnabled [i] = 1;
cr->m_spiderFreqs [i] = 7.0;
- cr->m_spiderDiffbotApiNum[i] = DBA_NONE; // 1
+ cr->m_spiderDiffbotApiUrl[i].purge();// = DBA_NONE; // 1
}
diff --git a/PageCrawlBot.h b/PageCrawlBot.h
index f1cf69e2..6a42ed97 100644
--- a/PageCrawlBot.h
+++ b/PageCrawlBot.h
@@ -3,6 +3,7 @@
#define CRAWLBOT_H
// values for the diffbot dropdown
+/*
#define DBA_NONE 0
#define DBA_ALL 1
#define DBA_ARTICLE_FORCE 2
@@ -17,6 +18,7 @@
// add new fields to END of list since i think we store the
// field we use as a number in the coll.conf, starting at 0
extern char *g_diffbotFields [];
+*/
bool sendPageCrawlbot ( TcpSocket *s , HttpRequest *hr );
diff --git a/Parms.cpp b/Parms.cpp
index c7a8c090..7aed6df7 100644
--- a/Parms.cpp
+++ b/Parms.cpp
@@ -222,7 +222,7 @@ unsigned long Parms::calcChecksum() {
if ( m->m_type == TYPE_BOOL2 ) size = 1;
if ( m->m_type == TYPE_PRIORITY ) size = 1;
if ( m->m_type == TYPE_PRIORITY2 ) size = 1;
- if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
+ //if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1;
if ( m->m_type == TYPE_RETRIES ) size = 1;
if ( m->m_type == TYPE_TIME ) size = 6;
@@ -1064,16 +1064,59 @@ char *printDropDown ( long n , char *p, char *pend, char *name, long select,
}
*/
-bool printDiffbotDropDown ( long n , SafeBuf *sb , char *name , long select ) {
- sb->safePrintf ( "");
return true;
}
@@ -2101,18 +2144,12 @@ bool Parms::printParm ( SafeBuf* sb,
printDropDown ( MAX_SPIDER_PRIORITIES , sb , cgi , *s ,
true , true );
}
- else if ( t == TYPE_DIFFBOT_DROPDOWN ) {
- // just show the parm name and value if printing in json
- if ( isJSON ) {
- // convert diffbot # to string
- long apiNum = (long)*s;
- char *str = g_diffbotFields [apiNum];
- sb->safePrintf("\"%s-str\":\"%s\",\n",cgi,str);
- sb->safePrintf("\"%s\":%li,\n",cgi,apiNum);
- }
- else
- printDiffbotDropDown ( 8, sb , cgi , *s );
- }
+ // this url filters parm is an array of SAFEBUFs now, so each is
+ // a string and that string is the diffbot api url to use.
+ // the string is empty or zero length to indicate none.
+ //else if ( t == TYPE_DIFFBOT_DROPDOWN ) {
+ // char *xx=NULL;*xx=0;
+ //}
else if ( t == TYPE_RETRIES )
printDropDown ( 4 , sb , cgi , *s , false , false );
else if ( t == TYPE_PRIORITY_BOXES ) {
@@ -2173,6 +2210,31 @@ bool Parms::printParm ( SafeBuf* sb,
sb->dequote ( s , gbstrlen(s) );
sb->safePrintf ("\">");
}
+ // HACK: print a drop down not a textbox for selecting the
+ // m_spiderDiffbotApiUrl[]. we can't just store this selection
+ // as a number because m_diffbotApiList (a string of comma separated
+ // items to select from) can change! it is not a typical dropdown.
+ // so we have to record the actual text we selected, which is
+ // basically the diffbot api url. this is because john can add
+ // custom diffbot api urls at anytime to the list.
+ else if ( t == TYPE_SAFEBUF && strcmp(m->m_cgi,"dapi") == 0 ) {
+ SafeBuf *sx = (SafeBuf *)s;
+ // just show the parm name and value if printing in json
+ if ( isJSON ) {
+ // this can be empty for the empty row i guess
+ if ( sx->length() ) {
+ // convert diffbot # to string
+ sb->safePrintf("\"%s\":\"",cgi);
+ // this is just the url path, not the title
+ // of the menu option... so this would be
+ // like "/api/article?u="
+ sb->safeUtf8ToJSON (sx->getBufStart() );
+ sb->safePrintf("\",\n");
+ }
+ }
+ else
+ printDiffbotDropDown ( sb , cgi , THIS , sx );
+ }
else if ( t == TYPE_SAFEBUF ) {
long size = m->m_size;
// give regular expression box on url filters page more room
@@ -2842,7 +2904,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
t == TYPE_BOOL2 ||
t == TYPE_PRIORITY ||
t == TYPE_PRIORITY2 ||
- t == TYPE_DIFFBOT_DROPDOWN ||
+ //t == TYPE_DIFFBOT_DROPDOWN ||
t == TYPE_PRIORITY_BOXES ||
t == TYPE_RETRIES ||
t == TYPE_FILTER ) {
@@ -3047,10 +3109,10 @@ void Parms::setToDefault ( char *THIS ) {
// . this is a backwards-compatibility hack since this new parm
// will not be in old coll.conf files and will not be properly
// initialize when displaying a url filter row.
- if ( THIS != (char *)&g_conf ) {
- CollectionRec *cr = (CollectionRec *)THIS;
- memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS);
- }
+ //if ( THIS != (char *)&g_conf ) {
+ // CollectionRec *cr = (CollectionRec *)THIS;
+ // memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS);
+ //}
for ( long i = 0 ; i < m_numParms ; i++ ) {
Parm *m = &m_parms[i];
@@ -3662,7 +3724,7 @@ char *Parms::getParmHtmlEncoded ( char *p , char *pend , Parm *m , char *s ) {
if ( t == TYPE_CHAR || t == TYPE_BOOL ||
t == TYPE_CHECKBOX ||
t == TYPE_PRIORITY || t == TYPE_PRIORITY2 ||
- t == TYPE_DIFFBOT_DROPDOWN ||
+ //t == TYPE_DIFFBOT_DROPDOWN ||
t == TYPE_PRIORITY_BOXES || t == TYPE_RETRIES ||
t == TYPE_RETRIES || t == TYPE_FILTER ||
t == TYPE_BOOL2 || t == TYPE_CHAR2 )
@@ -3762,7 +3824,7 @@ bool Parms::serialize( char *buf, long *bufSize ) {
if ( m->m_type == TYPE_BOOL2 ) size = 1;
if ( m->m_type == TYPE_PRIORITY ) size = 1;
if ( m->m_type == TYPE_PRIORITY2 ) size = 1;
- if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
+ //if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1;
if ( m->m_type == TYPE_RETRIES ) size = 1;
if ( m->m_type == TYPE_TIME ) size = 6;
@@ -8154,12 +8216,25 @@ void Parms::init ( ) {
m++;
*/
- m->m_cgi = "dbapiqs";
- m->m_xml = "diffbotApiQueryString";
- m->m_off = (char *)&cr.m_diffbotApiQueryString - x;
+ m->m_cgi = "dbapilist";
+ m->m_xml = "diffbotApiList";//QueryString";
+ m->m_off = (char *)&cr.m_diffbotApiList - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
- m->m_def = "";
+ // XmlDoc.cpp when it first computes "ufn" it also sets
+ // m_diffbotApiUrl to one of these. lest we change the url filters
+ // table AFTER it gets the ufn and BEFORE it gets the diffbot api url.
+ m->m_def =
+ "All|http://www.diffbot.com/api/analzye?mode=auto,"
+ "Article (autodetect)|http://www.diffbot.com/api/analyze?mode=article,"
+ "Article (force)|http://www.diffbot.com/api/article?,"
+ "Product (autodetect)|http://www.diffbot.com/api/analyze?mode=product,"
+ "Product (force)|http://www.diffbot.com/api/product?,"
+ "Image (autodetect)|http://www.diffbot.com/api/analyze?mode=image,"
+ "Image (force)|http://www.diffbot.com/api/image?,"
+ "FrontPage (autodetect)|http://www.diffbot.com/api/analyze?mode=frontpage,"
+ "FrontPage (force)|http://www.diffbot.com/api/frontpage?"
+ ;
m++;
/*
@@ -12746,10 +12821,16 @@ void Parms::init ( ) {
m->m_cgi = "dapi";
m->m_xml = "diffbotAPI";
m->m_max = MAX_FILTERS;
- m->m_off = (char *)cr.m_spiderDiffbotApiNum - x;
- m->m_type = TYPE_DIFFBOT_DROPDOWN;
- m->m_def = "0";
+ m->m_off = (char *)cr.m_spiderDiffbotApiUrl - x;
+ // HACK: we print a dropdown for this but the value is a string
+ // because the items in the drop down can change so we can't store
+ // an item # here, it has to be a string, i.e. the diffbot api url.
+ // john might add a new custom api to m_diffbotApiList at any time.
+ // so we select the item in the drop down if it matches THIS string.
+ m->m_type = TYPE_SAFEBUF;//DIFFBOT_DROPDOWN;
+ m->m_def = "";
m->m_page = PAGE_FILTERS;
+ m->m_size = sizeof(SafeBuf);
m->m_rowid = 1;
m->m_addin = 1; // "insert" follows?
m++;
@@ -15462,7 +15543,7 @@ void Parms::init ( ) {
if ( t == TYPE_CHECKBOX ) size = 1;
if ( t == TYPE_PRIORITY ) size = 1;
if ( t == TYPE_PRIORITY2 ) size = 1;
- if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1;
+ //if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1;
if ( t == TYPE_PRIORITY_BOXES ) size = 1;
if ( t == TYPE_RETRIES ) size = 1;
if ( t == TYPE_TIME ) size = 6;
diff --git a/Parms.h b/Parms.h
index b53b7118..3f42804c 100644
--- a/Parms.h
+++ b/Parms.h
@@ -51,8 +51,8 @@ enum {
TYPE_MONOM2 ,
TYPE_LONG_CONST ,
TYPE_SITERULE , // 29
- TYPE_SAFEBUF ,
- TYPE_DIFFBOT_DROPDOWN
+ TYPE_SAFEBUF
+ //TYPE_DIFFBOT_DROPDOWN
};
//forward decls to make compiler happy:
diff --git a/XmlDoc.cpp b/XmlDoc.cpp
index 5a965d39..4a3deaa3 100644
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@@ -11860,28 +11860,34 @@ skip:
THIS->m_masterLoop ( THIS->m_masterState );
}
-long *XmlDoc::getDiffbotApiNum ( ) {
+SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
- if ( m_diffbotApiNumValid )
- return &m_diffbotApiNum;
+ if ( m_diffbotApiUrlValid )
+ return &m_diffbotApiUrl;
// if we are a diffbot json object, do not re-send to diffbot!
if ( m_isDiffbotJSONObject ) {
- m_diffbotApiNum = DBA_NONE;
- m_diffbotApiNumValid = true;
- return &m_diffbotApiNum;
+ //m_diffbotApiNum = DBA_NONE;
+ m_diffbotApiUrlValid = true;
+ return &m_diffbotApiUrl;
}
+ // this now automatically sets m_diffbotApiUrl and m_diffbotApiUrlValid
+ // in case the url filters table changes while spidering this!!!
+ // gotta be careful of that.
long *ufn = getUrlFilterNum();
- if ( ! ufn || ufn == (void *)-1 ) return (long *)ufn;
+ if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;
- m_diffbotApiNum = m_cr->m_spiderDiffbotApiNum[*ufn];
+ // ensure it does set it!
+ if ( ! m_diffbotApiUrlValid ) { char *xx=NULL;*xx=0; }
+
+ //m_diffbotApiNum = m_cr->m_spiderDiffbotApiNum[*ufn];
// sanity check
- if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }
+ //if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }
- m_diffbotApiNumValid = true;
- return &m_diffbotApiNum;
+ //m_diffbotApiNumValid = true;
+ return &m_diffbotApiUrl;
}
// the diffbot reply will be a list of json objects we want to index
@@ -11903,12 +11909,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
return &m_diffbotReply;
}
- // check the url filters table to see if diffbot api is specified
- long *an = getDiffbotApiNum();
- if ( ! an || an == (void *)-1 ) return (SafeBuf *)an;
+ // . check the url filters table to see if diffbot api is specified
+ // . just return "\0" if none, but NULL means error i guess
+ SafeBuf *au = getDiffbotApiUrl();
+ if ( ! au || au == (void *)-1 ) return (SafeBuf *)au;
- // if "NONE" is in the diffbot api drop down, do not send to diffbot
- if ( *an == DBA_NONE ) {
+ // if no url, assume do not access diffbot
+ if ( au->length() <= 0 ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
@@ -11975,7 +11982,15 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// from this url
SafeBuf diffbotUrl;
// TODO: make sure "api" works as hostname for not just product...
- diffbotUrl.safePrintf("http://www.diffbot.com/api/");
+ //diffbotUrl.safePrintf("http://www.diffbot.com/");
+ // skip extra '/'?
+ //char *api = au->getBufStart();
+ //long apiLen = au->length();
+ //if ( api && api[0] == '/' ) { api++; apiLen--; }
+ // append the custom url. i.e. /api/analyze?mode=auto&u=
+ //if ( api ) diffbotUrl.safeMemcpy ( api , apiLen );
+ // store the api url into here
+ diffbotUrl.safeMemcpy ( au );
// . m_diffbotApi Is like "article" or "product" etc.
// . if classify is true we always return the classification
@@ -11985,6 +12000,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// if there is no json objects of the specified page type, "api"
// . BUT if api is "all" return all types of json objects
// . SHOULD we return "type" in the json output?
+ /*
if ( *an == DBA_ALL )
diffbotUrl.safePrintf("analyze?mode=auto&" );
else if ( *an == DBA_ARTICLE_FORCE )
@@ -12007,19 +12023,20 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
log("build: unknown diffbot api num = %li. assuming all",*an );
diffbotUrl.safePrintf("analyze?mode=auto&" );
}
+ */
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
- diffbotUrl.safePrintf("token=%s",m_cr->m_diffbotToken.getBufStart());
+ diffbotUrl.safePrintf("&token=%s",m_cr->m_diffbotToken.getBufStart());
diffbotUrl.safePrintf("&url=");
// give diffbot the url to process
diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
// append this just in case the next thing doesn't have it.
- if ( m_cr->m_diffbotApiQueryString.length() &&
- m_cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
- diffbotUrl.pushChar('&');
+ //if ( m_cr->m_diffbotApiQueryString.length() &&
+ // m_cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
+ // diffbotUrl.pushChar('&');
// then user provided parms that are dependent on if it is an
// article, product, etc. like "&dontstripads=1" or whatever
- diffbotUrl.safeStrcpy ( m_cr->m_diffbotApiQueryString.getBufStart());
+ //diffbotUrl.safeStrcpy ( m_cr->m_diffbotApiQueryString.getBufStart());
// null term it
diffbotUrl.nullTerm();
@@ -15147,6 +15164,17 @@ long *XmlDoc::getUrlFilterNum ( ) {
// store it
m_urlFilterNum = ufn;
m_urlFilterNumValid = true;
+
+ // set this too in case the url filters table changes while
+ // we are spidering this and a row is inserted or deleted or something
+ SafeBuf *yy = &m_cr->m_spiderDiffbotApiUrl[ufn];
+ // copy to ours
+ m_diffbotApiUrl.safeMemcpy ( yy );
+ // ensure null term
+ m_diffbotApiUrl.nullTerm();
+ m_diffbotApiUrlValid = true;
+
+
return &m_urlFilterNum;
}
@@ -15881,7 +15909,9 @@ bool XmlDoc::logIt ( ) {
sb.safePrintf("urlfilternum=%li ",(long)m_urlFilterNum);
- if ( m_diffbotApiNumValid && m_diffbotApiNum != DBA_NONE )
+ if ( m_diffbotApiUrlValid &&
+ m_diffbotApiUrl.getBufStart() &&
+ m_diffbotApiUrl.getBufStart()[0] )
sb.safePrintf("diffbotjsonobjects=%li ",
(long)m_diffbotJSONCount);
@@ -17443,8 +17473,9 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
od->m_useTagdb = false;
// do not use diffbot for old doc since we call
// od->nukeJSONObjects below()
- od->m_diffbotApiNumValid = true;
- od->m_diffbotApiNum = DBA_NONE;
+ od->m_diffbotApiUrlValid = true;
+ // api url should be empty by default
+ //od->m_diffbotApiNum = DBA_NONE;
// if we are doing diffbot stuff, we are still indexing this
// page, so we need to get the old doc meta list
oldList = od->getMetaList ( true );
@@ -17752,8 +17783,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
return (char *)linkSiteHashes;
}
- long *an = getDiffbotApiNum();
- if ( ! an || an == (void *)-1 ) return (char *)an;
+ //SafeBuf *au = getDiffbotApiUrl();
+ //if ( ! au || au == (void *)-1 ) return (char *)au;
// test json parser
diff --git a/XmlDoc.h b/XmlDoc.h
index 1e386933..51b5a427 100644
--- a/XmlDoc.h
+++ b/XmlDoc.h
@@ -775,7 +775,8 @@ class XmlDoc {
char *getIsCompromised ( ) ;
char *getIsNoArchive ( ) ;
long *getUrlFilterNum();
- long *getDiffbotApiNum();
+ //long *getDiffbotApiNum();
+ SafeBuf *getDiffbotApiUrl();
long long **getAdVector ( ) ;
char *getIsLinkSpam ( ) ;
char *getIsHijacked();
@@ -1248,11 +1249,12 @@ class XmlDoc {
bool m_baseUrlValid;
bool m_replyValid;
bool m_diffbotReplyValid;
- bool m_diffbotUrlCrawlPatternMatchValid;
- bool m_diffbotUrlProcessPatternMatchValid;
- bool m_diffbotPageProcessPatternMatchValid;
+ //bool m_diffbotUrlCrawlPatternMatchValid;
+ //bool m_diffbotUrlProcessPatternMatchValid;
+ //bool m_diffbotPageProcessPatternMatchValid;
//bool m_useDiffbotValid;
- bool m_diffbotApiNumValid;
+ //bool m_diffbotApiNumValid;
+ bool m_diffbotApiUrlValid;
bool m_crawlInfoValid;
bool m_isPageParserValid;
bool m_imageUrlValid;
@@ -1507,16 +1509,18 @@ class XmlDoc {
char m_diffbotSavedChar;
SafeBuf m_diffbotReply;
long m_diffbotReplyError;
- bool m_diffbotUrlCrawlPatternMatch;
- bool m_diffbotUrlProcessPatternMatch;
- bool m_diffbotPageProcessPatternMatch;
- long m_diffbotApiNum;
+ //bool m_diffbotUrlCrawlPatternMatch;
+ //bool m_diffbotUrlProcessPatternMatch;
+ //bool m_diffbotPageProcessPatternMatch;
+ //long m_diffbotApiNum;
//bool m_useDiffbot;
+ // url to access diffbot with
+ SafeBuf m_diffbotApiUrl;
SafeBuf *getDiffbotReply ( ) ;
- bool doesUrlMatchDiffbotCrawlPattern() ;
- bool doesUrlMatchDiffbotProcessPattern() ;
- bool doesPageContentMatchDiffbotProcessPattern() ;
+ //bool doesUrlMatchDiffbotCrawlPattern() ;
+ //bool doesUrlMatchDiffbotProcessPattern() ;
+ //bool doesPageContentMatchDiffbotProcessPattern() ;
char *hashJSON ( HashTableX *table );
long *nukeJSONObjects ( ) ;
long m_joc;
diff --git a/gb.conf b/gb.conf
index 2653c9d5..0791635c 100644
--- a/gb.conf
+++ b/gb.conf
@@ -57,7 +57,7 @@
0>
# Overrides all spidering for all collections on just this host.
-0>
+1>
# Overrides all add urls for all collections on just this host.
1>
@@ -73,10 +73,10 @@
1>
# Enable spidering on all hosts
-0>
+1>
# Disable spidering on all hosts
-0>
+1>
# Serves ads unless pure=1 is in cgi parms.
0>