customizable api list in url filters

This commit is contained in:
mwells 2013-09-30 09:18:22 -06:00
parent 0edcbcc7d8
commit 20952eedbe
9 changed files with 227 additions and 93 deletions

View File

@ -317,7 +317,10 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
m_spidersEnabled[n] = 1;
m_numRegExs7++;
m_spiderDiffbotApiNum[n] = 1;
//m_spiderDiffbotApiNum[n] = 1;
//m_numRegExs11++;
m_spiderDiffbotApiUrl[n].set("");
m_spiderDiffbotApiUrl[n].nullTerm();
m_numRegExs11++;
}

View File

@ -389,11 +389,11 @@ class CollectionRec {
// an alternate name for the collection. we tend to create
// collection names as a random sequence of hex digits. this
// will allow a user to give them an alternate name.
SafeBuf m_collectionNameAlias;
//SafeBuf m_collectionNameAlias;
//SafeBuf m_diffbotSeed;
// this will be NULL or "none" to not pass off to diffbot
//SafeBuf m_diffbotApi;
SafeBuf m_diffbotApiQueryString;
SafeBuf m_diffbotApiList;//QueryString;
//SafeBuf m_diffbotUrlCrawlPattern;
//SafeBuf m_diffbotUrlProcessPattern;
//SafeBuf m_diffbotPageProcessPattern;
@ -403,9 +403,9 @@ class CollectionRec {
char m_isCustomCrawl;
//char m_isDiffbotCollection;
// format of output. "csv" or "xml" or "json" or null
SafeBuf m_diffbotFormat;
//SafeBuf m_diffbotFormat;
// what fields to return in the json output: (api dependent)
SafeBuf m_diffbotFields;
//SafeBuf m_diffbotFields;
long long m_diffbotMaxToCrawl;
long long m_diffbotMaxToProcess;
long long m_diffbotCrawlStartTime;
@ -466,8 +466,11 @@ class CollectionRec {
// should urls in this queue be sent to diffbot for processing
// when we are trying to index them?
//long m_numRegExs11;
//char m_spiderDiffbotApiNum [ MAX_FILTERS ];
long m_numRegExs11;
char m_spiderDiffbotApiNum [ MAX_FILTERS ];
SafeBuf m_spiderDiffbotApiUrl [ MAX_FILTERS ];
// dummy?
long m_numRegExs9;

View File

@ -34,6 +34,7 @@ CollectionRec *getCollRecFromHttpRequest ( HttpRequest *hr ) ;
CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) ;
//bool isAliasUnique ( CollectionRec *cr , char *token , char *alias ) ;
/*
char *g_diffbotFields [] = {
"Unused-ERROR",
"None",
@ -46,12 +47,12 @@ char *g_diffbotFields [] = {
"Image (autodetect)",
"FrontPage (force)",
"FrontPage (autodetect)",
//
// last field must be empty. add new fields above this.
//
NULL
};
*/
/*
class StateNC {
@ -918,7 +919,7 @@ void StateCD::gotRdbList ( ) {
sb.safeMemcpy(mime.getMime(),mime.getMimeLen() );
}
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
// we set this to true below if any one shard has more spiderdb
// records left to read
@ -933,8 +934,9 @@ void StateCD::gotRdbList ( ) {
RdbList *list = &m_lists[i];
// get the format
char *format = cr->m_diffbotFormat.getBufStart();
if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL;
//char *format = cr->m_diffbotFormat.getBufStart();
//if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL;
char *format = NULL;
char *ek = list->getEndKey();
@ -1684,6 +1686,11 @@ static class HelpItem s_his[] = {
{"maxtoprocess", "Specify max pages to successfully process through "
"diffbot"},
{"urt","Use robots.txt?"},
{"dbapilist","Special list of diffbot API urls. The URL Filters "
"will display these options in a drop down menu. "
"Example (unencoded): "
"&dbapilist=All|/api/analyze?mode=auto&u=,Article (forced)|/api/"
"article?u="},
{"fe[N]","Filter expression #N. The first expression in the url "
"filters table is 0. But if N is 0, leave N out, only specify it "
"if N is > 0. Example &fe=onsamedomain to change the expression in "
@ -1695,7 +1702,10 @@ static class HelpItem s_his[] = {
{"mspi[N]","Max outstanding spiders for this IP."},
{"xg[N]","Wait this many milliseconds between spiders of same IP."},
{"fsp[N]","Spider priority. Higher priorities spidered first. Can be from 0 to 127. But -3 means to ignore the URL. -2 means the URL is banned because it comes from an evil site."},
{"dapi[N]","Diffbot api number. Process through this diffbot api."},
{"dapi[N]","Diffbot API Url. This is a string. Usually it "
"corresponds to dbapilist parm above. But it is the url we use when "
"accessing diffbot for this url filter. "
"Example (unencoded): &dapi2=/api/article?u="},
{"injecturl","Specify a seed url to inject."},
{"urldata","A huge string of whitespace separated URLs to add to "
"spiderdb for crawling."},
@ -3164,7 +3174,7 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) {
cr->m_spiderIpMaxSpiders[i] = 3; // keep it respectful
cr->m_spidersEnabled [i] = 1;
cr->m_spiderFreqs [i] = 7.0;
cr->m_spiderDiffbotApiNum[i] = DBA_NONE; // 1
cr->m_spiderDiffbotApiUrl[i].purge();// = DBA_NONE; // 1
}

View File

@ -3,6 +3,7 @@
#define CRAWLBOT_H
// values for the diffbot dropdown
/*
#define DBA_NONE 0
#define DBA_ALL 1
#define DBA_ARTICLE_FORCE 2
@ -17,6 +18,7 @@
// add new fields to END of list since i think we store the
// field we use as a number in the coll.conf, starting at 0
extern char *g_diffbotFields [];
*/
bool sendPageCrawlbot ( TcpSocket *s , HttpRequest *hr );

155
Parms.cpp
View File

@ -222,7 +222,7 @@ unsigned long Parms::calcChecksum() {
if ( m->m_type == TYPE_BOOL2 ) size = 1;
if ( m->m_type == TYPE_PRIORITY ) size = 1;
if ( m->m_type == TYPE_PRIORITY2 ) size = 1;
if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
//if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1;
if ( m->m_type == TYPE_RETRIES ) size = 1;
if ( m->m_type == TYPE_TIME ) size = 6;
@ -1064,16 +1064,59 @@ char *printDropDown ( long n , char *p, char *pend, char *name, long select,
}
*/
bool printDiffbotDropDown ( long n , SafeBuf *sb , char *name , long select ) {
sb->safePrintf ( "<select name=%s>", name );
for ( long i = 0 ; i < 100 ; i++ ) {
char *s = "";
char *field = g_diffbotFields[i];
if ( ! field || field[0] == '\0' ) break;
if ( i == select ) s = " selected";
sb->safePrintf ("<option value=%li%s>%s",i,s,field);
bool printDiffbotDropDown ( SafeBuf *sb,char *name,char *THIS , SafeBuf *sx) {
CollectionRec *cr = (CollectionRec *)THIS;
// . get the string we have selected
// . the list of available strings to select is in
// m_diffbotApiList for this collection, and that can
// be changed by john to add custom diffbot api urls.
// . should just be m_spiderDiffbotApiUrl[i] safebuf
char *usingApi = sx->getBufStart();
if ( sx->length() == 0 ) usingApi = NULL;
// now scan each item in the list. see the setting of
// "m_def" for "diffbotApiList" below to see the
// comma separated list of default strings. each item in
// this list is of the format "<title>|<urlPath>,"
char *p = cr->m_diffbotApiList.getBufStart();
// wtf?
if ( ! p ) return true;
// print out. cgi is "dapi%li".
sb->safePrintf("<select name=%s>\n",name);
// print "none" as the first option
char *sel = "";
if ( ! usingApi ) sel = " selected";
sb->safePrintf("<option value=\"\"%s>None</option>",sel);
// the various "diffbot urls" are separated by commas
for ( ; *p ; ) {
// point to start of item name
char *name = p;
// p should now point to name of the item
char *end1 = p;
// point to start of url for that item
for ( ; *end1 && *end1 != '|' ;end1++);
// save that
char *url = end1;
if ( *url == '|' ) url++;
// find end of url
char *urlEnd = url;
for ( ; *urlEnd && *urlEnd != ',' ; urlEnd++ );
// do we match it?
sel = "";
if ( usingApi && strncmp(usingApi,url,urlEnd-url)== 0 )
sel = " selected";
// advance p
p = urlEnd;
// skip over comma to get next one
if ( *p == ',' ) p++;
// use the hash as the identifier
sb->safePrintf("<option value=\"");
sb->safeMemcpy ( url, urlEnd - url );
sb->safePrintf("\"%s>",sel);
// print item name
sb->safeMemcpy ( name , end1 - name );
sb->safePrintf("</option>\n");
}
sb->safePrintf ( "</select>" );
sb->safePrintf("</select>");
return true;
}
@ -2101,18 +2144,12 @@ bool Parms::printParm ( SafeBuf* sb,
printDropDown ( MAX_SPIDER_PRIORITIES , sb , cgi , *s ,
true , true );
}
else if ( t == TYPE_DIFFBOT_DROPDOWN ) {
// just show the parm name and value if printing in json
if ( isJSON ) {
// convert diffbot # to string
long apiNum = (long)*s;
char *str = g_diffbotFields [apiNum];
sb->safePrintf("\"%s-str\":\"%s\",\n",cgi,str);
sb->safePrintf("\"%s\":%li,\n",cgi,apiNum);
}
else
printDiffbotDropDown ( 8, sb , cgi , *s );
}
// this url filters parm is an array of SAFEBUFs now, so each is
// a string and that string is the diffbot api url to use.
// the string is empty or zero length to indicate none.
//else if ( t == TYPE_DIFFBOT_DROPDOWN ) {
// char *xx=NULL;*xx=0;
//}
else if ( t == TYPE_RETRIES )
printDropDown ( 4 , sb , cgi , *s , false , false );
else if ( t == TYPE_PRIORITY_BOXES ) {
@ -2173,6 +2210,31 @@ bool Parms::printParm ( SafeBuf* sb,
sb->dequote ( s , gbstrlen(s) );
sb->safePrintf ("\">");
}
// HACK: print a drop down not a textbox for selecting the
// m_spiderDiffbotApiUrl[]. we can't just store this selection
// as a number because m_diffbotApiList (a string of comma separated
// items to select from) can change! it is not a typical dropdown.
// so we have to record the actual text we selected, which is
// basically the diffbot api url. this is because john can add
// custom diffbot api urls at anytime to the list.
else if ( t == TYPE_SAFEBUF && strcmp(m->m_cgi,"dapi") == 0 ) {
SafeBuf *sx = (SafeBuf *)s;
// just show the parm name and value if printing in json
if ( isJSON ) {
// this can be empty for the empty row i guess
if ( sx->length() ) {
// convert diffbot # to string
sb->safePrintf("\"%s\":\"",cgi);
// this is just the url path, not the title
// of the menu option... so this would be
// like "/api/article?u="
sb->safeUtf8ToJSON (sx->getBufStart() );
sb->safePrintf("\",\n");
}
}
else
printDiffbotDropDown ( sb , cgi , THIS , sx );
}
else if ( t == TYPE_SAFEBUF ) {
long size = m->m_size;
// give regular expression box on url filters page more room
@ -2842,7 +2904,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
t == TYPE_BOOL2 ||
t == TYPE_PRIORITY ||
t == TYPE_PRIORITY2 ||
t == TYPE_DIFFBOT_DROPDOWN ||
//t == TYPE_DIFFBOT_DROPDOWN ||
t == TYPE_PRIORITY_BOXES ||
t == TYPE_RETRIES ||
t == TYPE_FILTER ) {
@ -3047,10 +3109,10 @@ void Parms::setToDefault ( char *THIS ) {
// . this is a backwards-compatibility hack since this new parm
// will not be in old coll.conf files and will not be properly
// initialize when displaying a url filter row.
if ( THIS != (char *)&g_conf ) {
CollectionRec *cr = (CollectionRec *)THIS;
memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS);
}
//if ( THIS != (char *)&g_conf ) {
// CollectionRec *cr = (CollectionRec *)THIS;
// memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS);
//}
for ( long i = 0 ; i < m_numParms ; i++ ) {
Parm *m = &m_parms[i];
@ -3662,7 +3724,7 @@ char *Parms::getParmHtmlEncoded ( char *p , char *pend , Parm *m , char *s ) {
if ( t == TYPE_CHAR || t == TYPE_BOOL ||
t == TYPE_CHECKBOX ||
t == TYPE_PRIORITY || t == TYPE_PRIORITY2 ||
t == TYPE_DIFFBOT_DROPDOWN ||
//t == TYPE_DIFFBOT_DROPDOWN ||
t == TYPE_PRIORITY_BOXES || t == TYPE_RETRIES ||
t == TYPE_RETRIES || t == TYPE_FILTER ||
t == TYPE_BOOL2 || t == TYPE_CHAR2 )
@ -3762,7 +3824,7 @@ bool Parms::serialize( char *buf, long *bufSize ) {
if ( m->m_type == TYPE_BOOL2 ) size = 1;
if ( m->m_type == TYPE_PRIORITY ) size = 1;
if ( m->m_type == TYPE_PRIORITY2 ) size = 1;
if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
//if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1;
if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1;
if ( m->m_type == TYPE_RETRIES ) size = 1;
if ( m->m_type == TYPE_TIME ) size = 6;
@ -8154,12 +8216,25 @@ void Parms::init ( ) {
m++;
*/
m->m_cgi = "dbapiqs";
m->m_xml = "diffbotApiQueryString";
m->m_off = (char *)&cr.m_diffbotApiQueryString - x;
m->m_cgi = "dbapilist";
m->m_xml = "diffbotApiList";//QueryString";
m->m_off = (char *)&cr.m_diffbotApiList - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_def = "";
// XmlDoc.cpp when it first computes "ufn" it also sets
// m_diffbotApiUrl to one of these. lest we change the url filters
// table AFTER it gets the ufn and BEFORE it gets the diffbot api url.
m->m_def =
"All|http://www.diffbot.com/api/analzye?mode=auto,"
"Article (autodetect)|http://www.diffbot.com/api/analyze?mode=article,"
"Article (force)|http://www.diffbot.com/api/article?,"
"Product (autodetect)|http://www.diffbot.com/api/analyze?mode=product,"
"Product (force)|http://www.diffbot.com/api/product?,"
"Image (autodetect)|http://www.diffbot.com/api/analyze?mode=image,"
"Image (force)|http://www.diffbot.com/api/image?,"
"FrontPage (autodetect)|http://www.diffbot.com/api/analyze?mode=frontpage,"
"FrontPage (force)|http://www.diffbot.com/api/frontpage?"
;
m++;
/*
@ -12746,10 +12821,16 @@ void Parms::init ( ) {
m->m_cgi = "dapi";
m->m_xml = "diffbotAPI";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_spiderDiffbotApiNum - x;
m->m_type = TYPE_DIFFBOT_DROPDOWN;
m->m_def = "0";
m->m_off = (char *)cr.m_spiderDiffbotApiUrl - x;
// HACK: we print a dropdown for this but the value is a string
// because the items in the drop down can change so we can't store
// an item # here, it has to be a string, i.e. the diffbot api url.
// john might add a new custom api to m_diffbotApiList at any time.
// so we select the item in the drop down if it matches THIS string.
m->m_type = TYPE_SAFEBUF;//DIFFBOT_DROPDOWN;
m->m_def = "";
m->m_page = PAGE_FILTERS;
m->m_size = sizeof(SafeBuf);
m->m_rowid = 1;
m->m_addin = 1; // "insert" follows?
m++;
@ -15462,7 +15543,7 @@ void Parms::init ( ) {
if ( t == TYPE_CHECKBOX ) size = 1;
if ( t == TYPE_PRIORITY ) size = 1;
if ( t == TYPE_PRIORITY2 ) size = 1;
if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1;
//if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1;
if ( t == TYPE_PRIORITY_BOXES ) size = 1;
if ( t == TYPE_RETRIES ) size = 1;
if ( t == TYPE_TIME ) size = 6;

View File

@ -51,8 +51,8 @@ enum {
TYPE_MONOM2 ,
TYPE_LONG_CONST ,
TYPE_SITERULE , // 29
TYPE_SAFEBUF ,
TYPE_DIFFBOT_DROPDOWN
TYPE_SAFEBUF
//TYPE_DIFFBOT_DROPDOWN
};
//forward decls to make compiler happy:

View File

@ -11860,28 +11860,34 @@ skip:
THIS->m_masterLoop ( THIS->m_masterState );
}
long *XmlDoc::getDiffbotApiNum ( ) {
SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
if ( m_diffbotApiNumValid )
return &m_diffbotApiNum;
if ( m_diffbotApiUrlValid )
return &m_diffbotApiUrl;
// if we are a diffbot json object, do not re-send to diffbot!
if ( m_isDiffbotJSONObject ) {
m_diffbotApiNum = DBA_NONE;
m_diffbotApiNumValid = true;
return &m_diffbotApiNum;
//m_diffbotApiNum = DBA_NONE;
m_diffbotApiUrlValid = true;
return &m_diffbotApiUrl;
}
// this now automatically sets m_diffbotApiUrl and m_diffbotApiUrlValid
// in case the url filters table changes while spidering this!!!
// gotta be careful of that.
long *ufn = getUrlFilterNum();
if ( ! ufn || ufn == (void *)-1 ) return (long *)ufn;
if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn;
m_diffbotApiNum = m_cr->m_spiderDiffbotApiNum[*ufn];
// ensure it does set it!
if ( ! m_diffbotApiUrlValid ) { char *xx=NULL;*xx=0; }
//m_diffbotApiNum = m_cr->m_spiderDiffbotApiNum[*ufn];
// sanity check
if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }
//if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; }
m_diffbotApiNumValid = true;
return &m_diffbotApiNum;
//m_diffbotApiNumValid = true;
return &m_diffbotApiUrl;
}
// the diffbot reply will be a list of json objects we want to index
@ -11903,12 +11909,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
return &m_diffbotReply;
}
// check the url filters table to see if diffbot api is specified
long *an = getDiffbotApiNum();
if ( ! an || an == (void *)-1 ) return (SafeBuf *)an;
// . check the url filters table to see if diffbot api is specified
// . just return "\0" if none, but NULL means error i guess
SafeBuf *au = getDiffbotApiUrl();
if ( ! au || au == (void *)-1 ) return (SafeBuf *)au;
// if "NONE" is in the diffbot api drop down, do not send to diffbot
if ( *an == DBA_NONE ) {
// if no url, assume do not access diffbot
if ( au->length() <= 0 ) {
m_diffbotReplyValid = true;
return &m_diffbotReply;
}
@ -11975,7 +11982,15 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// from this url
SafeBuf diffbotUrl;
// TODO: make sure "api" works as hostname for not just product...
diffbotUrl.safePrintf("http://www.diffbot.com/api/");
//diffbotUrl.safePrintf("http://www.diffbot.com/");
// skip extra '/'?
//char *api = au->getBufStart();
//long apiLen = au->length();
//if ( api && api[0] == '/' ) { api++; apiLen--; }
// append the custom url. i.e. /api/analyze?mode=auto&u=
//if ( api ) diffbotUrl.safeMemcpy ( api , apiLen );
// store the api url into here
diffbotUrl.safeMemcpy ( au );
// . m_diffbotApi Is like "article" or "product" etc.
// . if classify is true we always return the classification
@ -11985,6 +12000,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// if there is no json objects of the specified page type, "api"
// . BUT if api is "all" return all types of json objects
// . SHOULD we return "type" in the json output?
/*
if ( *an == DBA_ALL )
diffbotUrl.safePrintf("analyze?mode=auto&" );
else if ( *an == DBA_ARTICLE_FORCE )
@ -12007,19 +12023,20 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
log("build: unknown diffbot api num = %li. assuming all",*an );
diffbotUrl.safePrintf("analyze?mode=auto&" );
}
*/
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
diffbotUrl.safePrintf("token=%s",m_cr->m_diffbotToken.getBufStart());
diffbotUrl.safePrintf("&token=%s",m_cr->m_diffbotToken.getBufStart());
diffbotUrl.safePrintf("&url=");
// give diffbot the url to process
diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
// append this just in case the next thing doesn't have it.
if ( m_cr->m_diffbotApiQueryString.length() &&
m_cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
diffbotUrl.pushChar('&');
//if ( m_cr->m_diffbotApiQueryString.length() &&
// m_cr->m_diffbotApiQueryString.getBufStart()[0] != '&' )
// diffbotUrl.pushChar('&');
// then user provided parms that are dependent on if it is an
// article, product, etc. like "&dontstripads=1" or whatever
diffbotUrl.safeStrcpy ( m_cr->m_diffbotApiQueryString.getBufStart());
//diffbotUrl.safeStrcpy ( m_cr->m_diffbotApiQueryString.getBufStart());
// null term it
diffbotUrl.nullTerm();
@ -15147,6 +15164,17 @@ long *XmlDoc::getUrlFilterNum ( ) {
// store it
m_urlFilterNum = ufn;
m_urlFilterNumValid = true;
// set this too in case the url filters table changes while
// we are spidering this and a row is inserted or deleted or something
SafeBuf *yy = &m_cr->m_spiderDiffbotApiUrl[ufn];
// copy to ours
m_diffbotApiUrl.safeMemcpy ( yy );
// ensure null term
m_diffbotApiUrl.nullTerm();
m_diffbotApiUrlValid = true;
return &m_urlFilterNum;
}
@ -15881,7 +15909,9 @@ bool XmlDoc::logIt ( ) {
sb.safePrintf("urlfilternum=%li ",(long)m_urlFilterNum);
if ( m_diffbotApiNumValid && m_diffbotApiNum != DBA_NONE )
if ( m_diffbotApiUrlValid &&
m_diffbotApiUrl.getBufStart() &&
m_diffbotApiUrl.getBufStart()[0] )
sb.safePrintf("diffbotjsonobjects=%li ",
(long)m_diffbotJSONCount);
@ -17443,8 +17473,9 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
od->m_useTagdb = false;
// do not use diffbot for old doc since we call
// od->nukeJSONObjects below()
od->m_diffbotApiNumValid = true;
od->m_diffbotApiNum = DBA_NONE;
od->m_diffbotApiUrlValid = true;
// api url should be empty by default
//od->m_diffbotApiNum = DBA_NONE;
// if we are doing diffbot stuff, we are still indexing this
// page, so we need to get the old doc meta list
oldList = od->getMetaList ( true );
@ -17752,8 +17783,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
return (char *)linkSiteHashes;
}
long *an = getDiffbotApiNum();
if ( ! an || an == (void *)-1 ) return (char *)an;
//SafeBuf *au = getDiffbotApiUrl();
//if ( ! au || au == (void *)-1 ) return (char *)au;
// test json parser

View File

@ -775,7 +775,8 @@ class XmlDoc {
char *getIsCompromised ( ) ;
char *getIsNoArchive ( ) ;
long *getUrlFilterNum();
long *getDiffbotApiNum();
//long *getDiffbotApiNum();
SafeBuf *getDiffbotApiUrl();
long long **getAdVector ( ) ;
char *getIsLinkSpam ( ) ;
char *getIsHijacked();
@ -1248,11 +1249,12 @@ class XmlDoc {
bool m_baseUrlValid;
bool m_replyValid;
bool m_diffbotReplyValid;
bool m_diffbotUrlCrawlPatternMatchValid;
bool m_diffbotUrlProcessPatternMatchValid;
bool m_diffbotPageProcessPatternMatchValid;
//bool m_diffbotUrlCrawlPatternMatchValid;
//bool m_diffbotUrlProcessPatternMatchValid;
//bool m_diffbotPageProcessPatternMatchValid;
//bool m_useDiffbotValid;
bool m_diffbotApiNumValid;
//bool m_diffbotApiNumValid;
bool m_diffbotApiUrlValid;
bool m_crawlInfoValid;
bool m_isPageParserValid;
bool m_imageUrlValid;
@ -1507,16 +1509,18 @@ class XmlDoc {
char m_diffbotSavedChar;
SafeBuf m_diffbotReply;
long m_diffbotReplyError;
bool m_diffbotUrlCrawlPatternMatch;
bool m_diffbotUrlProcessPatternMatch;
bool m_diffbotPageProcessPatternMatch;
long m_diffbotApiNum;
//bool m_diffbotUrlCrawlPatternMatch;
//bool m_diffbotUrlProcessPatternMatch;
//bool m_diffbotPageProcessPatternMatch;
//long m_diffbotApiNum;
//bool m_useDiffbot;
// url to access diffbot with
SafeBuf m_diffbotApiUrl;
SafeBuf *getDiffbotReply ( ) ;
bool doesUrlMatchDiffbotCrawlPattern() ;
bool doesUrlMatchDiffbotProcessPattern() ;
bool doesPageContentMatchDiffbotProcessPattern() ;
//bool doesUrlMatchDiffbotCrawlPattern() ;
//bool doesUrlMatchDiffbotProcessPattern() ;
//bool doesPageContentMatchDiffbotProcessPattern() ;
char *hashJSON ( HashTableX *table );
long *nukeJSONObjects ( ) ;
long m_joc;

View File

@ -57,7 +57,7 @@
<doNarrowSearch>0</>
# Overrides all spidering for all collections on just this host.
<localSpideringEnabled>0</>
<localSpideringEnabled>1</>
# Overrides all add urls for all collections on just this host.
<localAddUrlEnabled>1</>
@ -73,10 +73,10 @@
<qaSearchTestEnabled>1</>
# Enable spidering on all hosts
<allSpidersOn>0</>
<allSpidersOn>1</>
# Disable spidering on all hosts
<allSpidersOff>0</>
<allSpidersOff>1</>
# Serves ads unless pure=1 is in cgi parms.
<adFeedEnabled>0</>