diff --git a/CollectionRec.cpp b/CollectionRec.cpp index 37560b33..86b8523d 100644 --- a/CollectionRec.cpp +++ b/CollectionRec.cpp @@ -317,7 +317,10 @@ void CollectionRec::setUrlFiltersToDefaults ( ) { m_spidersEnabled[n] = 1; m_numRegExs7++; - m_spiderDiffbotApiNum[n] = 1; + //m_spiderDiffbotApiNum[n] = 1; + //m_numRegExs11++; + m_spiderDiffbotApiUrl[n].set(""); + m_spiderDiffbotApiUrl[n].nullTerm(); m_numRegExs11++; } diff --git a/CollectionRec.h b/CollectionRec.h index a196e8c8..d4c81828 100644 --- a/CollectionRec.h +++ b/CollectionRec.h @@ -389,11 +389,11 @@ class CollectionRec { // an alternate name for the collection. we tend to create // collection names as a random sequence of hex digits. this // will allow a user to give them an alternate name. - SafeBuf m_collectionNameAlias; + //SafeBuf m_collectionNameAlias; //SafeBuf m_diffbotSeed; // this will be NULL or "none" to not pass off to diffbot //SafeBuf m_diffbotApi; - SafeBuf m_diffbotApiQueryString; + SafeBuf m_diffbotApiList;//QueryString; //SafeBuf m_diffbotUrlCrawlPattern; //SafeBuf m_diffbotUrlProcessPattern; //SafeBuf m_diffbotPageProcessPattern; @@ -403,9 +403,9 @@ class CollectionRec { char m_isCustomCrawl; //char m_isDiffbotCollection; // format of output. "csv" or "xml" or "json" or null - SafeBuf m_diffbotFormat; + //SafeBuf m_diffbotFormat; // what fields to return in the json output: (api dependent) - SafeBuf m_diffbotFields; + //SafeBuf m_diffbotFields; long long m_diffbotMaxToCrawl; long long m_diffbotMaxToProcess; long long m_diffbotCrawlStartTime; @@ -466,8 +466,11 @@ class CollectionRec { // should urls in this queue be sent to diffbot for processing // when we are trying to index them? + //long m_numRegExs11; + //char m_spiderDiffbotApiNum [ MAX_FILTERS ]; + long m_numRegExs11; - char m_spiderDiffbotApiNum [ MAX_FILTERS ]; + SafeBuf m_spiderDiffbotApiUrl [ MAX_FILTERS ]; // dummy? long m_numRegExs9; diff --git a/PageCrawlBot.cpp b/PageCrawlBot.cpp index f2eb4ca3..c7f053ef 100644 --- a/PageCrawlBot.cpp +++ b/PageCrawlBot.cpp @@ -34,6 +34,7 @@ CollectionRec *getCollRecFromHttpRequest ( HttpRequest *hr ) ; CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) ; //bool isAliasUnique ( CollectionRec *cr , char *token , char *alias ) ; +/* char *g_diffbotFields [] = { "Unused-ERROR", "None", @@ -46,12 +47,12 @@ char *g_diffbotFields [] = { "Image (autodetect)", "FrontPage (force)", "FrontPage (autodetect)", - // // last field must be empty. add new fields above this. // NULL }; +*/ /* class StateNC { @@ -918,7 +919,7 @@ void StateCD::gotRdbList ( ) { sb.safeMemcpy(mime.getMime(),mime.getMimeLen() ); } - CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); + //CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); // we set this to true below if any one shard has more spiderdb // records left to read @@ -933,8 +934,9 @@ void StateCD::gotRdbList ( ) { RdbList *list = &m_lists[i]; // get the format - char *format = cr->m_diffbotFormat.getBufStart(); - if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL; + //char *format = cr->m_diffbotFormat.getBufStart(); + //if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL; + char *format = NULL; char *ek = list->getEndKey(); @@ -1684,6 +1686,11 @@ static class HelpItem s_his[] = { {"maxtoprocess", "Specify max pages to successfully process through " "diffbot"}, {"urt","Use robots.txt?"}, + {"dbapilist","Special list of diffbot API urls. The URL Filters " + "will display these options in a drop down menu. " + "Example (unencoded): " + "&dbapilist=All|/api/analyze?mode=auto&u=,Article (forced)|/api/" + "article?u="}, {"fe[N]","Filter expression #N. The first expression in the url " "filters table is 0. But if N is 0, leave N out, only specify it " "if N is > 0. Example &fe=onsamedomain to change the expression in " @@ -1695,7 +1702,10 @@ static class HelpItem s_his[] = { {"mspi[N]","Max outstanding spiders for this IP."}, {"xg[N]","Wait this many milliseconds between spiders of same IP."}, {"fsp[N]","Spider priority. Higher priorities spidered first. Can be from 0 to 127. But -3 means to ignore the URL. -2 means the URL is banned because it comes from an evil site."}, - {"dapi[N]","Diffbot api number. Process through this diffbot api."}, + {"dapi[N]","Diffbot API Url. This is a string. Usually it " + "corresponds to dbapilist parm above. But it is the url we use when " + "accessing diffbot for this url filter. " + "Example (unencoded): &dapi2=/api/article?u="}, {"injecturl","Specify a seed url to inject."}, {"urldata","A huge string of whitespace separated URLs to add to " "spiderdb for crawling."}, @@ -3164,7 +3174,7 @@ CollectionRec *addNewDiffbotColl ( char *addColl , HttpRequest *hr ) { cr->m_spiderIpMaxSpiders[i] = 3; // keep it respectful cr->m_spidersEnabled [i] = 1; cr->m_spiderFreqs [i] = 7.0; - cr->m_spiderDiffbotApiNum[i] = DBA_NONE; // 1 + cr->m_spiderDiffbotApiUrl[i].purge();// = DBA_NONE; // 1 } diff --git a/PageCrawlBot.h b/PageCrawlBot.h index f1cf69e2..6a42ed97 100644 --- a/PageCrawlBot.h +++ b/PageCrawlBot.h @@ -3,6 +3,7 @@ #define CRAWLBOT_H // values for the diffbot dropdown +/* #define DBA_NONE 0 #define DBA_ALL 1 #define DBA_ARTICLE_FORCE 2 @@ -17,6 +18,7 @@ // add new fields to END of list since i think we store the // field we use as a number in the coll.conf, starting at 0 extern char *g_diffbotFields []; +*/ bool sendPageCrawlbot ( TcpSocket *s , HttpRequest *hr ); diff --git a/Parms.cpp b/Parms.cpp index c7a8c090..7aed6df7 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -222,7 +222,7 @@ unsigned long Parms::calcChecksum() { if ( m->m_type == TYPE_BOOL2 ) size = 1; if ( m->m_type == TYPE_PRIORITY ) size = 1; if ( m->m_type == TYPE_PRIORITY2 ) size = 1; - if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1; + //if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1; if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1; if ( m->m_type == TYPE_RETRIES ) size = 1; if ( m->m_type == TYPE_TIME ) size = 6; @@ -1064,16 +1064,59 @@ char *printDropDown ( long n , char *p, char *pend, char *name, long select, } */ -bool printDiffbotDropDown ( long n , SafeBuf *sb , char *name , long select ) { - sb->safePrintf ( "\n",name); + // print "none" as the first option + char *sel = ""; + if ( ! usingApi ) sel = " selected"; + sb->safePrintf("",sel); + // the various "diffbot urls" are separated by commas + for ( ; *p ; ) { + // point to start of item name + char *name = p; + // p should now point to name of the item + char *end1 = p; + // point to start of url for that item + for ( ; *end1 && *end1 != '|' ;end1++); + // save that + char *url = end1; + if ( *url == '|' ) url++; + // find end of url + char *urlEnd = url; + for ( ; *urlEnd && *urlEnd != ',' ; urlEnd++ ); + // do we match it? + sel = ""; + if ( usingApi && strncmp(usingApi,url,urlEnd-url)== 0 ) + sel = " selected"; + // advance p + p = urlEnd; + // skip over comma to get next one + if ( *p == ',' ) p++; + // use the hash as the identifier + sb->safePrintf("\n"); } - sb->safePrintf ( "" ); + sb->safePrintf(""); return true; } @@ -2101,18 +2144,12 @@ bool Parms::printParm ( SafeBuf* sb, printDropDown ( MAX_SPIDER_PRIORITIES , sb , cgi , *s , true , true ); } - else if ( t == TYPE_DIFFBOT_DROPDOWN ) { - // just show the parm name and value if printing in json - if ( isJSON ) { - // convert diffbot # to string - long apiNum = (long)*s; - char *str = g_diffbotFields [apiNum]; - sb->safePrintf("\"%s-str\":\"%s\",\n",cgi,str); - sb->safePrintf("\"%s\":%li,\n",cgi,apiNum); - } - else - printDiffbotDropDown ( 8, sb , cgi , *s ); - } + // this url filters parm is an array of SAFEBUFs now, so each is + // a string and that string is the diffbot api url to use. + // the string is empty or zero length to indicate none. + //else if ( t == TYPE_DIFFBOT_DROPDOWN ) { + // char *xx=NULL;*xx=0; + //} else if ( t == TYPE_RETRIES ) printDropDown ( 4 , sb , cgi , *s , false , false ); else if ( t == TYPE_PRIORITY_BOXES ) { @@ -2173,6 +2210,31 @@ bool Parms::printParm ( SafeBuf* sb, sb->dequote ( s , gbstrlen(s) ); sb->safePrintf ("\">"); } + // HACK: print a drop down not a textbox for selecting the + // m_spiderDiffbotApiUrl[]. we can't just store this selection + // as a number because m_diffbotApiList (a string of comma separated + // items to select from) can change! it is not a typical dropdown. + // so we have to record the actual text we selected, which is + // basically the diffbot api url. this is because john can add + // custom diffbot api urls at anytime to the list. + else if ( t == TYPE_SAFEBUF && strcmp(m->m_cgi,"dapi") == 0 ) { + SafeBuf *sx = (SafeBuf *)s; + // just show the parm name and value if printing in json + if ( isJSON ) { + // this can be empty for the empty row i guess + if ( sx->length() ) { + // convert diffbot # to string + sb->safePrintf("\"%s\":\"",cgi); + // this is just the url path, not the title + // of the menu option... so this would be + // like "/api/article?u=" + sb->safeUtf8ToJSON (sx->getBufStart() ); + sb->safePrintf("\",\n"); + } + } + else + printDiffbotDropDown ( sb , cgi , THIS , sx ); + } else if ( t == TYPE_SAFEBUF ) { long size = m->m_size; // give regular expression box on url filters page more room @@ -2842,7 +2904,7 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s , t == TYPE_BOOL2 || t == TYPE_PRIORITY || t == TYPE_PRIORITY2 || - t == TYPE_DIFFBOT_DROPDOWN || + //t == TYPE_DIFFBOT_DROPDOWN || t == TYPE_PRIORITY_BOXES || t == TYPE_RETRIES || t == TYPE_FILTER ) { @@ -3047,10 +3109,10 @@ void Parms::setToDefault ( char *THIS ) { // . this is a backwards-compatibility hack since this new parm // will not be in old coll.conf files and will not be properly // initialize when displaying a url filter row. - if ( THIS != (char *)&g_conf ) { - CollectionRec *cr = (CollectionRec *)THIS; - memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS); - } + //if ( THIS != (char *)&g_conf ) { + // CollectionRec *cr = (CollectionRec *)THIS; + // memset ( cr->m_spiderDiffbotApiNum , 0 , MAX_FILTERS); + //} for ( long i = 0 ; i < m_numParms ; i++ ) { Parm *m = &m_parms[i]; @@ -3662,7 +3724,7 @@ char *Parms::getParmHtmlEncoded ( char *p , char *pend , Parm *m , char *s ) { if ( t == TYPE_CHAR || t == TYPE_BOOL || t == TYPE_CHECKBOX || t == TYPE_PRIORITY || t == TYPE_PRIORITY2 || - t == TYPE_DIFFBOT_DROPDOWN || + //t == TYPE_DIFFBOT_DROPDOWN || t == TYPE_PRIORITY_BOXES || t == TYPE_RETRIES || t == TYPE_RETRIES || t == TYPE_FILTER || t == TYPE_BOOL2 || t == TYPE_CHAR2 ) @@ -3762,7 +3824,7 @@ bool Parms::serialize( char *buf, long *bufSize ) { if ( m->m_type == TYPE_BOOL2 ) size = 1; if ( m->m_type == TYPE_PRIORITY ) size = 1; if ( m->m_type == TYPE_PRIORITY2 ) size = 1; - if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1; + //if ( m->m_type == TYPE_DIFFBOT_DROPDOWN) size = 1; if ( m->m_type == TYPE_PRIORITY_BOXES ) size = 1; if ( m->m_type == TYPE_RETRIES ) size = 1; if ( m->m_type == TYPE_TIME ) size = 6; @@ -8154,12 +8216,25 @@ void Parms::init ( ) { m++; */ - m->m_cgi = "dbapiqs"; - m->m_xml = "diffbotApiQueryString"; - m->m_off = (char *)&cr.m_diffbotApiQueryString - x; + m->m_cgi = "dbapilist"; + m->m_xml = "diffbotApiList";//QueryString"; + m->m_off = (char *)&cr.m_diffbotApiList - x; m->m_type = TYPE_SAFEBUF; m->m_page = PAGE_NONE; - m->m_def = ""; + // XmlDoc.cpp when it first computes "ufn" it also sets + // m_diffbotApiUrl to one of these. lest we change the url filters + // table AFTER it gets the ufn and BEFORE it gets the diffbot api url. + m->m_def = + "All|http://www.diffbot.com/api/analzye?mode=auto," + "Article (autodetect)|http://www.diffbot.com/api/analyze?mode=article," + "Article (force)|http://www.diffbot.com/api/article?," + "Product (autodetect)|http://www.diffbot.com/api/analyze?mode=product," + "Product (force)|http://www.diffbot.com/api/product?," + "Image (autodetect)|http://www.diffbot.com/api/analyze?mode=image," + "Image (force)|http://www.diffbot.com/api/image?," + "FrontPage (autodetect)|http://www.diffbot.com/api/analyze?mode=frontpage," + "FrontPage (force)|http://www.diffbot.com/api/frontpage?" + ; m++; /* @@ -12746,10 +12821,16 @@ void Parms::init ( ) { m->m_cgi = "dapi"; m->m_xml = "diffbotAPI"; m->m_max = MAX_FILTERS; - m->m_off = (char *)cr.m_spiderDiffbotApiNum - x; - m->m_type = TYPE_DIFFBOT_DROPDOWN; - m->m_def = "0"; + m->m_off = (char *)cr.m_spiderDiffbotApiUrl - x; + // HACK: we print a dropdown for this but the value is a string + // because the items in the drop down can change so we can't store + // an item # here, it has to be a string, i.e. the diffbot api url. + // john might add a new custom api to m_diffbotApiList at any time. + // so we select the item in the drop down if it matches THIS string. + m->m_type = TYPE_SAFEBUF;//DIFFBOT_DROPDOWN; + m->m_def = ""; m->m_page = PAGE_FILTERS; + m->m_size = sizeof(SafeBuf); m->m_rowid = 1; m->m_addin = 1; // "insert" follows? m++; @@ -15462,7 +15543,7 @@ void Parms::init ( ) { if ( t == TYPE_CHECKBOX ) size = 1; if ( t == TYPE_PRIORITY ) size = 1; if ( t == TYPE_PRIORITY2 ) size = 1; - if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1; + //if ( t ==TYPE_DIFFBOT_DROPDOWN) size = 1; if ( t == TYPE_PRIORITY_BOXES ) size = 1; if ( t == TYPE_RETRIES ) size = 1; if ( t == TYPE_TIME ) size = 6; diff --git a/Parms.h b/Parms.h index b53b7118..3f42804c 100644 --- a/Parms.h +++ b/Parms.h @@ -51,8 +51,8 @@ enum { TYPE_MONOM2 , TYPE_LONG_CONST , TYPE_SITERULE , // 29 - TYPE_SAFEBUF , - TYPE_DIFFBOT_DROPDOWN + TYPE_SAFEBUF + //TYPE_DIFFBOT_DROPDOWN }; //forward decls to make compiler happy: diff --git a/XmlDoc.cpp b/XmlDoc.cpp index 5a965d39..4a3deaa3 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -11860,28 +11860,34 @@ skip: THIS->m_masterLoop ( THIS->m_masterState ); } -long *XmlDoc::getDiffbotApiNum ( ) { +SafeBuf *XmlDoc::getDiffbotApiUrl ( ) { - if ( m_diffbotApiNumValid ) - return &m_diffbotApiNum; + if ( m_diffbotApiUrlValid ) + return &m_diffbotApiUrl; // if we are a diffbot json object, do not re-send to diffbot! if ( m_isDiffbotJSONObject ) { - m_diffbotApiNum = DBA_NONE; - m_diffbotApiNumValid = true; - return &m_diffbotApiNum; + //m_diffbotApiNum = DBA_NONE; + m_diffbotApiUrlValid = true; + return &m_diffbotApiUrl; } + // this now automatically sets m_diffbotApiUrl and m_diffbotApiUrlValid + // in case the url filters table changes while spidering this!!! + // gotta be careful of that. long *ufn = getUrlFilterNum(); - if ( ! ufn || ufn == (void *)-1 ) return (long *)ufn; + if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn; - m_diffbotApiNum = m_cr->m_spiderDiffbotApiNum[*ufn]; + // ensure it does set it! + if ( ! m_diffbotApiUrlValid ) { char *xx=NULL;*xx=0; } + + //m_diffbotApiNum = m_cr->m_spiderDiffbotApiNum[*ufn]; // sanity check - if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; } + //if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; } - m_diffbotApiNumValid = true; - return &m_diffbotApiNum; + //m_diffbotApiNumValid = true; + return &m_diffbotApiUrl; } // the diffbot reply will be a list of json objects we want to index @@ -11903,12 +11909,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) { return &m_diffbotReply; } - // check the url filters table to see if diffbot api is specified - long *an = getDiffbotApiNum(); - if ( ! an || an == (void *)-1 ) return (SafeBuf *)an; + // . check the url filters table to see if diffbot api is specified + // . just return "\0" if none, but NULL means error i guess + SafeBuf *au = getDiffbotApiUrl(); + if ( ! au || au == (void *)-1 ) return (SafeBuf *)au; - // if "NONE" is in the diffbot api drop down, do not send to diffbot - if ( *an == DBA_NONE ) { + // if no url, assume do not access diffbot + if ( au->length() <= 0 ) { m_diffbotReplyValid = true; return &m_diffbotReply; } @@ -11975,7 +11982,15 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) { // from this url SafeBuf diffbotUrl; // TODO: make sure "api" works as hostname for not just product... - diffbotUrl.safePrintf("http://www.diffbot.com/api/"); + //diffbotUrl.safePrintf("http://www.diffbot.com/"); + // skip extra '/'? + //char *api = au->getBufStart(); + //long apiLen = au->length(); + //if ( api && api[0] == '/' ) { api++; apiLen--; } + // append the custom url. i.e. /api/analyze?mode=auto&u= + //if ( api ) diffbotUrl.safeMemcpy ( api , apiLen ); + // store the api url into here + diffbotUrl.safeMemcpy ( au ); // . m_diffbotApi Is like "article" or "product" etc. // . if classify is true we always return the classification @@ -11985,6 +12000,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) { // if there is no json objects of the specified page type, "api" // . BUT if api is "all" return all types of json objects // . SHOULD we return "type" in the json output? + /* if ( *an == DBA_ALL ) diffbotUrl.safePrintf("analyze?mode=auto&" ); else if ( *an == DBA_ARTICLE_FORCE ) @@ -12007,19 +12023,20 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) { log("build: unknown diffbot api num = %li. assuming all",*an ); diffbotUrl.safePrintf("analyze?mode=auto&" ); } + */ //diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u=" - diffbotUrl.safePrintf("token=%s",m_cr->m_diffbotToken.getBufStart()); + diffbotUrl.safePrintf("&token=%s",m_cr->m_diffbotToken.getBufStart()); diffbotUrl.safePrintf("&url="); // give diffbot the url to process diffbotUrl.urlEncode ( m_firstUrl.getUrl() ); // append this just in case the next thing doesn't have it. - if ( m_cr->m_diffbotApiQueryString.length() && - m_cr->m_diffbotApiQueryString.getBufStart()[0] != '&' ) - diffbotUrl.pushChar('&'); + //if ( m_cr->m_diffbotApiQueryString.length() && + // m_cr->m_diffbotApiQueryString.getBufStart()[0] != '&' ) + // diffbotUrl.pushChar('&'); // then user provided parms that are dependent on if it is an // article, product, etc. like "&dontstripads=1" or whatever - diffbotUrl.safeStrcpy ( m_cr->m_diffbotApiQueryString.getBufStart()); + //diffbotUrl.safeStrcpy ( m_cr->m_diffbotApiQueryString.getBufStart()); // null term it diffbotUrl.nullTerm(); @@ -15147,6 +15164,17 @@ long *XmlDoc::getUrlFilterNum ( ) { // store it m_urlFilterNum = ufn; m_urlFilterNumValid = true; + + // set this too in case the url filters table changes while + // we are spidering this and a row is inserted or deleted or something + SafeBuf *yy = &m_cr->m_spiderDiffbotApiUrl[ufn]; + // copy to ours + m_diffbotApiUrl.safeMemcpy ( yy ); + // ensure null term + m_diffbotApiUrl.nullTerm(); + m_diffbotApiUrlValid = true; + + return &m_urlFilterNum; } @@ -15881,7 +15909,9 @@ bool XmlDoc::logIt ( ) { sb.safePrintf("urlfilternum=%li ",(long)m_urlFilterNum); - if ( m_diffbotApiNumValid && m_diffbotApiNum != DBA_NONE ) + if ( m_diffbotApiUrlValid && + m_diffbotApiUrl.getBufStart() && + m_diffbotApiUrl.getBufStart()[0] ) sb.safePrintf("diffbotjsonobjects=%li ", (long)m_diffbotJSONCount); @@ -17443,8 +17473,9 @@ char *XmlDoc::getMetaList ( bool forDelete ) { od->m_useTagdb = false; // do not use diffbot for old doc since we call // od->nukeJSONObjects below() - od->m_diffbotApiNumValid = true; - od->m_diffbotApiNum = DBA_NONE; + od->m_diffbotApiUrlValid = true; + // api url should be empty by default + //od->m_diffbotApiNum = DBA_NONE; // if we are doing diffbot stuff, we are still indexing this // page, so we need to get the old doc meta list oldList = od->getMetaList ( true ); @@ -17752,8 +17783,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) { return (char *)linkSiteHashes; } - long *an = getDiffbotApiNum(); - if ( ! an || an == (void *)-1 ) return (char *)an; + //SafeBuf *au = getDiffbotApiUrl(); + //if ( ! au || au == (void *)-1 ) return (char *)au; // test json parser diff --git a/XmlDoc.h b/XmlDoc.h index 1e386933..51b5a427 100644 --- a/XmlDoc.h +++ b/XmlDoc.h @@ -775,7 +775,8 @@ class XmlDoc { char *getIsCompromised ( ) ; char *getIsNoArchive ( ) ; long *getUrlFilterNum(); - long *getDiffbotApiNum(); + //long *getDiffbotApiNum(); + SafeBuf *getDiffbotApiUrl(); long long **getAdVector ( ) ; char *getIsLinkSpam ( ) ; char *getIsHijacked(); @@ -1248,11 +1249,12 @@ class XmlDoc { bool m_baseUrlValid; bool m_replyValid; bool m_diffbotReplyValid; - bool m_diffbotUrlCrawlPatternMatchValid; - bool m_diffbotUrlProcessPatternMatchValid; - bool m_diffbotPageProcessPatternMatchValid; + //bool m_diffbotUrlCrawlPatternMatchValid; + //bool m_diffbotUrlProcessPatternMatchValid; + //bool m_diffbotPageProcessPatternMatchValid; //bool m_useDiffbotValid; - bool m_diffbotApiNumValid; + //bool m_diffbotApiNumValid; + bool m_diffbotApiUrlValid; bool m_crawlInfoValid; bool m_isPageParserValid; bool m_imageUrlValid; @@ -1507,16 +1509,18 @@ class XmlDoc { char m_diffbotSavedChar; SafeBuf m_diffbotReply; long m_diffbotReplyError; - bool m_diffbotUrlCrawlPatternMatch; - bool m_diffbotUrlProcessPatternMatch; - bool m_diffbotPageProcessPatternMatch; - long m_diffbotApiNum; + //bool m_diffbotUrlCrawlPatternMatch; + //bool m_diffbotUrlProcessPatternMatch; + //bool m_diffbotPageProcessPatternMatch; + //long m_diffbotApiNum; //bool m_useDiffbot; + // url to access diffbot with + SafeBuf m_diffbotApiUrl; SafeBuf *getDiffbotReply ( ) ; - bool doesUrlMatchDiffbotCrawlPattern() ; - bool doesUrlMatchDiffbotProcessPattern() ; - bool doesPageContentMatchDiffbotProcessPattern() ; + //bool doesUrlMatchDiffbotCrawlPattern() ; + //bool doesUrlMatchDiffbotProcessPattern() ; + //bool doesPageContentMatchDiffbotProcessPattern() ; char *hashJSON ( HashTableX *table ); long *nukeJSONObjects ( ) ; long m_joc; diff --git a/gb.conf b/gb.conf index 2653c9d5..0791635c 100644 --- a/gb.conf +++ b/gb.conf @@ -57,7 +57,7 @@ 0 # Overrides all spidering for all collections on just this host. -0 +1 # Overrides all add urls for all collections on just this host. 1 @@ -73,10 +73,10 @@ 1 # Enable spidering on all hosts -0 +1 # Disable spidering on all hosts -0 +1 # Serves ads unless pure=1 is in cgi parms. 0