// diffbot api implementaion // // WHAT APIs are here? // // . 1. the CrawlBot API to start a crawl // . 2. To directly process a provided URL (injection) // . 3. the Cache API so phantomjs can quickly check the cache for files // and quickly add files to the cache. // // Related pages: // // * http://diffbot.com/dev/docs/ (Crawlbot API tab, and others) // * http://diffbot.com/dev/crawl/ #include "TcpServer.h" #include "HttpRequest.h" #include "HttpServer.h" #include "Pages.h" // g_msg #include "XmlDoc.h" // for checkRegex() //void printCrawlStats ( SafeBuf *sb , CollectionRec *cr ) ; void doneSendingWrapper ( void *state , TcpSocket *sock ) ; bool sendBackDump ( TcpSocket *s,HttpRequest *hr ); //void gotMsg4ReplyWrapper ( void *state ) ; //bool showAllCrawls ( TcpSocket *s , HttpRequest *hr ) ; char *getTokenFromHttpRequest ( HttpRequest *hr ) ; char *getCrawlIdFromHttpRequest ( HttpRequest *hr ) ; CollectionRec *getCollRecFromHttpRequest ( HttpRequest *hr ) ; //CollectionRec *getCollRecFromCrawlId ( char *crawlId ); //void printCrawlStatsWrapper ( void *state ) ; CollectionRec *addNewDiffbotColl ( HttpRequest *hr ) ; char *g_diffbotFields [] = { "None", "All", // /api/analyze?mode=auto "Article (force)", // /api/article "Article (autodetect)", // /api/analyze?mode=article "Product (force)", "Product (autodetect)", "Image (force)", "Image (autodetect)", "FrontPage (force)", "FrontPage (autodetect)", // // last field must be empty. add new fields above this. // NULL }; /* class StateNC { public: Msg4 m_msg4; collnum_t m_collnum; TcpSocket *m_socket; }; class StateXX { public: TcpSocket *m_socket; collnum_t m_collnum; }; // . HttpServer.cpp calls handleDiffbotRequest() when it senses // a diffbot api request, like "GET /api/ *" // . incoming request format described in diffbot.com/dev/docs/ // . use incoming request to create a new collection and set the crawl // parameters of the collection if it is "/api/startcrawl" // . url format is like: live.diffbot.com/api/startcrawl // or /api/stopcrawl etc. // . it does not seem to matter if the handler returns true or false! bool handleDiffbotRequest ( TcpSocket *s , HttpRequest *hr ) { // . parse out stuff out of the url call // . these 3 are required long tokenLen = 0; char *token = hr->getString("token",&tokenLen); // the seed url char *seed = hr->getString("seed"); // this can be "article" "product" "frontpage" "image" char *api = hr->getString("api"); // apiQueryString holds the cgi parms to pass to the specific diffbot // api like /api/article?... char *apiQueryString = hr->getString("apiQueryString",NULL); // these are regular expressions char *urlCrawlPattern = hr->getString("urlCrawlPattern",NULL); char *urlProcessPattern = hr->getString("urlProcessPattern",NULL); char *pageProcessPattern = hr->getString("pageProcessPattern",NULL); // this is 1 or 0. if enabled then diffbot.com will only try to // extract json objects from page types that match "api" page type // specified above. so if "api" is "product" and a page is identified // as "image" then no json objects will be extracted. long classify = hr->getLong("classify",0); // default to 100,000 pages? max pages successfully downloaded, // so does not include tcptimeouts, dnstimeouts, but does include // bad http status codes, like 404. long long maxCrawled = hr->getLongLong("maxCrawled",100000LL); // default to 100,000 pages? # of pages SUCCESSFULLY got a reply // from diffbot for. long long maxToProcess = hr->getLongLong("maxProcessed",100000LL); char *id = hr->getString("id",NULL); // crawl id // start or stop a crawl or download? /api/startcrawl /api/stopcrawl or // /api/downloadcrawl /api/activecrawls char *path = hr->getPath(); if ( ! path || strncmp(path,"/api/",5) != 0 ) { g_errno = EBADREQUEST; g_msg = " (error: diffbot api path invalid)"; return g_httpServer.sendErrorReply(s,500,"invalid diffbot " "api path"); } #define DB_STARTCRAWL 1 #define DB_STOPCRAWL 2 #define DB_CRAWLS 3 #define DB_ACTIVECRAWLS 4 #define DB_DOWNLOADURLS 5 #define DB_DOWNLOADOBJECTS 6 #define DB_RESUMECRAWL 7 long func = 0; bool hasFormat = hr->hasField("format"); if ( strncmp(path,"/api/startcrawl" ,15) == 0 ) func=DB_STARTCRAWL; if ( strncmp(path,"/api/stopcrawl" ,14) == 0 ) func=DB_STOPCRAWL; if ( strncmp(path,"/api/resumecrawl" ,16) == 0 ) func=DB_RESUMECRAWL; if ( strncmp(path,"/api/crawls" ,11) == 0 ) func=DB_CRAWLS; if ( strncmp(path,"/api/activecrawls" ,17) == 0 ) func=DB_ACTIVECRAWLS; if ( strncmp(path,"/api/downloadurls" ,17) == 0 ) func=DB_DOWNLOADURLS; if ( strncmp(path,"/api/downloadcrawl",18) == 0 ) { if ( ! hasFormat ) func = DB_DOWNLOADURLS; else func = DB_DOWNLOADOBJECTS; } if ( ! func ) { g_errno = EBADREQUEST; g_msg = " (error: diffbot api command invalid)"; return g_httpServer.sendErrorReply(s,500,"invalid diffbot " "api command"); } // token is not required for /api/activecrawls or stopcrawl, only id if ( (func == DB_STARTCRAWL || func == DB_CRAWLS ) && ! token ) { g_errno = EBADREQUEST; g_msg = " (error: need \"token\" parm)"; return g_httpServer.sendErrorReply(s,500, "missing \"token\" parm"); } if ( (func == DB_STOPCRAWL || func == DB_RESUMECRAWL || func == DB_ACTIVECRAWLS || func == DB_DOWNLOADURLS || func == DB_DOWNLOADOBJECTS ) && ! id ) { g_errno = EBADREQUEST; g_msg = " (error: need \"id\" parm)"; return g_httpServer.sendErrorReply(s,500, "missing \"id\" parm"); } CollectionRec *cr = NULL; // get collrec if ( func == DB_STOPCRAWL || func == DB_RESUMECRAWL || func == DB_ACTIVECRAWLS || func == DB_DOWNLOADURLS || func == DB_DOWNLOADOBJECTS ) { // the crawlid needs a valid collection cr = getCollRecFromCrawlId ( id ); // complain if not there if ( ! cr ) { g_errno = EBADREQUEST; g_msg = " (error: invalid diffbot crawl id or token)"; return g_httpServer.sendErrorReply(s,500,"invalid " "diffbot crawl id " "or token"); } } // if stopping crawl... if ( func == DB_STOPCRAWL ) { cr->m_spideringEnabled = 0; char *reply = "{\"reply\":\"success\"}"; return g_httpServer.sendDynamicPage( s, reply, gbstrlen(reply), 0, // cacheTime true, // POSTReply? "application/json" ); } // resuming crawl if ( func == DB_RESUMECRAWL ) { cr->m_spideringEnabled = 1; char *reply = "{\"reply\":\"success\"}"; return g_httpServer.sendDynamicPage( s, reply, gbstrlen(reply), 0, // cacheTime true, // POSTReply? "application/json" ); } // downloading the urls from spiderdb... sorted by time? if ( func == DB_DOWNLOADURLS ) return sendBackDump ( s , hr , RDB_SPIDERDB ); if ( func == DB_DOWNLOADOBJECTS ) return sendBackDump ( s , hr , RDB_TITLEDB ); // viewing crawl stats just for this one collection/crawl if ( func == DB_ACTIVECRAWLS ) { // state class in case update blocks StateXX *sxx; try { sxx = new (StateXX); } catch ( ... ) { g_msg = "(error: no mem for diffbot2)"; return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno)); } mnew ( sxx , sizeof(StateXX), "statexx"); // set this shit sxx->m_collnum = cr->m_collnum; sxx->m_socket = s; // . if blocks then return and wait for callback 2 be called // . set useCache to false to get semi-exact stats if ( ! updateCrawlInfo ( cr , sxx , printCrawlStatsWrapper , false )) return false; // it did not block, so call wrapper directly printCrawlStatsWrapper ( sxx ); // all done return true; } // show stats of ALL crawls done by this token if ( func == DB_CRAWLS ) return showAllCrawls ( s , hr ); // at this point they must be starting a new crawl.no other cmds remain if ( func != DB_STARTCRAWL ) { g_errno = EBADREQUEST; g_msg = " (error: diffbot api command invalid)"; return g_httpServer.sendErrorReply(s,500,"invalid diffbot " "api command"); } //////////////// // // SUPPORT FOR GET /api/startcrawl // // Adds a new CollectionRec, injects the seed url into it, and // turns spidering on. // //////////////// if ( ! seed ) { g_errno = EBADREQUEST; g_msg = " (error: need seed parm)"; return g_httpServer.sendErrorReply(s,500,"need seed parm"); } if ( ! api ) { g_errno = EBADREQUEST; g_msg = " (error: need seed api)"; return g_httpServer.sendErrorReply(s,500,"need api parm"); } // sanity if ( gbstrlen(seed)+1 >= MAX_URL_LEN ) { g_errno = EBADREQUEST; g_msg = " (error: seed url too long)"; return g_httpServer.sendErrorReply(s,500,"seed url must " "be less than 1023 " "bytes"); } // make sure the provided regular expressions compile ok SafeBuf sb; bool boolVal; bool boolValValid; long compileError; // test the url crawl pattern sb.set ( urlCrawlPattern ); checkRegex (&sb,"x",&boolVal,&boolValValid,&compileError,cr); if ( compileError ) { g_errno = EBADREQUEST; g_msg = "(error: bad url crawl pattern)"; return g_httpServer.sendErrorReply(s,500, "bad url crawl pattern"); } // test the url process pattern sb.set ( urlProcessPattern ); checkRegex (&sb,"x",&boolVal,&boolValValid,&compileError,cr); if ( compileError ) { g_errno = EBADREQUEST; g_msg = "(error: bad url process pattern)"; return g_httpServer.sendErrorReply(s,500, "bad url process pattern"); } // test the page process pattern sb.set ( pageProcessPattern ); checkRegex (&sb,"x",&boolVal,&boolValValid,&compileError,cr); if ( compileError ) { g_errno = EBADREQUEST; g_msg = "(error: bad page process pattern)"; return g_httpServer.sendErrorReply(s,500, "bad page process pattern"); } // // crap we need a new state: NC = New Collection state // because we do a msg4 below that could block... // StateNC *nc; try { nc = new (StateNC); } catch ( ... ) { g_msg = "(error: no mem for diffbot)"; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } mnew ( nc , sizeof(StateNC), "statenc"); // let's create a new crawl id. dan was making it 32 characters // with 4 hyphens in it for a total of 36 bytes, but since // MAX_COLL_LEN, the maximum length of a collection name, is just // 64 bytes, and the token is already 32, let's limit to 16 bytes // for the crawlerid. so if we print that out in hex, 16 hex chars // 0xffffffff 0xffffffff is 64 bits. so let's make a random 64-bit // value here. unsigned long r1 = rand(); unsigned long r2 = rand(); unsigned long long crawlId64 = (unsigned long long) r1; crawlId64 <<= 32; crawlId64 |= r2; // the name of the new collection we are creating for this crawl // will be -. if it is a "test" crawl as // specified as an option in the diffbot crawlbot api page, // then make it --test. Test crawls do not index, // they only crawl. char collBuf[MAX_COLL_LEN+1]; // include a +5 for "-test" // include 16 for crawlid (16 char hex #) if ( tokenLen + 16 + 5>= MAX_COLL_LEN ) { char *xx=NULL;*xx=0;} char *testStr = ""; //if ( cr->m_isDiffbotTestCrawl ) testStr = "-test"; // ensure the crawlid is the full 16 characters long so we // can quickly extricate the crawlid from the collection name sprintf(collBuf,"%s-%016llx%s",token,crawlId64,testStr); ///////////// // // . make a new collection! "cr" is the collectionRec. // . collection Name is the crawl id // ///////////// if ( ! g_collectiondb.addRec ( collBuf , NULL , // copy from 0 , // copy from len true , // it is a brand new one -1 , // we are new, this is -1 false , // is NOT a dump true // save it for sure! ) ) { log("diffbot: failed to add new coll rec"); g_msg = " (error: diffbot failed to allocate crawl)"; return g_httpServer.sendErrorReply(s,500,"diffbot crawl " "alloc failed?"); } // get the collrec cr = g_collectiondb.getRec ( collBuf ); // did an alloc fail? if ( ! cr ) { char *xx=NULL;*xx=0; } // noralize the seed url Url norm; norm.set ( seed ); cr->m_diffbotSeed.set ( norm.getUrl() ); // these must be there too //cr->m_diffbotToken.set ( token ); cr->m_diffbotApi.set ( api ); // these are optional, may be NULL cr->m_diffbotApiQueryString.set ( apiQueryString ); cr->m_diffbotUrlCrawlPattern.set ( urlCrawlPattern ); cr->m_diffbotUrlProcessPattern.set ( urlProcessPattern ); cr->m_diffbotPageProcessPattern.set ( pageProcessPattern ); cr->m_diffbotClassify = classify; // let's make these all NULL terminated strings cr->m_diffbotSeed.nullTerm(); //cr->m_diffbotToken.nullTerm(); cr->m_diffbotApi.nullTerm(); cr->m_diffbotApiQueryString.nullTerm(); cr->m_diffbotUrlCrawlPattern.nullTerm(); cr->m_diffbotUrlProcessPattern.nullTerm(); cr->m_diffbotPageProcessPattern.nullTerm(); // do not spider more than this many urls total. -1 means no max. cr->m_diffbotMaxToCrawl = maxCrawled; // do not process more than this. -1 means no max. cr->m_diffbotMaxToProcess = maxToProcess; // reset the crawl stats cr->m_diffbotCrawlStartTime = gettimeofdayInMillisecondsGlobal(); cr->m_diffbotCrawlEndTime = 0LL; // reset crawler stats. they should be loaded from crawlinfo.txt memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) ); memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) ); //cr->m_globalCrawlInfoUpdateTime = 0; cr->m_replies = 0; cr->m_requests = 0; // support current web page api i guess for test crawls //cr->m_isDiffbotTestCrawl = false; //char *strange = hr->getString("href",NULL); //if ( strange && strcmp ( strange,"/dev/crawl#testCrawl" ) == 0 ) // cr->m_isDiffbotTestCrawl = true; /////// // // extra diffbot ARTICLE parms // /////// // . ppl mostly use meta, html and tags. // . dropping support for dontStripAds. mike is ok with that. // . use for jsonp requests. needed for cross-domain ajax. //char *callback = hr->getString("callback",NULL); // a download timeout //long timeout = hr->getLong("timeout",5000); // "xml" or "json" char *format = hr->getString("format",NULL,"json"); // save that cr->m_diffbotFormat.safeStrcpy(format); // return all content from page? for frontpage api. // TODO: can we put "all" into "fields="? //bool all = hr->hasField("all"); ///////// // // specify diffbot fields to return in the json output // ///////// // point to the safebuf that holds the fields the user wants to // extract from each url. comma separated list of supported diffbot // fields like "meta","tags", ... SafeBuf *f = &cr->m_diffbotFields; // transcribe provided fields if any char *fields = hr->getString("fields",NULL); // appends those to our field buf if ( fields ) f->safeStrcpy(fields); // if something there push a comma in case we add more below if ( f->length() ) f->pushChar(','); // return contents of the page's meta tags? twitter card metadata, .. if ( hr->hasField("meta" ) ) f->safeStrcpy("meta,"); if ( hr->hasField("html" ) ) f->safeStrcpy("html,"); if ( hr->hasField("tags" ) ) f->safeStrcpy("tags,"); if ( hr->hasField("comments") ) f->safeStrcpy("comments,"); if ( hr->hasField("summary" ) ) f->safeStrcpy("summary,"); if ( hr->hasField("all" ) ) f->safeStrcpy("all,"); // if we added crap to "fields" safebuf remove trailing comma f->removeLastChar(','); // set some defaults. max spiders for all priorities in this collection cr->m_maxNumSpiders = 10; // make the gigablast regex table just "default" so it does not // filtering, but accepts all urls. we will add code to pass the urls // through m_diffbotUrlCrawlPattern alternatively. if that itself // is empty, we will just restrict to the seed urls subdomain. for ( long i = 0 ; i < MAX_FILTERS ; i++ ) { cr->m_regExs[i].purge(); cr->m_spiderPriorities[i] = 0; cr->m_maxSpidersPerRule [i] = 10; cr->m_spiderIpWaits [i] = 250; // 250 ms for now cr->m_spiderIpMaxSpiders[i] = 10; cr->m_spidersEnabled [i] = 1; cr->m_spiderFreqs [i] = 7.0; } // // by default to not spider image or movie links or // links with /print/ in them // long i = 0; cr->m_regExs[i].safePrintf("$.css"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf("$.mpeg"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf("$.mpg"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf("$.wmv"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf(".css?"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf("$.jpg"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf("$.JPG"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf("$.gif"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf("$.GIF"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf("$.ico"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; cr->m_regExs[i].safePrintf("/print/"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; // if user did not specify a url crawl pattern then keep // the crawl limited to the same subdomain of the seed url if ( cr->m_diffbotUrlCrawlPattern.length() == 0 ) { // first limit to http://subdomain cr->m_regExs[i].safePrintf("^http://"); cr->m_regExs[i].safeMemcpy(norm.getHost(),norm.getHostLen()); cr->m_regExs[i].pushChar('/'); cr->m_regExs[i].nullTerm(); cr->m_spiderPriorities [i] = 50; cr->m_maxSpidersPerRule [i] = 10; cr->m_spiderIpWaits [i] = 250; // 500 ms for now cr->m_spiderIpMaxSpiders[i] = 10; cr->m_spidersEnabled [i] = 1; i++; // then include HTTPS cr->m_regExs[i].safePrintf("^https://"); cr->m_regExs[i].safeMemcpy(norm.getHost(),norm.getHostLen()); cr->m_regExs[i].pushChar('/'); cr->m_regExs[i].nullTerm(); cr->m_spiderPriorities [i] = 50; cr->m_maxSpidersPerRule [i] = 10; cr->m_spiderIpWaits [i] = 250; // 500 ms for now cr->m_spiderIpMaxSpiders[i] = 10; cr->m_spidersEnabled [i] = 1; i++; // and make all else filtered cr->m_regExs[i].safePrintf("default"); cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED; cr->m_maxSpidersPerRule [i] = 10; cr->m_spiderIpWaits [i] = 250; // 500 ms for now cr->m_spiderIpMaxSpiders[i] = 10; cr->m_spidersEnabled [i] = 1; i++; } else { cr->m_regExs[i].safePrintf("default"); cr->m_spiderPriorities [i] = 50; cr->m_maxSpidersPerRule [i] = 10; cr->m_spiderIpWaits [i] = 250; // 500 ms for now cr->m_spiderIpMaxSpiders[i] = 10; cr->m_spidersEnabled [i] = 1; i++; } // just the default rule! cr->m_numRegExs = i; cr->m_numRegExs2 = i; cr->m_numRegExs3 = i; cr->m_numRegExs10 = i; cr->m_numRegExs5 = i; cr->m_numRegExs6 = i; cr->m_numRegExs7 = i; //cr->m_spiderPriorities [1] = -1; // filtered? or banned? //cr->m_maxSpidersPerRule [1] = 10; //cr->m_spiderIpWaits [1] = 500; // 500 ms for now cr->m_needsSave = 1; // start the spiders! cr->m_spideringEnabled = true; // and global spider must be on... // do not turn it off on shutdown i guess, too g_conf.m_spideringEnabled = true; // . add the seed url to spiderdb // . make a "meta" list to add to spiderdb using msg4 below SafeBuf listBuf; listBuf.pushChar ( RDB_SPIDERDB ); SpiderRequest sreq; // constructor does not use reset i guess so we must call it sreq.reset(); // string ptr char *url = cr->m_diffbotSeed.getBufStart(); // use this as the url strcpy( sreq.m_url, url ); // parentdocid of 0 long firstIp = hash32n ( url ); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; sreq.setKey( firstIp,0LL, false ); sreq.m_isInjecting = 0; sreq.m_isPageInject = 0; sreq.m_hopCount = 0; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; // store it into list to add to spiderdb listBuf.safeMemcpy ( (char *)&sreq , sreq.getRecSize() ); // allow search queries to take precedence over this operation. // otherwise, we'd make it niceness 0. long niceness = 1; Msg4 *m4 = &nc->m_msg4; // save this in our state nc->m_collnum = cr->m_collnum; nc->m_socket = s; if ( m4->addMetaList ( listBuf.getBufStart(), listBuf.length() , // add spiderrequest to our new coll name collBuf, nc , gotMsg4ReplyWrapper , niceness ) ) { // i guess it did not block gotMsg4ReplyWrapper ( nc ); return true; } // it blocked return false; } // . come here after the SpiderRequest was added to Spiderdb // . just transmit back the crawlerid, just like dan does now void gotMsg4ReplyWrapper ( void *state ) { // cast it StateNC *nc = (StateNC *)state; // get the special ptr we hid in there CollectionRec *cr = g_collectiondb.getRec(nc->m_collnum); // the crawlid is the last 16 characters of the collection name char *crawlIdStr = cr->m_coll + cr->m_collLen - 16; // get it TcpSocket *socket = nc->m_socket; // nuke it delete nc; mdelete ( nc , sizeof(StateNC) , "stnc" ); // httpserver.cpp copies the reply so don't worry that it is on // the stack //char reply[128]; //sprintf(reply,"%016llx", crawlIdStr ); // we successfully started the crawl... g_httpServer.sendDynamicPage ( socket , crawlIdStr, gbstrlen(crawlIdStr) ); } */ //////////////// // // SUPPORT FOR DOWNLOADING an RDB DUMP // // We ask each shard for 10MB of Spiderdb records. If 10MB was returned // then we repeat. Everytime we get 10MB from each shard we print the // Spiderdb records out into "safebuf" and transmit it to the user. once // the buffer has been transmitted then we ask the shards for another 10MB // worth of spider records. // //////////////// // use this as a state while dumping out spiderdb for a collection class StateCD { public: StateCD () { m_needsMime = true; }; void sendBackDump2 ( ) ; void readDataFromRdb ( ) ; void gotRdbList ( ) ; void printSpiderdbList ( RdbList *list , SafeBuf *sb , char *format ) ; void printTitledbList ( RdbList *list , SafeBuf *sb , char *format ) ; bool m_needsMime; char m_rdbId; bool m_downloadJSON; collnum_t m_collnum; long m_numRequests; long m_numReplies; long m_minRecSizes; bool m_someoneNeedsMore; TcpSocket *m_socket; Msg0 m_msg0s[MAX_HOSTS]; key128_t m_spiderdbStartKeys[MAX_HOSTS]; key_t m_titledbStartKeys[MAX_HOSTS]; RdbList m_lists[MAX_HOSTS]; bool m_needMore[MAX_HOSTS]; }; // . basically dump out spiderdb // . returns urls in csv format in reply to a "GET /api/downloadcrawl " // . the ordering of the urls is not specified so whatever order they are // in spiderdb will do // . the gui that lists the urls as they are spidered in real time when you // do a test crawl will just have to call this repeatedly. it shouldn't // be too slow because of disk caching, and, most likely, the spider requests // will all be in spiderdb's rdbtree any how // . because we are distributed we have to send a msg0 request to each // shard/group asking for all the spider urls. dan says 30MB is typical // for a csv file, so for now we will just try to do a single spiderdb // request. bool sendBackDump ( TcpSocket *s, HttpRequest *hr ) { CollectionRec *cr = getCollRecFromHttpRequest ( hr ); if ( ! cr ) { char *msg = "token or id (crawlid) invalid"; log("crawlbot: invalid token or crawlid to dump"); g_httpServer.sendErrorReply(s,500,msg); return true; } char *path = hr->getPath(); char rdbId = RDB_NONE; bool downloadJSON = false; if ( strncmp ( path ,"/crawlbot/downloadurls",22 ) == 0 ) rdbId = RDB_SPIDERDB; if ( strncmp ( path ,"/crawlbot/downloadpages",23 ) == 0 ) rdbId = RDB_TITLEDB; if ( strncmp ( path ,"/crawlbot/downloadobjects",25 ) == 0 ) { downloadJSON = true; rdbId = RDB_TITLEDB; } // sanity, must be one of 3 download calls if ( rdbId == RDB_NONE ) { char *msg ; msg = "usage: downloadurls, downloadpages, downloadobjects"; log("crawlbot: %s",msg); g_httpServer.sendErrorReply(s,500,msg); return true; } StateCD *st; try { st = new (StateCD); } catch ( ... ) { return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } mnew ( st , sizeof(StateCD), "statecd"); // initialize the new state st->m_rdbId = rdbId; st->m_downloadJSON = downloadJSON; st->m_socket = s; // the name of the collections whose spiderdb we read from st->m_collnum = cr->m_collnum; // begin the possible segmented process of sending back spiderdb // to the user's browser st->sendBackDump2(); // i dont think this return values matters at all since httpserver.cpp // does not look at it when it calls sendReply() return true; } void StateCD::sendBackDump2 ( ) { m_numRequests = 0; m_numReplies = 0; // read 10MB from each shard's spiderdb at a time m_minRecSizes = 9999999; // we stop reading from all shards when this becomes false m_someoneNeedsMore = true; // initialize the spiderdb startkey "cursor" for each shard's spiderdb for ( long i = 0 ; i < g_hostdb.m_numGroups ; i++ ) { m_needMore[i] = true; KEYMIN((char *)&m_spiderdbStartKeys[i],sizeof(key128_t)); KEYMIN((char *)&m_titledbStartKeys[i],sizeof(key_t)); } // begin reading from each shard and sending the spiderdb records // over the network readDataFromRdb ( ); } void gotRdbListWrapper ( void *state ) ; void StateCD::readDataFromRdb ( ) { // set end key to max key. we are limiting using m_minRecSizes for this key128_t ek; KEYMAX((char *)&ek,sizeof(key128_t)); CollectionRec *cr = g_collectiondb.getRec(m_collnum); // launch one request to each shard for ( long i = 0 ; i < g_hostdb.m_numGroups ; i++ ) { // count it m_numRequests++; // this is the least nice. crawls will yield to it mostly. long niceness = 0; // point to right startkey char *sk ; if ( m_rdbId == RDB_SPIDERDB ) sk = (char *)&m_spiderdbStartKeys[i]; else sk = (char *)&m_titledbStartKeys[i]; // get host Host *h = g_hostdb.getLiveHostInGroup(i); // msg0 uses multicast in case one of the hosts in a shard is // dead or dies during this call. if ( ! m_msg0s[i].getList ( h->m_hostId , // use multicast h->m_ip, h->m_port, 0, // maxcacheage false, // addtocache? m_rdbId, cr->m_coll, &m_lists[i], sk, (char *)&ek, // get at most about // "minRecSizes" worth of spiderdb // records m_minRecSizes, this, gotRdbListWrapper , niceness ) ) // continue if it blocked continue; // we got a reply back right away... m_numReplies++; } // all done? return if still waiting on more msg0s to get their data if ( m_numReplies < m_numRequests ) return; // done i guess, print and return gotRdbList(); } void gotRdbListWrapper ( void *state ) { // get the Crawler dump State StateCD *st = (StateCD *)state; st->gotRdbList(); } void StateCD::gotRdbList ( ) { // get the Crawler dump State // inc it m_numReplies++; // return if still awaiting more replies if ( m_numReplies < m_numRequests ) return; SafeBuf sb; //sb.setLabel("dbotdmp"); // . if we haven't yet sent an http mime back to the user // then do so here, the content-length will not be in there // because we might have to call for more spiderdb data if ( m_needsMime ) { HttpMime mime; mime.makeMime ( -1, // totel content-lenght is unknown! 0 , // do not cache (cacheTime) 0 , // lastModified 0 , // offset -1 , // bytesToSend NULL , // ext false, // POSTReply "text/csv", // contenttype "utf-8" , // charset -1 , // httpstatus NULL ); //cookie sb.safeMemcpy(mime.getMime(),mime.getMimeLen() ); } CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); // we set this to true below if any one shard has more spiderdb // records left to read m_someoneNeedsMore = false; // // got all replies... create the HTTP reply and send it back // for ( long i = 0 ; i < g_hostdb.m_numGroups ; i++ ) { if ( ! m_needMore[i] ) continue; // get the list from that group RdbList *list = &m_lists[i]; // get the format char *format = cr->m_diffbotFormat.getBufStart(); if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL; char *ek = list->getEndKey(); // now print the spiderdb list out into "sb" if ( m_rdbId == RDB_SPIDERDB ) { // print SPIDERDB list into "sb" printSpiderdbList ( list , &sb , format ); // update spiderdb startkey for this shard KEYSET((char *)&m_spiderdbStartKeys[i],ek, sizeof(key128_t)); // advance by 1 m_spiderdbStartKeys[i] += 1; } if ( m_rdbId == RDB_TITLEDB ) { // print TITLEDB list into "sb" printTitledbList ( list , &sb , format ); // update titledb startkey for this shard KEYSET((char *)&m_titledbStartKeys[i],ek, sizeof(key_t)); // advance by 1 m_titledbStartKeys[i] += 1; } // should we try to read more? m_needMore[i] = false; if ( list->m_listSize >= m_minRecSizes ) { m_needMore[i] = true; m_someoneNeedsMore = true; } } // if first time, send it back if ( m_needsMime ) { // only do once m_needsMime = false; // start the send process TcpServer *tcp = &g_httpServer.m_tcp; if ( ! tcp->sendMsg ( m_socket , sb.getBufStart(), // sendBuf , sb.getCapacity(),//sendBufSize , sb.length(),//sendBufSize , sb.length(), // msgtotalsize this , // data for callback doneSendingWrapper ) ) { // callback // do not free sendbuf we are transmitting it sb.detachBuf(); return; } // error? //TcpSocket *s = m_socket; // sometimes it does not block and is successful if ( ! g_errno ) sb.detachBuf(); // nuke state delete this; mdelete ( this , sizeof(StateCD) , "stcd" ); if ( g_errno ) log("diffbot: tcp sendmsg did not block. error: %s", mstrerror(g_errno)); //g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); return ; } // if nothing to send back we are done. return true since we // did not block sending back. if ( sb.length() == 0 ) return; // put socket in sending-again mode m_socket->m_sendBuf = sb.getBufStart(); m_socket->m_sendBufSize = sb.getCapacity(); m_socket->m_sendBufUsed = sb.length(); m_socket->m_sendOffset = 0; m_socket->m_totalSent = 0; m_socket->m_totalToSend = sb.length(); // tell TcpServer.cpp to send this latest buffer! HACK! m_socket->m_sockState = ST_SEND_AGAIN; // do not let safebuf free this, we will take care of it sb.detachBuf(); // . when it is done sending call this callback, don't hang up! // . if m_someoneNeedsMore is false then this callback should just // destroy the socket and delete "this" m_socket->m_callback = doneSendingWrapper; m_socket->m_state = this; // we blocked sending back return; } // TcpServer.cpp calls this when done sending TcpSocket's m_sendBuf void doneSendingWrapper ( void *state , TcpSocket *sock ) { StateCD *st = (StateCD *)state; TcpSocket *socket = st->m_socket; // free the old sendbuf then i guess since we might replace it // in the above function. mfree ( socket->m_sendBuf , socket->m_sendBufSize , "dbsbuf"); // in case we have nothing to send back do not let socket free // what we just freed above. it'll core. socket->m_sendBuf = NULL; // all done? if ( st->m_someoneNeedsMore ) { // . read more from spiderdb from one or more shards // . will also put into socket's write buf and set // TcpSocket::m_sockState to ST_SEND_AGAIN so that // TcpServer.cpp resumes the sending process and does not // destroy the socket st->readDataFromRdb(); return; } // delete that state delete st; mdelete ( st , sizeof(StateCD) , "stcd" ); } void StateCD::printSpiderdbList ( RdbList *list , SafeBuf *sb , char *format) { // declare these up here SpiderRequest *sreq = NULL; SpiderReply *srep = NULL; long long lastUh48 = 0LL; long long prevReplyUh48 = 0LL; long prevReplyError = 0; time_t prevReplyDownloadTime = 0LL; // parse through it for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) { // this record is either a SpiderRequest or SpiderReply char *rec = list->getCurrentRec(); // we encounter the spiderreplies first then the // spiderrequests for the same url if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) { srep = (SpiderReply *)rec; sreq = NULL; prevReplyUh48 = srep->getUrlHash48(); // 0 means indexed successfully. not sure if // this includes http status codes like 404 etc. // i don't think it includes those types of errors! prevReplyError = srep->m_errCode; prevReplyDownloadTime = srep->m_spideredTime; continue; } // ok, we got a spider request sreq = (SpiderRequest *)rec; // sanity check if ( srep && srep->getUrlHash48() != sreq->getUrlHash48()){ log("diffbot: had a spider reply with no " "corresponding spider request for uh48=%lli" , srep->getUrlHash48()); //char *xx=NULL;*xx=0; } // print the url if not yet printed long long uh48 = sreq->getUrlHash48 (); bool printIt = false; // there can be multiple spiderrequests for the same url! if ( lastUh48 != uh48 ) printIt = true; if ( ! printIt ) continue; lastUh48 = uh48; // debug point //if ( strstr(sreq->m_url,"chief") ) // log("hey"); // 1 means spidered, 0 means not spidered, -1 means error long status = 1; // if unspidered, then we don't match the prev reply // so set "status" to 0 to indicate hasn't been // downloaded yet. if ( lastUh48 != prevReplyUh48 ) status = 0; // if it matches, perhaps an error spidering it? if ( status && prevReplyError ) status = -1; // use the time it was added to spiderdb if the url // was not spidered time_t time = sreq->m_addedTime; // if it was spidered, successfully or got an error, // then use the time it was spidered if ( status ) time = prevReplyDownloadTime; char *msg = "Successfully Crawled"; if ( status == 0 ) msg = "Unexamined"; if ( status == -1 ) msg = mstrerror(prevReplyError); // "csv" is default if json not specified if ( format && strcmp(format,"json")==0 ) sb->safePrintf("[{" "{\"url\":" "\"%s\"}," "{\"time\":" "\"%lu\"}," "{\"status\":" "\"%li\"}," "{\"statusMsg\":" "\"%s\"}" "}]\n" , sreq->m_url // when was it first added to spiderdb? , sreq->m_addedTime , status , msg ); // but default to csv else sb->safePrintf("%s,%lu,%li,\"%s\"" //",%s" "\n" , sreq->m_url // when was it first added to spiderdb? , sreq->m_addedTime , status , msg //, iptoa(sreq->m_firstIp) ); } } void StateCD::printTitledbList ( RdbList *list , SafeBuf *sb , char *format ) { XmlDoc xd; CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); // parse through it for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) { // this record is either a SpiderRequest or SpiderReply char *rec = list->getCurrentRec(); // skip ifnegative if ( (rec[0] & 0x01) == 0x00 ) continue; // uncompress it if ( ! xd.set2 ( rec , 0, // maxSize unused cr->m_coll , NULL , // ppbuf 0 , // niceness NULL ) ) { // spiderRequest log("diffbot: error setting titlerec in dump"); continue; } // must be of type json to be a diffbot json object if ( m_downloadJSON && xd.m_contentType != CT_JSON ) continue; // or if downloading web pages... if ( ! m_downloadJSON ) { // skip if json object content type if ( xd.m_contentType == CT_JSON ) continue; // . just print the cached page // . size should include the \0 sb->safeStrcpy ( xd.m_firstUrl.m_url); // then \n sb->pushChar('\n'); // then page content sb->safeStrcpy ( xd.ptr_utf8Content ); // null term just in case //sb->nullTerm(); // separate pages with \0 i guess sb->pushChar('\0'); // \n sb->pushChar('\n'); continue; } // skip if not a diffbot json url if ( ! xd.m_isDiffbotJSONObject ) continue; // get the json content char *json = xd.ptr_utf8Content; // just print that out. encode \n's and \r's back to \\n \\r // and backslash to a \\ ... // but if they originally had a \u encoding and // we made into utf8, do not put that back into the \u // encoding because it is not necessary. if ( ! sb->safeStrcpyPrettyJSON ( json ) ) log("diffbot: error printing json in dump"); // separate each JSON object with \n i guess sb->pushChar('\n'); } } /* //////////////// // // SUPPORT FOR GET /api/crawls and /api/activecrawls // // Just scan each collection record whose collection name includes the // provided "token" of the user. then print out the stats of just // //////////////// // example output for http://live.diffbot.com/api/crawls?token=matt // [{"id":"c421f09d-7c31-4131-9da2-21e35d8130a9","finish":1378233585887,"matched":274,"status":"Stopped","start":1378233159848,"token":"matt","parameterMap":{"token":"matt","seed":"www.techcrunch.com","api":"article"},"crawled":274}] // example output from activecrawls?id=.... // {"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":null,"matched":27,"status":"Crawling","start":1378322184332,"token":"matt","parameterMap":{"token":"matt","seed":"www.alleyinsider.com","api":"article"},"crawled":34} // NOTE: it does not seem to include active crawls! bad!! like if you lost // the crawlid... // "cr" is NULL if showing all crawls! bool showAllCrawls ( TcpSocket *s , HttpRequest *hr ) { long tokenLen = 0; char *token = hr->getString("token",&tokenLen); // token MUST be there because this function's caller checked for it if ( ! token ) { char *xx=NULL;*xx=0; } // store the crawl stats as html into "sb" SafeBuf sb; // scan the collection recs for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // get it CollectionRec *cr = g_collectiondb.m_recs[i]; // skip if empty if ( ! cr ) continue; // get name char *coll = cr->m_coll; //long collLen = cr->m_collLen; // skip if first 16 or whatever characters does not match // the user token because the name of a collection is // - if ( coll[0] != token[0] ) continue; if ( coll[1] != token[1] ) continue; if ( coll[2] != token[2] ) continue; // scan the rest bool match = true; for ( long i = 3 ; coll[i] && token[i] ; i++ ) { // the name of a collection is - // so if we hit the hyphen we are done if ( coll[i] == '-' ) break; if ( coll[i] != token[i] ) { match = false; break; } } if ( ! match ) continue; // we got a match, print them out printCrawlStats ( &sb , cr ); } // and send back now return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length(), -1);// cachetime } */ char *getTokenFromHttpRequest ( HttpRequest *hr ) { // provided directly? char *token = hr->getString("token",NULL,NULL); if ( token ) return token; // extract token from coll? char *c = hr->getString("c",NULL,NULL); if ( ! c ) return NULL; CollectionRec *cr = g_collectiondb.getRec(c); if ( ! cr ) return NULL; if ( cr->m_diffbotToken.length() <= 0 ) return NULL; token = cr->m_diffbotToken.getBufStart(); return token; } CollectionRec *getCollRecFromHttpRequest ( HttpRequest *hr ) { // if we have the collection name explicitly, get the coll rec then char *c = hr->getString("c",NULL,NULL); if ( c ) return g_collectiondb.getRec ( c ); // otherwise, get it from token/crawlid char *token = getTokenFromHttpRequest( hr ); for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cr = g_collectiondb.m_recs[i]; if ( ! cr ) continue; if ( strcmp ( cr->m_diffbotToken.getBufStart(),token)==0 ) return cr; } // no matches return NULL; } /* // doesn't have to be fast, so just do a scan CollectionRec *getCollRecFromCrawlId ( char *crawlId ) { long idLen = gbstrlen(crawlId); // scan collection names for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // get it CollectionRec *cr = g_collectiondb.m_recs[i]; // skip if empty if ( ! cr ) continue; // get name char *coll = cr->m_coll; long collLen = cr->m_collLen; if ( collLen < 16 ) continue; // skip if first 16 or whatever characters does not match // the user token because the name of a collection is // - if ( coll[collLen-1] != crawlId[idLen-1] ) continue; if ( coll[collLen-2] != crawlId[idLen-2] ) continue; if ( coll[collLen-3] != crawlId[idLen-3] ) continue; if ( ! strstr ( coll , crawlId ) ) continue; return cr; } return NULL; } void printCrawlStatsWrapper ( void *state ) { StateXX *sxx = (StateXX *)state; // get collection rec CollectionRec *cr = g_collectiondb.getRec(sxx->m_collnum); // print out the crawl SafeBuf sb; printCrawlStats ( &sb , cr ); // save before nuking state TcpSocket *sock = sxx->m_socket; // nuke the state delete sxx; mdelete ( sxx , sizeof(StateXX) , "stxx" ); // and send back now g_httpServer.sendDynamicPage ( sock , sb.getBufStart(), sb.length(), -1 ); // cachetime } void printCrawlStats ( SafeBuf *sb , CollectionRec *cr ) { // if we are the first, print a '[' to start a json thingy if ( sb->length() == 0 ) sb->pushChar('['); // otherwise, remove the previous ']' since we are not the last else { char *p = sb->getBufStart(); long plen = sb->length(); if ( p[plen-1]=='[' ) sb->incrementLength(-1); } sb->safePrintf( "{" "\"id\":\"" ); // get the token from coll name char *token = cr->m_coll; // and the length, up to the hyphen that separates it from crawl id long tokenLen = 0; for ( ; token[tokenLen] && token[tokenLen] != '-' ; tokenLen++ ); // now crawl id char *crawlId = token + tokenLen; // skip hyphen if ( crawlId[0] == '-' ) crawlId++; // print crawl id out sb->safeStrcpy ( crawlId ); // end its quote sb->safeStrcpy ( "\","); // now the time the crawl finished. if ( cr->m_spideringEnabled ) sb->safePrintf("\"finish\":null,"); else sb->safePrintf("\"finish\":%lli,",cr->m_diffbotCrawlEndTime); // how many urls we handoff to diffbot api. that implies successful // download and that it matches the url crawl pattern and // url process pattern and content regular expression pattern. // // NOTE: pageProcessAttempts can be higher than m_pageDownloadAttempts // when we call getMetaList() on an *old* (in titledb) xmldoc, // where we just get the cached content from titledb to avoid a // download, but we still call getDiffbotReply(). perhaps reconstruct // the diffbot reply from XmlDoc::m_diffbotJSONCount // // "processed" here corresponds to the "maxProcessed" cgi parm // specified when instantiating the crawl parms for the first time. // // likewise "crawled" corresponds to "maxCrawled" // sb->safePrintf("\"processedAttempts\":%lli,", cr->m_globalCrawlInfo.m_pageProcessAttempts); sb->safePrintf("\"processed\":%lli,", cr->m_globalCrawlInfo.m_pageProcessSuccesses); sb->safePrintf("\"crawlAttempts\":%lli,", cr->m_globalCrawlInfo.m_pageDownloadAttempts); sb->safePrintf("\"crawled\":%lli,", cr->m_globalCrawlInfo.m_pageDownloadSuccesses); sb->safePrintf("\"urlsConsidered\":%lli,", cr->m_globalCrawlInfo.m_urlsConsidered); // how many spiders outstanding for this coll right now? SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum); long spidersOut = sc->getTotalOutstandingSpiders(); // . status of the crawl: "Stopped" or "Active"? // . TODO: check with dan to see if Active is correct and // ShuttingDown is allowable if ( cr->m_spideringEnabled ) sb->safePrintf("\"status\":\"Active\","); else if ( spidersOut ) sb->safePrintf("\"status\":\"ShuttingDown\","); else sb->safePrintf("\"status\":\"Stopped\","); // spider crawl start time sb->safePrintf("\"start\":%lli,",cr->m_diffbotCrawlStartTime); // the token sb->safePrintf("\"token\":\""); sb->safeMemcpy(token,tokenLen); sb->safePrintf("\","); // // BEGIN parameter map // // the token again sb->safePrintf("{"); sb->safePrintf("\"token\":\""); sb->safeMemcpy(token,tokenLen); sb->safePrintf("\","); // the seed url sb->safePrintf("\"seed\":\"%s\",",cr->m_diffbotSeed.getBufStart()); // the api sb->safePrintf("\"api\":\"%s\",",cr->m_diffbotApi.getBufStart()); sb->safePrintf("},"); // // END parameter map // // crawl count. counts non-errors. successful downloads. //sb->safePrintf("\"crawled\":%lli", // cr->m_globalCrawlInfo.m_pageCrawlAttempts); sb->safePrintf("}"); // assume we are the last json object in the array sb->pushChar(']'); } */ //////////////// // // **** THE CRAWLBOT CONTROL PANEL ***** // // . Based on http://diffbot.com/dev/crawl/ page. // . got to /dev/crawl to see this! // //////////////// // generate a random collection name char *getNewCollName ( ) { // char *token , long tokenLen ) { // let's create a new crawl id. dan was making it 32 characters // with 4 hyphens in it for a total of 36 bytes, but since // MAX_COLL_LEN, the maximum length of a collection name, is just // 64 bytes, and the token is already 32, let's limit to 16 bytes // for the crawlerid. so if we print that out in hex, 16 hex chars // 0xffffffff 0xffffffff is 64 bits. so let's make a random 64-bit // value here. unsigned long r1 = rand(); unsigned long r2 = rand(); unsigned long long crawlId64 = (unsigned long long) r1; crawlId64 <<= 32; crawlId64 |= r2; static char s_collBuf[MAX_COLL_LEN+1]; //long tokenLen = gbstrlen(token); // include a +5 for "-test" // include 16 for crawlid (16 char hex #) //if ( tokenLen + 16 + 5>= MAX_COLL_LEN ) { char *xx=NULL;*xx=0;} // ensure the crawlid is the full 16 characters long so we // can quickly extricate the crawlid from the collection name //memcpy ( s_collBuf, token, tokenLen ); //sprintf(s_collBuf + tokenLen ,"-%016llx",crawlId64); sprintf(s_collBuf ,"%016llx",crawlId64); return s_collBuf; } bool printCrawlBotPage ( TcpSocket *s , HttpRequest *hr , SafeBuf *injectionResponse ) { SafeBuf sb; sb.safePrintf( "" "Crawlbot - " "Web Data Extraction and Search Made Easy" "" ); // // if no token... they need to login or signup // char *token = getTokenFromHttpRequest ( hr ); if ( ! token ) { // || tokenLen == 0 ) { sb.safePrintf("In order to use crawlbot you must " "first LOGIN:" "" "
" "- OR -" "
SIGN UP" "" "" "" ); return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length(), -1); // cachetime } //char *crawlId = getCrawlIdFromHttpRequest(); CollectionRec *cr = NULL; // . add new collection? // . make a new collection and it also becomes the cursor long addColl = hr->getLong("addcoll",0); if ( addColl ) cr = addNewDiffbotColl ( hr ); char *delColl = hr->getString("delcoll",NULL,NULL); if ( delColl ) g_collectiondb.deleteRec ( delColl , true ); char *resetColl = hr->getString("resetcoll",NULL,NULL); if ( resetColl ) g_collectiondb.resetColl ( resetColl ); // set this to current collection. if only token was provided // then it will return the first collection owned by token. // if token has no collections it will be NULL. if ( ! cr ) cr = getCollRecFromHttpRequest ( hr ); // if you reset from crawlbot api page then enable spiders if ( resetColl && cr ) cr->m_spideringEnabled = 1; // if no collection, then it is the first time or this token // so automatically add one for them if ( ! cr ) cr = addNewDiffbotColl ( hr ); if ( ! cr ) { log("crawlbot: failed to add new coll rec"); g_msg = " (error: crawlbot failed to allocate crawl)"; return g_httpServer.sendErrorReply(s,500, "crawlbot crawl " "alloc failed?"); } sb.safePrintf("" "" "

" "" "" "Crawlbot" "
" "" "Crawl, Datamine and Index the Web" "" "

" , token ); sb.safePrintf("
"); // first print "add new collection" sb.safePrintf("" "add new collection" " " "" "all collections" " " , token , token ); long tokenLen = gbstrlen(token); // // print list of collections controlled by this token // for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cx = g_collectiondb.m_recs[i]; if ( ! cx ) continue; // get its token if any char *ct = cx->m_diffbotToken.getBufStart(); if ( ! ct ) continue; // skip if token does not match if ( strcmp(ct,token) ) continue; // highlight the tab if it is what we selected bool highlight = false; if ( cx == cr ) highlight = true; char *style = ""; if ( highlight ) { style = "style=text-decoration:none; "; sb.safePrintf ( ""); } // print the crawl id. collection name minus - sb.safePrintf("" "%s" " " , style , cx->m_coll , cx->m_coll ); if ( highlight ) sb.safePrintf(""); } sb.safePrintf ( "
" ); long maxToCrawl = hr->getLongLong("maxtocrawl",-1LL); long maxToProcess = hr->getLongLong("maxtoprocess",-1LL); if ( maxToCrawl != -1 ) cr->m_diffbotMaxToCrawl = maxToCrawl; if ( maxToProcess != -1 ) cr->m_diffbotMaxToProcess = maxToProcess; //char *api = hr->getString("diffbotapi",NULL,NULL); //if ( api ) { // cr->m_diffbotApi.set(api); // cr->m_diffbotApi.nullTerm(); //} #define FMT_HTML 1 #define FMT_XML 2 #define FMT_JSON 3 // . now show stats for the current crawl // . put in xml or json if format=xml or format=json or // xml=1 or json=1 ... char format = FMT_HTML; long pause = hr->getLong("pause",-1); if ( pause == 0 ) cr->m_spideringEnabled = 1; if ( pause == 1 ) cr->m_spideringEnabled = 0; // // show urls being crawled (ajax) (from Spider.cpp) // sb.safePrintf ( "" //"bgcolor=#%s>\n" "\n" ); // the table headers so SpiderRequest::printToTable() works if ( ! SpiderRequest::printTableHeaderSimple(&sb,true) ) return false; // shortcut XmlDoc **docs = g_spiderLoop.m_docs; // first print the spider recs we are spidering for ( long i = 0 ; i < (long)MAX_SPIDERS ; i++ ) { // get it XmlDoc *xd = docs[i]; // skip if empty if ( ! xd ) continue; // sanity check if ( ! xd->m_oldsrValid ) { char *xx=NULL;*xx=0; } // skip if not our coll rec! if ( xd->m_cr != cr ) continue; // grab it SpiderRequest *oldsr = &xd->m_oldsr; // get status char *status = xd->m_statusMsg; // show that if ( ! oldsr->printToTableSimple ( &sb , status,xd) ) return false; } // end the table sb.safePrintf ( "

"// bgcolor=#%s>" "Last 10 URLs (%li spiders active)" //,LIGHT_BLUE //,DARK_BLUE ,(long)g_spiderLoop.m_numSpidersOut); if ( cr->m_spideringEnabled ) sb.safePrintf(" " "" "Pause spiders" "" , cr->m_coll ); else sb.safePrintf(" " "" "Resume spidering" "" , cr->m_coll ); sb.safePrintf("

\n" ); sb.safePrintf ( "
\n" ); // // show stats // if ( format == FMT_HTML ) { sb.safePrintf("
" "

" "" , cr->m_coll ); sb.safePrintf("" "" , cr->m_globalCrawlInfo.m_objectsAdded - cr->m_globalCrawlInfo.m_objectsDeleted , cr->m_globalCrawlInfo.m_urlsHarvested , cr->m_globalCrawlInfo.m_urlsConsidered , cr->m_globalCrawlInfo.m_pageDownloadAttempts , cr->m_globalCrawlInfo.m_pageDownloadSuccesses , cr->m_globalCrawlInfo.m_pageProcessAttempts , cr->m_globalCrawlInfo.m_pageProcessSuccesses ); // spacer column sb.safePrintf("" ); // what diffbot api to use? /* char *api = cr->m_diffbotApi.getBufStart(); char *s[10]; for ( long i = 0 ; i < 10 ; i++ ) s[i] = ""; if ( api && strcmp(api,"all") == 0 ) s[0] = " selected"; if ( api && strcmp(api,"article") == 0 ) s[1] = " selected"; if ( api && strcmp(api,"product") == 0 ) s[2] = " selected"; if ( api && strcmp(api,"image") == 0 ) s[3] = " selected"; if ( api && strcmp(api,"frontpage") == 0 ) s[4] = " selected"; if ( api && strcmp(api,"none") == 0 ) s[5] = " selected"; if ( ! api || ! api[0] ) s[5] = " selected"; */ sb.safePrintf( "" "" "

" "" // this will have to be in crawlinfo too! //"" //"" //"" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "

pages indexed" //"	%lli
Objects Found	%lli
URLs Harvested	%lli
URLs Examined	%lli
Page Download Attempts	%lli
Page Download Successes	%lli
Page Process Attempts	%lli
Page Process Successes	%lli

" "

" " " " " "

" "" /* "" "" "" , s[0] , s[1] , s[2] , s[3] , s[4] , s[5] */ ); sb.safePrintf( // "" "" "" "" "" "" "" "" "" "" "" "" "" // // "" "" "" "" "" "" "" "" "" "" "

" "Diffbot API" "	" "" "
Token:	%s
Download Objects: " "	" "" "json" " " "" "xml" "
Download Urls: " "	" /* "" "json" " " "" "xml" " " */ "" "csv" // "
Download Pages: " "	" "" "json" // "
Max Page Download Successes: " "	" " " "" "
Max Page Process Successes:" "	" " " "" "
" "Use Robots.txt when crawling? " "	" "" "
" "Use spider proxies on AWS? " "	" "" "

" "

" , cr->m_diffbotToken.getBufStart() , cr->m_coll , cr->m_coll //, cr->m_coll //, cr->m_coll , cr->m_coll , cr->m_coll , cr->m_diffbotMaxToCrawl , cr->m_diffbotMaxToProcess ); } unsigned long r1 = rand(); unsigned long r2 = rand(); unsigned long long rand64 = (unsigned long long) r1; rand64 <<= 32; rand64 |= r2; sb.safePrintf( "" // OBJECT search input box "" "" "" "" // PAGE search input box "" "" "" "" // add url input box "" "" "" "" "" "" "" "" "" "

" "Search Objects:" "	" "" "" "" // restrict search to json objects "" " " "" "
" "Search Pages:" "	" "" "" "" // restrict search to NON json objects "" " " "" "
" "Inject Url: " "	" "" " " "" " " " crawl links on this page?" , cr->m_coll , rand64 , cr->m_coll , rand64 ); if ( injectionResponse ) sb.safePrintf(" %s\n" ,injectionResponse->getBufStart() ); sb.safePrintf("" "" "
Upload URLs	" " " " crawl links on those pages?" "

" "
" , cr->m_coll ); // xml or json does not show the input boxes //if ( format != FMT_HTML ) // return g_httpServer.sendDynamicPage ( s, // sb.getBufStart(), // sb.length(), // -1 ); // cachetime // // print url filters. use "multimedia" to handle jpg etc. // // use "notindexable" for images/movies/css etc. // add a "process" column to send to diffbot... // // char *s1 = "Show"; char *s2 = "none"; if ( hr->getLongFromCookie("showtable",0) ) { s1 = "Hide"; s2 = ""; } sb.safePrintf( "" "" "
" "%s URL Filters Table" "
" "" "" "

" "" "

" "
" "
" ); // // add search box to your site // /* sb.safePrintf("
" "" "" "" "" "

" "Add this search box to your site" "" "

"); */ // // show input boxes // sb.safePrintf( "" "" "" "" "" "

" // reset collection form "" // end reset collection form "

" // delete collection form "" // end delete collection form "

" "" , cr->m_coll ); return g_httpServer.sendDynamicPage ( s, sb.getBufStart(), sb.length(), -1 ); // cachetime /* "

API for Diffbot

" "" "
" "

API for Crawlbot

" // "

" "" "

" "

" "Token:

" "API: (article, product)

" "

Page-type

" "

" "" "Page-type is required" "API calls will be made using your current token." "

" "

API Querystring

" "

" "?" "

" "

Seed URL

" "

" "" "
Seed URL is required" "

" "

Crawl Filters

" "

URL Regex

" "

" "" "Diffbot uses Java regex syntax. Be sure to escape your characters." "

" "

Max Pages Crawled

Processing Filters

Smart Processing

Only process pages that match the selected page-type. Uses Page Classifier API.Smart Processing only operates with Diffbot Automatic APIs.

URL Regex

Page-Content Regex

Max Pages Processed

" "" "" "

" ); */ } CollectionRec *addNewDiffbotColl ( HttpRequest *hr ) { char *token = getTokenFromHttpRequest ( hr ); if ( ! token ) { log("crawlbot: need token to add new coll"); return NULL; } char *collBuf = getNewCollName ( );//token , tokenLen ); if ( ! g_collectiondb.addRec ( collBuf , NULL , // copy from 0 , // copy from len true , // it is a brand new one -1 , // we are new, this is -1 false , // is NOT a dump true // save it for sure! ) ) return NULL; // get the collrec CollectionRec *cr = g_collectiondb.getRec ( collBuf ); // did an alloc fail? if ( ! cr ) { char *xx=NULL;*xx=0; } // normalize the seed url //Url norm; //norm.set ( seed ); //cr->m_diffbotSeed.set ( norm.getUrl() ); // remember the token cr->m_diffbotToken.set ( token ); cr->m_diffbotToken.nullTerm(); /* this stuff can be set later. cr->m_diffbotApi.set ( api ); // these are optional, may be NULL cr->m_diffbotApiQueryString.set ( apiQueryString ); cr->m_diffbotUrlCrawlPattern.set ( urlCrawlPattern ); cr->m_diffbotUrlProcessPattern.set ( urlProcessPattern ); cr->m_diffbotPageProcessPattern.set ( pageProcessPattern ); cr->m_diffbotClassify = classify; // let's make these all NULL terminated strings cr->m_diffbotSeed.nullTerm(); cr->m_diffbotApi.nullTerm(); cr->m_diffbotApiQueryString.nullTerm(); cr->m_diffbotUrlCrawlPattern.nullTerm(); cr->m_diffbotUrlProcessPattern.nullTerm(); cr->m_diffbotPageProcessPattern.nullTerm(); */ // do not spider more than this many urls total. -1 means no max. cr->m_diffbotMaxToCrawl = 100000; // do not process more than this. -1 means no max. cr->m_diffbotMaxToProcess = 100000; // this collection should always hit diffbot //cr->m_useDiffbot = true; // show the ban links in the search results. the collection name // is cryptographic enough to show that cr->m_isCustomCrawl = true; // reset the crawl stats cr->m_diffbotCrawlStartTime = gettimeofdayInMillisecondsGlobal(); cr->m_diffbotCrawlEndTime = 0LL; // reset crawler stats. they should be loaded from crawlinfo.txt memset ( &cr->m_localCrawlInfo , 0 , sizeof(CrawlInfo) ); memset ( &cr->m_globalCrawlInfo , 0 , sizeof(CrawlInfo) ); //cr->m_globalCrawlInfoUpdateTime = 0; cr->m_replies = 0; cr->m_requests = 0; // support current web page api i guess for test crawls //cr->m_isDiffbotTestCrawl = false; //char *strange = hr->getString("href",NULL); //if ( strange && strcmp ( strange,"/dev/crawl#testCrawl" ) == 0 ) // cr->m_isDiffbotTestCrawl = true; /////// // // extra diffbot ARTICLE parms // /////// // . ppl mostly use meta, html and tags. // . dropping support for dontStripAds. mike is ok with that. // . use for jsonp requests. needed for cross-domain ajax. //char *callback = hr->getString("callback",NULL); // a download timeout //long timeout = hr->getLong("timeout",5000); // "xml" or "json" char *format = hr->getString("format",NULL,"json"); // save that cr->m_diffbotFormat.safeStrcpy(format); // return all content from page? for frontpage api. // TODO: can we put "all" into "fields="? //bool all = hr->hasField("all"); ///////// // // specify diffbot fields to return in the json output // ///////// // point to the safebuf that holds the fields the user wants to // extract from each url. comma separated list of supported diffbot // fields like "meta","tags", ... SafeBuf *f = &cr->m_diffbotFields; // transcribe provided fields if any char *fields = hr->getString("fields",NULL); // appends those to our field buf if ( fields ) f->safeStrcpy(fields); // if something there push a comma in case we add more below if ( f->length() ) f->pushChar(','); // return contents of the page's meta tags? twitter card metadata, .. if ( hr->hasField("meta" ) ) f->safeStrcpy("meta,"); if ( hr->hasField("html" ) ) f->safeStrcpy("html,"); if ( hr->hasField("tags" ) ) f->safeStrcpy("tags,"); if ( hr->hasField("comments") ) f->safeStrcpy("comments,"); if ( hr->hasField("summary" ) ) f->safeStrcpy("summary,"); if ( hr->hasField("all" ) ) f->safeStrcpy("all,"); // if we added crap to "fields" safebuf remove trailing comma f->removeLastChar(','); // set some defaults. max spiders for all priorities in this collection cr->m_maxNumSpiders = 10; // make the gigablast regex table just "default" so it does not // filtering, but accepts all urls. we will add code to pass the urls // through m_diffbotUrlCrawlPattern alternatively. if that itself // is empty, we will just restrict to the seed urls subdomain. for ( long i = 0 ; i < MAX_FILTERS ; i++ ) { cr->m_regExs[i].purge(); cr->m_spiderPriorities[i] = 0; cr->m_maxSpidersPerRule [i] = 10; cr->m_spiderIpWaits [i] = 250; // 250 ms for now cr->m_spiderIpMaxSpiders[i] = 10; cr->m_spidersEnabled [i] = 1; cr->m_spiderFreqs [i] = 7.0; cr->m_spiderDiffbotApiNum[i] = DBA_NONE; } // // by default to not spider image or movie links or // links with /print/ in them // long i = 0; cr->m_regExs[i].safePrintf("isinjected"); cr->m_spiderPriorities[i] = 49; i++; cr->m_regExs[i].safePrintf("ismedia"); cr->m_spiderPriorities[i] = SPIDER_PRIORITY_FILTERED; i++; // if user did not specify a url crawl pattern then keep // the crawl limited to the same subdomain of the seed url //if ( cr->m_diffbotUrlCrawlPattern.length() == 0 ) { // first limit to http://subdomain cr->m_regExs[i].safePrintf("isonsamedomain");//^http://"); //cr->m_regExs[i].safeMemcpy(norm.getHost(),norm.getHostLen()); //cr->m_regExs[i].pushChar('/'); cr->m_regExs[i].nullTerm(); cr->m_spiderPriorities [i] = 50; cr->m_maxSpidersPerRule [i] = 10; cr->m_spiderIpWaits [i] = 250; // 500 ms for now cr->m_spiderIpMaxSpiders[i] = 10; cr->m_spidersEnabled [i] = 1; i++; // and make all else filtered cr->m_regExs[i].safePrintf("default"); cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED; cr->m_maxSpidersPerRule [i] = 10; cr->m_spiderIpWaits [i] = 250; // 500 ms for now cr->m_spiderIpMaxSpiders[i] = 10; cr->m_spidersEnabled [i] = 1; i++; // just the default rule! cr->m_numRegExs = i; cr->m_numRegExs2 = i; cr->m_numRegExs3 = i; cr->m_numRegExs10 = i; cr->m_numRegExs11 = i; cr->m_numRegExs5 = i; cr->m_numRegExs6 = i; cr->m_numRegExs7 = i; //cr->m_spiderPriorities [1] = -1; // filtered? or banned? //cr->m_maxSpidersPerRule [1] = 10; //cr->m_spiderIpWaits [1] = 500; // 500 ms for now cr->m_needsSave = 1; // start the spiders! cr->m_spideringEnabled = true; return cr; }