// diffbot api implementation // // WHAT APIs are here? // // . 1. the CrawlBot API to start a crawl // . 2. To directly process a provided URL (injection) // . 3. the Cache API so phantomjs can quickly check the cache for files // and quickly add files to the cache. // // Related pages: // // * http://diffbot.com/dev/docs/ (Crawlbot API tab, and others) // * http://diffbot.com/dev/crawl/ #include "Errno.h" #include "PageCrawlBot.h" #include "TcpServer.h" #include "HttpRequest.h" #include "HttpServer.h" #include "Pages.h" // g_msg #include "XmlDoc.h" // for checkRegex() #include "PageInject.h" // Msg7 #include "Repair.h" #include "Parms.h" // so user can specify the format of the reply/output //#define FMT_HTML 1 //#define FMT_XML 2 //#define FMT_JSON 3 //#define FMT_CSV 4 //#define FMT_TXT 5 void doneSendingWrapper ( void *state , TcpSocket *sock ) ; bool sendBackDump ( TcpSocket *s,HttpRequest *hr ); CollectionRec *addNewDiffbotColl ( char *addColl , char *token,char *name , class HttpRequest *hr ) ; bool resetUrlFilters ( CollectionRec *cr ) ; bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket , HttpRequest *hr , CollectionRec *cr ) ; //////////////// // // SUPPORT FOR DOWNLOADING an RDB DUMP // // We ask each shard for 10MB of Spiderdb records. If 10MB was returned // then we repeat. Every time we get 10MB from each shard we print the // Spiderdb records out into "safebuf" and transmit it to the user. once // the buffer has been transmitted then we ask the shards for another 10MB // worth of spider records. // //////////////// // use this as a state while dumping out spiderdb for a collection class StateCD { public: StateCD () { m_needsMime = true; }; void sendBackDump2 ( ) ; bool readDataFromRdb ( ) ; bool sendList ( ) ; void printSpiderdbList ( RdbList *list , SafeBuf *sb , char **lastKeyPtr ) ; void printTitledbList ( RdbList *list , SafeBuf *sb , char **lastKeyPtr ); bool printJsonItemInCsv ( char *json , SafeBuf *sb ) ; int64_t m_lastUh48; int32_t m_lastFirstIp; int64_t m_prevReplyUh48; int32_t m_prevReplyFirstIp; int32_t m_prevReplyError; time_t m_prevReplyDownloadTime; char m_fmt; Msg4 m_msg4; HttpRequest m_hr; Msg7 m_msg7; int32_t m_dumpRound; int64_t m_accumulated; WaitEntry m_waitEntry; bool m_isFirstTime; bool m_printedFirstBracket; bool m_printedEndingBracket; bool m_printedItem; bool m_needHeaderRow; SafeBuf m_seedBank; SafeBuf m_listBuf; bool m_needsMime; char m_rdbId; bool m_downloadJSON; collnum_t m_collnum; int32_t m_numRequests; int32_t m_numReplies; int32_t m_minRecSizes; bool m_someoneNeedsMore; TcpSocket *m_socket; Msg0 m_msg0s[MAX_HOSTS]; key128_t m_spiderdbStartKeys[MAX_HOSTS]; key_t m_titledbStartKeys[MAX_HOSTS]; RdbList m_lists[MAX_HOSTS]; bool m_needMore[MAX_HOSTS]; }; // . basically dump out spiderdb // . returns urls in csv format in reply to a // "GET /api/download/%s_data.json" // "GET /api/download/%s_data.xml" // "GET /api/download/%s_urls.csv" // "GET /api/download/%s_pages.txt" // where %s is the collection name // . the ordering of the urls is not specified so whatever order they are // in spiderdb will do // . the gui that lists the urls as they are spidered in real time when you // do a test crawl will just have to call this repeatedly. it shouldn't // be too slow because of disk caching, and, most likely, the spider requests // will all be in spiderdb's rdbtree any how // . because we are distributed we have to send a msg0 request to each // shard/group asking for all the spider urls. dan says 30MB is typical // for a csv file, so for now we will just try to do a single spiderdb // request. bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) { char *path = hr->getPath(); int32_t pathLen = hr->getPathLen(); char *pathEnd = path + pathLen; char *str = strstr ( path , "/download/" ); if ( ! str ) { char *msg = "bad download request"; log("crawlbot: %s",msg); g_httpServer.sendErrorReply(sock,500,msg); return true; } // when downloading csv socket closes because we can take minutes // before we send over the first byte, so try to keep open //int parm = 1; //if(setsockopt(sock->m_sd,SOL_TCP,SO_KEEPALIVE,&parm,sizeof(int))<0){ // log("crawlbot: setsockopt: %s",mstrerror(errno)); // errno = 0; //} //int32_t pathLen = hr->getPathLen(); char rdbId = RDB_NONE; bool downloadJSON = false; int32_t fmt; char *xx; int32_t dt = CT_JSON; if ( ( xx = strstr ( path , "_data.json" ) ) ) { rdbId = RDB_TITLEDB; fmt = FORMAT_JSON; downloadJSON = true; dt = CT_JSON; } else if ( ( xx = strstr ( path , "_html.json" ) ) ) { rdbId = RDB_TITLEDB; fmt = FORMAT_JSON; downloadJSON = true; dt = CT_HTML; } else if ( ( xx = strstr ( path , "_data.csv" ) ) ) { rdbId = RDB_TITLEDB; downloadJSON = true; fmt = FORMAT_CSV; } else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) { rdbId = RDB_SPIDERDB; fmt = FORMAT_CSV; } else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) { rdbId = RDB_SPIDERDB; fmt = FORMAT_TXT; } else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) { rdbId = RDB_TITLEDB; fmt = FORMAT_TXT; } // sanity, must be one of 3 download calls if ( rdbId == RDB_NONE ) { char *msg ; msg = "usage: downloadurls, downloadpages, downloaddata"; log("crawlbot: %s",msg); g_httpServer.sendErrorReply(sock,500,msg); return true; } char *coll = str + 10; if ( coll >= pathEnd ) { char *msg = "bad download request2"; log("crawlbot: %s",msg); g_httpServer.sendErrorReply(sock,500,msg); return true; } // get coll char *collEnd = xx; //CollectionRec *cr = getCollRecFromHttpRequest ( hr ); CollectionRec *cr = g_collectiondb.getRec ( coll , collEnd - coll ); if ( ! cr ) { char *msg = "token or id (crawlid) invalid"; log("crawlbot: invalid token or crawlid to dump"); g_httpServer.sendErrorReply(sock,500,msg); return true; } // . if doing download of csv, make it search results now! // . make an httprequest on stack and call it if ( fmt == FORMAT_CSV && rdbId == RDB_TITLEDB ) { char tmp2[5000]; SafeBuf sb2(tmp2,5000); int32_t dr = 1; // do not dedup bulk jobs if ( cr->m_isCustomCrawl == 2 ) dr = 0; // do not dedup for crawls either it is too confusing!!!! // ppl wonder where the results are! dr = 0; sb2.safePrintf("GET /search.csv?icc=1&format=csv&sc=0&" // dedup. since stream=1 and pss=0 below // this will dedup on page content hash only // which is super fast. "dr=%"INT32"&" "c=%s&n=1000000&" // stream it now "stream=1&" // no summary similarity dedup, only exact // doc content hash. otherwise too slow!! "pss=0&" // no gigabits "dsrt=0&" // do not compute summary. 0 lines. "ns=0&" "q=gbsortby%%3Agbspiderdate&" "prepend=type%%3Ajson" "\r\n\r\n" , dr , cr->m_coll ); log("crawlbot: %s",sb2.getBufStart()); HttpRequest hr2; hr2.set ( sb2.getBufStart() , sb2.length() , sock ); return sendPageResults ( sock , &hr2 ); } // . if doing download of json, make it search results now! // . make an httprequest on stack and call it if ( fmt == FORMAT_JSON && rdbId == RDB_TITLEDB && dt == CT_HTML ) { char tmp2[5000]; SafeBuf sb2(tmp2,5000); int32_t dr = 1; // do not dedup bulk jobs if ( cr->m_isCustomCrawl == 2 ) dr = 0; // do not dedup for crawls either it is too confusing!!!! // ppl wonder where the results are! dr = 0; sb2.safePrintf("GET /search.csv?icc=1&format=json&sc=0&" // dedup. since stream=1 and pss=0 below // this will dedup on page content hash only // which is super fast. "dr=%"INT32"&" "c=%s&n=1000000&" // we can stream this because unlink csv it // has no header row that needs to be // computed from all results. "stream=1&" // no summary similarity dedup, only exact // doc content hash. otherwise too slow!! "pss=0&" // no gigabits "dsrt=0&" // do not compute summary. 0 lines. "ns=0&" //"q=gbsortby%%3Agbspiderdate&" //"prepend=type%%3A%s" "q=type%%3Ahtml" "\r\n\r\n" , dr , cr->m_coll ); log("crawlbot: %s",sb2.getBufStart()); HttpRequest hr2; hr2.set ( sb2.getBufStart() , sb2.length() , sock ); return sendPageResults ( sock , &hr2 ); } if ( fmt == FORMAT_JSON && rdbId == RDB_TITLEDB ) { char tmp2[5000]; SafeBuf sb2(tmp2,5000); int32_t dr = 1; // do not dedup bulk jobs if ( cr->m_isCustomCrawl == 2 ) dr = 0; // do not dedup for crawls either it is too confusing!!!! // ppl wonder where the results are! dr = 0; sb2.safePrintf("GET /search.csv?icc=1&format=json&sc=0&" // dedup. since stream=1 and pss=0 below // this will dedup on page content hash only // which is super fast. "dr=%"INT32"&" "c=%s&n=1000000&" // we can stream this because unlink csv it // has no header row that needs to be // computed from all results. "stream=1&" // no summary similarity dedup, only exact // doc content hash. otherwise too slow!! "pss=0&" // no gigabits "dsrt=0&" // do not compute summary. 0 lines. "ns=0&" "q=gbsortby%%3Agbspiderdate&" "prepend=type%%3Ajson" "\r\n\r\n" , dr , cr->m_coll ); log("crawlbot: %s",sb2.getBufStart()); HttpRequest hr2; hr2.set ( sb2.getBufStart() , sb2.length() , sock ); return sendPageResults ( sock , &hr2 ); } // . now the urls.csv is also a query on gbss files // . make an httprequest on stack and call it // . only do this for version 3 // i.e. GET /v3/crawl/download/token-collectionname_urls.csv if ( fmt == FORMAT_CSV && rdbId == RDB_SPIDERDB && path[0] == '/' && path[1] == 'v' && path[2] == '3' ) { char tmp2[5000]; SafeBuf sb2(tmp2,5000); // never dedup int32_t dr = 0; // do not dedup for crawls either it is too confusing!!!! // ppl wonder where the results are! dr = 0; sb2.safePrintf("GET /search?" // this is not necessary //"icc=1&" "format=csv&" // no site clustering "sc=0&" // never dedup. "dr=0&" "c=%s&" "n=10000000&" // stream it now // can't stream until we fix headers be printed // in Msg40.cpp. so gbssUrl->Url etc. // mdw: ok should work now "stream=1&" //"stream=0&" // no summary similarity dedup, only exact // doc content hash. otherwise too slow!! "pss=0&" // no gigabits "dsrt=0&" // do not compute summary. 0 lines. //"ns=0&" "q=gbrevsortbyint%%3AgbssSpiderTime+" "gbssIsDiffbotObject%%3A0" "&" //"prepend=type%%3Ajson" "\r\n\r\n" , cr->m_coll ); log("crawlbot: %s",sb2.getBufStart()); HttpRequest hr2; hr2.set ( sb2.getBufStart() , sb2.length() , sock ); return sendPageResults ( sock , &hr2 ); } //if ( strncmp ( path ,"/crawlbot/downloadurls",22 ) == 0 ) // rdbId = RDB_SPIDERDB; //if ( strncmp ( path ,"/crawlbot/downloadpages",23 ) == 0 ) // rdbId = RDB_TITLEDB; //if ( strncmp ( path ,"/crawlbot/downloaddata",22 ) == 0 ) { // downloadJSON = true; // rdbId = RDB_TITLEDB; //} StateCD *st; try { st = new (StateCD); } catch ( ... ) { return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno)); } mnew ( st , sizeof(StateCD), "statecd"); // initialize the new state st->m_rdbId = rdbId; st->m_downloadJSON = downloadJSON; st->m_socket = sock; // the name of the collections whose spiderdb we read from st->m_collnum = cr->m_collnum; st->m_fmt = fmt; st->m_isFirstTime = true; st->m_printedFirstBracket = false; st->m_printedItem = false; st->m_printedEndingBracket = false; // for csv... st->m_needHeaderRow = true; st->m_lastUh48 = 0LL; st->m_lastFirstIp = 0; st->m_prevReplyUh48 = 0LL; st->m_prevReplyFirstIp = 0; st->m_prevReplyError = 0; st->m_prevReplyDownloadTime = 0LL; st->m_dumpRound = 0; st->m_accumulated = 0LL; // debug //log("mnew1: st=%"XINT32"",(int32_t)st); // begin the possible segmented process of sending back spiderdb // to the user's browser st->sendBackDump2(); // i dont think this return values matters at all since httpserver.cpp // does not look at it when it calls sendReply() return true; } // . all wrappers call this // . returns false if would block, true otherwise bool readAndSendLoop ( StateCD *st , bool readFirst ) { subloop: // if we had a broken pipe on the sendChunk() call then hopefully // this will kick in... if ( g_errno ) { log("crawlbot: readAndSendLoop: %s",mstrerror(g_errno)); readFirst = true; st->m_someoneNeedsMore = false; } // wait if some are outstanding. how can this happen? if ( st->m_numRequests > st->m_numReplies ) { log("crawlbot: only got %"INT32" of %"INT32" replies. waiting for " "all to come back in.", st->m_numReplies,st->m_numRequests); return false; } // are we all done? we still have to call sendList() to // set socket's streamingMode to false to close things up if ( readFirst && ! st->m_someoneNeedsMore ) { log("crawlbot: done sending for download request"); mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; return true; } // begin reading from each shard and sending the spiderdb records // over the network. return if that blocked if ( readFirst && ! st->readDataFromRdb ( ) ) return false; // did user delete their collection midstream on us? if ( g_errno ) { log("crawlbot: read shard data had error: %s", mstrerror(g_errno)); goto subloop; } // send it to the browser socket. returns false if blocks. if ( ! st->sendList() ) return false; // read again i guess readFirst = true; // hey, it did not block... tcpserver caches writes... goto subloop; } void StateCD::sendBackDump2 ( ) { m_numRequests = 0; m_numReplies = 0; // read 10MB from each shard's spiderdb at a time //m_minRecSizes = 9999999; // 1ook to be more fluid m_minRecSizes = 99999; // we stop reading from all shards when this becomes false m_someoneNeedsMore = true; // initialize the spiderdb startkey "cursor" for each shard's spiderdb for ( int32_t i = 0 ; i < g_hostdb.m_numShards ; i++ ) { m_needMore[i] = true; KEYMIN((char *)&m_spiderdbStartKeys[i],sizeof(key128_t)); KEYMIN((char *)&m_titledbStartKeys[i],sizeof(key_t)); } // begin reading from the shards and trasmitting back on m_socket readAndSendLoop ( this , true ); } static void gotListWrapper7 ( void *state ) { // get the Crawler dump State StateCD *st = (StateCD *)state; // inc it up here st->m_numReplies++; // wait for all if ( st->m_numReplies < st->m_numRequests ) return; // read and send loop readAndSendLoop( st , false ); } bool StateCD::readDataFromRdb ( ) { // set end key to max key. we are limiting using m_minRecSizes for this key128_t ek; KEYMAX((char *)&ek,sizeof(key128_t)); CollectionRec *cr = g_collectiondb.getRec(m_collnum); // collection got nuked? if ( ! cr ) { log("crawlbot: readdatafromrdb: coll %"INT32" got nuked", (int32_t)m_collnum); g_errno = ENOCOLLREC; return true; } // top: // launch one request to each shard for ( int32_t i = 0 ; i < g_hostdb.m_numShards ; i++ ) { // reset each one m_lists[i].freeList(); // if last list was exhausted don't bother if ( ! m_needMore[i] ) continue; // count it m_numRequests++; // this is the least nice. crawls will yield to it mostly. int32_t niceness = 0; // point to right startkey char *sk ; if ( m_rdbId == RDB_SPIDERDB ) sk = (char *)&m_spiderdbStartKeys[i]; else sk = (char *)&m_titledbStartKeys[i]; // get host Host *h = g_hostdb.getLiveHostInShard(i); // show it int32_t ks = getKeySizeFromRdbId(m_rdbId); log("dump: asking host #%"INT32" for list sk=%s", h->m_hostId,KEYSTR(sk,ks)); // msg0 uses multicast in case one of the hosts in a shard is // dead or dies during this call. if ( ! m_msg0s[i].getList ( h->m_hostId , // use multicast h->m_ip, h->m_port, 0, // maxcacheage false, // addtocache? m_rdbId, cr->m_collnum, &m_lists[i], sk, (char *)&ek, // get at most about // "minRecSizes" worth of spiderdb // records m_minRecSizes, this, gotListWrapper7 , niceness ) ) { log("crawlbot: blocked getting list from shard"); // continue if it blocked continue; } log("crawlbot: did not block getting list from shard err=%s", mstrerror(g_errno)); // we got a reply back right away... m_numReplies++; } // all done? return if still waiting on more msg0s to get their data if ( m_numReplies < m_numRequests ) return false; // i guess did not block, empty single shard? no, must have been // error because sendList() would have sent back on the tcp // socket and blocked and returned false if not error sending return true; } bool StateCD::sendList ( ) { // get the Crawler dump State // inc it //m_numReplies++; // show it log("crawlbot: got list from shard. req=%"INT32" rep=%"INT32"", m_numRequests,m_numReplies); // return if still awaiting more replies if ( m_numReplies < m_numRequests ) return false; SafeBuf sb; //sb.setLabel("dbotdmp"); char *ct = "text/csv"; if ( m_fmt == FORMAT_JSON ) ct = "application/json"; if ( m_fmt == FORMAT_XML ) ct = "text/xml"; if ( m_fmt == FORMAT_TXT ) ct = "text/plain"; if ( m_fmt == FORMAT_CSV ) ct = "text/csv"; // . if we haven't yet sent an http mime back to the user // then do so here, the content-length will not be in there // because we might have to call for more spiderdb data if ( m_needsMime ) { m_needsMime = false; HttpMime mime; mime.makeMime ( -1, // totel content-lenght is unknown! 0 , // do not cache (cacheTime) 0 , // lastModified 0 , // offset -1 , // bytesToSend NULL , // ext false, // POSTReply ct, // "text/csv", // contenttype "utf-8" , // charset -1 , // httpstatus NULL ); //cookie sb.safeMemcpy(mime.getMime(),mime.getMimeLen() ); } //CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); if ( ! m_printedFirstBracket && m_fmt == FORMAT_JSON ) { sb.safePrintf("[\n"); m_printedFirstBracket = true; } // these are csv files not xls //if ( ! m_printedFirstBracket && m_fmt == FORMAT_CSV ) { // sb.safePrintf("sep=,\n"); // m_printedFirstBracket = true; //} // we set this to true below if any one shard has more spiderdb // records left to read m_someoneNeedsMore = false; // // got all replies... create the HTTP reply and send it back // for ( int32_t i = 0 ; i < g_hostdb.m_numShards ; i++ ) { if ( ! m_needMore[i] ) continue; // get the list from that group RdbList *list = &m_lists[i]; // should we try to read more? m_needMore[i] = false; // report it log("dump: got list of %"INT32" bytes from host #%"INT32" round #%"INT32"", list->getListSize(),i,m_dumpRound); if ( list->isEmpty() ) { list->freeList(); continue; } // get the format //char *format = cr->m_diffbotFormat.getBufStart(); //if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL; //char *format = NULL; // this cores because msg0 does not transmit lastkey //char *ek = list->getLastKey(); char *lastKeyPtr = NULL; // now print the spiderdb list out into "sb" if ( m_rdbId == RDB_SPIDERDB ) { // print SPIDERDB list into "sb" printSpiderdbList ( list , &sb , &lastKeyPtr ); // update spiderdb startkey for this shard KEYSET((char *)&m_spiderdbStartKeys[i],lastKeyPtr, sizeof(key128_t)); // advance by 1 m_spiderdbStartKeys[i] += 1; } else if ( m_rdbId == RDB_TITLEDB ) { // print TITLEDB list into "sb" printTitledbList ( list , &sb , &lastKeyPtr ); // update titledb startkey for this shard KEYSET((char *)&m_titledbStartKeys[i],lastKeyPtr, sizeof(key_t)); // advance by 1 m_titledbStartKeys[i] += 1; } else { char *xx=NULL;*xx=0; } // figure out why we do not get the full list???? //if ( list->m_listSize >= 0 ) { // m_minRecSizes ) { m_needMore[i] = true; m_someoneNeedsMore = true; //} // save mem list->freeList(); } m_dumpRound++; //log("rdbid=%"INT32" fmt=%"INT32" some=%"INT32" printed=%"INT32"", // (int32_t)m_rdbId,(int32_t)m_fmt,(int32_t)m_someoneNeedsMore, // (int32_t)m_printedEndingBracket); m_socket->m_streamingMode = true; // if nobody needs to read more... if ( ! m_someoneNeedsMore && ! m_printedEndingBracket ) { // use this for printing out urls.csv as well... m_printedEndingBracket = true; // end array of json objects. might be empty! if ( m_rdbId == RDB_TITLEDB && m_fmt == FORMAT_JSON ) sb.safePrintf("\n]\n"); //log("adding ]. len=%"INT32"",sb.length()); // i'd like to exit streaming mode here. i fixed tcpserver.cpp // so if we are called from makecallback() there it won't // call destroysocket if we WERE in streamingMode just yet m_socket->m_streamingMode = false; } TcpServer *tcp = &g_httpServer.m_tcp; // . transmit the chunk in sb // . steals the allocated buffer from sb and stores in the // TcpSocket::m_sendBuf, which it frees when socket is // ultimately destroyed or we call sendChunk() again. // . when TcpServer is done transmitting, it does not close the // socket but rather calls doneSendingWrapper() which can call // this function again to send another chunk if ( ! tcp->sendChunk ( m_socket , &sb , this , doneSendingWrapper ) ) return false; // we are done sending this chunk, i guess tcp write was cached // in the network card buffer or something return true; } // TcpServer.cpp calls this when done sending TcpSocket's m_sendBuf void doneSendingWrapper ( void *state , TcpSocket *sock ) { StateCD *st = (StateCD *)state; // error on socket? //if ( g_errno ) st->m_socketError = g_errno; //TcpSocket *socket = st->m_socket; st->m_accumulated += sock->m_totalSent; log("crawlbot: done sending on socket %"INT32"/%"INT32" [%"INT64"] bytes", sock->m_totalSent, sock->m_sendBufUsed, st->m_accumulated); readAndSendLoop ( st , true ); return; } void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){ // declare these up here SpiderRequest *sreq = NULL; SpiderReply *srep = NULL; int32_t badCount = 0; int32_t nowGlobalMS = gettimeofdayInMillisecondsGlobal(); CollectionRec *cr = g_collectiondb.getRec(m_collnum); uint32_t lastSpidered = 0; // parse through it for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) { // this record is either a SpiderRequest or SpiderReply char *rec = list->getCurrentRec(); // save it *lastKeyPtr = rec; // we encounter the spiderreplies first then the // spiderrequests for the same url if ( g_spiderdb.isSpiderReply ( (key128_t *)rec ) ) { srep = (SpiderReply *)rec; if ( sreq ) lastSpidered = 0; sreq = NULL; if ( lastSpidered == 0 ) lastSpidered = srep->m_spideredTime; else if ( srep->m_spideredTime > lastSpidered ) lastSpidered = srep->m_spideredTime; m_prevReplyUh48 = srep->getUrlHash48(); m_prevReplyFirstIp = srep->m_firstIp; // 0 means indexed successfully. not sure if // this includes http status codes like 404 etc. // i don't think it includes those types of errors! m_prevReplyError = srep->m_errCode; m_prevReplyDownloadTime = srep->m_spideredTime; continue; } // ok, we got a spider request sreq = (SpiderRequest *)rec; if ( sreq->isCorrupt() ) { log("spider: encountered a corrupt spider req " "when dumping cn=%"INT32". skipping.", (int32_t)cr->m_collnum); continue; } // sanity check if ( srep && srep->getUrlHash48() != sreq->getUrlHash48()){ badCount++; //log("diffbot: had a spider reply with no " // "corresponding spider request for uh48=%"INT64"" // , srep->getUrlHash48()); //char *xx=NULL;*xx=0; } // print the url if not yet printed int64_t uh48 = sreq->getUrlHash48 (); int32_t firstIp = sreq->m_firstIp; bool printIt = false; // there can be multiple spiderrequests for the same url! if ( m_lastUh48 != uh48 ) printIt = true; // sometimes the same url has different firstips now that // we have the EFAKEFIRSTIP spider error to avoid spidering // seeds twice... if ( m_lastFirstIp != firstIp ) printIt = true; if ( ! printIt ) continue; m_lastUh48 = uh48; m_lastFirstIp = firstIp; // make sure spiderreply is for the same url! if ( srep && srep->getUrlHash48() != sreq->getUrlHash48() ) srep = NULL; if ( ! srep ) lastSpidered = 0; bool isProcessed = false; if ( srep ) isProcessed = srep->m_sentToDiffbotThisTime; if ( srep && srep->m_hadDiffbotError ) isProcessed = false; // debug point //if ( strstr(sreq->m_url,"chief") ) // log("hey"); // 1 means spidered, 0 means not spidered, -1 means error int32_t status = 1; // if unspidered, then we don't match the prev reply // so set "status" to 0 to indicate hasn't been // downloaded yet. if ( m_lastUh48 != m_prevReplyUh48 ) status = 0; if ( m_lastFirstIp != m_prevReplyFirstIp ) status = 0; // if it matches, perhaps an error spidering it? if ( status && m_prevReplyError ) status = -1; // use the time it was added to spiderdb if the url // was not spidered time_t time = sreq->m_addedTime; // if it was spidered, successfully or got an error, // then use the time it was spidered if ( status ) time = m_prevReplyDownloadTime; char *msg = "Successfully Downloaded";//Crawled"; if ( status == 0 ) msg = "Not downloaded";//Unexamined"; if ( status == -1 ) { msg = mstrerror(m_prevReplyError); // do not print "Fake First Ip"... if ( m_prevReplyError == EFAKEFIRSTIP ) msg = "Initial crawl request"; // if the initial crawl request got a reply then that // means the spiderrequest was added under the correct // firstip... so skip it. i am assuming that the // correct spidrerequest got added ok here... if ( m_prevReplyError == EFAKEFIRSTIP ) continue; } if ( srep && srep->m_hadDiffbotError ) msg = "Diffbot processing error"; // indicate specific diffbot error if we have it if ( srep && srep->m_hadDiffbotError && srep->m_errCode && // stick with "diffbot processing error" for these... srep->m_errCode != EDIFFBOTINTERNALERROR ) msg = mstrerror(srep->m_errCode); // matching url filter, print out the expression int32_t ufn ; ufn = ::getUrlFilterNum(sreq, srep, nowGlobalMS, false, MAX_NICENESS, cr, false, // isoutlink? NULL, -1); // langIdArg char *expression = NULL; int32_t priority = -4; // sanity check if ( ufn >= 0 ) { expression = cr->m_regExs[ufn].getBufStart(); priority = cr->m_spiderPriorities[ufn]; } if ( ! expression ) { expression = "error. matches no expression!"; priority = -4; } // when spidering rounds we use the // lastspidertime>={roundstart} --> spiders disabled rule // so that we do not spider a url twice in the same round if ( ufn >= 0 && //! cr->m_spidersEnabled[ufn] ) { cr->m_regExs[ufn].length() && // we set this to 0 instead of using the checkbox strstr(cr->m_regExs[ufn].getBufStart(),"round") ) { //cr->m_maxSpidersPerRule[ufn] <= 0 ) { priority = -5; } char *as = "discovered"; if ( sreq && ( sreq->m_isInjecting || sreq->m_isAddUrl ) ) { as = "manually added"; } // print column headers? if ( m_isFirstTime ) { m_isFirstTime = false; sb->safePrintf("\"Url\"," "\"Entry Method\"," ); if ( cr->m_isCustomCrawl ) sb->safePrintf("\"Processed?\","); sb->safePrintf( "\"Add Time\"," "\"Last Crawled\"," "\"Last Status\"," "\"Matching Expression\"," "\"Matching Action\"\n"); } // "csv" is default if json not specified if ( m_fmt == FORMAT_JSON ) sb->safePrintf("[{" "{\"url\":" "\"%s\"}," "{\"time\":" "\"%"UINT32"\"}," "{\"status\":" "\"%"INT32"\"}," "{\"statusMsg\":" "\"%s\"}" "}]\n" , sreq->m_url // when was it first added to spiderdb? , sreq->m_addedTime , status , msg ); // but default to csv else { if (cr && cr->m_isCustomCrawl == 1 && sreq && !sreq->m_isAddUrl && !sreq->m_isInjecting) { if (cr->m_diffbotUrlCrawlPattern.m_length == 0 && cr->m_diffbotUrlProcessPattern.m_length == 0) { // If a crawl and there are no urlCrawlPattern or urlCrawlRegEx values, only return URLs from seed domain if (sreq && !sreq->m_sameDom) continue; } else { // TODO: if we get here, we have a crawl with a custom urlCrawlPattern and/or custom // urlProcessPattern. We have to check if the current url matches the pattern } } sb->safePrintf("\"%s\",\"%s\"," , sreq->m_url , as ); if ( cr->m_isCustomCrawl ) sb->safePrintf("%"INT32",",(int32_t)isProcessed); sb->safePrintf( "%"UINT32",%"UINT32",\"%s\",\"%s\",\"" //",%s" //"\n" // when was it first added to spiderdb? , sreq->m_addedTime // last time spidered, 0 if none , lastSpidered //, status , msg // the url filter expression it matches , expression // the priority //, priorityMsg //, iptoa(sreq->m_firstIp) ); // print priority //if ( priority == SPIDER_PRIORITY_FILTERED ) // we just turn off the spiders now if ( ufn >= 0 && cr->m_maxSpidersPerRule[ufn] <= 0 ) sb->safePrintf("url ignored"); //else if ( priority == SPIDER_PRIORITY_BANNED ) // sb->safePrintf("url banned"); else if ( priority == -4 ) sb->safePrintf("error"); else if ( priority == -5 ) sb->safePrintf("will spider next round"); else sb->safePrintf("%"INT32"",priority); sb->safePrintf("\"" "\n"); } } if ( ! badCount ) return; log("diffbot: had a spider reply with no " "corresponding spider request %"INT32" times", badCount); } void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){ XmlDoc xd; CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); // save it *lastKeyPtr = NULL; // parse through it for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) { // this record is either a SpiderRequest or SpiderReply char *rec = list->getCurrentRec(); // skip ifnegative if ( (rec[0] & 0x01) == 0x00 ) continue; // set it *lastKeyPtr = rec; // reset first since set2() can't call reset() xd.reset(); // uncompress it if ( ! xd.set2 ( rec , 0, // maxSize unused cr->m_coll , NULL , // ppbuf 0 , // niceness NULL ) ) { // spiderRequest log("diffbot: error setting titlerec in dump"); continue; } // must be of type json to be a diffbot json object if ( m_downloadJSON && xd.m_contentType != CT_JSON ) continue; // or if downloading web pages... if ( ! m_downloadJSON ) { // skip if json object content type if ( xd.m_contentType == CT_JSON ) continue; // . just print the cached page // . size should include the \0 sb->safeStrcpy ( xd.m_firstUrl.m_url); // then \n sb->pushChar('\n'); // then page content sb->safeStrcpy ( xd.ptr_utf8Content ); // null term just in case //sb->nullTerm(); // separate pages with \0 i guess sb->pushChar('\0'); // \n sb->pushChar('\n'); continue; } // skip if not a diffbot json url if ( ! xd.m_isDiffbotJSONObject ) continue; // get the json content char *json = xd.ptr_utf8Content; // empty? if ( xd.size_utf8Content <= 1 ) continue; // if not json, just print the json item out in csv // moved into PageResults.cpp... //if ( m_fmt == FORMAT_CSV ) { // printJsonItemInCsv ( json , sb ); // continue; //} // just print that out. encode \n's and \r's back to \\n \\r // and backslash to a \\ ... // but if they originally had a \u encoding and // we made into utf8, do not put that back into the \u // encoding because it is not necessary. // print in json if ( m_printedItem ) sb->safePrintf("\n,\n"); m_printedItem = true; //if ( ! sb->safeStrcpyPrettyJSON ( json ) ) // log("diffbot: error printing json in dump"); sb->safeStrcpy ( json ); sb->nullTerm(); // separate each JSON object with \n i guess //sb->pushChar('\n'); } } /* //////////////// // // SUPPORT FOR GET /api/crawls and /api/activecrawls // // Just scan each collection record whose collection name includes the // provided "token" of the user. then print out the stats of just // //////////////// // example output for http://live.diffbot.com/api/crawls?token=matt // [{"id":"c421f09d-7c31-4131-9da2-21e35d8130a9","finish":1378233585887,"matched":274,"status":"Stopped","start":1378233159848,"token":"matt","parameterMap":{"token":"matt","seed":"www.techcrunch.com","api":"article"},"crawled":274}] // example output from activecrawls?id=.... // {"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":null,"matched":27,"status":"Crawling","start":1378322184332,"token":"matt","parameterMap":{"token":"matt","seed":"www.alleyinsider.com","api":"article"},"crawled":34} // NOTE: it does not seem to include active crawls! bad!! like if you lost // the crawlid... // "cr" is NULL if showing all crawls! bool showAllCrawls ( TcpSocket *s , HttpRequest *hr ) { int32_t tokenLen = 0; char *token = hr->getString("token",&tokenLen); // token MUST be there because this function's caller checked for it if ( ! token ) { char *xx=NULL;*xx=0; } // store the crawl stats as html into "sb" SafeBuf sb; // scan the collection recs for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // get it CollectionRec *cr = g_collectiondb.m_recs[i]; // skip if empty if ( ! cr ) continue; // get name char *coll = cr->m_coll; //int32_t collLen = cr->m_collLen; // skip if first 16 or whatever characters does not match // the user token because the name of a collection is // - if ( coll[0] != token[0] ) continue; if ( coll[1] != token[1] ) continue; if ( coll[2] != token[2] ) continue; // scan the rest bool match = true; for ( int32_t i = 3 ; coll[i] && token[i] ; i++ ) { // the name of a collection is - // so if we hit the hyphen we are done if ( coll[i] == '-' ) break; if ( coll[i] != token[i] ) { match = false; break; } } if ( ! match ) continue; // we got a match, print them out printCrawlStats ( &sb , cr ); } // and send back now return g_httpServer.sendDynamicPage (s, sb.getBufStart(), sb.length(), -1);// cachetime } */ /* char *getTokenFromHttpRequest ( HttpRequest *hr ) { // provided directly? char *token = hr->getString("token",NULL,NULL); if ( token ) return token; // extract token from coll? char *c = hr->getString("c",NULL,NULL); // try new "id" approach if ( ! c ) c = hr->getString("id",NULL,NULL); if ( ! c ) return NULL; CollectionRec *cr = g_collectiondb.getRec(c); if ( ! cr ) return NULL; if ( cr->m_diffbotToken.length() <= 0 ) return NULL; token = cr->m_diffbotToken.getBufStart(); return token; } CollectionRec *getCollRecFromHttpRequest ( HttpRequest *hr ) { // if we have the collection name explicitly, get the coll rec then char *c = hr->getString("c",NULL,NULL); // try new "id" approach if ( ! c ) c = hr->getString("id",NULL,NULL); if ( c ) return g_collectiondb.getRec ( c ); // no matches return NULL; } */ /* // doesn't have to be fast, so just do a scan CollectionRec *getCollRecFromCrawlId ( char *crawlId ) { int32_t idLen = gbstrlen(crawlId); // scan collection names for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // get it CollectionRec *cr = g_collectiondb.m_recs[i]; // skip if empty if ( ! cr ) continue; // get name char *coll = cr->m_coll; int32_t collLen = cr->m_collLen; if ( collLen < 16 ) continue; // skip if first 16 or whatever characters does not match // the user token because the name of a collection is // - if ( coll[collLen-1] != crawlId[idLen-1] ) continue; if ( coll[collLen-2] != crawlId[idLen-2] ) continue; if ( coll[collLen-3] != crawlId[idLen-3] ) continue; if ( ! strstr ( coll , crawlId ) ) continue; return cr; } return NULL; } void printCrawlStatsWrapper ( void *state ) { StateXX *sxx = (StateXX *)state; // get collection rec CollectionRec *cr = g_collectiondb.getRec(sxx->m_collnum); // print out the crawl SafeBuf sb; printCrawlStats ( &sb , cr ); // save before nuking state TcpSocket *sock = sxx->m_socket; // nuke the state mdelete ( sxx , sizeof(StateXX) , "stxx" ); delete sxx; // and send back now g_httpServer.sendDynamicPage ( sock , sb.getBufStart(), sb.length(), -1 ); // cachetime } void printCrawlStats ( SafeBuf *sb , CollectionRec *cr ) { // if we are the first, print a '[' to start a json thingy if ( sb->length() == 0 ) sb->pushChar('['); // otherwise, remove the previous ']' since we are not the last else { char *p = sb->getBufStart(); int32_t plen = sb->length(); if ( p[plen-1]=='[' ) sb->incrementLength(-1); } sb->safePrintf( "{" "\"id\":\"" ); // get the token from coll name char *token = cr->m_coll; // and the length, up to the hyphen that separates it from crawl id int32_t tokenLen = 0; for ( ; token[tokenLen] && token[tokenLen] != '-' ; tokenLen++ ); // now crawl id char *crawlId = token + tokenLen; // skip hyphen if ( crawlId[0] == '-' ) crawlId++; // print crawl id out sb->safeStrcpy ( crawlId ); // end its quote sb->safeStrcpy ( "\","); // now the time the crawl finished. if ( cr->m_spideringEnabled ) sb->safePrintf("\"finish\":null,"); else sb->safePrintf("\"finish\":%"INT64",",cr->m_diffbotCrawlEndTime); // how many urls we handoff to diffbot api. that implies successful // download and that it matches the url crawl pattern and // url process pattern and content regular expression pattern. // // NOTE: pageProcessAttempts can be higher than m_pageDownloadAttempts // when we call getMetaList() on an *old* (in titledb) xmldoc, // where we just get the cached content from titledb to avoid a // download, but we still call getDiffbotReply(). perhaps reconstruct // the diffbot reply from XmlDoc::m_diffbotJSONCount // // "processed" here corresponds to the "maxProcessed" cgi parm // specified when instantiating the crawl parms for the first time. // // likewise "crawled" corresponds to "maxCrawled" // sb->safePrintf("\"processedAttempts\":%"INT64",", cr->m_globalCrawlInfo.m_pageProcessAttempts); sb->safePrintf("\"processed\":%"INT64",", cr->m_globalCrawlInfo.m_pageProcessSuccesses); sb->safePrintf("\"crawlAttempts\":%"INT64",", cr->m_globalCrawlInfo.m_pageDownloadAttempts); sb->safePrintf("\"crawled\":%"INT64",", cr->m_globalCrawlInfo.m_pageDownloadSuccesses); sb->safePrintf("\"urlsConsidered\":%"INT64",", cr->m_globalCrawlInfo.m_urlsConsidered); // how many spiders outstanding for this coll right now? SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum); int32_t spidersOut = sc->getTotalOutstandingSpiders(); // . status of the crawl: "Stopped" or "Active"? // . TODO: check with dan to see if Active is correct and // ShuttingDown is allowable if ( cr->m_spideringEnabled ) sb->safePrintf("\"status\":\"Active\","); else if ( spidersOut ) sb->safePrintf("\"status\":\"ShuttingDown\","); else sb->safePrintf("\"status\":\"Stopped\","); // spider crawl start time sb->safePrintf("\"start\":%"INT64",",cr->m_diffbotCrawlStartTime); // the token sb->safePrintf("\"token\":\""); sb->safeMemcpy(token,tokenLen); sb->safePrintf("\","); // // BEGIN parameter map // // the token again sb->safePrintf("{"); sb->safePrintf("\"token\":\""); sb->safeMemcpy(token,tokenLen); sb->safePrintf("\","); // the seed url sb->safePrintf("\"seed\":\"%s\",",cr->m_diffbotSeed.getBufStart()); // the api sb->safePrintf("\"api\":\"%s\",",cr->m_diffbotApi.getBufStart()); sb->safePrintf("},"); // // END parameter map // // crawl count. counts non-errors. successful downloads. //sb->safePrintf("\"crawled\":%"INT64"", // cr->m_globalCrawlInfo.m_pageCrawlAttempts); sb->safePrintf("}"); // assume we are the last json object in the array sb->pushChar(']'); } */ //////////////// // // **** THE CRAWLBOT CONTROL PANEL ***** // // . Based on http://diffbot.com/dev/crawl/ page. // . got to /dev/crawl to see this! // //////////////// /* // generate a random collection name char *getNewCollName ( ) { // char *token , int32_t tokenLen ) { // let's create a new crawl id. dan was making it 32 characters // with 4 hyphens in it for a total of 36 bytes, but since // MAX_COLL_LEN, the maximum length of a collection name, is just // 64 bytes, and the token is already 32, let's limit to 16 bytes // for the crawlerid. so if we print that out in hex, 16 hex chars // 0xffffffff 0xffffffff is 64 bits. so let's make a random 64-bit // value here. uint32_t r1 = rand(); uint32_t r2 = rand(); uint64_t crawlId64 = (uint64_t) r1; crawlId64 <<= 32; crawlId64 |= r2; static char s_collBuf[MAX_COLL_LEN+1]; //int32_t tokenLen = gbstrlen(token); // include a +5 for "-test" // include 16 for crawlid (16 char hex #) //if ( tokenLen + 16 + 5>= MAX_COLL_LEN ) { char *xx=NULL;*xx=0;} // ensure the crawlid is the full 16 characters long so we // can quickly extricate the crawlid from the collection name //gbmemcpy ( s_collBuf, token, tokenLen ); //sprintf(s_collBuf + tokenLen ,"-%016"XINT64"",crawlId64); sprintf(s_collBuf ,"%016"XINT64"",crawlId64); return s_collBuf; } */ ////////////////////////////////////////// // // MAIN API STUFF I GUESS // ////////////////////////////////////////// bool sendReply2 (TcpSocket *socket , int32_t fmt , char *msg ) { // log it log("crawlbot: %s",msg); char *ct = "text/html"; // send this back to browser SafeBuf sb; if ( fmt == FORMAT_JSON ) { sb.safePrintf("{\n\"response\":\"success\",\n" "\"message\":\"%s\"\n}\n" , msg ); ct = "application/json"; } else sb.safePrintf("" "success: %s" "" , msg ); //return g_httpServer.sendErrorReply(socket,500,sb.getBufStart()); return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), sb.length(), 0, // cachetime false, // POST reply? ct); } bool sendErrorReply2 ( TcpSocket *socket , int32_t fmt , char *msg ) { // log it log("crawlbot: sending back 500 http status '%s'",msg); char *ct = "text/html"; // send this back to browser SafeBuf sb; if ( fmt == FORMAT_JSON ) { sb.safePrintf("{\"error\":\"%s\"}\n" , msg ); ct = "application/json"; } else sb.safePrintf("" "failed: %s" "" , msg ); // log it //log("crawlbot: %s",msg ); //return g_httpServer.sendErrorReply(socket,500,sb.getBufStart()); return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), sb.length(), 0, // cachetime false, // POST reply? ct , 500 ); // error! not 200... } bool printCrawlBotPage2 ( class TcpSocket *s , class HttpRequest *hr , char fmt, class SafeBuf *injectionResponse , class SafeBuf *urlUploadResponse , collnum_t collnum ) ; void addedUrlsToSpiderdbWrapper ( void *state ) { StateCD *st = (StateCD *)state; SafeBuf rr; rr.safePrintf("Successfully added urls for spidering."); printCrawlBotPage2 ( st->m_socket, &st->m_hr , st->m_fmt, NULL , &rr , st->m_collnum ); mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; //log("mdel2: st=%"XINT32"",(int32_t)st); } /* void injectedUrlWrapper ( void *state ) { StateCD *st = (StateCD *)state; Msg7 *msg7 = &st->m_msg7; // the doc we injected... XmlDoc *xd = &msg7->m_xd; // make a status msg for the url SafeBuf sb; SafeBuf js; // for json reply if ( xd->m_indexCode == 0 ) { sb.safePrintf("" "Successfully added "); js.safePrintf("Seed Successful. "); } else if ( xd->m_indexCode == EDOCFILTERED ) { sb.safePrintf("" "Error: %s by matching " "url filter #%"INT32" " "when adding " , mstrerror(xd->m_indexCode) // divide by 2 because we add a // "manualadd &&" rule with every url filter // that the client adds , (xd->m_urlFilterNum - 2) / 2 ); js.safePrintf("Seed URL filtered by URL filter #%"INT32"" , (xd->m_urlFilterNum - 2) / 2 ); } else { sb.safePrintf("" "Error: %s when adding " , mstrerror(xd->m_indexCode) ); js.safePrintf("Error adding seed url: %s" , mstrerror(xd->m_indexCode) ); } sb.safeTruncateEllipsis(xd->m_firstUrl.getUrl(),60); if ( xd->m_indexCode == 0 ) { if ( xd->m_numOutlinksAddedValid ) { sb.safePrintf("   (added %"INT32" outlinks)" ,(int32_t)xd->m_numOutlinksAdded); js.safePrintf("Added %"INT32" outlinks from same domain. " "%"INT32" outlinks were filtered." ,(int32_t)xd->m_numOutlinksAddedFromSameDomain ,(int32_t)xd->m_numOutlinksFiltered ); } else { sb.safePrintf("   (added 0 outlinks)"); js.safePrintf("Added 0 outlinks from same domain. " "0 links were filtered." ); } } sb.safePrintf(""); sb.nullTerm(); js.nullTerm(); // send back the html or json response? SafeBuf *response = &sb; if ( st->m_fmt == FORMAT_JSON ) response = &js; // . this will call g_httpServer.sendReply() // . pass it in the injection response, "sb" printCrawlBotPage2 ( st->m_socket, &st->m_hr , st->m_fmt, response, NULL , st->m_collnum ); mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; } */ class HelpItem { public: char *m_parm; char *m_desc; }; static class HelpItem s_his[] = { {"format","Use &format=html to show HTML output. Default is JSON."}, {"token","Required for all operations below."}, {"name","Name of the crawl. If missing will just show " "all crawls owned by the given token."}, {"delete=1","Deletes the crawl."}, {"reset=1","Resets the crawl. Removes all seeds."}, {"restart=1","Restarts the crawl. Keeps the seeds."}, {"pause", "Specify 1 or 0 to pause or resume the crawl respectively."}, {"repeat","Specify number of days as floating point to " "recrawl the pages. Set to 0.0 to NOT repeat the crawl."}, {"crawlDelay","Wait this many seconds between crawling urls from the " "same IP address. Can be a floating point number."}, //{"deleteCrawl","Same as delete."}, //{"resetCrawl","Same as delete."}, //{"pauseCrawl","Same as pause."}, //{"repeatCrawl","Same as repeat."}, {"seeds","Whitespace separated list of URLs used to seed the crawl. " "Will only follow outlinks on the same domain of seed URLs." }, {"spots", "Whitespace separated list of URLs to add to the crawl. " "Outlinks will not be followed." }, {"urls", "Same as spots."}, //{"spiderLinks","Use 1 or 0 to spider the links or NOT spider " // "the links, respectively, from " // "the provided seed or addUrls parameters. " // "The default is 1."}, {"maxToCrawl", "Specify max pages to successfully download."}, //{"maxToDownload", "Specify max pages to successfully download."}, {"maxToProcess", "Specify max pages to successfully process through " "diffbot."}, {"maxRounds", "Specify maximum number of crawl rounds. Use " "-1 to indicate no max."}, {"onlyProcessIfNew", "Specify 1 to avoid re-processing pages " "that have already been processed once before."}, {"notifyEmail","Send email alert to this email when crawl hits " "the maxtocrawl or maxtoprocess limit, or when the crawl " "completes."}, {"notifyWebhook","Fetch this URL when crawl hits " "the maxtocrawl or maxtoprocess limit, or when the crawl " "completes."}, {"obeyRobots","Obey robots.txt files?"}, //{"restrictDomain","Restrict downloaded urls to domains of seeds?"}, {"urlCrawlPattern","List of || separated strings. If the url " "contains any of these then we crawl the url, otherwise, we do not. " "An empty pattern matches all urls."}, {"urlProcessPattern","List of || separated strings. If the url " "contains any of these then we send url to diffbot for processing. " "An empty pattern matches all urls."}, {"pageProcessPattern","List of || separated strings. If the page " "contains any of these then we send it to diffbot for processing. " "An empty pattern matches all pages."}, {"urlCrawlRegEx","Regular expression that the url must match " "in order to be crawled. If present then the urlCrawlPattern will " "be ignored. " "An empty regular expression matches all urls."}, {"urlProcessRegEx","Regular expression that the url must match " "in order to be processed. " "If present then the urlProcessPattern will " "be ignored. " "An empty regular expression matches all urls."}, {"apiUrl","Diffbot api url to use. We automatically append " "token and url to it."}, //{"expression","A pattern to match in a URL. List up to 100 " // "expression/action pairs in the HTTP request. " // "Example expressions:"}, //{"action","Take the appropriate action when preceeding pattern is " // "matched. Specify multiple expression/action pairs to build a " // "table of filters. Each URL being spidered will take the given " // "action of the first expression it matches. Example actions:"}, {NULL,NULL} }; /* // get the input string from the httprequest or the json post char *getInputString ( char *string , HttpRequest *hr , Json *JS ) { // try to get it from http request char *val = hr->getString(string); // if token in json post, use that if ( ! val ) { JsonItem *ji = JS.getItem(string); if ( ji ) val = ji->getValue(); } return val; } */ void collOpDoneWrapper ( void *state ) { StateCD *st = (StateCD *)state; TcpSocket *socket = st->m_socket; log("crawlbot: done with blocked op."); mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; //log("mdel3: st=%"XINT32"",(int32_t)st); g_httpServer.sendDynamicPage (socket,"OK",2); } // . when we receive the request from john we call broadcastRequest() from // Pages.cpp. then msg28 sends this replay with a &cast=0 appended to it // to every host in the network. then when msg28 gets back replies from all // those hosts it calls sendPageCrawlbot() here but without a &cast=0 // . so if no &cast is present we are the original!!! bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) { // print help int32_t help = hr->getLong("help",0); if ( help ) { SafeBuf sb; sb.safePrintf("" "Crawlbot API" "

Crawlbot API

" "Use the parameters below on the " "/crawlbot page." "

" "" ); for ( int32_t i = 0 ; i < 1000 ; i++ ) { HelpItem *h = &s_his[i]; if ( ! h->m_parm ) break; sb.safePrintf( "" "" "" "" , h->m_parm , h->m_desc ); } sb.safePrintf("
%s%s
" ""); return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), sb.length(), 0); // cachetime } // . Pages.cpp by default broadcasts all PageCrawlbot /crawlbot // requests to every host in the network unless a cast=0 is // explicitly given // . Msg28::massConfig() puts a &cast=0 on the secondary requests // sent to each host in the network //int32_t cast = hr->getLong("cast",1); // httpserver/httprequest should not try to decode post if // it's application/json. //char *json = hr->getPOST(); //Json JS; //if ( json ) JS.parseJsonStringIntoJsonItems ( json ); // . now show stats for the current crawl // . put in xml or json if format=xml or format=json or // xml=1 or json=1 ... char fmt = FORMAT_JSON; // token is always required. get from json or html form input //char *token = getInputString ( "token" ); char *token = hr->getString("token"); char *name = hr->getString("name"); // . try getting token-name from ?c= // . the name of the collection is encoded as - char *c = hr->getString("c"); char tmp[MAX_COLL_LEN+100]; if ( ! token && c ) { strncpy ( tmp , c , MAX_COLL_LEN ); token = tmp; name = strstr(tmp,"-"); if ( name ) { *name = '\0'; name++; } // change default formatting to html fmt = FORMAT_HTML; } if (token){ for ( int32_t i = 0 ; i < gbstrlen(token) ; i++ ){ token[i]=tolower(token[i]); } } char *fs = hr->getString("format",NULL,NULL); // give john a json api if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML; if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON; if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML; // if we got json as input, give it as output //if ( JS.getFirstItem() ) fmt = FORMAT_JSON; if ( ! token && fmt == FORMAT_JSON ) { // (cast==0|| fmt == FORMAT_JSON ) ) { char *msg = "invalid token"; return sendErrorReply2 (socket,fmt,msg); } if ( ! token ) { // print token form if html SafeBuf sb; sb.safePrintf("In order to use crawlbot you must " "first LOGIN:" "
" "
" "" "" "
" "
" "- OR -" "
SIGN UP" "
" "Name: " "
" "Email: " "
" "" "
" "" ""); return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), sb.length(), 0); // cachetime } if ( gbstrlen(token) > 32 ) { //log("crawlbot: token is over 32 chars"); char *msg = "crawlbot: token is over 32 chars"; return sendErrorReply2 (socket,fmt,msg); } char *seeds = hr->getString("seeds"); char *spots = hr->getString("spots"); // just existence is the operation //bool delColl = hr->hasField("deleteCrawl"); //bool resetColl = hr->hasField("resetCrawl"); // /v2/bulk api support: if ( ! spots ) spots = hr->getString("urls"); if ( spots && ! spots[0] ) spots = NULL; if ( seeds && ! seeds[0] ) seeds = NULL; //if ( ! delColl ) delColl = hr->hasField("delete"); //if ( ! resetColl ) resetColl = hr->hasField("reset"); bool restartColl = hr->hasField("restart"); //if ( delColl && ! && cast == 0 ) { // log("crawlbot: no collection found to delete."); // char *msg = "Could not find crawl to delete."; // return sendErrorReply2 (socket,fmt,msg); //} // just send back a list of all the collections after the delete //if ( delColl && cast && fmt == FORMAT_JSON ) { // char *msg = "Collection deleted."; // return sendReply2 (socket,fmt,msg); //} // default name to next available collection crawl name in the // case of a delete operation... char *msg = NULL; if ( hr->hasField("delete") ) msg = "deleted"; // need to re-add urls for a restart //if ( hr->hasField("restart") ) msg = "restarted"; if ( hr->hasField("reset") ) msg = "reset"; if ( msg ) { // delColl && cast ) { // this was deleted... so is invalid now name = NULL; // no longer a delete function, we need to set "name" below //delColl = false;//NULL; // john wants just a brief success reply SafeBuf tmp; tmp.safePrintf("{\"response\":\"Successfully %s job.\"}", msg); char *reply = tmp.getBufStart(); if ( ! reply ) { if ( ! g_errno ) g_errno = ENOMEM; return sendErrorReply2(socket,fmt,mstrerror(g_errno)); } return g_httpServer.sendDynamicPage( socket, reply, gbstrlen(reply), 0, // cacheTime false, // POSTReply? "application/json" ); } // if name is missing default to name of first existing // collection for this token. for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { // cast if ( name ) break; // do not do this if doing an // injection (seed) or add url or del coll or reset coll !! if ( seeds ) break; if ( spots ) break; //if ( delColl ) break; //if ( resetColl ) break; if ( restartColl ) break; CollectionRec *cx = g_collectiondb.m_recs[i]; // deleted collections leave a NULL slot if ( ! cx ) continue; // skip if token does not match if ( strcmp ( cx->m_diffbotToken.getBufStart(),token) ) continue; // got it name = cx->m_diffbotCrawlName.getBufStart(); break; } if ( ! name ) { // if the token is valid char *ct = "application/json"; char *msg = "{}\n"; return g_httpServer.sendDynamicPage ( socket, msg, gbstrlen(msg) , -1 , // cachetime false , ct , 200 ); // http status //log("crawlbot: no crawl name given"); //char *msg = "invalid or missing name"; //return sendErrorReply2 (socket,fmt,msg); } if ( gbstrlen(name) > 30 ) { //log("crawlbot: name is over 30 chars"); char *msg = "crawlbot: name is over 30 chars"; return sendErrorReply2 (socket,fmt,msg); } // make the collection name so it includes the token and crawl name char collName[MAX_COLL_LEN+1]; // sanity if ( MAX_COLL_LEN < 64 ) { char *xx=NULL;*xx=0; } // make a compound name for collection of token and name sprintf(collName,"%s-%s",token,name); // if they did not specify the token/name of an existing collection // then cr will be NULL and we'll add it below CollectionRec *cr = g_collectiondb.getRec(collName); // i guess bail if not there? if ( ! cr ) { log("crawlbot: missing coll rec for coll %s",collName); //char *msg = "invalid or missing collection rec"; char *msg = "Could not create job because missing seeds or " "urls."; return sendErrorReply2 (socket,fmt,msg); } // if no token... they need to login or signup //char *token = getTokenFromHttpRequest ( hr ); // get coll name if any //char *c = hr->getString("c"); //if ( ! c ) c = hr->getString("id"); // get some other parms provided optionally //char *addColl = hr->getString("addcoll"); // try json //if ( JS.getInputString("addNewCrawl") ) addColl = collName; //if ( JS.getInputString("deleteCrawl") ) delColl = true; //if ( JS.getInputString("resetCrawl") ) resetColl = true; //if ( resetColl && ! cr ) { // //log("crawlbot: no collection found to reset."); // char *msg = "Could not find crawl to reset."; // return sendErrorReply2 (socket,fmt,msg); //} //if ( restartColl && ! cr ) { // char *msg = "Could not find crawl to restart."; // return sendErrorReply2 (socket,fmt,msg); //} // make a new state StateCD *st; try { st = new (StateCD); } catch ( ... ) { return sendErrorReply2 ( socket , fmt , mstrerror(g_errno)); } mnew ( st , sizeof(StateCD), "statecd"); // debug //log("mnew2: st=%"XINT32"",(int32_t)st); // copy crap st->m_hr.copy ( hr ); st->m_socket = socket; st->m_fmt = fmt; if ( cr ) st->m_collnum = cr->m_collnum; else st->m_collnum = -1; // save seeds if ( cr && restartColl ) { // && cast ) { // bail on OOM saving seeds if ( ! st->m_seedBank.safeMemcpy ( &cr->m_diffbotSeeds ) || ! st->m_seedBank.pushChar('\0') ) { mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; return sendErrorReply2(socket,fmt,mstrerror(g_errno)); } } // // if we can't compile the provided regexes, return error // if ( cr ) { char *rx1 = hr->getString("urlCrawlRegEx",NULL); if ( rx1 && ! rx1[0] ) rx1 = NULL; char *rx2 = hr->getString("urlProcessRegEx",NULL); if ( rx2 && ! rx2[0] ) rx2 = NULL; // this will store the compiled regular expression into ucr regex_t re1; regex_t re2; int32_t status1 = 0; int32_t status2 = 0; if ( rx1 ) status1 = regcomp ( &re1 , rx1 , REG_EXTENDED|REG_ICASE| REG_NEWLINE|REG_NOSUB); if ( rx2 ) status2 = regcomp ( &re2 , rx2 , REG_EXTENDED|REG_ICASE| REG_NEWLINE|REG_NOSUB); if ( rx1 ) regfree ( &re1 ); if ( rx2 ) regfree ( &re2 ); SafeBuf em; if ( status1 ) { log("xmldoc: regcomp %s failed.",rx1); em.safePrintf("Invalid regular expression: %s",rx1); } else if ( status2 ) { log("xmldoc: regcomp %s failed.",rx2); em.safePrintf("Invalid regular expression: %s",rx2); } if ( status1 || status2 ) { mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; char *msg = em.getBufStart(); return sendErrorReply2(socket,fmt,msg); } } // . if this is a cast=0 request it is received by all hosts in the // network // . this code is the only code run by EVERY host in the network // . the other code is just run once by the receiving host // . so we gotta create a coll rec on each host etc. // . no need to update collectionrec parms here since Pages.cpp calls // g_parms.setFromRequest() for us before calling this function, // pg->m_function(). even though maxtocrawl is on "PAGE_NONE" // hopefully it will still be set // . but we should take care of add/del/reset coll here. // . i guess this will be handled by the new parm syncing logic // which deals with add/del coll requests /* if ( cast == 0 ) { // add a new collection by default if ( ! cr && name && name[0] ) cr = addNewDiffbotColl ( collName , token , name, hr ); // also support the good 'ole html form interface if ( cr ) setSpiderParmsFromHtmlRequest ( socket , hr , cr ); // . we can't sync these operations on a dead host when it // comes back up yet. we can only sync parms, not collection // adds/deletes/resets // . TODO: make new collections just a list of rdb records, // then they can leverage the msg4 and addsinprogress.dat // functionality we have for getting dead hosts back up to // sync. Call it Colldb. // . PROBLEM: when just starting up seems like hasDeadHost() // is returning true because it has not yet received its // first ping reply //if ( addColl || delColl || resetColl ) { // // if any host in network is dead, do not do this // if ( g_hostdb.hasDeadHost() ) { // char *msg = "A host in the network is dead."; // // log it // log("crawlbot: %s",msg); // // make sure this returns in json if required // return sendErrorReply2(socket,fmt,msg); // } //} // problem? if ( ! cr ) { // send back error char *msg = "Collection add failed"; if ( delColl ) msg = "No such collection"; if ( resetColl ) msg = "No such collection"; if ( restartColl ) msg = "No such collection"; // nuke it mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; // log it log("crawlbot: cr is null. %s",msg); // make sure this returns in json if required return sendErrorReply2(socket,fmt,msg); } // set this up WaitEntry *we = &st->m_waitEntry; we->m_state = st; we->m_callback = collOpDoneWrapper; // this won't work, collname is on the stack! //we->m_coll = collName; we->m_coll = cr->m_coll; if ( delColl ) { // note it log("crawlbot: deleting coll"); // delete collection name // this can block if tree is saving, it has to wait // for tree save to complete before removing old // collnum recs from tree if ( ! g_collectiondb.deleteRec ( collName , we ) ) return false; // nuke it mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; // all done return g_httpServer.sendDynamicPage (socket,"OK",2); } if ( resetColl || restartColl ) { // note it log("crawlbot: resetting/restarting coll"); //cr = g_collectiondb.getRec ( resetColl ); // this can block if tree is saving, it has to wait // for tree save to complete before removing old // collnum recs from tree bool purgeSeeds = true; if ( restartColl ) purgeSeeds = false; if ( ! g_collectiondb.resetColl ( collName , we , purgeSeeds ) ) return false; // it is a NEW ptr now! cr = g_collectiondb.getRec( collName ); // if reset from crawlbot api page then enable spiders // to avoid user confusion if ( cr ) cr->m_spideringEnabled = 1; // nuke it mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; // all done return g_httpServer.sendDynamicPage (socket,"OK",2); } // nuke it mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; // this will set the the collection parms from json //setSpiderParmsFromJSONPost ( socket , hr , cr , &JS ); // this is a cast, so just return simple response return g_httpServer.sendDynamicPage (socket,"OK",2); } */ ///////// // // after all hosts have replied to the request, we finally send the // request here, with no &cast=0 appended to it. so there is where we // send the final reply back to the browser // ///////// /* // in case collection was just added above... try this!! cr = g_collectiondb.getRec(collName); // collectionrec must be non-null at this point. i.e. we added it if ( ! cr ) { char *msg = "Crawl name was not found."; if ( name && name[0] ) msg = "Failed to add crawl. Crawl name is illegal."; // nuke it mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; //log("crawlbot: no collection found. need to add a crawl"); return sendErrorReply2(socket,fmt, msg); } //char *spots = hr->getString("spots",NULL,NULL); //char *seeds = hr->getString("seeds",NULL,NULL); */ // check seed bank now too for restarting a crawl if ( st->m_seedBank.length() && ! seeds ) seeds = st->m_seedBank.getBufStart(); char *coll = "NONE"; if ( cr ) coll = cr->m_coll; if ( seeds ) log("crawlbot: adding seeds=\"%s\" coll=%s (%"INT32")", seeds,coll,(int32_t)st->m_collnum); char bulkurlsfile[1024]; // when a collection is restarted the collnum changes to avoid // adding any records destined for that collnum that might be on // the wire. so just put these in the root dir snprintf(bulkurlsfile, 1024, "%sbulkurls-%s.txt", g_hostdb.m_dir , coll );//, (int32_t)st->m_collnum ); if ( spots && cr && cr->m_isCustomCrawl == 2 ) { int32_t spotsLen = (int32_t)gbstrlen(spots); log("crawlbot: got spots (len=%"INT32") to add coll=%s (%"INT32")", spotsLen,coll,(int32_t)st->m_collnum); FILE *f = fopen(bulkurlsfile, "w"); if (f != NULL) { // urls are space separated. // as of 5/14/2014, it appears that spots is space-separated for some URLs (the first two) // and newline-separated for the remainder. Make a copy that's space separated so that restarting bulk jobs works. // Alternatives: // 1) just write one character to disk at a time, replacing newlines with spaces // 2) just output what you have, and then when you read in, replace newlines with spaces // 3) probably the best option: change newlines to spaces earlier in the pipeline char *spotsCopy = (char*) mmalloc(spotsLen+1, "create a temporary copy of spots that we're about to delete"); for (int i = 0; i < spotsLen; i++) { char c = spots[i]; if (c == '\n') c = ' '; spotsCopy[i] = c; } spotsCopy[spotsLen] = '\0'; fprintf(f, "%s", spotsCopy); fclose(f); mfree(spotsCopy, spotsLen+1, "no longer need copy"); } } // if restart flag is on and the file with bulk urls exists, // get spots from there SafeBuf bb; if ( !spots && restartColl && cr && cr->m_isCustomCrawl == 2 ) { bb.load(bulkurlsfile); bb.nullTerm(); spots = bb.getBufStart(); log("crawlbot: restarting bulk job file=%s bufsize=%"INT32" for %s", bulkurlsfile,bb.length(), cr->m_coll); } /* FILE *f = fopen(bulkurlsfile, "r"); if (f != NULL) { fseek(f, 0, SEEK_END); int32_t size = ftell(f); fseek(f, 0, SEEK_SET); char *bulkurls = (char*) mmalloc(size, "reading in bulk urls"); if ( ! bulkurls ) { mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; return sendErrorReply2(socket,fmt,mstrerror(g_errno)); } fgets(bulkurls, size, f); spots = bulkurls; fclose(f); } } */ /////// // // handle file of urls upload. can be HUGE! // /////// if ( spots || seeds ) { // error if ( g_repair.isRepairActive() && g_repair.m_collnum == st->m_collnum ) { log("crawlbot: repair active. can't add seeds " "or spots while repairing collection."); g_errno = EREPAIRING; return sendErrorReply2(socket,fmt,mstrerror(g_errno)); } // . avoid spidering links for these urls? i would say // . default is to NOT spider the links... // . support camel case and all lower case //int32_t spiderLinks = hr->getLong("spiderLinks",1); //spiderLinks = hr->getLong("spiderlinks",spiderLinks); //bool spiderLinks = false; // make a list of spider requests from these urls //SafeBuf listBuf; // this returns NULL with g_errno set bool status = true; if ( ! getSpiderRequestMetaList ( seeds, &st->m_listBuf , true , // spiderLinks? cr ) ) status = false; // do not spider links for spots if ( ! getSpiderRequestMetaList ( spots, &st->m_listBuf , false , // spiderLinks? NULL ) ) status = false; // empty? int32_t size = st->m_listBuf.length(); // error? if ( ! status ) { // nuke it mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; return sendErrorReply2(socket,fmt,mstrerror(g_errno)); } // if not list if ( ! size ) { // nuke it mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; return sendErrorReply2(socket,fmt,"no urls found"); } // add to spiderdb if ( ! st->m_msg4.addMetaList( st->m_listBuf.getBufStart() , st->m_listBuf.length(), cr->m_coll, st , addedUrlsToSpiderdbWrapper, 0 // niceness ) ) // blocked! return false; // did not block, print page! addedUrlsToSpiderdbWrapper(st); return true; } ///////// // // handle direct injection of a url. looks at "spiderlinks=1" parm // and all the other parms in Msg7::inject() in PageInject.cpp. // ////////// /* if ( injectUrl ) { // a valid collection is required if ( ! cr ) return sendErrorReply2(socket,fmt, "invalid collection"); // begin the injection if ( ! st->m_msg7.inject ( st->m_socket, &st->m_hr, st , injectedUrlWrapper , 1 , // spiderLinks default is on collName ) ) // coll override // if blocked, return now return false; // otherwise send back reply injectedUrlWrapper ( st ); return true; } */ // we do not need the state i guess //////////// // // print the html or json page of all the data // printCrawlBotPage2 ( socket,hr,fmt,NULL,NULL,cr->m_collnum); // get rid of that state mdelete ( st , sizeof(StateCD) , "stcd" ); delete st; //log("mdel4: st=%"XINT32"",(int32_t)st); return true; } /* bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , int32_t fmt ) { if ( fmt == FORMAT_JSON ) sb.safePrintf("\"urlFilters\":["); // skip first filters that are: // 0. ismedia->ignore and // 1. !isonsamedomain->ignore // 2. lastspidertime or !isindexed // 3. errorcount rule // 4. errorcount rule int32_t istart = 5; // if respidering then we added an extra filter // lastspidertime>={roundstart} --> FILTERED //if ( cr->m_collectiveRespiderFrequency > 0.0 ) // istart++; for ( int32_t i = istart ; i < cr->m_numRegExs ; i++ ) { //sb.safePrintf char *expression = cr->m_regExs[i].getBufStart(); // do not allow nulls if ( ! expression ) expression = ""; // skip spaces if ( *expression && is_wspace_a(*expression) ) expression++; if ( strcmp(expression,"default") == 0 ) expression = "*"; char *action = cr->m_spiderDiffbotApiUrl[i].getBufStart(); // do not all nulls if ( ! action ) action = ""; // skip spaces if ( *action && is_wspace_a(*action) ) action++; // if no diffbot api url specified, do not process if ( ! *action ) action = "doNotProcess"; // if filtered from crawling, do not even spider int32_t priority = cr->m_spiderPriorities[i]; if ( priority == SPIDER_PRIORITY_FILTERED ) // -3 action = "doNotCrawl"; // we add this supplemental expressin/action for every // one the user adds in order to give manually added // urls higher spider priority, so skip it if ( strncmp(expression,"ismanualadd && ",15) == 0 ) continue; if ( fmt == FORMAT_HTML ) { sb.safePrintf("" "Expression " " " "" "Action " "" "" "\n" , expression , action ); continue; } // show it sb.safePrintf("{\"expression\":\"%s\",",expression); sb.safePrintf("\"action\":\"%s\"}",action); // more follow? sb.pushChar(','); sb.pushChar('\n'); } if ( fmt == FORMAT_JSON ) { // remove trailing comma sb.removeLastChar('\n'); sb.removeLastChar(','); sb.safePrintf("]\n"); } return true; } */ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) { return printCrawlDetailsInJson( sb , cx , HTTP_REQUEST_DEFAULT_REQUEST_VERSION); } bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx, int version ) { SafeBuf tmp; int32_t crawlStatus = -1; getSpiderStatusMsg ( cx , &tmp , &crawlStatus ); CrawlInfo *ci = &cx->m_localCrawlInfo; int32_t sentAlert = (int32_t)ci->m_sentCrawlDoneAlert; if ( sentAlert ) sentAlert = 1; char *crawlTypeStr = "crawl"; //char *nomen = "crawl"; if ( cx->m_isCustomCrawl == 2 ) { crawlTypeStr = "bulk"; //nomen = "job"; } // don't print completed time if spidering is going on uint32_t completed = cx->m_diffbotCrawlEndTime; // if not yet done, make this zero if ( crawlStatus == SP_INITIALIZING ) completed = 0; if ( crawlStatus == SP_NOURLS ) completed = 0; //if ( crawlStatus == SP_PAUSED ) completed = 0; //if ( crawlStatus == SP_ADMIN_PAUSED ) completed = 0; if ( crawlStatus == SP_INPROGRESS ) completed = 0; sb->safePrintf("\n\n{" "\"name\":\"%s\",\n" "\"type\":\"%s\",\n" "\"jobCreationTimeUTC\":%"INT32",\n" "\"jobCompletionTimeUTC\":%"INT32",\n" //"\"alias\":\"%s\",\n" //"\"crawlingEnabled\":%"INT32",\n" "\"jobStatus\":{" // nomen = jobStatus / crawlStatus "\"status\":%"INT32"," "\"message\":\"%s\"},\n" "\"sentJobDoneNotification\":%"INT32",\n" //"\"crawlingPaused\":%"INT32",\n" "\"objectsFound\":%"INT64",\n" "\"urlsHarvested\":%"INT64",\n" //"\"urlsExamined\":%"INT64",\n" "\"pageCrawlAttempts\":%"INT64",\n" "\"pageCrawlSuccesses\":%"INT64",\n" "\"pageCrawlSuccessesThisRound\":%"INT64",\n" "\"pageProcessAttempts\":%"INT64",\n" "\"pageProcessSuccesses\":%"INT64",\n" "\"pageProcessSuccessesThisRound\":%"INT64",\n" "\"maxRounds\":%"INT32",\n" "\"repeat\":%f,\n" "\"crawlDelay\":%f,\n" //,cx->m_coll , cx->m_diffbotCrawlName.getBufStart() , crawlTypeStr , cx->m_diffbotCrawlStartTime // this is 0 if not over yet , completed //, alias //, (int32_t)cx->m_spideringEnabled , crawlStatus , tmp.getBufStart() , sentAlert //, (int32_t)paused , cx->m_globalCrawlInfo.m_objectsAdded - cx->m_globalCrawlInfo.m_objectsDeleted , cx->m_globalCrawlInfo.m_urlsHarvested //,cx->m_globalCrawlInfo.m_urlsConsidered , cx->m_globalCrawlInfo.m_pageDownloadAttempts , cx->m_globalCrawlInfo.m_pageDownloadSuccesses , cx->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound , cx->m_globalCrawlInfo.m_pageProcessAttempts , cx->m_globalCrawlInfo.m_pageProcessSuccesses , cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound , (int32_t)cx->m_maxCrawlRounds , cx->m_collectiveRespiderFrequency , cx->m_collectiveCrawlDelay ); sb->safePrintf("\"obeyRobots\":%"INT32",\n" , (int32_t)cx->m_useRobotsTxt ); // if not a "bulk" injection, show crawl stats if ( cx->m_isCustomCrawl != 2 ) { sb->safePrintf( // settable parms "\"maxToCrawl\":%"INT64",\n" "\"maxToProcess\":%"INT64",\n" //"\"restrictDomain\":%"INT32",\n" "\"onlyProcessIfNew\":%"INT32",\n" , cx->m_maxToCrawl , cx->m_maxToProcess //, (int32_t)cx->m_restrictDomain , (int32_t)cx->m_diffbotOnlyProcessIfNewUrl ); sb->safePrintf("\"seeds\":\""); sb->safeUtf8ToJSON ( cx->m_diffbotSeeds.getBufStart()); sb->safePrintf("\",\n"); } sb->safePrintf("\"roundsCompleted\":%"INT32",\n", cx->m_spiderRoundNum); sb->safePrintf("\"roundStartTime\":%"UINT32",\n", cx->m_spiderRoundStartTime); sb->safePrintf("\"currentTime\":%"UINT32",\n", (uint32_t)getTimeGlobal() ); sb->safePrintf("\"currentTimeUTC\":%"UINT32",\n", (uint32_t)getTimeGlobal() ); sb->safePrintf("\"apiUrl\":\""); sb->safeUtf8ToJSON ( cx->m_diffbotApiUrl.getBufStart() ); sb->safePrintf("\",\n"); sb->safePrintf("\"urlCrawlPattern\":\""); sb->safeUtf8ToJSON ( cx->m_diffbotUrlCrawlPattern.getBufStart() ); sb->safePrintf("\",\n"); sb->safePrintf("\"urlProcessPattern\":\""); sb->safeUtf8ToJSON ( cx->m_diffbotUrlProcessPattern.getBufStart() ); sb->safePrintf("\",\n"); sb->safePrintf("\"pageProcessPattern\":\""); sb->safeUtf8ToJSON ( cx->m_diffbotPageProcessPattern.getBufStart() ); sb->safePrintf("\",\n"); sb->safePrintf("\"urlCrawlRegEx\":\""); sb->safeUtf8ToJSON ( cx->m_diffbotUrlCrawlRegEx.getBufStart() ); sb->safePrintf("\",\n"); sb->safePrintf("\"urlProcessRegEx\":\""); sb->safeUtf8ToJSON ( cx->m_diffbotUrlProcessRegEx.getBufStart() ); sb->safePrintf("\",\n"); sb->safePrintf("\"maxHops\":%"INT32",\n", (int32_t)cx->m_diffbotMaxHops); char *token = cx->m_diffbotToken.getBufStart(); char *name = cx->m_diffbotCrawlName.getBufStart(); char *mt = "crawl"; if ( cx->m_isCustomCrawl == 2 ) mt = "bulk"; sb->safePrintf("\"downloadJson\":" "\"http://api.diffbot.com/v%d/%s/download/" "%s-%s_data.json\",\n" , version , mt , token , name ); sb->safePrintf("\"downloadUrls\":" "\"http://api.diffbot.com/v%d/%s/download/" "%s-%s_urls.csv\",\n" , version , mt , token , name ); sb->safePrintf("\"notifyEmail\":\""); sb->safeUtf8ToJSON ( cx->m_notifyEmail.getBufStart() ); sb->safePrintf("\",\n"); sb->safePrintf("\"notifyWebhook\":\""); sb->safeUtf8ToJSON ( cx->m_notifyUrl.getBufStart() ); sb->safePrintf("\"\n"); //sb->safePrintf("\",\n"); ///// // // show url filters table. kinda hacky!! // ///// /* g_parms.sendPageGeneric ( socket , hr , PAGE_FILTERS , NULL , &sb , cr->m_coll, // coll override true // isJSON? ); */ //printUrlFilters ( sb , cx , FORMAT_JSON ); // end that collection rec sb->safePrintf("}\n"); return true; } bool printCrawlDetails2 (SafeBuf *sb , CollectionRec *cx , char format ) { SafeBuf tmp; int32_t crawlStatus = -1; getSpiderStatusMsg ( cx , &tmp , &crawlStatus ); CrawlInfo *ci = &cx->m_localCrawlInfo; int32_t sentAlert = (int32_t)ci->m_sentCrawlDoneAlert; if ( sentAlert ) sentAlert = 1; // don't print completed time if spidering is going on uint32_t completed = cx->m_diffbotCrawlEndTime; // time_t // if not yet done, make this zero if ( crawlStatus == SP_INITIALIZING ) completed = 0; if ( crawlStatus == SP_NOURLS ) completed = 0; //if ( crawlStatus == SP_PAUSED ) completed = 0; //if ( crawlStatus == SP_ADMIN_PAUSED ) completed = 0; if ( crawlStatus == SP_INPROGRESS ) completed = 0; if ( format == FORMAT_JSON ) { sb->safePrintf("{" "\"response:{\n" "\t\"statusCode\":%"INT32",\n" "\t\"statusMsg\":\"%s\",\n" "\t\"jobCreationTimeUTC\":%"INT32",\n" "\t\"jobCompletionTimeUTC\":%"INT32",\n" "\t\"sentJobDoneNotification\":%"INT32",\n" "\t\"urlsHarvested\":%"INT64",\n" "\t\"pageCrawlAttempts\":%"INT64",\n" "\t\"pageCrawlSuccesses\":%"INT64",\n" , crawlStatus , tmp.getBufStart() , cx->m_diffbotCrawlStartTime , completed , sentAlert , cx->m_globalCrawlInfo.m_urlsHarvested , cx->m_globalCrawlInfo.m_pageDownloadAttempts , cx->m_globalCrawlInfo.m_pageDownloadSuccesses ); sb->safePrintf("\t\"currentTime\":%"UINT32",\n", (uint32_t)getTimeGlobal() ); sb->safePrintf("\t\"currentTimeUTC\":%"UINT32",\n", (uint32_t)getTimeGlobal() ); sb->safePrintf("\t}\n"); sb->safePrintf("}\n"); } if ( format == FORMAT_XML ) { sb->safePrintf("\n" "\t%"INT32"\n" , crawlStatus ); sb->safePrintf( "\t\n" "\t%"INT32"" "\n" , (char *)tmp.getBufStart() , (int32_t)cx->m_diffbotCrawlStartTime ); sb->safePrintf( "\t%"INT32"" "\n" "\t%"INT32"" "\n" "\t%"INT64"\n" "\t%"INT64"" "\n" "\t%"INT64"" "\n" , completed , sentAlert , cx->m_globalCrawlInfo.m_urlsHarvested , cx->m_globalCrawlInfo.m_pageDownloadAttempts , cx->m_globalCrawlInfo.m_pageDownloadSuccesses ); sb->safePrintf("\t%"UINT32"\n", (uint32_t)getTimeGlobal() ); sb->safePrintf("\t%"UINT32"\n", (uint32_t)getTimeGlobal() ); sb->safePrintf("\n"); } return true; } bool printCrawlBotPage2 ( TcpSocket *socket , HttpRequest *hr , char fmt, // format SafeBuf *injectionResponse , SafeBuf *urlUploadResponse , collnum_t collnum ) { // store output into here SafeBuf sb; if ( fmt == FORMAT_HTML ) sb.safePrintf( "" "Crawlbot - " "Web Data Extraction and Search Made " "Easy" "" ); CollectionRec *cr = g_collectiondb.m_recs[collnum]; // was coll deleted while adding urls to spiderdb? if ( ! cr ) { g_errno = EBADREQUEST; char *msg = "invalid crawl. crawl was deleted."; return sendErrorReply2(socket,fmt,msg); } char *token = cr->m_diffbotToken.getBufStart(); char *name = cr->m_diffbotCrawlName.getBufStart(); // this is usefful SafeBuf hb; hb.safePrintf("" "" "" , name , token ); hb.nullTerm(); // and this SafeBuf lb; lb.safePrintf("name="); lb.urlEncode(name); lb.safePrintf ("&token="); lb.urlEncode(token); if ( fmt == FORMAT_HTML ) lb.safePrintf("&format=html"); lb.nullTerm(); // set this to current collection. if only token was provided // then it will return the first collection owned by token. // if token has no collections it will be NULL. //if ( ! cr ) // cr = getCollRecFromHttpRequest ( hr ); //if ( ! cr ) { // char *msg = "failed to add new collection"; // g_msg = " (error: crawlbot failed to allocate crawl)"; // return sendErrorReply2 ( socket , fmt , msg ); //} if ( fmt == FORMAT_HTML ) { sb.safePrintf("" "" "
" "" "" "Crawlbot" "
" "" "Crawl, Datamine and Index the Web" "" "
" , token ); sb.safePrintf("

"); // first print help sb.safePrintf("[ " "api help ]   " // json output "[ " "json output" " ]   " , token , name ); // random coll name to add uint32_t r1 = rand(); uint32_t r2 = rand(); uint64_t rand64 = (uint64_t) r1; rand64 <<= 32; rand64 |= r2; char newCollName[MAX_COLL_LEN+1]; snprintf(newCollName,MAX_COLL_LEN,"%s-%016"XINT64"", token , rand64 ); // first print "add new collection" sb.safePrintf("[ " "add new crawl" " ]   " "[ " "show all crawls" " ]   " , rand64 , token , newCollName , token ); } bool firstOne = true; // // print list of collections controlled by this token // for ( int32_t i = 0 ; fmt == FORMAT_HTML && im_diffbotToken.getBufStart(); if ( ! ct ) continue; // skip if token does not match if ( strcmp(ct,token) ) continue; // highlight the tab if it is what we selected bool highlight = false; if ( cx == cr ) highlight = true; char *style = ""; if ( highlight ) { style = "style=text-decoration:none; "; sb.safePrintf ( ""); } // print the crawl id. collection name minus - sb.safePrintf("" "%s (%"INT32")" "   " , cx->m_diffbotCrawlName.getBufStart() , (int32_t)cx->m_collnum ); if ( highlight ) sb.safePrintf(""); } if ( fmt == FORMAT_HTML ) sb.safePrintf ( "

" ); // the ROOT JSON [ if ( fmt == FORMAT_JSON ) sb.safePrintf("{\n"); // injection is currently not in use, so this is an artifact: if ( fmt == FORMAT_JSON && injectionResponse ) sb.safePrintf("\"response\":\"%s\",\n\n" , injectionResponse->getBufStart() ); if ( fmt == FORMAT_JSON && urlUploadResponse ) sb.safePrintf("\"response\":\"%s\",\n\n" , urlUploadResponse->getBufStart() ); ////// // // print collection summary page // ////// // the items in the array now have type:bulk or type:crawl // so call them 'jobs' if ( fmt == FORMAT_JSON ) sb.safePrintf("\"jobs\":[");//\"collections\":"); int32_t summary = hr->getLong("summary",0); // enter summary mode for json if ( fmt != FORMAT_HTML ) summary = 1; // start the table if ( summary && fmt == FORMAT_HTML ) { sb.safePrintf("" "" "" "" "" "" "" "" "" "" "" "" "" ); } char *name3 = hr->getString("name"); // scan each coll and get its stats for ( int32_t i = 0 ; summary && i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cx = g_collectiondb.m_recs[i]; if ( ! cx ) continue; // must belong to us if ( strcmp(cx->m_diffbotToken.getBufStart(),token) ) continue; // just print out single crawl info for json if ( fmt != FORMAT_HTML && cx != cr && name3 ) continue; // if json, print each collectionrec if ( fmt == FORMAT_JSON ) { if ( ! firstOne ) sb.safePrintf(",\n\t"); firstOne = false; //char *alias = ""; //if ( cx->m_collectionNameAlias.length() > 0 ) // alias=cx->m_collectionNameAlias.getBufStart(); //int32_t paused = 1; //if ( cx->m_spideringEnabled ) paused = 0; if ( cx->m_isCustomCrawl ) printCrawlDetailsInJson ( &sb , cx , getVersionFromRequest(hr) ); else printCrawlDetails2 ( &sb,cx,FORMAT_JSON ); // print the next one out continue; } // print in table sb.safePrintf("" "" "" "" //"" "" "" "" "" "" "" "" , cx->m_coll , cx->m_globalCrawlInfo.m_objectsAdded - cx->m_globalCrawlInfo.m_objectsDeleted , cx->m_globalCrawlInfo.m_urlsHarvested //, cx->m_globalCrawlInfo.m_urlsConsidered , cx->m_globalCrawlInfo.m_pageDownloadAttempts , cx->m_globalCrawlInfo.m_pageDownloadSuccesses , cx->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound , cx->m_globalCrawlInfo.m_pageProcessAttempts , cx->m_globalCrawlInfo.m_pageProcessSuccesses , cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound ); } if ( summary && fmt == FORMAT_HTML ) { sb.safePrintf("
CollectionObjects FoundURLs HarvestedURLs ExaminedPage Download AttemptsPage Download SuccessesPage Download Successes This Round" "Page Process AttemptsPage Process SuccessesPage Process Successes This Round" "
%s%"INT64"%"INT64"%"INT64"%"INT64"%"INT64"%"INT64"%"INT64"%"INT64"%"INT64"
" ); return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), sb.length(), 0); // cachetime } if ( fmt == FORMAT_JSON ) // end the array of collection objects sb.safePrintf("\n]\n"); /////// // // end print collection summary page // /////// // // show urls being crawled (ajax) (from Spider.cpp) // if ( fmt == FORMAT_HTML ) { sb.safePrintf ( "" //"bgcolor=#%s>\n" "\n" ); // the table headers so SpiderRequest::printToTable() works if ( ! SpiderRequest::printTableHeaderSimple(&sb,true) ) return false; // int16_tcut XmlDoc **docs = g_spiderLoop.m_docs; // row count int32_t j = 0; // first print the spider recs we are spidering for ( int32_t i = 0 ; i < (int32_t)MAX_SPIDERS ; i++ ) { // get it XmlDoc *xd = docs[i]; // skip if empty if ( ! xd ) continue; // sanity check if ( ! xd->m_sreqValid ) { char *xx=NULL;*xx=0; } // skip if not our coll rec! //if ( xd->m_cr != cr ) continue; if ( xd->m_collnum != cr->m_collnum ) continue; // grab it SpiderRequest *oldsr = &xd->m_sreq; // get status char *status = xd->m_statusMsg; // show that if ( ! oldsr->printToTableSimple ( &sb , status,xd,j)) return false; j++; } // end the table sb.safePrintf ( "
"// bgcolor=#%s>" "Last 10 URLs (%"INT32" spiders active)" //,LIGHT_BLUE //,DARK_BLUE ,(int32_t)g_spiderLoop.m_numSpidersOut); char *str = "Resume Crawl"; int32_t pval = 0; if ( cr->m_spideringEnabled ) { str = "Pause Crawl"; pval = 1; } sb.safePrintf(" " "%s" , lb.getBufStart() // has &name=&token= encoded , pval , str ); sb.safePrintf("
\n" ); sb.safePrintf ( "
\n" ); } // end html format // this is for making sure the search results are not cached uint32_t r1 = rand(); uint32_t r2 = rand(); uint64_t rand64 = (uint64_t) r1; rand64 <<= 32; rand64 |= r2; if ( fmt == FORMAT_HTML ) { sb.safePrintf("
" "" // OBJECT search input box "" "" "" "" // PAGE search input box "" "" "" "" // add url input box "" "" "" "" //"" "" "" "" "" "
" "Search Objects:" "" "" // site clustering off "" // dup removal off "" "" "" // bypass ajax, searchbox, logo, etc. "" // restrict search to json objects "" " " "" "
" "Search Pages:" "" "" // site clustering off "" // dup removal off "" "" "" // bypass ajax, searchbox, logo, etc. "" // restrict search to NON json objects "" " " "" "
" "Add Seed Urls: " "" "" "%s" // hidden tags " " "" //"     " //" crawl links on this page?" , cr->m_coll , rand64 , cr->m_coll , rand64 , hb.getBufStart() // hidden tags ); } if ( injectionResponse && fmt == FORMAT_HTML ) sb.safePrintf("
%s\n" ,injectionResponse->getBufStart() ); if ( fmt == FORMAT_HTML ) sb.safePrintf(//"" //"" "
Add Spot URLs:" // this page will call // printCrawlbotPage2(uploadResponse) 2display it //"
" //"" " " "" "%s" // hidden tags //"     " //" crawl links on those pages?" "
" "
" "
" //, cr->m_coll , hb.getBufStart() ); // // show stats // if ( fmt == FORMAT_HTML ) { char *seedStr = cr->m_diffbotSeeds.getBufStart(); if ( ! seedStr ) seedStr = ""; SafeBuf tmp; int32_t crawlStatus = -1; getSpiderStatusMsg ( cr , &tmp , &crawlStatus ); CrawlInfo *ci = &cr->m_localCrawlInfo; int32_t sentAlert = (int32_t)ci->m_sentCrawlDoneAlert; if ( sentAlert ) sentAlert = 1; sb.safePrintf( "
" "%s" , hb.getBufStart() // hidden input token/name/.. ); sb.safePrintf("" "" // download products html , cr->m_coll , rand64 , cr->m_coll , rand64 //, cr->m_coll //, cr->m_coll //, cr->m_coll // urls.csv old , cr->m_coll // urls.csv new format v3 , cr->m_coll // last 50 downloaded urls , cr->m_coll // latest objects in html , cr->m_coll , rand64 // latest objects in csv , cr->m_coll , rand64 // latest products in html , cr->m_coll , rand64 // latest products in csv , cr->m_coll , rand64 // download pages , cr->m_coll ); // spacer column sb.safePrintf("" ); // what diffbot api to use? /* char *api = cr->m_diffbotApi.getBufStart(); char *s[10]; for ( int32_t i = 0 ; i < 10 ; i++ ) s[i] = ""; if ( api && strcmp(api,"all") == 0 ) s[0] = " selected"; if ( api && strcmp(api,"article") == 0 ) s[1] = " selected"; if ( api && strcmp(api,"product") == 0 ) s[2] = " selected"; if ( api && strcmp(api,"image") == 0 ) s[3] = " selected"; if ( api && strcmp(api,"frontpage") == 0 ) s[4] = " selected"; if ( api && strcmp(api,"none") == 0 ) s[5] = " selected"; if ( ! api || ! api[0] ) s[5] = " selected"; */ sb.safePrintf( "" "" "
" "" // "" "" "" "" "" "" "" "" //"" //"" //"" //"" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" , cr->m_diffbotCrawlName.getBufStart() , (int32_t)cr->m_isCustomCrawl , cr->m_diffbotToken.getBufStart() , seedStr , crawlStatus , tmp.getBufStart() , cr->m_diffbotCrawlStartTime // this is 0 if not over yet , cr->m_diffbotCrawlEndTime , cr->m_spiderRoundNum , cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider ); // show crawlinfo crap CrawlInfo *cis = (CrawlInfo *)cr->m_crawlInfoBuf.getBufStart(); sb.safePrintf("\n"); sb.safePrintf( // this will have to be in crawlinfo too! //"" //"" //"" "" "" "" "" "" "" "" "" //"" //"" //"" //"" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" , cr->m_globalCrawlInfo.m_objectsAdded - cr->m_globalCrawlInfo.m_objectsDeleted , cr->m_globalCrawlInfo.m_urlsHarvested //, cr->m_globalCrawlInfo.m_urlsConsidered , cr->m_globalCrawlInfo.m_pageDownloadAttempts , cr->m_globalCrawlInfo.m_pageDownloadSuccesses , cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound , cr->m_globalCrawlInfo.m_pageProcessAttempts , cr->m_globalCrawlInfo.m_pageProcessSuccesses , cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound ); uint32_t now = (uint32_t)getTimeGlobalNoCore(); sb.safePrintf("" "" "" // json search with gbsortby:gbspiderdate , cr->m_coll , rand64 // json search with gbmin:gbspiderdate , cr->m_coll , rand64 , now - 30 // 60 // last 1 minute ); sb.safePrintf ( "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "
Crawl Name:%s
Crawl Type:%"INT32"
Collection Alias:%s%s
Token:%s
Seeds:%s
Crawl Status:%"INT32"
Crawl Status Msg:%s
Crawl Start Time:%"UINT32"
Last Crawl Completion Time:%"UINT32"
Rounds Completed:%"INT32"
Has Urls Ready to Spider:%"INT32"
Ready Hosts"); for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) { CrawlInfo *ci = &cis[i]; if ( ! ci ) continue; if ( ! ci->m_hasUrlsReadyToSpider ) continue; Host *h = g_hostdb.getHost ( i ); if ( ! h ) continue; sb.safePrintf("" "%i " , iptoa(h->m_ip) , (int)h->m_httpPort , cr->m_coll , (int)i ); } sb.safePrintf("
pages indexed" //"%"INT64"
Objects Found%"INT64"
URLs Harvested (inc. dups)%"INT64"
URLs Examined%"INT64"
Page Crawl Attempts%"INT64"
Page Crawl Successes%"INT64"
Page Crawl Successes This Round%"INT64"
Page Process Attempts%"INT64"
Page Process Successes%"INT64"
Page Process Successes This Round%"INT64"
Download Objects: " "" "" "csv" "   " "" "json full dump" "   " , cr->m_coll , cr->m_coll ); sb.safePrintf( // newest json on top of results "" "json full search (newest on top)" "   " // newest json on top of results, last 10 mins "" "json search (last 30 seconds)" "
Download Products: " "" // make it search.csv so excel opens it "" "csv" "   " "" "html" "
Download Urls: " "" "" "csv" " " "new csv format" " " "last 50 download attempts" "
Latest Objects: " "" "" "csv" "   " "" "html" "
Latest Products: " "" "" "csv" "   " "" "html" "
Download Pages: " "" "" "txt" // "
" "
" "    " "    " "" "" /* "" "" "" , s[0] , s[1] , s[2] , s[3] , s[4] , s[5] */ ); //char *alias = ""; //if ( cr->m_collectionNameAlias.length() > 0 ) // alias = cr->m_collectionNameAlias.getBufStart(); //char *aliasResponse = ""; //if ( alias && ! isAliasUnique(cr,token,alias) ) // aliasResponse = "
" // "Alias not unique"; char *urtYes = " checked"; char *urtNo = ""; if ( ! cr->m_useRobotsTxt ) { urtYes = ""; urtNo = " checked"; } /* char *rdomYes = " checked"; char *rdomNo = ""; if ( ! cr->m_restrictDomain ) { rdomYes = ""; rdomNo = " checked"; } */ char *isNewYes = ""; char *isNewNo = " checked"; if ( cr->m_diffbotOnlyProcessIfNewUrl ) { isNewYes = " checked"; isNewNo = ""; } char *api = cr->m_diffbotApiUrl.getBufStart(); if ( ! api ) api = ""; SafeBuf apiUrl; apiUrl.htmlEncode ( api , gbstrlen(api), true , 0 ); apiUrl.nullTerm(); char *px1 = cr->m_diffbotUrlCrawlPattern.getBufStart(); if ( ! px1 ) px1 = ""; SafeBuf ppp1; ppp1.htmlEncode ( px1 , gbstrlen(px1) , true , 0 ); ppp1.nullTerm(); char *px2 = cr->m_diffbotUrlProcessPattern.getBufStart(); if ( ! px2 ) px2 = ""; SafeBuf ppp2; ppp2.htmlEncode ( px2 , gbstrlen(px2) , true , 0 ); ppp2.nullTerm(); char *px3 = cr->m_diffbotPageProcessPattern.getBufStart(); if ( ! px3 ) px3 = ""; SafeBuf ppp3; ppp3.htmlEncode ( px3 , gbstrlen(px3) , true , 0 ); ppp3.nullTerm(); char *rx1 = cr->m_diffbotUrlCrawlRegEx.getBufStart(); if ( ! rx1 ) rx1 = ""; SafeBuf rrr1; rrr1.htmlEncode ( rx1 , gbstrlen(rx1), true , 0 ); char *rx2 = cr->m_diffbotUrlProcessRegEx.getBufStart(); if ( ! rx2 ) rx2 = ""; SafeBuf rrr2; rrr2.htmlEncode ( rx2 , gbstrlen(rx2), true , 0 ); char *notifEmail = cr->m_notifyEmail.getBufStart(); char *notifUrl = cr->m_notifyUrl.getBufStart(); if ( ! notifEmail ) notifEmail = ""; if ( ! notifUrl ) notifUrl = ""; sb.safePrintf( // // "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" //"" //"" //"" //"" "
" "Diffbot API" "" "" "
Repeat Crawl: " "" " " "" " days" "
Diffbot API Url: " "" " " "" "
Url Crawl Pattern: " "" " " "" "
Url Process Pattern: " "" " " "" "
Page Process Pattern: " "" " " "" "
Url Crawl RegEx: " "" " " "" "
Url Process RegEx: " "" " " "" "
Max hopcount to seeds: " "" " " "" "
Only Process If New: " "" " yes   " " no   " "
Crawl Delay (seconds): " "" " " "" "
Max Page Crawl Successes: " "" " " "" "
Max Page Process Successes:" "" " " "" "
Max Rounds:" "" " " "" "
Notification Email:" "" " " "" "
Notification URL:" "" " " "" "
" "Use Robots.txt when crawling? " "" " yes   " " no   " "
" //"Restrict domain to seeds? " //"" //" yes   " //" no   " //"
" //"Use spider proxies on AWS? " //"" //" //"
" "
" , cr->m_collectiveRespiderFrequency , apiUrl.getBufStart() , ppp1.getBufStart() , ppp2.getBufStart() , ppp3.getBufStart() , rrr1.getBufStart() , rrr2.getBufStart() , cr->m_diffbotMaxHops , isNewYes , isNewNo , cr->m_collectiveCrawlDelay , cr->m_maxToCrawl , cr->m_maxToProcess , (int32_t)cr->m_maxCrawlRounds , notifEmail , notifUrl , urtYes , urtNo //, rdomYes //, rdomNo ); } // xml or json does not show the input boxes //if ( format != FORMAT_HTML ) // return g_httpServer.sendDynamicPage ( s, // sb.getBufStart(), // sb.length(), // -1 ); // cachetime // // print url filters. use "multimedia" to handle jpg etc. // // use "notindexable" for images/movies/css etc. // add a "process" column to send to diffbot... // // /* char *s1 = "Show"; char *s2 = "none"; if ( hr->getLongFromCookie("showtable",0) ) { s1 = "Hide"; s2 = ""; } if ( fmt == FORMAT_HTML ) sb.safePrintf( "" "" "
" "%s URL Filters Table" "
" "
" "
" "
" "" "" "" , s1 , s2 , cr->m_coll ); // // print url filters. HACKy... // if ( fmt == FORMAT_HTML ) g_parms.sendPageGeneric ( socket , hr , PAGE_FILTERS , NULL , &sb , cr->m_coll, // coll override false ); // isJSON? // // end HACKy hack // if ( fmt == FORMAT_HTML ) sb.safePrintf( "" "
" "
" "
" ); */ // // add search box to your site // /* sb.safePrintf("
" "" "" "" "" "
" "Add this search box to your site" "" "
"); */ // // show simpler url filters table // if ( fmt == FORMAT_HTML ) { /* sb.safePrintf ( "" "\n" ); // true means its html input printUrlFilters ( sb , cr , fmt ); // for adding new rule sb.safePrintf("" "" "\n" ); //sb.safePrintf("
" "URL Filters" "
Expression " " " "" "Action " " " "" "
U sb.safePrintf("
\n"); */ // // END THE BIG FORM // sb.safePrintf(""); } // // show reset and delete crawl buttons // if ( fmt == FORMAT_HTML ) { sb.safePrintf( "" "" "" "" // restart collection form "" // restart collection form "" "" "
" // reset collection form "
" "%s" // hidden tags , hb.getBufStart() ); sb.safePrintf( "" // also show it in the display, so set "c" "" "
" // end reset collection form "
" // delete collection form "
" "%s" //, (int32_t)cr->m_collnum , hb.getBufStart() ); sb.safePrintf( "" "" "
" // end delete collection form "
" "
" "%s" "" "" "
" "
" "
" "%s" "" "" "
" "
" //, (int32_t)cr->m_collnum , hb.getBufStart() , hb.getBufStart() //, (int32_t)cr->m_collnum ); } // the ROOT JSON } if ( fmt == FORMAT_JSON ) sb.safePrintf("}\n"); char *ct = "text/html"; if ( fmt == FORMAT_JSON ) ct = "application/json"; if ( fmt == FORMAT_XML ) ct = "text/xml"; if ( fmt == FORMAT_CSV ) ct = "text/csv"; // this could be in html json or xml return g_httpServer.sendDynamicPage ( socket, sb.getBufStart(), sb.length(), -1 , // cachetime false , ct ); /* "

API for Diffbot

" "
" "" "" "
" "
" "

API for Crawlbot

" // "
" "" "
" "
" "
" "Token:

" "API: (article, product)

" "
" "
" "" "Page-type is required" "API calls will be made using your current token." "
" "
" "
" "
" "
" "
" "
" "?" "
" "
" "
" "
" "
" "
" "
" "
" "" "
Seed URL is required
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "
" "" "Diffbot uses Java regex syntax. Be sure to escape your characters." "
" "
" "
" "
" "

 
" "" "" "
" ); */ } // . do not add dups into m_diffbotSeeds safebuf // . return 0 if not in table, 1 if in table. -1 on error adding to table. int32_t isInSeedBuf ( CollectionRec *cr , char *url, int len ) { HashTableX *ht = &cr->m_seedHashTable; // if table is empty, populate it if ( ht->m_numSlotsUsed <= 0 ) { // initialize the hash table if ( ! ht->set(8,0,1024,NULL,0,false,1,"seedtbl") ) return -1; // populate it from list of seed urls char *p = cr->m_diffbotSeeds.getBufStart(); for ( ; p && *p ; ) { // get url char *purl = p; // advance to next for ( ; *p && !is_wspace_a(*p) ; p++ ); // make end then char *end = p; // skip possible white space. might be \0. if ( *p ) p++; // hash it int64_t h64 = hash64 ( purl , end-purl ); if ( ! ht->addKey ( &h64 ) ) return -1; } } // is this url in the hash table? int64_t u64 = hash64 ( url, len ); if ( ht->isInTable ( &u64 ) ) return 1; // add it to hashtable if ( ! ht->addKey ( &u64 ) ) return -1; // WAS not in table return 0; } // just use "fakeips" based on the hash of each url hostname/subdomain // so we don't waste time doing ip lookups. bool getSpiderRequestMetaList ( char *doc , SafeBuf *listBuf , bool spiderLinks , CollectionRec *cr ) { if ( ! doc ) return true; // . scan the list of urls // . assume separated by white space \n \t or space char *p = doc; uint32_t now = (uint32_t)getTimeGlobal(); // a big loop while ( true ) { // skip white space (\0 is not a whitespace) for ( ; is_wspace_a(*p) ; p++ ); // all done? if ( ! *p ) break; // save it char *saved = p; // advance to next white space for ( ; ! is_wspace_a(*p) && *p ; p++ ); // set end char *end = p; // get that url Url url; url.set ( saved , end - saved ); // if not legit skip if ( url.getUrlLen() <= 0 ) continue; // need this int64_t probDocId = g_titledb.getProbableDocId(&url); // make it SpiderRequest sreq; sreq.reset(); sreq.m_firstIp = url.getHostHash32(); // fakeip! // avoid ips of 0 or -1 if ( sreq.m_firstIp == 0 || sreq.m_firstIp == -1 ) sreq.m_firstIp = 1; sreq.m_hostHash32 = url.getHostHash32(); sreq.m_domHash32 = url.getDomainHash32(); sreq.m_siteHash32 = url.getHostHash32(); //sreq.m_probDocId = probDocId; sreq.m_hopCount = 0; // we're a seed sreq.m_hopCountValid = true; sreq.m_addedTime = now; sreq.m_isNewOutlink = 1; sreq.m_isWWWSubdomain = url.isSimpleSubdomain(); // treat seed urls as being on same domain and hostname sreq.m_sameDom = 1; sreq.m_sameHost = 1; sreq.m_sameSite = 1; sreq.m_fakeFirstIp = 1; sreq.m_isAddUrl = 1; // spider links? if ( ! spiderLinks ) sreq.m_avoidSpiderLinks = 1; // save the url! strcpy ( sreq.m_url , url.getUrl() ); // finally, we can set the key. isDel = false sreq.setKey ( sreq.m_firstIp , probDocId , false ); int32_t oldBufSize = listBuf->getCapacity(); int32_t need = listBuf->getLength() + 100 + sreq.getRecSize(); int32_t newBufSize = 0; if ( need > oldBufSize ) newBufSize = oldBufSize + 100000; if ( newBufSize && ! listBuf->reserve ( newBufSize ) ) // return false with g_errno set return false; // store rdbid first if ( ! listBuf->pushChar(RDB_SPIDERDB) ) // return false with g_errno set return false; // store it if ( ! listBuf->safeMemcpy ( &sreq , sreq.getRecSize() ) ) // return false with g_errno set return false; if ( ! cr ) continue; // do not add dups into m_diffbotSeeds safebuf int32_t status = isInSeedBuf ( cr , saved , end - saved ); // error? if ( status == -1 ) { log ( "crawlbot: error adding seed to table: %s", mstrerror(g_errno) ); return true; } // already in buf if ( status == 1 ) continue; // add url into m_diffbotSeeds, \n separated list if ( cr->m_diffbotSeeds.length() ) // make it space not \n so it looks better in the // json output i guess cr->m_diffbotSeeds.pushChar(' '); // \n cr->m_diffbotSeeds.safeMemcpy (url.getUrl(), url.getUrlLen()); cr->m_diffbotSeeds.nullTerm(); } // all done return true; } /* bool isAliasUnique ( CollectionRec *cr , char *token , char *alias ) { // scan all collections for ( int32_t i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { CollectionRec *cx = g_collectiondb.m_recs[i]; if ( ! cx ) continue; // must belong to us if ( strcmp(cx->m_diffbotToken.getBufStart(),token) ) continue; // skip if collection we are putting alias on if ( cx == cr ) continue; // does it match? if ( cx->m_collectionNameAlias.length() <= 0 ) continue; // return false if it matches! not unique if ( strcmp ( cx->m_collectionNameAlias.getBufStart() , alias ) == 0 ) return false; } return true; } */ // json can be provided via get or post but content type must be // url-encoded so we can test with a simple html form page. /* bool setSpiderParmsFromJSONPost ( TcpSocket *socket , HttpRequest *hr , CollectionRec *cr ) { // get the json char *json = hr->getString("json"); if ( ! json ) return sendReply2 ( socket, FORMAT_JSON, "No &json= provided in request."); Json JP; bool status = JP.parseJsonStringIntoJsonItems ( json ); // wtf? if ( ! status ) return sendReply2 ( socket, FORMAT_JSON, "Error with JSON parser."); // error adding it? if ( ! cr ) return sendReply2 ( socket,FORMAT_JSON, "Failed to create new collection."); ji = JP.getFirstItem(); char *seed = NULL; // traverse the json for ( ; ji ; ji = ji->m_next ) { // just get STRINGS or NUMS if ( ji->m_type != JT_STRING && ji->m_type != JT_NUMBER ) continue; // check name char *name = ji->m_name; char *val = ji->getValue(); if ( strcmp(name,"seed") == 0 ) seed = val; if ( strcmp(name,"email") == 0 ) cr->m_notifyEmail.set(val); if ( strcmp(name,"webhook") == 0 ) cr->m_notifyUrl.set(val); if ( strcmp(name,"frequency") == 0 ) cr->m_collectiveRespiderFrequency = atof(val); if ( strcmp(name,"maxToCrawl") == 0 ) cr->m_maxToCrawl = atoll(val); if ( strcmp(name,"maxToProcess") == 0 ) cr->m_maxToProcess = atoll(val); if ( strcmp(name,"pageProcessPattern") == 0 ) cr->m_diffbotPageProcessPattern.set(val); if ( strcmp(name,"obeyRobots") == 0 ) { if ( val[0]=='t' || val[0]=='T' || val[0]==1 ) cr->m_useRobotsTxt = true; else cr->m_useRobotsTxt = false; } if ( strcmp(name,"onlyProcessNew") == 0 ) { if ( val[0]=='t' || val[0]=='T' || val[0]==1 ) cr->m_diffbotOnlyProcessIfNew = true; else cr->m_diffbotOnlyProcessIfNew = false; } if ( strcmp(name,"pauseCrawl") == 0 ) { if ( val[0]=='t' || val[0]=='T' || val[0]==1 ) cr->m_spideringEnabled = 0; else cr->m_spideringEnabled = 1; } } // set collective respider in case just that was passed for ( int32_t i =0 ; i < MAX_FILTERS ; i++ ) cr->m_spiderFreqs[i] = cr->m_collectiveRespiderFrequency; // if url filters not specified, we are done if ( ! JP.getItem("urlFilters") ) return true; // reset the url filters here to the default set. // we will append the client's filters below them below. resetUrlFilters ( cr ); char *expression = NULL; char *action = NULL; // start over at top ji = JP.getFirstItem(); // "urlFilters": [ // { // "value": "*", // MDW - this matches all urls! ("default") // "action": "http://www.diffbot.com/api/analyze?mode=auto" // } // { // "value": "company", // "action" : "http://www.diffbot.com/api/article?tags&meta" // } // { // "value": "^http://www", // "action": "doNotProcess" // } // { // "value": "$.html && category", // "action": "doNotCrawl" // } // { // "value": "!$.html && $.php", // "action": "doNotCrawl" // } // ] // how many filters do we have so far? int32_t nf = cr->m_numRegExs; for ( ; ji ; ji = ji->m_next ) { // just get STRINGS only if ( ji->m_type != JT_STRING ) continue; // must be right now char *name = ji->m_name; char *value = ji->getValue(); if ( strcmp(name,"value")==0 ) expression = value; if ( strcmp(name,"action")==0 ) action = ji->getValue(); // need both if ( ! action ) continue; if ( ! expression ) continue; // they use "*" instead of "default" so put that back if ( expression[0] == '*' ) expression = "default"; // deal with it cr->m_regExs[1].set(expression); cr->m_numRegExs++; int32_t priority = 50; // default diffbot api call: char *api = NULL; if ( strcasecmp(action,"donotcrawl") == 0 ) priority = SPIDER_PRIORITY_FILTERED; //if ( strcasecmp(action,"donotprocess") == 0 ) // api = NULL; // a new diffbot url? if ( strcasecmp(action,"http") == 0 ) api = action; // add the new filter cr->m_regExs [nf].set(expression); cr->m_spiderPriorities [nf] = priority; cr->m_spiderDiffbotApiUrl[nf].set(api); nf++; // add a mirror of that filter but for manually added, // i.e. injected or via add url, if ( priority < 0 ) continue; // make the priority higher! cr->m_regExs[nf].safePrintf("ismanualadd && %s",expression); cr->m_spiderPriorities [nf] = 70; cr->m_spiderDiffbotApiUrl[nf].set(api); // appends \0 nf++; // NULL out again action = NULL; expression = NULL; if ( nf < MAX_FILTERS ) continue; log("crawlbot: too many url filters!"); break; } // update the counts cr->m_numRegExs = nf; cr->m_numRegExs2 = nf; cr->m_numRegExs3 = nf; cr->m_numRegExs10 = nf; cr->m_numRegExs5 = nf; cr->m_numRegExs6 = nf; cr->m_numRegExs7 = nf; cr->m_numRegExs11 = nf; // set collective respider for ( int32_t i =0 ; i < nf ; i++ ) cr->m_spiderFreqs[i] = cr->m_collectiveRespiderFrequency; return true; } */ /* THIS IS NOW AUTOMATIC from new Parms.cpp broadcast logic bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket , HttpRequest *hr , CollectionRec *cr ) { // update the url filters for now since that is complicated // supply "cr" directly since "c" may not be in the http // request if addcoll=xxxxxx (just created a new rec) //int32_t page = PAGE_FILTERS; //WebPage *pg = g_pages.getPage ( page ) ; //g_parms.setFromRequest ( hr , socket , pg->m_function, cr ); bool rebuild = false; // // set other diffbot parms for this collection // int32_t maxToCrawl = hr->getLongLong("maxToCrawl",-1LL); if ( maxToCrawl == -1 ) maxToCrawl = hr->getLongLong("maxToDownload",-1LL); if ( maxToCrawl != -1 ) { cr->m_maxToCrawl = maxToCrawl; cr->m_needsSave = 1; } int32_t maxToProcess = hr->getLongLong("maxToProcess",-1LL); if ( maxToProcess != -1 ) { cr->m_maxToProcess = maxToProcess; cr->m_needsSave = 1; } // -1 means no max, so use -2 as default here int32_t maxCrawlRounds = hr->getLongLong("maxCrawlRounds",-2LL); if ( maxCrawlRounds == -2 ) maxCrawlRounds = hr->getLongLong("maxRounds",-2LL); if ( maxCrawlRounds != -2 ) { cr->m_maxCrawlRounds = maxCrawlRounds; cr->m_needsSave = 1; } char *email = hr->getString("notifyEmail",NULL,NULL); if ( email ) { cr->m_notifyEmail.set(email); cr->m_needsSave = 1; } char *url = hr->getString("notifyWebHook",NULL,NULL); if ( ! url ) url = hr->getString("notifyWebhook",NULL,NULL); if ( url ) { // assume url is invalid, purge it cr->m_notifyUrl.purge(); // normalize Url norm; norm.set ( url ); if ( norm.getDomainLen() > 0 && norm.getHostLen() > 0 ) // set the ssafebuf to it. will \0 terminate it. cr->m_notifyUrl.set(norm.getUrl()); // save the collection rec cr->m_needsSave = 1; } int32_t pause = hr->getLong("pauseCrawl",-1); // /v2/bulk api support if ( pause == -1 ) pause = hr->getLong("pause",-1); if ( pause == 0 ) { cr->m_needsSave = 1; cr->m_spideringEnabled = 1; } if ( pause == 1 ) { cr->m_needsSave = 1; cr->m_spideringEnabled = 0; } int32_t obeyRobots = hr->getLong("obeyRobots",-1); if ( obeyRobots == -1 ) obeyRobots = hr->getLong("robots",-1); if ( obeyRobots != -1 ) { cr->m_useRobotsTxt = obeyRobots; cr->m_needsSave = 1; } int32_t restrictDomain = hr->getLong("restrictDomain",-1); if ( restrictDomain != -1 ) { cr->m_restrictDomain = restrictDomain; cr->m_needsSave = 1; rebuild = true; } char *api = hr->getString("apiUrl",NULL); if ( api ) { cr->m_diffbotApiUrl.set(api); cr->m_needsSave = 1; } char *ppp1 = hr->getString("urlCrawlPattern",NULL); if ( ppp1 ) { cr->m_diffbotUrlCrawlPattern.set(ppp1); cr->m_needsSave = 1; rebuild = true; } char *ppp2 = hr->getString("urlProcessPattern",NULL); if ( ppp2 ) { cr->m_diffbotUrlProcessPattern.set(ppp2); cr->m_needsSave = 1; } char *ppp3 = hr->getString("pageProcessPattern",NULL); if ( ppp3 ) { cr->m_diffbotPageProcessPattern.set(ppp3); cr->m_needsSave = 1; } // reg ex support char *rx1 = hr->getString("urlCrawlRegEx",NULL); // clear what we had if ( rx1 && cr->m_hasucr ) { regfree ( &cr->m_ucr ); cr->m_hasucr = false; cr->m_diffbotUrlCrawlRegEx.purge(); cr->m_needsSave = 1; rebuild = true; } // add a new one if not blank if ( rx1 && rx1[0] ) { cr->m_diffbotUrlCrawlRegEx.set(rx1); cr->m_needsSave = 1; // this will store the compiled regular expression into ucr if ( regcomp ( &cr->m_ucr , // the regular expression to compile rx1 , // some flags REG_EXTENDED|REG_ICASE| REG_NEWLINE|REG_NOSUB) ) { regfree ( &cr->m_ucr); // should never fail! return log("xmldoc: regcomp %s failed: %s. " "Ignoring.", rx1,mstrerror(errno)); } cr->m_hasucr = true; } char *rx2 = hr->getString("urlProcessRegEx",NULL); // clear what we had if ( rx2 && cr->m_hasupr ) { regfree ( &cr->m_upr ); cr->m_hasupr = false; cr->m_diffbotUrlProcessRegEx.purge(); cr->m_needsSave = 1; } // add a new one if not blank if ( rx2 && rx2[0] ) { cr->m_diffbotUrlProcessRegEx.set(rx2); cr->m_needsSave = 1; // this will store the compiled regular expression into upr if ( regcomp ( &cr->m_upr , // the regular expression to compile rx2 , // some flags REG_EXTENDED|REG_ICASE| REG_NEWLINE|REG_NOSUB) ) { regfree ( &cr->m_upr); // error! return log("xmldoc: regcomp %s failed: %s. " "Ignoring.", rx2,mstrerror(errno)); } cr->m_hasupr = true; } float respider = hr->getFloat("repeatJob",-1.0); if ( respider == -1.0 ) respider = hr->getFloat("repeat",-1.0); if ( respider == -1.0 ) respider = hr->getFloat("repeatCrawl",-1.0); if ( respider >= 0.0 ) { // if not 0, then change this by the delta if ( cr->m_spiderRoundStartTime ) { // convert from days into seconds float rfOld = cr->m_collectiveRespiderFrequency; float rfNew = respider; // 86400 seconds in a day int32_t secondsOld = (int32_t)(rfOld * 86400); int32_t secondsNew = (int32_t)(rfNew * 86400); // remove old one. cr->m_spiderRoundStartTime -= secondsOld; // add in new one cr->m_spiderRoundStartTime += secondsNew; } // if 0 that means NO recrawling if ( respider == 0.0 ) { cr->m_spiderRoundStartTime = 0;//getTimeGlobal(); } cr->m_collectiveRespiderFrequency = respider; cr->m_needsSave = 1; } float delay = hr->getFloat("crawlDelay",-1.0); //int32_t crawlWait = hr->getLong("wait",-1); if ( delay >= 0.0 ) { rebuild = true; cr->m_collectiveCrawlDelay = delay; } int32_t onlyProcessNew = hr->getLong("onlyProcessIfNew",-1); if ( onlyProcessNew != -1 ) { cr->m_diffbotOnlyProcessIfNew = onlyProcessNew; cr->m_needsSave = 1; } // set collective respider //for ( int32_t i =0 ; i < cr->m_numRegExs ; i++ ) { // if ( cr->m_collectiveRespiderFrequency == 0.0 ) // cr->m_spiderFreqs[i] = 0.000; // else // cr->m_spiderFreqs[i] = 0.001; // //cr->m_collectiveRespiderFrequency; //} char *path = hr->getPath(); bool isBulkApi = false; if ( path && strncmp(path,"/v2/bulk",8)==0 ) isBulkApi = true; // were any url filteres specified? if not, don't reset them //if ( ! hr->hasField("action") ) // return true; // reset the url filters here to the default set. // we will append the client's filters below them below. resetUrlFilters ( cr ); // if it was not recrawling and we made it start we have // to repopulate waiting tree because most entries will // need to be re-added! // really, anytime we change url filters we have to repopulate // the waiting tree SpiderColl *sc = cr->m_spiderColl; if ( sc && rebuild ) { // this is causing a bulk job not to complete because // jenkins keeps checking it every 10 seconds sc->m_waitingTreeNeedsRebuild = true; } return true; // "urlFilters": [ // { // "value": "*", // MDW - this matches all urls! ("default") // "action": "http://www.diffbot.com/api/analyze?mode=auto" // } // { // "value": "company", // "action" : "http://www.diffbot.com/api/article?tags&meta" // } // { // "value": "^http://www", // "action": "doNotProcess" // } // { // "value": "$.html && category", // "action": "doNotCrawl" // } // { // "value": "!$.html && $.php", // "action": "doNotCrawl" // } // ] char *expression = NULL; char *action = NULL; // how many filters do we have so far? int32_t nf = cr->m_numRegExs; // delete the 3rd default filter cuz we should re-add it below // to the bottom of the list. if ( nf >= 3 ) nf--; bool addedDefault = false; // loop over the cgi parms for ( int32_t i = 0 ; i < hr->getNumFields() ; i++ ) { // get cgi parm name char *field = hr->getField ( i ); //int32_t flen = hr->getFieldLen ( i ); if ( strcmp(field,"expression") == 0 ) expression = hr->getValue(i); if ( strcmp(field,"action") == 0 ) action = hr->getValue(i); // need both if ( ! action ) continue; // no! the /v2/bulk api just has a single action if ( isBulkApi ) expression = "*"; // action before expression???? set action to NULL then? if ( ! expression ) continue; //else continue;// { action = NULL; continue; } // skip whitespace while ( is_wspace_a(*expression) ) expression++; while ( is_wspace_a(*action) ) action++; // skip if expression is empty if ( ! expression[0] ) { action = NULL; expression = NULL; continue; } // they use "*" instead of "default" so put that back if ( expression[0] == '*' ) { expression = "default"; addedDefault = true; } // deal with it int32_t priority = 50; // default diffbot api call: //char *api = NULL; if ( strcasecmp(action,"donotcrawl") == 0 ) priority = SPIDER_PRIORITY_FILTERED; //if ( strcasecmp(action,"donotprocess") == 0 ) // api = NULL; // a new diffbot url? //if ( strncasecmp(action,"http",4) == 0 ) //api = action; // add a mirror of that filter but for manually added, // i.e. injected or via add url, if ( priority >= 0 ) { // purge because might have been the last "default" // filter that we did nf-- above on. cr->m_regExs [nf].purge(); // make the priority higher! cr->m_regExs [nf].safePrintf("ismanualadd && %s", expression); cr->m_spiderPriorities [nf] = 70; cr->m_spiderDiffbotApiUrl[nf].set(action); // appends\0 cr->m_spiderFreqs[nf]= cr->m_collectiveRespiderFrequency; nf++; } // add the new filter cr->m_regExs [nf].set(expression); cr->m_spiderPriorities [nf] = priority; cr->m_spiderDiffbotApiUrl[nf].set(action); cr->m_spiderFreqs [nf] = cr->m_collectiveRespiderFrequency; nf++; // NULL out again action = NULL; expression = NULL; if ( nf < MAX_FILTERS ) continue; log("crawlbot: too many url filters!"); break; } // if no '*' line was provided, add it here if ( ! addedDefault ) { cr->m_regExs [nf].set("default"); cr->m_spiderPriorities [nf] = 50; cr->m_spiderDiffbotApiUrl[nf].set(NULL); cr->m_spiderFreqs[nf] = cr->m_collectiveRespiderFrequency; nf++; } // update the counts cr->m_numRegExs = nf; cr->m_numRegExs2 = nf; cr->m_numRegExs3 = nf; cr->m_numRegExs10 = nf; cr->m_numRegExs5 = nf; cr->m_numRegExs6 = nf; cr->m_numRegExs7 = nf; cr->m_numRegExs11 = nf; // set collective respider //for ( int32_t i =0 ; i < nf ; i++ ) // cr->m_spiderFreqs[i] = cr->m_collectiveRespiderFrequency; return true; } */ /////////// // // SUPPORT for getting the last 100 spidered urls // // . sends request to each node // . each node returns top 100 after scanning spiderdb (cache for speed) // . master node gets top 100 of the top 100s // . sends pretty html or json back to socket // . then user can see why their crawl isn't working // . also since we are scanning spiderdb indicate how many urls are // ignored because they match "ismedia" or "!isonsamedomain" etc. so // show each url filter expression then show how many urls matched that. // when doing this make the spiderReply null, b/c the purpose is to see // what urls // . BUT url may never be attempted because it matches "ismedia" so that kind // of thing might have to be indicated on the spiderdb dump above, not here. // ////////// //bool sendPageLast100Urls ( TcpSocket *socket , HttpRequest *hr ) {