2013-09-14 03:22:07 +04:00
// diffbot api implementaion
//
// WHAT APIs are here?
//
// . 1. the CrawlBot API to start a crawl
// . 2. To directly process a provided URL (injection)
// . 3. the Cache API so phantomjs can quickly check the cache for files
// and quickly add files to the cache.
//
// Related pages:
//
// * http://diffbot.com/dev/docs/ (Crawlbot API tab, and others)
// * http://diffbot.com/dev/crawl/
2013-10-02 01:14:39 +04:00
# include "PageCrawlBot.h"
2013-09-14 03:22:07 +04:00
# include "TcpServer.h"
# include "HttpRequest.h"
# include "HttpServer.h"
# include "Pages.h" // g_msg
# include "XmlDoc.h" // for checkRegex()
2013-09-26 01:37:20 +04:00
# include "PageInject.h" // Msg7
2013-10-15 04:19:59 +04:00
//#include "Json.h"
2013-12-11 03:28:04 +04:00
# include "Parms.h"
2013-09-14 03:22:07 +04:00
2013-09-30 20:46:54 +04:00
// so user can specify the format of the reply/output
# define FMT_HTML 1
# define FMT_XML 2
# define FMT_JSON 3
# define FMT_CSV 4
2013-09-30 21:10:43 +04:00
# define FMT_TXT 5
2013-09-30 20:46:54 +04:00
2013-09-14 03:22:07 +04:00
void doneSendingWrapper ( void * state , TcpSocket * sock ) ;
2013-09-17 21:25:54 +04:00
bool sendBackDump ( TcpSocket * s , HttpRequest * hr ) ;
2013-11-12 03:52:04 +04:00
CollectionRec * addNewDiffbotColl ( char * addColl , char * token , char * name ,
class HttpRequest * hr ) ;
2013-10-15 02:10:48 +04:00
bool resetUrlFilters ( CollectionRec * cr ) ;
2013-09-19 02:07:47 +04:00
2013-10-15 04:19:59 +04:00
bool setSpiderParmsFromHtmlRequest ( TcpSocket * socket ,
HttpRequest * hr ,
CollectionRec * cr ) ;
2013-09-14 03:22:07 +04:00
////////////////
//
2013-09-17 21:25:54 +04:00
// SUPPORT FOR DOWNLOADING an RDB DUMP
2013-09-14 03:22:07 +04:00
//
// We ask each shard for 10MB of Spiderdb records. If 10MB was returned
// then we repeat. Everytime we get 10MB from each shard we print the
// Spiderdb records out into "safebuf" and transmit it to the user. once
// the buffer has been transmitted then we ask the shards for another 10MB
// worth of spider records.
//
////////////////
// use this as a state while dumping out spiderdb for a collection
class StateCD {
public :
StateCD ( ) { m_needsMime = true ; } ;
void sendBackDump2 ( ) ;
2013-09-30 23:48:37 +04:00
bool readDataFromRdb ( ) ;
2013-10-01 00:12:22 +04:00
bool sendList ( ) ;
2013-10-26 01:54:24 +04:00
void printSpiderdbList ( RdbList * list , SafeBuf * sb ,
char * * lastKeyPtr ) ;
void printTitledbList ( RdbList * list , SafeBuf * sb ,
char * * lastKeyPtr ) ;
2013-11-13 01:51:52 +04:00
bool printJsonItemInCsv ( char * json , SafeBuf * sb ) ;
2013-09-14 03:22:07 +04:00
2013-12-05 22:09:06 +04:00
long long m_lastUh48 ;
long long m_prevReplyUh48 ;
long m_prevReplyError ;
time_t m_prevReplyDownloadTime ;
2013-09-26 01:37:20 +04:00
char m_fmt ;
Msg4 m_msg4 ;
HttpRequest m_hr ;
Msg7 m_msg7 ;
2013-10-31 00:12:46 +04:00
WaitEntry m_waitEntry ;
2013-11-04 22:49:31 +04:00
bool m_isFirstTime ;
2013-11-04 23:29:22 +04:00
bool m_printedFirstBracket ;
2013-11-04 23:34:22 +04:00
bool m_printedEndingBracket ;
2013-11-04 23:29:22 +04:00
bool m_printedItem ;
2013-11-04 22:49:31 +04:00
2013-11-13 01:51:52 +04:00
bool m_needHeaderRow ;
2013-11-15 01:16:08 +04:00
SafeBuf m_seedBank ;
2013-09-14 03:22:07 +04:00
bool m_needsMime ;
char m_rdbId ;
2013-09-17 21:25:54 +04:00
bool m_downloadJSON ;
2013-09-14 03:22:07 +04:00
collnum_t m_collnum ;
long m_numRequests ;
long m_numReplies ;
long m_minRecSizes ;
bool m_someoneNeedsMore ;
TcpSocket * m_socket ;
Msg0 m_msg0s [ MAX_HOSTS ] ;
key128_t m_spiderdbStartKeys [ MAX_HOSTS ] ;
key_t m_titledbStartKeys [ MAX_HOSTS ] ;
RdbList m_lists [ MAX_HOSTS ] ;
bool m_needMore [ MAX_HOSTS ] ;
} ;
// . basically dump out spiderdb
2013-09-30 21:10:43 +04:00
// . returns urls in csv format in reply to a
// "GET /api/download/%s_data.json"
// "GET /api/download/%s_data.xml"
// "GET /api/download/%s_urls.csv"
// "GET /api/download/%s_pages.txt"
// where %s is the collection name
2013-09-14 03:22:07 +04:00
// . the ordering of the urls is not specified so whatever order they are
// in spiderdb will do
// . the gui that lists the urls as they are spidered in real time when you
// do a test crawl will just have to call this repeatedly. it shouldn't
// be too slow because of disk caching, and, most likely, the spider requests
// will all be in spiderdb's rdbtree any how
// . because we are distributed we have to send a msg0 request to each
// shard/group asking for all the spider urls. dan says 30MB is typical
// for a csv file, so for now we will just try to do a single spiderdb
// request.
2013-09-30 21:10:43 +04:00
bool sendBackDump ( TcpSocket * sock , HttpRequest * hr ) {
2013-09-17 21:25:54 +04:00
2013-09-30 21:10:43 +04:00
char * path = hr - > getPath ( ) ;
long pathLen = hr - > getPathLen ( ) ;
char * pathEnd = path + pathLen ;
char * str = strstr ( path , " /download/ " ) ;
if ( ! str ) {
char * msg = " bad download request " ;
log ( " crawlbot: %s " , msg ) ;
g_httpServer . sendErrorReply ( sock , 500 , msg ) ;
return true ;
}
2013-10-30 21:00:46 +04:00
//long pathLen = hr->getPathLen();
char rdbId = RDB_NONE ;
bool downloadJSON = false ;
long fmt ;
char * xx ;
if ( ( xx = strstr ( path , " _data.json " ) ) ) {
rdbId = RDB_TITLEDB ;
fmt = FMT_JSON ;
downloadJSON = true ;
}
2013-11-13 01:51:52 +04:00
else if ( ( xx = strstr ( path , " _data.csv " ) ) ) {
2013-10-30 21:00:46 +04:00
rdbId = RDB_TITLEDB ;
downloadJSON = true ;
2013-11-13 01:51:52 +04:00
fmt = FMT_CSV ;
2013-10-30 21:00:46 +04:00
}
else if ( ( xx = strstr ( path , " _urls.csv " ) ) ) {
rdbId = RDB_SPIDERDB ;
fmt = FMT_CSV ;
}
else if ( ( xx = strstr ( path , " _pages.txt " ) ) ) {
rdbId = RDB_TITLEDB ;
fmt = FMT_TXT ;
}
// sanity, must be one of 3 download calls
if ( rdbId = = RDB_NONE ) {
char * msg ;
msg = " usage: downloadurls, downloadpages, downloaddata " ;
2013-09-30 21:10:43 +04:00
log ( " crawlbot: %s " , msg ) ;
g_httpServer . sendErrorReply ( sock , 500 , msg ) ;
return true ;
}
2013-10-30 21:00:46 +04:00
char * coll = str + 10 ;
if ( coll > = pathEnd ) {
char * msg = " bad download request2 " ;
2013-09-30 21:10:43 +04:00
log ( " crawlbot: %s " , msg ) ;
g_httpServer . sendErrorReply ( sock , 500 , msg ) ;
return true ;
}
2013-10-30 21:00:46 +04:00
// get coll
char * collEnd = xx ;
2013-09-30 21:10:43 +04:00
//CollectionRec *cr = getCollRecFromHttpRequest ( hr );
CollectionRec * cr = g_collectiondb . getRec ( coll , collEnd - coll ) ;
2013-09-17 21:25:54 +04:00
if ( ! cr ) {
char * msg = " token or id (crawlid) invalid " ;
log ( " crawlbot: invalid token or crawlid to dump " ) ;
2013-09-30 21:10:43 +04:00
g_httpServer . sendErrorReply ( sock , 500 , msg ) ;
2013-09-17 21:25:54 +04:00
return true ;
}
2013-09-30 21:10:43 +04:00
2013-11-20 22:45:10 +04:00
// . if doing download of json, make it search results now!
// . make an httprequest on stack and call it
if ( fmt = = FMT_CSV & & rdbId = = RDB_TITLEDB ) {
char tmp2 [ 5000 ] ;
SafeBuf sb2 ( tmp2 , 5000 ) ;
sb2 . safePrintf ( " GET /search.csv?icc=1&format=csv&sc=0&dr=0& "
" c=%s&n=1000000& "
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson "
" \r \n \r \n "
, cr - > m_coll
) ;
HttpRequest hr2 ;
hr2 . set ( sb2 . getBufStart ( ) , sb2 . length ( ) , sock ) ;
return sendPageResults ( sock , & hr2 ) ;
}
2013-09-30 21:10:43 +04:00
//if ( strncmp ( path ,"/crawlbot/downloadurls",22 ) == 0 )
// rdbId = RDB_SPIDERDB;
//if ( strncmp ( path ,"/crawlbot/downloadpages",23 ) == 0 )
// rdbId = RDB_TITLEDB;
//if ( strncmp ( path ,"/crawlbot/downloaddata",22 ) == 0 ) {
// downloadJSON = true;
// rdbId = RDB_TITLEDB;
//}
2013-09-17 21:25:54 +04:00
2013-09-14 03:22:07 +04:00
StateCD * st ;
try { st = new ( StateCD ) ; }
catch ( . . . ) {
2013-11-11 04:28:00 +04:00
return g_httpServer . sendErrorReply ( sock , 500 , mstrerror ( g_errno ) ) ;
2013-09-14 03:22:07 +04:00
}
mnew ( st , sizeof ( StateCD ) , " statecd " ) ;
// initialize the new state
st - > m_rdbId = rdbId ;
2013-09-17 21:25:54 +04:00
st - > m_downloadJSON = downloadJSON ;
2013-09-30 21:10:43 +04:00
st - > m_socket = sock ;
2013-09-14 03:22:07 +04:00
// the name of the collections whose spiderdb we read from
st - > m_collnum = cr - > m_collnum ;
2013-09-30 20:46:54 +04:00
2013-09-30 21:10:43 +04:00
st - > m_fmt = fmt ;
2013-11-04 22:49:31 +04:00
st - > m_isFirstTime = true ;
2013-09-30 20:46:54 +04:00
2013-11-04 23:29:22 +04:00
st - > m_printedFirstBracket = false ;
st - > m_printedItem = false ;
2013-11-04 23:34:22 +04:00
st - > m_printedEndingBracket = false ;
2013-11-04 23:29:22 +04:00
2013-11-13 01:51:52 +04:00
// for csv...
st - > m_needHeaderRow = true ;
2013-12-05 22:09:06 +04:00
st - > m_lastUh48 = 0LL ;
st - > m_prevReplyUh48 = 0LL ;
st - > m_prevReplyError = 0 ;
st - > m_prevReplyDownloadTime = 0LL ;
2013-11-11 04:28:00 +04:00
// debug
2013-11-11 21:58:14 +04:00
//log("mnew1: st=%lx",(long)st);
2013-11-11 04:28:00 +04:00
2013-09-14 03:22:07 +04:00
// begin the possible segmented process of sending back spiderdb
// to the user's browser
st - > sendBackDump2 ( ) ;
// i dont think this return values matters at all since httpserver.cpp
// does not look at it when it calls sendReply()
return true ;
}
void StateCD : : sendBackDump2 ( ) {
m_numRequests = 0 ;
m_numReplies = 0 ;
// read 10MB from each shard's spiderdb at a time
m_minRecSizes = 9999999 ;
// we stop reading from all shards when this becomes false
m_someoneNeedsMore = true ;
// initialize the spiderdb startkey "cursor" for each shard's spiderdb
2013-10-05 03:18:56 +04:00
for ( long i = 0 ; i < g_hostdb . m_numShards ; i + + ) {
2013-09-14 03:22:07 +04:00
m_needMore [ i ] = true ;
KEYMIN ( ( char * ) & m_spiderdbStartKeys [ i ] , sizeof ( key128_t ) ) ;
KEYMIN ( ( char * ) & m_titledbStartKeys [ i ] , sizeof ( key_t ) ) ;
}
2013-09-30 23:48:37 +04:00
subloop :
2013-09-14 03:22:07 +04:00
// begin reading from each shard and sending the spiderdb records
2013-09-30 23:48:37 +04:00
// over the network. return if that blocked
if ( ! readDataFromRdb ( ) ) return ;
// send it to the browser socket
2013-10-01 00:12:22 +04:00
if ( ! sendList ( ) ) return ;
2013-09-30 23:48:37 +04:00
// . hey, it did not block... i guess no data to send out
// . but if all shards are exhausted from the dump, just return
if ( m_someoneNeedsMore ) goto subloop ;
// note it
log ( " crawlbot: nobody needs more 1 " ) ;
2013-09-14 03:22:07 +04:00
}
2013-10-01 00:12:22 +04:00
void sendListWrapper ( void * state ) ;
2013-09-14 03:22:07 +04:00
2013-09-30 23:48:37 +04:00
bool StateCD : : readDataFromRdb ( ) {
2013-09-14 03:22:07 +04:00
// set end key to max key. we are limiting using m_minRecSizes for this
key128_t ek ; KEYMAX ( ( char * ) & ek , sizeof ( key128_t ) ) ;
CollectionRec * cr = g_collectiondb . getRec ( m_collnum ) ;
2013-09-30 23:48:37 +04:00
// top:
2013-09-14 03:22:07 +04:00
// launch one request to each shard
2013-10-05 03:18:56 +04:00
for ( long i = 0 ; i < g_hostdb . m_numShards ; i + + ) {
2013-10-26 01:54:24 +04:00
// reset each one
m_lists [ i ] . freeList ( ) ;
// if last list was exhausted don't bother
if ( ! m_needMore [ i ] ) continue ;
2013-09-14 03:22:07 +04:00
// count it
m_numRequests + + ;
// this is the least nice. crawls will yield to it mostly.
long niceness = 0 ;
// point to right startkey
char * sk ;
if ( m_rdbId = = RDB_SPIDERDB )
sk = ( char * ) & m_spiderdbStartKeys [ i ] ;
else
sk = ( char * ) & m_titledbStartKeys [ i ] ;
// get host
2013-10-05 03:18:56 +04:00
Host * h = g_hostdb . getLiveHostInShard ( i ) ;
2013-09-14 03:22:07 +04:00
// msg0 uses multicast in case one of the hosts in a shard is
// dead or dies during this call.
if ( ! m_msg0s [ i ] . getList ( h - > m_hostId , // use multicast
h - > m_ip ,
h - > m_port ,
0 , // maxcacheage
false , // addtocache?
m_rdbId ,
cr - > m_coll ,
& m_lists [ i ] ,
sk ,
( char * ) & ek ,
// get at most about
// "minRecSizes" worth of spiderdb
// records
m_minRecSizes ,
this ,
2013-10-01 00:12:22 +04:00
sendListWrapper ,
2013-09-30 23:48:37 +04:00
niceness ) ) {
log ( " crawlbot: blocked getting list from shard " ) ;
2013-09-14 03:22:07 +04:00
// continue if it blocked
continue ;
2013-09-30 23:48:37 +04:00
}
2013-11-07 05:15:29 +04:00
log ( " crawlbot: did not block getting list from shard err=%s " ,
mstrerror ( g_errno ) ) ;
2013-09-14 03:22:07 +04:00
// we got a reply back right away...
m_numReplies + + ;
}
// all done? return if still waiting on more msg0s to get their data
2013-09-30 23:48:37 +04:00
if ( m_numReplies < m_numRequests ) return false ;
// i guess did not block, empty single shard? no, must have been
2013-10-01 00:12:22 +04:00
// error becaues sendList() would have sent back on the tcp
2013-09-30 23:48:37 +04:00
// socket and blocked and returned false if not error sending
return true ;
2013-09-14 03:22:07 +04:00
}
2013-10-01 00:12:22 +04:00
void sendListWrapper ( void * state ) {
2013-09-14 03:22:07 +04:00
// get the Crawler dump State
StateCD * st = ( StateCD * ) state ;
2013-10-01 00:12:22 +04:00
// inc it up here
st - > m_numReplies + + ;
2013-09-30 23:48:37 +04:00
subloop :
// if this blocked sending back some data, return
2013-10-01 00:12:22 +04:00
if ( ! st - > sendList ( ) ) return ;
2013-09-30 23:48:37 +04:00
// otherwise, read more, maybe had no data to send from list
if ( ! st - > readDataFromRdb ( ) ) return ;
// send and read more
if ( st - > m_someoneNeedsMore ) goto subloop ;
// note it
log ( " crawlbot: nobody needs more 2 " ) ;
2013-09-14 03:22:07 +04:00
}
2013-10-01 00:12:22 +04:00
bool StateCD : : sendList ( ) {
2013-09-14 03:22:07 +04:00
// get the Crawler dump State
// inc it
2013-10-01 00:12:22 +04:00
//m_numReplies++;
2013-09-30 23:48:37 +04:00
// sohw it
log ( " crawlbot: got list from shard. req=%li rep=%li " ,
m_numRequests , m_numReplies ) ;
2013-09-14 03:22:07 +04:00
// return if still awaiting more replies
2013-09-30 23:48:37 +04:00
if ( m_numReplies < m_numRequests ) return false ;
2013-09-14 03:22:07 +04:00
SafeBuf sb ;
//sb.setLabel("dbotdmp");
2013-09-30 20:46:54 +04:00
char * ct = " text/csv " ;
if ( m_fmt = = FMT_JSON )
ct = " application/json " ;
if ( m_fmt = = FMT_XML )
ct = " text/xml " ;
2013-09-30 21:10:43 +04:00
if ( m_fmt = = FMT_TXT )
ct = " text/plain " ;
2013-11-13 01:51:52 +04:00
if ( m_fmt = = FMT_CSV )
ct = " text/csv " ;
2013-09-30 20:46:54 +04:00
2013-09-14 03:22:07 +04:00
// . if we haven't yet sent an http mime back to the user
// then do so here, the content-length will not be in there
// because we might have to call for more spiderdb data
if ( m_needsMime ) {
HttpMime mime ;
mime . makeMime ( - 1 , // totel content-lenght is unknown!
0 , // do not cache (cacheTime)
0 , // lastModified
0 , // offset
- 1 , // bytesToSend
NULL , // ext
false , // POSTReply
2013-09-30 20:46:54 +04:00
ct , // "text/csv", // contenttype
2013-09-14 03:22:07 +04:00
" utf-8 " , // charset
- 1 , // httpstatus
NULL ) ; //cookie
sb . safeMemcpy ( mime . getMime ( ) , mime . getMimeLen ( ) ) ;
}
2013-09-30 19:18:22 +04:00
//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
2013-09-14 03:22:07 +04:00
2013-11-04 23:29:22 +04:00
if ( ! m_printedFirstBracket & & m_fmt = = FMT_JSON ) {
sb . safePrintf ( " [ \n " ) ;
m_printedFirstBracket = true ;
}
2013-09-14 03:22:07 +04:00
// we set this to true below if any one shard has more spiderdb
// records left to read
m_someoneNeedsMore = false ;
//
// got all replies... create the HTTP reply and send it back
//
2013-10-05 03:18:56 +04:00
for ( long i = 0 ; i < g_hostdb . m_numShards ; i + + ) {
2013-09-14 03:22:07 +04:00
if ( ! m_needMore [ i ] ) continue ;
// get the list from that group
RdbList * list = & m_lists [ i ] ;
2013-09-30 23:48:37 +04:00
// should we try to read more?
m_needMore [ i ] = false ;
if ( list - > isEmpty ( ) ) {
list - > freeList ( ) ;
continue ;
}
2013-09-14 03:22:07 +04:00
// get the format
2013-09-30 19:18:22 +04:00
//char *format = cr->m_diffbotFormat.getBufStart();
//if ( cr->m_diffbotFormat.length() <= 0 ) format = NULL;
2013-09-30 20:46:54 +04:00
//char *format = NULL;
2013-09-14 03:22:07 +04:00
2013-10-26 01:54:24 +04:00
// this cores because msg0 does not transmit lastkey
//char *ek = list->getLastKey();
char * lastKeyPtr = NULL ;
2013-09-14 03:22:07 +04:00
// now print the spiderdb list out into "sb"
if ( m_rdbId = = RDB_SPIDERDB ) {
// print SPIDERDB list into "sb"
2013-10-26 01:54:24 +04:00
printSpiderdbList ( list , & sb , & lastKeyPtr ) ;
2013-09-14 03:22:07 +04:00
// update spiderdb startkey for this shard
2013-10-26 01:54:24 +04:00
KEYSET ( ( char * ) & m_spiderdbStartKeys [ i ] , lastKeyPtr ,
2013-09-14 03:22:07 +04:00
sizeof ( key128_t ) ) ;
// advance by 1
m_spiderdbStartKeys [ i ] + = 1 ;
}
2013-10-26 01:54:24 +04:00
else if ( m_rdbId = = RDB_TITLEDB ) {
2013-09-14 03:22:07 +04:00
// print TITLEDB list into "sb"
2013-10-26 01:54:24 +04:00
printTitledbList ( list , & sb , & lastKeyPtr ) ;
2013-09-14 03:22:07 +04:00
// update titledb startkey for this shard
2013-10-26 01:54:24 +04:00
KEYSET ( ( char * ) & m_titledbStartKeys [ i ] , lastKeyPtr ,
2013-09-14 03:22:07 +04:00
sizeof ( key_t ) ) ;
// advance by 1
m_titledbStartKeys [ i ] + = 1 ;
}
2013-10-26 01:54:24 +04:00
else { char * xx = NULL ; * xx = 0 ; }
2013-09-30 23:48:37 +04:00
// figure out why we do not get the full list????
//if ( list->m_listSize >= 0 ) { // m_minRecSizes ) {
m_needMore [ i ] = true ;
m_someoneNeedsMore = true ;
//}
// save mem
list - > freeList ( ) ;
2013-09-14 03:22:07 +04:00
}
2013-11-04 23:29:22 +04:00
// if nobody needs to read more...
2013-11-15 03:05:15 +04:00
if ( m_rdbId = = RDB_TITLEDB & &
m_fmt = = FMT_JSON & &
! m_someoneNeedsMore & &
2013-11-04 23:34:22 +04:00
! m_printedEndingBracket ) {
m_printedEndingBracket = true ;
2013-11-04 23:29:22 +04:00
// end array of json objects. might be empty!
sb . safePrintf ( " \n ] " ) ;
}
2013-09-14 03:22:07 +04:00
// if first time, send it back
if ( m_needsMime ) {
// only do once
m_needsMime = false ;
2013-11-15 04:59:50 +04:00
sendLoop :
2013-09-14 03:22:07 +04:00
// start the send process
TcpServer * tcp = & g_httpServer . m_tcp ;
if ( ! tcp - > sendMsg ( m_socket ,
sb . getBufStart ( ) , // sendBuf ,
sb . getCapacity ( ) , //sendBufSize ,
sb . length ( ) , //sendBufSize ,
sb . length ( ) , // msgtotalsize
this , // data for callback
doneSendingWrapper ) ) { // callback
// do not free sendbuf we are transmitting it
sb . detachBuf ( ) ;
2013-09-30 23:48:37 +04:00
return false ;
2013-09-14 03:22:07 +04:00
}
// error?
//TcpSocket *s = m_socket;
2013-10-01 01:28:25 +04:00
// sometimes it does not block and is successful because
// it just writes its buffer out in one write call.
2013-11-15 04:28:23 +04:00
//if ( ! g_errno )
sb . detachBuf ( ) ;
2013-10-01 00:12:22 +04:00
// log it
2013-11-11 21:58:14 +04:00
//log("crawlbot: nuking state. strange");
2013-10-01 00:12:22 +04:00
2013-09-14 03:22:07 +04:00
// nuke state
2013-10-01 01:28:25 +04:00
//delete this;
//mdelete ( this , sizeof(StateCD) , "stcd" );
2013-11-11 21:58:14 +04:00
//if ( g_errno )
log ( " diffbot: tcp sendmsg did not block: %s " ,
mstrerror ( g_errno ) ) ;
2013-09-14 03:22:07 +04:00
//g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
2013-10-01 01:28:25 +04:00
// wait for doneSendingWrapper to be called.
2013-11-11 21:58:14 +04:00
//return false;
//
// it did not block... so just keep going. that just
// means the socket sent the data. it's probably buffered.
//
2013-11-15 04:28:23 +04:00
// but we DO have to free the sendbuffer here since
// we did not block
mfree ( m_socket - > m_sendBuf ,
m_socket - > m_sendBufSize ,
" dbsbuf " ) ;
m_socket - > m_sendBuf = NULL ;
2013-11-15 04:59:50 +04:00
return true ;
2013-09-14 03:22:07 +04:00
}
// if nothing to send back we are done. return true since we
// did not block sending back.
2013-11-11 21:58:14 +04:00
if ( sb . length ( ) = = 0 ) {
//log("crawlbot: nuking state.");
//delete this;
//mdelete ( this , sizeof(StateCD) , "stcd" );
return true ;
}
2013-09-30 23:48:37 +04:00
// how can this be?
if ( m_socket - > m_sendBuf ) { char * xx = NULL ; * xx = 0 ; }
2013-09-14 03:22:07 +04:00
// put socket in sending-again mode
m_socket - > m_sendBuf = sb . getBufStart ( ) ;
m_socket - > m_sendBufSize = sb . getCapacity ( ) ;
m_socket - > m_sendBufUsed = sb . length ( ) ;
m_socket - > m_sendOffset = 0 ;
m_socket - > m_totalSent = 0 ;
m_socket - > m_totalToSend = sb . length ( ) ;
// tell TcpServer.cpp to send this latest buffer! HACK!
2013-10-01 00:12:22 +04:00
//m_socket->m_sockState = ST_SEND_AGAIN;//ST_WRITING;//SEND_AGAIN;
2013-09-14 03:22:07 +04:00
2013-11-15 04:59:50 +04:00
// this does nothing if we were not called indirectly by
// TcpServer::writeSocketWrapper_r(). so if we should call
// sendMsg() ourselves in such a situation.
// so if the sendMsg() did not block, the first time, and we came
// here empty except for the ending ']' the 2nd time, then
// write it out this way... calling sendMsg() directly
if ( m_socket - > m_sockState = = ST_NEEDS_CLOSE ) {
//m_socket->m_sockState = ST_SEND_AGAIN;
goto sendLoop ;
}
2013-09-14 03:22:07 +04:00
// do not let safebuf free this, we will take care of it
sb . detachBuf ( ) ;
// . when it is done sending call this callback, don't hang up!
// . if m_someoneNeedsMore is false then this callback should just
// destroy the socket and delete "this"
m_socket - > m_callback = doneSendingWrapper ;
m_socket - > m_state = this ;
2013-10-01 00:12:22 +04:00
2013-10-22 04:35:14 +04:00
//if ( m_socket->m_sendBufUsed == 79 )
// log("hey");
2013-10-01 00:12:22 +04:00
// log it
log ( " crawlbot: resending %li bytes on socket " , m_socket - > m_sendBufUsed ) ;
2013-09-14 03:22:07 +04:00
// we blocked sending back
2013-09-30 23:48:37 +04:00
return false ;
2013-09-14 03:22:07 +04:00
}
// TcpServer.cpp calls this when done sending TcpSocket's m_sendBuf
void doneSendingWrapper ( void * state , TcpSocket * sock ) {
StateCD * st = ( StateCD * ) state ;
TcpSocket * socket = st - > m_socket ;
2013-10-01 00:12:22 +04:00
log ( " crawlbot: done sending on socket %li/%li bytes " ,
sock - > m_totalSent ,
sock - > m_sendBufUsed ) ;
2013-10-01 01:28:25 +04:00
// . if the final callback
// . sometimes m_sendBuf is NULL if we freed it below and tried to
// read more, only to read 0 bytes
// . but it will be non-null if we read 0 bytes the first time
// and just have a mime to send. because sendReply() above
// returned true, and then doneSendingWrapper() got called.
if ( //! socket->m_sendBuf &&
2013-09-30 23:48:37 +04:00
st - > m_numRequests < = st - > m_numReplies & &
! st - > m_someoneNeedsMore ) {
log ( " crawlbot: done sending for download request " ) ;
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-11-11 21:58:14 +04:00
//log("mdel1: st=%lx",(long)st);
2013-09-30 23:48:37 +04:00
return ;
}
// if the timer called us, just return
if ( ! socket - > m_sendBuf ) {
log ( " crawlbot: timer callback " ) ;
socket - > m_sockState = ST_SEND_AGAIN ;
return ;
}
2013-09-14 03:22:07 +04:00
// free the old sendbuf then i guess since we might replace it
// in the above function.
mfree ( socket - > m_sendBuf ,
socket - > m_sendBufSize ,
" dbsbuf " ) ;
// in case we have nothing to send back do not let socket free
// what we just freed above. it'll core.
socket - > m_sendBuf = NULL ;
2013-09-30 23:48:37 +04:00
// sometimes this wrapper is called just from the timer...
// so if we have outstanding msg0s then we gotta wait
if ( st - > m_numRequests > st - > m_numReplies ) {
char * xx = NULL ; * xx = 0 ;
socket - > m_sockState = ST_SEND_AGAIN ;
return ;
}
2013-09-14 03:22:07 +04:00
// all done?
if ( st - > m_someoneNeedsMore ) {
2013-09-30 23:48:37 +04:00
// make sure socket doesn't close up on us!
socket - > m_sockState = ST_SEND_AGAIN ;
log ( " crawlbot: reading more download data " ) ;
// just enter the little loop here
subloop :
// otherwise, read more, maybe had no data to send from list
if ( ! st - > readDataFromRdb ( ) ) return ;
// if this blocked sending back some data, return
2013-10-01 00:12:22 +04:00
if ( ! st - > sendList ( ) ) return ;
// note that
log ( " crawlbot: sendList did not block " ) ;
2013-09-30 23:48:37 +04:00
// send and read more
if ( st - > m_someoneNeedsMore ) goto subloop ;
// note it
log ( " crawlbot: nobody needs more 3 " ) ;
// sanity
if ( st - > m_numRequests > st - > m_numReplies ) { char * xx = NULL ; * xx = 0 ; }
2013-09-14 03:22:07 +04:00
}
2013-09-30 23:48:37 +04:00
log ( " crawlbot: no more data available " ) ;
// it's possible that readDataFromRdb() did not block and called
2013-10-01 00:12:22 +04:00
// sendList which set the socket m_sendBuf again... so check
2013-09-30 23:48:37 +04:00
// for that... it needs to be sent yet before we delete this state
//if ( st->m_socket->m_sendBuf ) return;
2013-09-14 03:22:07 +04:00
}
2013-10-26 01:54:24 +04:00
void StateCD : : printSpiderdbList ( RdbList * list , SafeBuf * sb , char * * lastKeyPtr ) {
2013-09-14 03:22:07 +04:00
// declare these up here
SpiderRequest * sreq = NULL ;
SpiderReply * srep = NULL ;
2013-09-25 23:09:02 +04:00
long badCount = 0 ;
2013-10-24 22:32:41 +04:00
long nowGlobalMS = gettimeofdayInMillisecondsGlobal ( ) ;
CollectionRec * cr = g_collectiondb . getRec ( m_collnum ) ;
2013-10-30 21:00:46 +04:00
long lastSpidered = 0 ;
2013-09-14 03:22:07 +04:00
// parse through it
for ( ; ! list - > isExhausted ( ) ; list - > skipCurrentRec ( ) ) {
// this record is either a SpiderRequest or SpiderReply
char * rec = list - > getCurrentRec ( ) ;
2013-10-26 01:54:24 +04:00
// save it
* lastKeyPtr = rec ;
2013-09-14 03:22:07 +04:00
// we encounter the spiderreplies first then the
// spiderrequests for the same url
if ( g_spiderdb . isSpiderReply ( ( key128_t * ) rec ) ) {
srep = ( SpiderReply * ) rec ;
2013-10-30 21:00:46 +04:00
if ( sreq ) lastSpidered = 0 ;
2013-09-14 03:22:07 +04:00
sreq = NULL ;
2013-10-30 21:00:46 +04:00
if ( lastSpidered = = 0 )
lastSpidered = srep - > m_spideredTime ;
else if ( srep - > m_spideredTime > lastSpidered )
lastSpidered = srep - > m_spideredTime ;
2013-12-05 22:09:06 +04:00
m_prevReplyUh48 = srep - > getUrlHash48 ( ) ;
2013-09-14 03:22:07 +04:00
// 0 means indexed successfully. not sure if
// this includes http status codes like 404 etc.
// i don't think it includes those types of errors!
2013-12-05 22:09:06 +04:00
m_prevReplyError = srep - > m_errCode ;
m_prevReplyDownloadTime = srep - > m_spideredTime ;
2013-09-14 03:22:07 +04:00
continue ;
}
// ok, we got a spider request
sreq = ( SpiderRequest * ) rec ;
// sanity check
if ( srep & & srep - > getUrlHash48 ( ) ! = sreq - > getUrlHash48 ( ) ) {
2013-09-25 23:09:02 +04:00
badCount + + ;
//log("diffbot: had a spider reply with no "
// "corresponding spider request for uh48=%lli"
// , srep->getUrlHash48());
2013-09-14 03:22:07 +04:00
//char *xx=NULL;*xx=0;
}
// print the url if not yet printed
long long uh48 = sreq - > getUrlHash48 ( ) ;
bool printIt = false ;
// there can be multiple spiderrequests for the same url!
2013-12-05 22:09:06 +04:00
if ( m_lastUh48 ! = uh48 ) printIt = true ;
2013-09-14 03:22:07 +04:00
if ( ! printIt ) continue ;
2013-12-05 22:09:06 +04:00
m_lastUh48 = uh48 ;
2013-09-14 03:22:07 +04:00
2013-10-30 21:00:46 +04:00
// make sure spiderreply is for the same url!
if ( srep & & srep - > getUrlHash48 ( ) ! = sreq - > getUrlHash48 ( ) )
srep = NULL ;
if ( ! srep )
lastSpidered = 0 ;
2013-11-04 22:49:31 +04:00
bool isProcessed = false ;
if ( srep ) isProcessed = srep - > m_sentToDiffbot ;
2013-11-19 05:38:14 +04:00
if ( srep & & srep - > m_hadDiffbotError )
isProcessed = false ;
2013-09-14 03:22:07 +04:00
// debug point
//if ( strstr(sreq->m_url,"chief") )
// log("hey");
// 1 means spidered, 0 means not spidered, -1 means error
long status = 1 ;
// if unspidered, then we don't match the prev reply
// so set "status" to 0 to indicate hasn't been
// downloaded yet.
2013-12-05 22:09:06 +04:00
if ( m_lastUh48 ! = m_prevReplyUh48 ) status = 0 ;
2013-09-14 03:22:07 +04:00
// if it matches, perhaps an error spidering it?
2013-12-05 22:09:06 +04:00
if ( status & & m_prevReplyError ) status = - 1 ;
2013-09-14 03:22:07 +04:00
// use the time it was added to spiderdb if the url
// was not spidered
time_t time = sreq - > m_addedTime ;
// if it was spidered, successfully or got an error,
// then use the time it was spidered
2013-12-05 22:09:06 +04:00
if ( status ) time = m_prevReplyDownloadTime ;
2013-09-14 03:22:07 +04:00
2013-10-31 00:39:10 +04:00
char * msg = " Successfully Downloaded " ; //Crawled";
if ( status = = 0 ) msg = " Not downloaded " ; //Unexamined";
2013-12-05 22:09:06 +04:00
if ( status = = - 1 ) msg = mstrerror ( m_prevReplyError ) ;
2013-09-14 03:22:07 +04:00
2013-11-19 05:38:14 +04:00
if ( srep & & srep - > m_hadDiffbotError )
msg = " Diffbot processing error " ;
2013-10-24 22:32:41 +04:00
// matching url filter, print out the expression
long ufn ;
ufn = : : getUrlFilterNum ( sreq ,
srep ,
nowGlobalMS ,
false ,
MAX_NICENESS ,
cr ) ;
char * expression = NULL ;
long priority = - 4 ;
// sanity check
if ( ufn > = 0 ) {
expression = cr - > m_regExs [ ufn ] . getBufStart ( ) ;
priority = cr - > m_spiderPriorities [ ufn ] ;
}
if ( ! expression ) {
expression = " error. matches no expression! " ;
priority = - 4 ;
}
// when spidering rounds we use the
// lastspidertime>={roundstart} --> spiders disabled rule
// so that we do not spider a url twice in the same round
if ( ufn > = 0 & & ! cr - > m_spidersEnabled [ ufn ] ) {
priority = - 5 ;
}
2013-11-04 22:49:31 +04:00
char * as = " discovered " ;
if ( sreq & &
( sreq - > m_isInjecting | |
sreq - > m_isAddUrl ) ) {
as = " manually added " ;
}
// print column headers?
if ( m_isFirstTime ) {
m_isFirstTime = false ;
sb - > safePrintf ( " \" Url \" , "
" \" Entry Method \" , "
" \" Processed? \" , "
" \" Add Time \" , "
" \" Last Crawled \" , "
" \" Last Status \" , "
" \" Matching Expression \" , "
" \" Matching Action \" \n " ) ;
}
2013-09-14 03:22:07 +04:00
// "csv" is default if json not specified
2013-10-24 22:32:41 +04:00
if ( m_fmt = = FMT_JSON )
2013-09-14 03:22:07 +04:00
sb - > safePrintf ( " [{ "
" { \" url \" : "
" \" %s \" }, "
" { \" time \" : "
" \" %lu \" }, "
" { \" status \" : "
" \" %li \" }, "
" { \" statusMsg \" : "
" \" %s \" } "
" }] \n "
, sreq - > m_url
// when was it first added to spiderdb?
, sreq - > m_addedTime
, status
, msg
) ;
// but default to csv
2013-10-24 22:32:41 +04:00
else {
2013-11-04 22:49:31 +04:00
sb - > safePrintf ( " \" %s \" , \" %s \" , "
" %li,%lu,%lu, \" %s \" , \" %s \" , \" "
2013-09-14 03:22:07 +04:00
//",%s"
2013-10-24 22:32:41 +04:00
//"\n"
2013-09-14 03:22:07 +04:00
, sreq - > m_url
2013-11-04 22:49:31 +04:00
, as
, ( long ) isProcessed
2013-09-14 03:22:07 +04:00
// when was it first added to spiderdb?
, sreq - > m_addedTime
2013-10-30 21:00:46 +04:00
// last time spidered, 0 if none
, lastSpidered
2013-10-24 22:32:41 +04:00
//, status
2013-09-14 03:22:07 +04:00
, msg
2013-10-24 22:32:41 +04:00
// the url filter expression it matches
, expression
// the priority
//, priorityMsg
2013-09-14 03:22:07 +04:00
//, iptoa(sreq->m_firstIp)
) ;
2013-10-24 22:32:41 +04:00
// print priority
if ( priority = = SPIDER_PRIORITY_FILTERED )
sb - > safePrintf ( " url ignored " ) ;
else if ( priority = = SPIDER_PRIORITY_BANNED )
sb - > safePrintf ( " url banned " ) ;
else if ( priority = = - 4 )
sb - > safePrintf ( " error " ) ;
else if ( priority = = - 5 )
sb - > safePrintf ( " will spider next round " ) ;
else
sb - > safePrintf ( " %li " , priority ) ;
sb - > safePrintf ( " \" "
" \n " ) ;
}
2013-09-14 03:22:07 +04:00
}
2013-09-25 23:09:02 +04:00
if ( ! badCount ) return ;
log ( " diffbot: had a spider reply with no "
" corresponding spider request %li times " , badCount ) ;
2013-09-14 03:22:07 +04:00
}
2013-10-26 01:54:24 +04:00
void StateCD : : printTitledbList ( RdbList * list , SafeBuf * sb , char * * lastKeyPtr ) {
2013-09-14 03:22:07 +04:00
XmlDoc xd ;
CollectionRec * cr = g_collectiondb . getRec ( m_collnum ) ;
2013-11-04 23:29:22 +04:00
// save it
* lastKeyPtr = NULL ;
2013-09-14 03:22:07 +04:00
// parse through it
for ( ; ! list - > isExhausted ( ) ; list - > skipCurrentRec ( ) ) {
// this record is either a SpiderRequest or SpiderReply
char * rec = list - > getCurrentRec ( ) ;
// skip ifnegative
if ( ( rec [ 0 ] & 0x01 ) = = 0x00 ) continue ;
2013-11-04 23:29:22 +04:00
// set it
* lastKeyPtr = rec ;
2013-10-01 00:25:33 +04:00
// reset first since set2() can't call reset()
xd . reset ( ) ;
2013-09-14 03:22:07 +04:00
// uncompress it
if ( ! xd . set2 ( rec ,
0 , // maxSize unused
cr - > m_coll ,
NULL , // ppbuf
0 , // niceness
NULL ) ) { // spiderRequest
log ( " diffbot: error setting titlerec in dump " ) ;
continue ;
}
// must be of type json to be a diffbot json object
2013-09-17 21:25:54 +04:00
if ( m_downloadJSON & & xd . m_contentType ! = CT_JSON ) continue ;
// or if downloading web pages...
if ( ! m_downloadJSON ) {
// skip if json object content type
if ( xd . m_contentType = = CT_JSON ) continue ;
// . just print the cached page
// . size should include the \0
sb - > safeStrcpy ( xd . m_firstUrl . m_url ) ;
// then \n
sb - > pushChar ( ' \n ' ) ;
// then page content
sb - > safeStrcpy ( xd . ptr_utf8Content ) ;
// null term just in case
//sb->nullTerm();
// separate pages with \0 i guess
sb - > pushChar ( ' \0 ' ) ;
// \n
sb - > pushChar ( ' \n ' ) ;
continue ;
}
2013-09-14 03:22:07 +04:00
// skip if not a diffbot json url
2013-09-19 04:05:41 +04:00
if ( ! xd . m_isDiffbotJSONObject ) continue ;
2013-09-14 03:22:07 +04:00
// get the json content
char * json = xd . ptr_utf8Content ;
2013-11-13 01:51:52 +04:00
// empty?
if ( xd . size_utf8Content < = 1 )
continue ;
2013-11-04 23:29:22 +04:00
2013-11-13 01:51:52 +04:00
// if not json, just print the json item out in csv
2013-11-13 04:33:45 +04:00
// moved into PageResults.cpp...
//if ( m_fmt == FMT_CSV ) {
// printJsonItemInCsv ( json , sb );
// continue;
//}
2013-11-04 23:29:22 +04:00
2013-09-14 03:22:07 +04:00
// just print that out. encode \n's and \r's back to \\n \\r
// and backslash to a \\ ...
// but if they originally had a \u<backslash> encoding and
// we made into utf8, do not put that back into the \u
// encoding because it is not necessary.
2013-11-13 01:51:52 +04:00
// print in json
if ( m_printedItem )
sb - > safePrintf ( " \n , \n " ) ;
m_printedItem = true ;
2013-09-14 03:22:07 +04:00
if ( ! sb - > safeStrcpyPrettyJSON ( json ) )
log ( " diffbot: error printing json in dump " ) ;
// separate each JSON object with \n i guess
2013-11-04 23:29:22 +04:00
//sb->pushChar('\n');
2013-11-13 01:51:52 +04:00
}
}
2013-09-17 21:25:54 +04:00
/*
2013-09-14 03:22:07 +04:00
////////////////
//
// SUPPORT FOR GET /api/crawls and /api/activecrawls
//
// Just scan each collection record whose collection name includes the
// provided "token" of the user. then print out the stats of just
//
////////////////
// example output for http://live.diffbot.com/api/crawls?token=matt
// [{"id":"c421f09d-7c31-4131-9da2-21e35d8130a9","finish":1378233585887,"matched":274,"status":"Stopped","start":1378233159848,"token":"matt","parameterMap":{"token":"matt","seed":"www.techcrunch.com","api":"article"},"crawled":274}]
// example output from activecrawls?id=....
// {"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":null,"matched":27,"status":"Crawling","start":1378322184332,"token":"matt","parameterMap":{"token":"matt","seed":"www.alleyinsider.com","api":"article"},"crawled":34}
// NOTE: it does not seem to include active crawls! bad!! like if you lost
// the crawlid...
// "cr" is NULL if showing all crawls!
bool showAllCrawls ( TcpSocket * s , HttpRequest * hr ) {
long tokenLen = 0 ;
char * token = hr - > getString ( " token " , & tokenLen ) ;
// token MUST be there because this function's caller checked for it
if ( ! token ) { char * xx = NULL ; * xx = 0 ; }
// store the crawl stats as html into "sb"
SafeBuf sb ;
// scan the collection recs
for ( long i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) {
// get it
CollectionRec * cr = g_collectiondb . m_recs [ i ] ;
// skip if empty
if ( ! cr ) continue ;
// get name
char * coll = cr - > m_coll ;
//long collLen = cr->m_collLen;
// skip if first 16 or whatever characters does not match
// the user token because the name of a collection is
// <TOKEN>-<CRAWLID>
if ( coll [ 0 ] ! = token [ 0 ] ) continue ;
if ( coll [ 1 ] ! = token [ 1 ] ) continue ;
if ( coll [ 2 ] ! = token [ 2 ] ) continue ;
// scan the rest
bool match = true ;
for ( long i = 3 ; coll [ i ] & & token [ i ] ; i + + ) {
// the name of a collection is <TOKEN>-<CRAWLID>
// so if we hit the hyphen we are done
if ( coll [ i ] = = ' - ' ) break ;
if ( coll [ i ] ! = token [ i ] ) { match = false ; break ; }
}
if ( ! match ) continue ;
// we got a match, print them out
printCrawlStats ( & sb , cr ) ;
}
// and send back now
return g_httpServer . sendDynamicPage ( s , sb . getBufStart ( ) ,
sb . length ( ) ,
2013-09-17 21:25:54 +04:00
- 1 ) ; // cachetime
2013-09-14 03:22:07 +04:00
}
2013-09-17 21:25:54 +04:00
*/
2013-09-14 03:22:07 +04:00
2013-10-16 01:08:55 +04:00
/*
2013-09-18 02:32:28 +04:00
char * getTokenFromHttpRequest ( HttpRequest * hr ) {
2013-09-17 22:27:31 +04:00
// provided directly?
2013-09-18 02:32:28 +04:00
char * token = hr - > getString ( " token " , NULL , NULL ) ;
2013-09-17 22:27:31 +04:00
if ( token ) return token ;
// extract token from coll?
char * c = hr - > getString ( " c " , NULL , NULL ) ;
2013-09-27 01:32:11 +04:00
// try new "id" approach
if ( ! c ) c = hr - > getString ( " id " , NULL , NULL ) ;
2013-09-17 22:27:31 +04:00
if ( ! c ) return NULL ;
2013-09-18 02:32:28 +04:00
CollectionRec * cr = g_collectiondb . getRec ( c ) ;
if ( ! cr ) return NULL ;
if ( cr - > m_diffbotToken . length ( ) < = 0 ) return NULL ;
token = cr - > m_diffbotToken . getBufStart ( ) ;
return token ;
2013-09-17 22:27:31 +04:00
}
2013-09-17 21:25:54 +04:00
CollectionRec * getCollRecFromHttpRequest ( HttpRequest * hr ) {
2013-09-17 22:27:31 +04:00
// if we have the collection name explicitly, get the coll rec then
2013-09-17 21:25:54 +04:00
char * c = hr - > getString ( " c " , NULL , NULL ) ;
2013-09-27 01:32:11 +04:00
// try new "id" approach
if ( ! c ) c = hr - > getString ( " id " , NULL , NULL ) ;
2013-09-17 21:25:54 +04:00
if ( c ) return g_collectiondb . getRec ( c ) ;
// no matches
return NULL ;
}
2013-10-16 01:08:55 +04:00
*/
2013-09-17 21:25:54 +04:00
/*
2013-09-14 03:22:07 +04:00
// doesn't have to be fast, so just do a scan
CollectionRec * getCollRecFromCrawlId ( char * crawlId ) {
long idLen = gbstrlen ( crawlId ) ;
// scan collection names
for ( long i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) {
// get it
CollectionRec * cr = g_collectiondb . m_recs [ i ] ;
// skip if empty
if ( ! cr ) continue ;
// get name
char * coll = cr - > m_coll ;
long collLen = cr - > m_collLen ;
if ( collLen < 16 ) continue ;
// skip if first 16 or whatever characters does not match
// the user token because the name of a collection is
// <TOKEN>-<CRAWLID>
if ( coll [ collLen - 1 ] ! = crawlId [ idLen - 1 ] ) continue ;
if ( coll [ collLen - 2 ] ! = crawlId [ idLen - 2 ] ) continue ;
if ( coll [ collLen - 3 ] ! = crawlId [ idLen - 3 ] ) continue ;
if ( ! strstr ( coll , crawlId ) ) continue ;
return cr ;
}
return NULL ;
}
void printCrawlStatsWrapper ( void * state ) {
StateXX * sxx = ( StateXX * ) state ;
// get collection rec
CollectionRec * cr = g_collectiondb . getRec ( sxx - > m_collnum ) ;
// print out the crawl
SafeBuf sb ;
printCrawlStats ( & sb , cr ) ;
// save before nuking state
TcpSocket * sock = sxx - > m_socket ;
// nuke the state
delete sxx ;
mdelete ( sxx , sizeof ( StateXX ) , " stxx " ) ;
// and send back now
g_httpServer . sendDynamicPage ( sock ,
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-09-17 21:25:54 +04:00
- 1 ) ; // cachetime
2013-09-14 03:22:07 +04:00
}
void printCrawlStats ( SafeBuf * sb , CollectionRec * cr ) {
// if we are the first, print a '[' to start a json thingy
if ( sb - > length ( ) = = 0 )
sb - > pushChar ( ' [ ' ) ;
// otherwise, remove the previous ']' since we are not the last
else {
char * p = sb - > getBufStart ( ) ;
long plen = sb - > length ( ) ;
if ( p [ plen - 1 ] = = ' [ ' )
sb - > incrementLength ( - 1 ) ;
}
sb - > safePrintf ( " { "
" \" id \" : \" "
) ;
// get the token from coll name
char * token = cr - > m_coll ;
// and the length, up to the hyphen that separates it from crawl id
long tokenLen = 0 ;
for ( ; token [ tokenLen ] & & token [ tokenLen ] ! = ' - ' ; tokenLen + + ) ;
// now crawl id
char * crawlId = token + tokenLen ;
// skip hyphen
if ( crawlId [ 0 ] = = ' - ' ) crawlId + + ;
// print crawl id out
sb - > safeStrcpy ( crawlId ) ;
// end its quote
sb - > safeStrcpy ( " \" , " ) ;
// now the time the crawl finished.
if ( cr - > m_spideringEnabled )
sb - > safePrintf ( " \" finish \" :null, " ) ;
else
sb - > safePrintf ( " \" finish \" :%lli, " , cr - > m_diffbotCrawlEndTime ) ;
// how many urls we handoff to diffbot api. that implies successful
// download and that it matches the url crawl pattern and
// url process pattern and content regular expression pattern.
//
// NOTE: pageProcessAttempts can be higher than m_pageDownloadAttempts
// when we call getMetaList() on an *old* (in titledb) xmldoc,
// where we just get the cached content from titledb to avoid a
// download, but we still call getDiffbotReply(). perhaps reconstruct
// the diffbot reply from XmlDoc::m_diffbotJSONCount
//
// "processed" here corresponds to the "maxProcessed" cgi parm
// specified when instantiating the crawl parms for the first time.
//
// likewise "crawled" corresponds to "maxCrawled"
//
sb - > safePrintf ( " \" processedAttempts \" :%lli, " ,
cr - > m_globalCrawlInfo . m_pageProcessAttempts ) ;
sb - > safePrintf ( " \" processed \" :%lli, " ,
cr - > m_globalCrawlInfo . m_pageProcessSuccesses ) ;
sb - > safePrintf ( " \" crawlAttempts \" :%lli, " ,
cr - > m_globalCrawlInfo . m_pageDownloadAttempts ) ;
sb - > safePrintf ( " \" crawled \" :%lli, " ,
cr - > m_globalCrawlInfo . m_pageDownloadSuccesses ) ;
sb - > safePrintf ( " \" urlsConsidered \" :%lli, " ,
cr - > m_globalCrawlInfo . m_urlsConsidered ) ;
// how many spiders outstanding for this coll right now?
SpiderColl * sc = g_spiderCache . getSpiderColl ( cr - > m_collnum ) ;
long spidersOut = sc - > getTotalOutstandingSpiders ( ) ;
// . status of the crawl: "Stopped" or "Active"?
// . TODO: check with dan to see if Active is correct and
// ShuttingDown is allowable
if ( cr - > m_spideringEnabled )
sb - > safePrintf ( " \" status \" : \" Active \" , " ) ;
else if ( spidersOut )
sb - > safePrintf ( " \" status \" : \" ShuttingDown \" , " ) ;
else
sb - > safePrintf ( " \" status \" : \" Stopped \" , " ) ;
// spider crawl start time
sb - > safePrintf ( " \" start \" :%lli, " , cr - > m_diffbotCrawlStartTime ) ;
// the token
sb - > safePrintf ( " \" token \" : \" " ) ;
sb - > safeMemcpy ( token , tokenLen ) ;
sb - > safePrintf ( " \" , " ) ;
//
// BEGIN parameter map
//
// the token again
sb - > safePrintf ( " { " ) ;
sb - > safePrintf ( " \" token \" : \" " ) ;
sb - > safeMemcpy ( token , tokenLen ) ;
sb - > safePrintf ( " \" , " ) ;
// the seed url
sb - > safePrintf ( " \" seed \" : \" %s \" , " , cr - > m_diffbotSeed . getBufStart ( ) ) ;
// the api
sb - > safePrintf ( " \" api \" : \" %s \" , " , cr - > m_diffbotApi . getBufStart ( ) ) ;
sb - > safePrintf ( " }, " ) ;
//
// END parameter map
//
// crawl count. counts non-errors. successful downloads.
//sb->safePrintf("\"crawled\":%lli",
// cr->m_globalCrawlInfo.m_pageCrawlAttempts);
sb - > safePrintf ( " } " ) ;
// assume we are the last json object in the array
sb - > pushChar ( ' ] ' ) ;
}
2013-09-17 21:25:54 +04:00
*/
2013-09-14 03:22:07 +04:00
////////////////
//
// **** THE CRAWLBOT CONTROL PANEL *****
//
// . Based on http://diffbot.com/dev/crawl/ page.
// . got to /dev/crawl to see this!
//
////////////////
2013-09-27 20:49:24 +04:00
/*
2013-09-14 03:22:07 +04:00
// generate a random collection name
2013-09-18 02:32:28 +04:00
char * getNewCollName ( ) { // char *token , long tokenLen ) {
2013-09-14 03:22:07 +04:00
// let's create a new crawl id. dan was making it 32 characters
// with 4 hyphens in it for a total of 36 bytes, but since
// MAX_COLL_LEN, the maximum length of a collection name, is just
// 64 bytes, and the token is already 32, let's limit to 16 bytes
// for the crawlerid. so if we print that out in hex, 16 hex chars
// 0xffffffff 0xffffffff is 64 bits. so let's make a random 64-bit
// value here.
unsigned long r1 = rand ( ) ;
unsigned long r2 = rand ( ) ;
unsigned long long crawlId64 = ( unsigned long long ) r1 ;
crawlId64 < < = 32 ;
crawlId64 | = r2 ;
static char s_collBuf [ MAX_COLL_LEN + 1 ] ;
2013-09-17 22:27:31 +04:00
//long tokenLen = gbstrlen(token);
2013-09-14 03:22:07 +04:00
// include a +5 for "-test"
// include 16 for crawlid (16 char hex #)
2013-09-18 02:32:28 +04:00
//if ( tokenLen + 16 + 5>= MAX_COLL_LEN ) { char *xx=NULL;*xx=0;}
2013-09-14 03:22:07 +04:00
// ensure the crawlid is the full 16 characters long so we
// can quickly extricate the crawlid from the collection name
2013-09-18 02:32:28 +04:00
//memcpy ( s_collBuf, token, tokenLen );
//sprintf(s_collBuf + tokenLen ,"-%016llx",crawlId64);
sprintf ( s_collBuf , " %016llx " , crawlId64 ) ;
2013-09-14 03:22:07 +04:00
return s_collBuf ;
}
2013-09-27 20:49:24 +04:00
*/
2013-09-14 03:22:07 +04:00
2013-09-26 01:37:20 +04:00
//////////////////////////////////////////
//
// MAIN API STUFF I GUESS
//
//////////////////////////////////////////
2013-09-14 03:22:07 +04:00
2013-09-27 20:49:24 +04:00
bool sendReply2 ( TcpSocket * socket , long fmt , char * msg ) {
// log it
log ( " crawlbot: %s " , msg ) ;
2013-10-15 23:40:56 +04:00
char * ct = " text/html " ;
2013-09-27 20:49:24 +04:00
// send this back to browser
SafeBuf sb ;
2013-10-15 23:40:56 +04:00
if ( fmt = = FMT_JSON ) {
2013-10-22 23:25:37 +04:00
sb . safePrintf ( " { \n \" response \" : \" success \" , \n "
" \" message \" : \" %s \" \n } \n "
2013-09-27 20:49:24 +04:00
, msg ) ;
2013-10-15 23:40:56 +04:00
ct = " application/json " ;
}
2013-09-27 20:49:24 +04:00
else
sb . safePrintf ( " <html><body> "
" success: %s "
" </body></html> "
, msg ) ;
//return g_httpServer.sendErrorReply(socket,500,sb.getBufStart());
return g_httpServer . sendDynamicPage ( socket ,
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-10-15 23:40:56 +04:00
0 , // cachetime
false , // POST reply?
ct ) ;
2013-09-27 20:49:24 +04:00
}
2013-09-26 03:12:01 +04:00
bool sendErrorReply2 ( TcpSocket * socket , long fmt , char * msg ) {
2013-09-14 03:22:07 +04:00
2013-09-26 01:37:20 +04:00
// log it
log ( " crawlbot: %s " , msg ) ;
2013-09-14 03:22:07 +04:00
2013-10-15 23:40:56 +04:00
char * ct = " text/html " ;
2013-09-26 01:37:20 +04:00
// send this back to browser
2013-09-14 03:22:07 +04:00
SafeBuf sb ;
2013-10-15 23:40:56 +04:00
if ( fmt = = FMT_JSON ) {
2013-10-22 23:25:37 +04:00
sb . safePrintf ( " { \" error \" : \" %s \" } \n "
2013-09-26 01:37:20 +04:00
, msg ) ;
2013-10-15 23:40:56 +04:00
ct = " application/json " ;
}
2013-09-26 01:37:20 +04:00
else
sb . safePrintf ( " <html><body> "
" failed: %s "
" </body></html> "
, msg ) ;
2013-10-19 02:21:00 +04:00
// log it
log ( " crawlbot: %s " , msg ) ;
2013-09-26 03:12:01 +04:00
//return g_httpServer.sendErrorReply(socket,500,sb.getBufStart());
return g_httpServer . sendDynamicPage ( socket ,
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-10-15 23:40:56 +04:00
0 , // cachetime
false , // POST reply?
ct ) ;
2013-09-26 01:37:20 +04:00
}
2013-10-02 01:14:39 +04:00
bool printCrawlBotPage2 ( class TcpSocket * s ,
class HttpRequest * hr ,
char fmt ,
class SafeBuf * injectionResponse ,
class SafeBuf * urlUploadResponse ,
collnum_t collnum ) ;
2013-09-26 01:37:20 +04:00
void addedUrlsToSpiderdbWrapper ( void * state ) {
StateCD * st = ( StateCD * ) state ;
SafeBuf rr ;
2013-10-30 02:26:32 +04:00
rr . safePrintf ( " Successfully added urls for spidering. " ) ;
2013-09-26 01:37:20 +04:00
printCrawlBotPage2 ( st - > m_socket ,
& st - > m_hr ,
st - > m_fmt ,
NULL ,
2013-09-27 20:49:24 +04:00
& rr ,
st - > m_collnum ) ;
2013-09-26 01:37:20 +04:00
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-11-11 21:58:14 +04:00
//log("mdel2: st=%lx",(long)st);
2013-09-26 01:37:20 +04:00
}
2013-10-19 02:21:00 +04:00
/*
2013-09-26 01:37:20 +04:00
void injectedUrlWrapper ( void * state ) {
StateCD * st = ( StateCD * ) state ;
Msg7 * msg7 = & st - > m_msg7 ;
// the doc we injected...
XmlDoc * xd = & msg7 - > m_xd ;
// make a status msg for the url
SafeBuf sb ;
2013-10-17 01:03:14 +04:00
SafeBuf js ; // for json reply
2013-09-26 01:37:20 +04:00
if ( xd - > m_indexCode = = 0 ) {
sb . safePrintf ( " <b><font color=black> "
" Successfully added " ) ;
2013-10-17 01:03:14 +04:00
js . safePrintf ( " Seed Successful. " ) ;
2013-09-26 01:37:20 +04:00
}
else if ( xd - > m_indexCode = = EDOCFILTERED ) {
sb . safePrintf ( " <b><font color=red> "
" Error: <i>%s</i> by matching "
" url filter #%li "
" when adding "
, mstrerror ( xd - > m_indexCode )
2013-10-17 01:03:14 +04:00
// divide by 2 because we add a
// "manualadd &&" rule with every url filter
// that the client adds
, ( xd - > m_urlFilterNum - 2 ) / 2
2013-09-26 01:37:20 +04:00
) ;
2013-10-17 01:03:14 +04:00
js . safePrintf ( " Seed URL filtered by URL filter #%li "
, ( xd - > m_urlFilterNum - 2 ) / 2 ) ;
2013-09-26 01:37:20 +04:00
}
else {
sb . safePrintf ( " <b><font color=red> "
" Error: <i>%s</i> when adding "
, mstrerror ( xd - > m_indexCode ) ) ;
2013-10-17 01:03:14 +04:00
js . safePrintf ( " Error adding seed url: %s "
, mstrerror ( xd - > m_indexCode ) ) ;
2013-09-26 01:37:20 +04:00
}
sb . safeTruncateEllipsis ( xd - > m_firstUrl . getUrl ( ) , 60 ) ;
if ( xd - > m_indexCode = = 0 ) {
2013-10-17 01:03:14 +04:00
if ( xd - > m_numOutlinksAddedValid ) {
sb . safePrintf ( " (added %li outlinks) "
, ( long ) xd - > m_numOutlinksAdded ) ;
js . safePrintf ( " Added %li outlinks from same domain. "
" %li outlinks were filtered. "
, ( long ) xd - > m_numOutlinksAddedFromSameDomain
, ( long ) xd - > m_numOutlinksFiltered
) ;
}
else {
2013-09-26 01:37:20 +04:00
sb . safePrintf ( " (added 0 outlinks) " ) ;
2013-10-17 01:03:14 +04:00
js . safePrintf ( " Added 0 outlinks from same domain. "
" 0 links were filtered. " ) ;
}
2013-09-26 01:37:20 +04:00
}
sb . safePrintf ( " </font></b> " ) ;
sb . nullTerm ( ) ;
2013-10-17 01:03:14 +04:00
js . nullTerm ( ) ;
// send back the html or json response?
SafeBuf * response = & sb ;
if ( st - > m_fmt = = FMT_JSON ) response = & js ;
2013-09-26 01:37:20 +04:00
// . this will call g_httpServer.sendReply()
// . pass it in the injection response, "sb"
printCrawlBotPage2 ( st - > m_socket ,
& st - > m_hr ,
st - > m_fmt ,
2013-10-17 01:03:14 +04:00
response ,
2013-09-27 20:49:24 +04:00
NULL ,
st - > m_collnum ) ;
2013-09-26 01:37:20 +04:00
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
}
2013-10-19 02:21:00 +04:00
*/
2013-09-26 01:37:20 +04:00
class HelpItem {
public :
char * m_parm ;
char * m_desc ;
} ;
2013-10-23 08:27:21 +04:00
2013-09-26 01:37:20 +04:00
static class HelpItem s_his [ ] = {
2013-10-15 22:22:59 +04:00
{ " format " , " Use &format=html to show HTML output. Default is JSON. " } ,
2013-10-15 04:19:59 +04:00
{ " token " , " Required for all operations below. " } ,
{ " name " , " Name of the crawl. If missing will just show "
" all crawls owned by the given token. " } ,
2013-10-23 05:55:19 +04:00
{ " delete=1 " , " Deletes the crawl. " } ,
2013-11-15 01:16:08 +04:00
{ " reset=1 " , " Resets the crawl. Removes all seeds. " } ,
{ " restart=1 " , " Restarts the crawl. Keeps the seeds. " } ,
2013-10-15 04:19:59 +04:00
2013-10-23 05:55:19 +04:00
{ " pause " ,
2013-10-15 22:31:02 +04:00
" Specify 1 or 0 to pause or resume the crawl respectively. " } ,
2013-10-23 05:55:19 +04:00
{ " repeat " , " Specify number of days as floating point to "
2013-10-15 22:17:44 +04:00
" recrawl the pages. Set to 0.0 to NOT repeat the crawl. " } ,
2013-10-15 22:31:02 +04:00
2013-10-29 08:20:44 +04:00
{ " crawlDelay " , " Wait this many seconds between crawling urls from the "
" same IP address. Can be a floating point number. " } ,
2013-10-25 06:05:57 +04:00
2013-11-12 03:52:04 +04:00
//{"deleteCrawl","Same as delete."},
//{"resetCrawl","Same as delete."},
//{"pauseCrawl","Same as pause."},
//{"repeatCrawl","Same as repeat."},
2013-10-15 22:31:02 +04:00
2013-10-18 22:53:14 +04:00
{ " seeds " , " Whitespace separated list of URLs used to seed the crawl. "
" Will only follow outlinks on the same domain of seed URLs. "
2013-10-17 03:27:24 +04:00
} ,
2013-10-18 22:53:14 +04:00
{ " spots " ,
" Whitespace separated list of URLs to add to the crawl. "
" Outlinks will not be followed. " } ,
2013-11-12 03:52:04 +04:00
{ " urls " ,
" Same as spots. " } ,
2013-10-18 22:53:14 +04:00
//{"spiderLinks","Use 1 or 0 to spider the links or NOT spider "
// "the links, respectively, from "
// "the provided seed or addUrls parameters. "
// "The default is 1."},
2013-10-15 04:19:59 +04:00
2013-10-15 22:31:02 +04:00
2013-10-15 04:19:59 +04:00
{ " maxToCrawl " , " Specify max pages to successfully download. " } ,
2013-11-12 03:52:04 +04:00
//{"maxToDownload", "Specify max pages to successfully download."},
2013-10-15 04:19:59 +04:00
{ " maxToProcess " , " Specify max pages to successfully process through "
" diffbot. " } ,
2013-11-12 03:52:04 +04:00
{ " maxRounds " , " Specify maximum number of crawl rounds. Use "
2013-10-23 22:40:30 +04:00
" -1 to indicate no max. " } ,
2013-11-05 01:57:44 +04:00
{ " onlyProcessIfNew " , " Specify 1 to avoid re-processing pages "
" that have already been processed once before. " } ,
2013-10-15 04:19:59 +04:00
{ " notifyEmail " , " Send email alert to this email when crawl hits "
2013-11-12 03:52:04 +04:00
" the maxtocrawl or maxtoprocess limit, or when the crawl "
" completes. " } ,
2013-10-31 00:39:10 +04:00
{ " notifyWebhook " , " Fetch this URL when crawl hits "
2013-11-12 03:52:04 +04:00
" the maxtocrawl or maxtoprocess limit, or when the crawl "
" completes. " } ,
2013-10-15 04:19:59 +04:00
{ " obeyRobots " , " Obey robots.txt files? " } ,
2013-11-12 03:52:04 +04:00
{ " restrictDomain " , " Restrict downloaded urls to domains of seeds? " } ,
2013-11-21 04:41:28 +04:00
{ " urlCrawlPattern " , " List of || separated strings. If the url "
" contains any of these then we crawl the url, otherwise, we do not. "
" An empty pattern matches all urls. " } ,
{ " urlProcessPattern " , " List of || separated strings. If the url "
" contains any of these then we send url to diffbot for processing. "
" An empty pattern matches all urls. " } ,
2013-10-15 04:19:59 +04:00
{ " pageProcessPattern " , " List of || separated strings. If the page "
2013-10-09 04:08:58 +04:00
" contains any of these then we send it to diffbot for processing. "
2013-11-21 04:41:28 +04:00
" An empty pattern matches all pages. " } ,
2013-10-15 04:19:59 +04:00
2013-12-04 04:23:05 +04:00
{ " urlCrawlRegEx " , " Regular expression that the url must match "
" in order to be crawled. If present then the urlCrawlPattern will "
" be ignored. "
" An empty regular expression matches all urls. " } ,
{ " urlProcessRegEx " , " Regular expression that the url must match "
" in order to be processed. "
" If present then the urlProcessPattern will "
" be ignored. "
" An empty regular expression matches all urls. " } ,
2013-12-17 02:10:39 +04:00
{ " apiUrl " , " Diffbot api url to use. We automatically append "
" token and url to it. " } ,
2013-12-04 04:23:05 +04:00
2013-12-17 02:10:39 +04:00
//{"expression","A pattern to match in a URL. List up to 100 "
// "expression/action pairs in the HTTP request. "
// "Example expressions:"},
//{"action","Take the appropriate action when preceeding pattern is "
// "matched. Specify multiple expression/action pairs to build a "
// "table of filters. Each URL being spidered will take the given "
// "action of the first expression it matches. Example actions:"},
2013-10-15 04:19:59 +04:00
2013-09-26 03:51:43 +04:00
2013-09-26 01:37:20 +04:00
{ NULL , NULL }
} ;
2013-10-15 04:19:59 +04:00
/*
2013-10-15 03:19:30 +04:00
// get the input string from the httprequest or the json post
char * getInputString ( char * string , HttpRequest * hr , Json * JS ) {
// try to get it from http request
char * val = hr - > getString ( string ) ;
// if token in json post, use that
if ( ! val ) {
JsonItem * ji = JS . getItem ( string ) ;
if ( ji ) val = ji - > getValue ( ) ;
}
return val ;
}
2013-10-15 04:19:59 +04:00
*/
2013-10-31 00:12:46 +04:00
void collOpDoneWrapper ( void * state ) {
StateCD * st = ( StateCD * ) state ;
TcpSocket * socket = st - > m_socket ;
2013-11-11 04:28:00 +04:00
log ( " crawlbot: done with blocked op. " ) ;
2013-10-31 00:12:46 +04:00
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-11-11 21:58:14 +04:00
//log("mdel3: st=%lx",(long)st);
2013-10-31 00:12:46 +04:00
g_httpServer . sendDynamicPage ( socket , " OK " , 2 ) ;
}
2013-09-27 08:41:05 +04:00
// . when we receive the request from john we call broadcastRequest() from
// Pages.cpp. then msg28 sends this replay with a &cast=0 appended to it
// to every host in the network. then when msg28 gets back replies from all
// those hosts it calls sendPageCrawlbot() here but without a &cast=0
// . so if no &cast is present we are the original!!!
bool sendPageCrawlbot ( TcpSocket * socket , HttpRequest * hr ) {
2013-09-26 01:37:20 +04:00
2013-10-15 22:22:59 +04:00
// print help
long help = hr - > getLong ( " help " , 0 ) ;
if ( help ) {
SafeBuf sb ;
sb . safePrintf ( " <html> "
" <title>Crawlbot API</title> "
" <h1>Crawlbot API</h1> "
" <b>Use the parameters below on the "
" <a href= \" /crawlbot \" >/crawlbot</a> page. "
" </b><br><br> "
" <table> "
) ;
for ( long i = 0 ; i < 1000 ; i + + ) {
HelpItem * h = & s_his [ i ] ;
if ( ! h - > m_parm ) break ;
sb . safePrintf ( " <tr> "
" <td>%s</td> "
" <td>%s</td> "
" </tr> "
, h - > m_parm
, h - > m_desc
) ;
}
sb . safePrintf ( " </table> "
" </html> " ) ;
return g_httpServer . sendDynamicPage ( socket ,
sb . getBufStart ( ) ,
sb . length ( ) ,
0 ) ; // cachetime
}
2013-10-15 03:19:30 +04:00
// . Pages.cpp by default broadcasts all PageCrawlbot /crawlbot
// requests to every host in the network unless a cast=0 is
// explicitly given
// . Msg28::massConfig() puts a &cast=0 on the secondary requests
// sent to each host in the network
2013-12-17 02:35:27 +04:00
//long cast = hr->getLong("cast",1);
2013-09-27 08:41:05 +04:00
2013-10-15 03:19:30 +04:00
// httpserver/httprequest should not try to decode post if
// it's application/json.
2013-10-15 04:19:59 +04:00
//char *json = hr->getPOST();
//Json JS;
//if ( json ) JS.parseJsonStringIntoJsonItems ( json );
2013-09-26 01:37:20 +04:00
// . now show stats for the current crawl
// . put in xml or json if format=xml or format=json or
// xml=1 or json=1 ...
2013-10-15 22:22:59 +04:00
char fmt = FMT_JSON ;
2013-09-26 01:37:20 +04:00
char * fs = hr - > getString ( " format " , NULL , NULL ) ;
// give john a json api
2013-10-15 22:22:59 +04:00
if ( fs & & strcmp ( fs , " html " ) = = 0 ) fmt = FMT_HTML ;
2013-09-26 01:37:20 +04:00
if ( fs & & strcmp ( fs , " json " ) = = 0 ) fmt = FMT_JSON ;
if ( fs & & strcmp ( fs , " xml " ) = = 0 ) fmt = FMT_XML ;
2013-10-15 03:19:30 +04:00
// if we got json as input, give it as output
2013-10-15 04:19:59 +04:00
//if ( JS.getFirstItem() ) fmt = FMT_JSON;
2013-10-15 03:19:30 +04:00
// token is always required. get from json or html form input
2013-10-15 04:19:59 +04:00
//char *token = getInputString ( "token" );
char * token = hr - > getString ( " token " ) ;
2013-10-15 03:19:30 +04:00
2013-12-17 02:35:27 +04:00
if ( ! token & & fmt = = FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) {
2013-10-15 03:19:30 +04:00
char * msg = " invalid token " ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
if ( ! token ) {
// print token form if html
SafeBuf sb ;
sb . safePrintf ( " In order to use crawlbot you must "
" first LOGIN: "
" <form action=/crawlbot method=get> "
" <br> "
" <input type=text name=token size=50> "
" <input type=submit name=submit value=OK> "
" </form> "
" <br> "
" <b>- OR -</b> "
" <br> SIGN UP "
" <form action=/crawlbot method=get> "
" Name: <input type=text name=name size=50> "
" <br> "
" Email: <input type=text name=email size=50> "
" <br> "
" <input type=submit name=submit value=OK> "
" </form> "
" </body> "
" </html> " ) ;
return g_httpServer . sendDynamicPage ( socket ,
sb . getBufStart ( ) ,
sb . length ( ) ,
0 ) ; // cachetime
}
2013-10-15 22:22:59 +04:00
if ( gbstrlen ( token ) > 32 ) {
2013-10-19 02:21:00 +04:00
//log("crawlbot: token is over 32 chars");
2013-10-15 22:22:59 +04:00
char * msg = " crawlbot: token is over 32 chars " ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
2013-10-18 22:53:14 +04:00
char * seeds = hr - > getString ( " seeds " ) ;
char * spots = hr - > getString ( " spots " ) ;
2013-10-15 03:19:30 +04:00
2013-10-15 21:54:54 +04:00
// just existence is the operation
2013-12-17 02:35:27 +04:00
//bool delColl = hr->hasField("deleteCrawl");
//bool resetColl = hr->hasField("resetCrawl");
2013-10-15 21:54:54 +04:00
2013-10-23 05:51:09 +04:00
// /v2/bulk api support:
if ( ! spots ) spots = hr - > getString ( " urls " ) ;
2013-12-17 02:35:27 +04:00
//if ( ! delColl ) delColl = hr->hasField("delete");
//if ( ! resetColl ) resetColl = hr->hasField("reset");
2013-10-23 05:51:09 +04:00
2013-12-17 02:35:27 +04:00
//bool restartColl = hr->hasField("restart");
2013-10-23 05:51:09 +04:00
2013-10-15 21:54:54 +04:00
char * name = hr - > getString ( " name " ) ;
2013-10-15 23:40:56 +04:00
//if ( delColl && ! && cast == 0 ) {
// log("crawlbot: no collection found to delete.");
// char *msg = "Could not find crawl to delete.";
// return sendErrorReply2 (socket,fmt,msg);
//}
2013-10-17 01:13:28 +04:00
// just send back a list of all the collections after the delete
//if ( delColl && cast && fmt == FMT_JSON ) {
// char *msg = "Collection deleted.";
// return sendReply2 (socket,fmt,msg);
//}
2013-10-15 23:40:56 +04:00
// default name to next available collection crawl name in the
// case of a delete operation...
2013-12-17 02:35:27 +04:00
char * msg = NULL ;
if ( hr - > hasField ( " delete " ) ) msg = " deleted " ;
if ( hr - > hasField ( " restart " ) ) msg = " restarted " ;
if ( hr - > hasField ( " reset " ) ) msg = " reset " ;
if ( msg ) { // delColl && cast ) {
2013-10-15 23:40:56 +04:00
// this was deleted... so is invalid now
name = NULL ;
// no longer a delete function, we need to set "name" below
2013-12-17 02:35:27 +04:00
//delColl = false;//NULL;
2013-11-07 21:55:47 +04:00
// john wants just a brief success reply
2013-12-17 02:35:27 +04:00
SafeBuf tmp ;
tmp . safePrintf ( " { \" response \" : \" Successfully %s job. \" } " ,
msg ) ;
char * reply = tmp . getBufStart ( ) ;
2013-11-07 21:55:47 +04:00
return g_httpServer . sendDynamicPage ( socket ,
reply ,
gbstrlen ( reply ) ,
0 , // cacheTime
false , // POSTReply?
" application/json "
) ;
2013-10-15 23:40:56 +04:00
}
2013-10-15 21:54:54 +04:00
// if name is missing default to name of first existing
// collection for this token.
2013-12-17 02:35:27 +04:00
for ( long i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) { // cast
2013-10-15 21:54:54 +04:00
if ( name ) break ;
// do not do this if doing an
// injection (seed) or add url or del coll or reset coll !!
2013-10-18 22:53:14 +04:00
if ( seeds ) break ;
if ( spots ) break ;
2013-12-17 02:35:27 +04:00
//if ( delColl ) break;
//if ( resetColl ) break;
//if ( restartColl ) break;
2013-10-15 21:54:54 +04:00
CollectionRec * cx = g_collectiondb . m_recs [ i ] ;
// deleted collections leave a NULL slot
if ( ! cx ) continue ;
// skip if token does not match
if ( strcmp ( cx - > m_diffbotToken . getBufStart ( ) , token ) )
continue ;
// got it
name = cx - > m_diffbotCrawlName . getBufStart ( ) ;
break ;
}
if ( ! name ) {
2013-10-19 02:21:00 +04:00
//log("crawlbot: no crawl name given");
2013-10-17 04:17:28 +04:00
char * msg = " invalid or missing name " ;
2013-10-15 03:19:30 +04:00
return sendErrorReply2 ( socket , fmt , msg ) ;
}
2013-10-15 21:54:54 +04:00
if ( gbstrlen ( name ) > 30 ) {
2013-10-19 02:21:00 +04:00
//log("crawlbot: name is over 30 chars");
2013-10-15 21:54:54 +04:00
char * msg = " crawlbot: name is over 30 chars " ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
// make the collection name so it includes the token and crawl name
char collName [ MAX_COLL_LEN + 1 ] ;
// sanity
if ( MAX_COLL_LEN < 64 ) { char * xx = NULL ; * xx = 0 ; }
// make a compound name for collection of token and name
sprintf ( collName , " %s-%s " , token , name ) ;
2013-10-15 03:19:30 +04:00
// if they did not specify the token/name of an existing collection
// then cr will be NULL and we'll add it below
CollectionRec * cr = g_collectiondb . getRec ( collName ) ;
2013-12-17 02:35:27 +04:00
// i guess bail if not there?
if ( ! cr ) {
char * msg = " invalid or missing collection rec " ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
2013-10-15 03:19:30 +04:00
// if no token... they need to login or signup
//char *token = getTokenFromHttpRequest ( hr );
2013-09-14 03:22:07 +04:00
2013-09-27 20:49:24 +04:00
// get coll name if any
2013-10-15 03:19:30 +04:00
//char *c = hr->getString("c");
//if ( ! c ) c = hr->getString("id");
2013-09-14 03:22:07 +04:00
2013-09-27 20:49:24 +04:00
// get some other parms provided optionally
2013-10-15 03:19:30 +04:00
//char *addColl = hr->getString("addcoll");
2013-10-15 04:19:59 +04:00
2013-10-15 03:19:30 +04:00
// try json
//if ( JS.getInputString("addNewCrawl") ) addColl = collName;
2013-10-15 04:19:59 +04:00
//if ( JS.getInputString("deleteCrawl") ) delColl = true;
//if ( JS.getInputString("resetCrawl") ) resetColl = true;
2013-10-15 03:19:30 +04:00
2013-12-17 02:35:27 +04:00
//if ( resetColl && ! cr ) {
// //log("crawlbot: no collection found to reset.");
// char *msg = "Could not find crawl to reset.";
// return sendErrorReply2 (socket,fmt,msg);
//}
2013-09-27 08:41:05 +04:00
2013-12-17 02:35:27 +04:00
//if ( restartColl && ! cr ) {
// char *msg = "Could not find crawl to restart.";
// return sendErrorReply2 (socket,fmt,msg);
//}
2013-11-15 01:16:08 +04:00
2013-10-31 00:12:46 +04:00
// make a new state
StateCD * st ;
try { st = new ( StateCD ) ; }
catch ( . . . ) {
return sendErrorReply2 ( socket , fmt , mstrerror ( g_errno ) ) ;
}
mnew ( st , sizeof ( StateCD ) , " statecd " ) ;
2013-11-11 04:28:00 +04:00
// debug
2013-11-11 21:58:14 +04:00
//log("mnew2: st=%lx",(long)st);
2013-11-11 04:28:00 +04:00
2013-10-31 00:12:46 +04:00
// copy crap
st - > m_hr . copy ( hr ) ;
st - > m_socket = socket ;
st - > m_fmt = fmt ;
if ( cr ) st - > m_collnum = cr - > m_collnum ;
else st - > m_collnum = - 1 ;
2013-11-15 01:16:08 +04:00
// save seeds
2013-12-17 02:35:27 +04:00
//if ( cr && restartColl && cast ) {
// // bail on OOM saving seeds
// if ( ! st->m_seedBank.safeMemcpy ( &cr->m_diffbotSeeds ) ||
// ! st->m_seedBank.pushChar('\0') )
// return sendErrorReply2(socket,fmt,mstrerror(g_errno));
//}
2013-12-04 04:23:05 +04:00
//
// if we can't compile the provided regexes, return error
//
if ( cr ) {
char * rx1 = hr - > getString ( " urlCrawlRegEx " , NULL ) ;
if ( rx1 & & ! rx1 [ 0 ] ) rx1 = NULL ;
char * rx2 = hr - > getString ( " urlProcessRegEx " , NULL ) ;
if ( rx2 & & ! rx2 [ 0 ] ) rx2 = NULL ;
// this will store the compiled regular expression into ucr
regex_t re1 ;
regex_t re2 ;
long status1 = 0 ;
long status2 = 0 ;
if ( rx1 )
status1 = regcomp ( & re1 , rx1 ,
REG_EXTENDED | REG_ICASE |
REG_NEWLINE | REG_NOSUB ) ;
if ( rx2 )
status2 = regcomp ( & re2 , rx2 ,
REG_EXTENDED | REG_ICASE |
REG_NEWLINE | REG_NOSUB ) ;
if ( rx1 ) regfree ( & re1 ) ;
if ( rx2 ) regfree ( & re2 ) ;
SafeBuf em ;
if ( status1 ) {
log ( " xmldoc: regcomp %s failed. " , rx1 ) ;
em . safePrintf ( " Invalid regular expresion: %s " , rx1 ) ;
}
else if ( status2 ) {
log ( " xmldoc: regcomp %s failed. " , rx2 ) ;
em . safePrintf ( " Invalid regular expresion: %s " , rx2 ) ;
}
if ( status1 | | status2 ) {
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
char * msg = em . getBufStart ( ) ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
}
2013-11-15 01:16:08 +04:00
2013-09-27 20:49:24 +04:00
// . if this is a cast=0 request it is received by all hosts in the
// network
2013-09-27 08:41:05 +04:00
// . this code is the only code run by EVERY host in the network
// . the other code is just run once by the receiving host
// . so we gotta create a coll rec on each host etc.
// . no need to update collectionrec parms here since Pages.cpp calls
// g_parms.setFromRequest() for us before calling this function,
2013-09-27 20:49:24 +04:00
// pg->m_function(). even though maxtocrawl is on "PAGE_NONE"
// hopefully it will still be set
2013-09-27 08:41:05 +04:00
// . but we should take care of add/del/reset coll here.
2013-12-11 01:09:55 +04:00
// . i guess this will be handled by the new parm syncing logic
// which deals with add/del coll requests
/*
2013-09-27 08:41:05 +04:00
if ( cast = = 0 ) {
2013-10-31 00:12:46 +04:00
// add a new collection by default
if ( ! cr & & name & & name [ 0 ] )
2013-11-12 03:52:04 +04:00
cr = addNewDiffbotColl ( collName , token , name , hr ) ;
2013-10-31 00:12:46 +04:00
// also support the good 'ole html form interface
if ( cr ) setSpiderParmsFromHtmlRequest ( socket , hr , cr ) ;
2013-09-27 08:41:05 +04:00
// . we can't sync these operations on a dead host when it
// comes back up yet. we can only sync parms, not collection
// adds/deletes/resets
2013-09-27 20:49:24 +04:00
// . TODO: make new collections just a list of rdb records,
// then they can leverage the msg4 and addsinprogress.dat
// functionality we have for getting dead hosts back up to
// sync. Call it Colldb.
2013-10-02 22:50:11 +04:00
// . PROBLEM: when just starting up seems like hasDeadHost()
// is returning true because it has not yet received its
// first ping reply
//if ( addColl || delColl || resetColl ) {
// // if any host in network is dead, do not do this
// if ( g_hostdb.hasDeadHost() ) {
// char *msg = "A host in the network is dead.";
// // log it
// log("crawlbot: %s",msg);
// // make sure this returns in json if required
// return sendErrorReply2(socket,fmt,msg);
// }
//}
2013-10-31 00:12:46 +04:00
2013-11-11 10:11:13 +04:00
// problem?
if ( ! cr ) {
// send back error
char * msg = " Collection add failed " ;
if ( delColl ) msg = " No such collection " ;
if ( resetColl ) msg = " No such collection " ;
2013-11-15 01:16:08 +04:00
if ( restartColl ) msg = " No such collection " ;
2013-11-11 10:11:13 +04:00
// nuke it
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
// log it
2013-11-11 21:58:14 +04:00
log ( " crawlbot: cr is null. %s " , msg ) ;
2013-11-11 10:11:13 +04:00
// make sure this returns in json if required
return sendErrorReply2 ( socket , fmt , msg ) ;
}
2013-10-31 00:12:46 +04:00
// set this up
WaitEntry * we = & st - > m_waitEntry ;
we - > m_state = st ;
we - > m_callback = collOpDoneWrapper ;
2013-11-11 10:11:13 +04:00
// this won't work, collname is on the stack!
//we->m_coll = collName;
we - > m_coll = cr - > m_coll ;
2013-10-31 00:12:46 +04:00
2013-09-27 08:41:05 +04:00
if ( delColl ) {
2013-11-11 21:58:14 +04:00
// note it
log ( " crawlbot: deleting coll " ) ;
2013-09-27 20:49:24 +04:00
// delete collection name
2013-10-31 00:12:46 +04:00
// this can block if tree is saving, it has to wait
// for tree save to complete before removing old
// collnum recs from tree
if ( ! g_collectiondb . deleteRec ( collName , we ) )
return false ;
2013-11-11 04:28:00 +04:00
// nuke it
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-09-27 20:49:24 +04:00
// all done
return g_httpServer . sendDynamicPage ( socket , " OK " , 2 ) ;
2013-09-27 08:41:05 +04:00
}
2013-09-27 20:49:24 +04:00
2013-11-15 01:16:08 +04:00
if ( resetColl | | restartColl ) {
2013-11-11 21:58:14 +04:00
// note it
2013-11-15 02:02:56 +04:00
log ( " crawlbot: resetting/restarting coll " ) ;
2013-10-15 03:19:30 +04:00
//cr = g_collectiondb.getRec ( resetColl );
2013-10-31 00:12:46 +04:00
// this can block if tree is saving, it has to wait
// for tree save to complete before removing old
// collnum recs from tree
2013-11-15 02:02:56 +04:00
bool purgeSeeds = true ;
if ( restartColl ) purgeSeeds = false ;
if ( ! g_collectiondb . resetColl ( collName ,
we ,
purgeSeeds ) )
2013-10-31 00:12:46 +04:00
return false ;
2013-10-15 23:40:56 +04:00
// it is a NEW ptr now!
cr = g_collectiondb . getRec ( collName ) ;
2013-09-27 08:41:05 +04:00
// if reset from crawlbot api page then enable spiders
// to avoid user confusion
if ( cr ) cr - > m_spideringEnabled = 1 ;
2013-11-11 04:28:00 +04:00
// nuke it
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-10-31 00:12:46 +04:00
// all done
return g_httpServer . sendDynamicPage ( socket , " OK " , 2 ) ;
2013-09-27 08:41:05 +04:00
}
2013-11-11 04:28:00 +04:00
// nuke it
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-10-15 03:19:30 +04:00
// this will set the the collection parms from json
2013-10-15 04:19:59 +04:00
//setSpiderParmsFromJSONPost ( socket , hr , cr , &JS );
2013-09-27 08:41:05 +04:00
// this is a cast, so just return simple response
return g_httpServer . sendDynamicPage ( socket , " OK " , 2 ) ;
}
2013-12-11 01:09:55 +04:00
*/
2013-09-27 08:41:05 +04:00
2013-10-15 03:19:30 +04:00
/////////
//
// after all hosts have replied to the request, we finally send the
// request here, with no &cast=0 appended to it. so there is where we
// send the final reply back to the browser
//
/////////
2013-09-27 21:39:23 +04:00
2013-12-17 02:35:27 +04:00
/*
2013-10-25 04:59:15 +04:00
// in case collection was just added above... try this!!
cr = g_collectiondb . getRec ( collName ) ;
2013-09-27 21:39:23 +04:00
2013-10-15 03:19:30 +04:00
// collectionrec must be non-null at this point. i.e. we added it
2013-10-25 04:59:15 +04:00
if ( ! cr ) {
2013-10-30 21:00:46 +04:00
char * msg = " Crawl name was not found. " ;
if ( name & & name [ 0 ] )
msg = " Failed to add crawl. Crawl name is illegal. " ;
2013-11-11 21:58:14 +04:00
// nuke it
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-10-26 01:54:24 +04:00
//log("crawlbot: no collection found. need to add a crawl");
2013-10-30 21:00:46 +04:00
return sendErrorReply2 ( socket , fmt , msg ) ;
2013-10-25 04:59:15 +04:00
}
2013-09-27 21:39:23 +04:00
2013-10-18 22:53:14 +04:00
//char *spots = hr->getString("spots",NULL,NULL);
//char *seeds = hr->getString("seeds",NULL,NULL);
2013-09-27 20:49:24 +04:00
2013-11-15 01:16:08 +04:00
// check seed bank now too for restarting a crawl
if ( st - > m_seedBank . length ( ) & & ! seeds )
seeds = st - > m_seedBank . getBufStart ( ) ;
2013-12-17 02:35:27 +04:00
*/
2013-11-15 01:16:08 +04:00
2013-10-25 04:59:15 +04:00
if ( seeds )
log ( " crawlbot: adding seeds= \" %s \" " , seeds ) ;
2013-11-19 02:13:28 +04:00
if ( spots )
log ( " crawlbot: got spots to add " ) ;
2013-10-25 04:59:15 +04:00
2013-09-26 01:37:20 +04:00
///////
//
// handle file of urls upload. can be HUGE!
//
///////
2013-10-18 22:53:14 +04:00
if ( spots | | seeds ) {
2013-10-15 04:19:59 +04:00
// . avoid spidering links for these urls? i would say
// . default is to NOT spider the links...
// . support camel case and all lower case
2013-10-18 22:53:14 +04:00
//long spiderLinks = hr->getLong("spiderLinks",1);
//spiderLinks = hr->getLong("spiderlinks",spiderLinks);
//bool spiderLinks = false;
2013-09-26 01:37:20 +04:00
// make a list of spider requests from these urls
2013-09-26 02:04:16 +04:00
SafeBuf listBuf ;
// this returns NULL with g_errno set
2013-10-18 22:53:14 +04:00
bool status = true ;
if ( ! getSpiderRequestMetaList ( seeds ,
& listBuf ,
2013-10-22 04:35:14 +04:00
true , // spiderLinks?
cr ) )
2013-10-18 22:53:14 +04:00
status = false ;
// do not spider links for spots
if ( ! getSpiderRequestMetaList ( spots ,
& listBuf ,
2013-10-22 04:35:14 +04:00
false , // spiderLinks?
NULL ) )
2013-10-18 22:53:14 +04:00
status = false ;
2013-09-26 02:04:16 +04:00
// empty?
long size = listBuf . length ( ) ;
// error?
2013-11-11 21:58:14 +04:00
if ( ! status ) {
// nuke it
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-09-26 03:12:01 +04:00
return sendErrorReply2 ( socket , fmt , mstrerror ( g_errno ) ) ;
2013-11-11 21:58:14 +04:00
}
2013-09-26 02:04:16 +04:00
// if not list
2013-11-11 21:58:14 +04:00
if ( ! size ) {
// nuke it
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-09-26 03:12:01 +04:00
return sendErrorReply2 ( socket , fmt , " no urls found " ) ;
2013-11-11 21:58:14 +04:00
}
2013-09-26 01:37:20 +04:00
// add to spiderdb
2013-09-26 02:04:16 +04:00
if ( ! st - > m_msg4 . addMetaList ( listBuf . getBufStart ( ) ,
listBuf . length ( ) ,
2013-09-26 01:37:20 +04:00
cr - > m_coll ,
st ,
addedUrlsToSpiderdbWrapper ,
0 // niceness
) )
// blocked!
return false ;
// did not block, print page!
addedUrlsToSpiderdbWrapper ( st ) ;
return true ;
}
/////////
//
// handle direct injection of a url. looks at "spiderlinks=1" parm
// and all the other parms in Msg7::inject() in PageInject.cpp.
//
//////////
2013-10-18 22:53:14 +04:00
/*
2013-09-26 01:37:20 +04:00
if ( injectUrl ) {
// a valid collection is required
if ( ! cr )
2013-09-26 03:12:01 +04:00
return sendErrorReply2 ( socket , fmt ,
" invalid collection " ) ;
2013-09-26 01:37:20 +04:00
// begin the injection
if ( ! st - > m_msg7 . inject ( st - > m_socket ,
& st - > m_hr ,
st ,
2013-10-15 22:22:59 +04:00
injectedUrlWrapper ,
2013-10-16 03:31:59 +04:00
1 , // spiderLinks default is on
collName ) ) // coll override
2013-09-26 01:37:20 +04:00
// if blocked, return now
return false ;
// otherwise send back reply
injectedUrlWrapper ( st ) ;
return true ;
}
2013-10-18 22:53:14 +04:00
*/
2013-09-26 01:37:20 +04:00
2013-09-27 20:49:24 +04:00
// we do not need the state i guess
2013-09-27 08:41:05 +04:00
////////////
2013-09-26 01:37:20 +04:00
//
// print the html or json page of all the data
//
2013-09-27 20:49:24 +04:00
printCrawlBotPage2 ( socket , hr , fmt , NULL , NULL , cr - > m_collnum ) ;
// get rid of that state
delete st ;
mdelete ( st , sizeof ( StateCD ) , " stcd " ) ;
2013-11-11 21:58:14 +04:00
//log("mdel4: st=%lx",(long)st);
2013-09-27 20:49:24 +04:00
return true ;
2013-09-26 01:37:20 +04:00
}
2013-12-17 02:35:27 +04:00
/*
2013-10-16 23:12:22 +04:00
bool printUrlFilters ( SafeBuf & sb , CollectionRec * cr , long fmt ) {
if ( fmt = = FMT_JSON )
2013-10-16 23:19:25 +04:00
sb . safePrintf ( " \" urlFilters \" :[ " ) ;
2013-10-15 22:45:23 +04:00
2013-10-21 23:04:08 +04:00
// skip first filters that are:
// 0. ismedia->ignore and
// 1. !isonsamedomain->ignore
// 2. lastspidertime or !isindexed
// 3. errorcount rule
// 4. errorcount rule
long istart = 5 ;
2013-10-18 05:59:00 +04:00
// if respidering then we added an extra filter
// lastspidertime>={roundstart} --> FILTERED
2013-10-22 05:05:45 +04:00
//if ( cr->m_collectiveRespiderFrequency > 0.0 )
// istart++;
2013-10-18 05:59:00 +04:00
for ( long i = istart ; i < cr - > m_numRegExs ; i + + ) {
2013-10-15 22:45:23 +04:00
//sb.safePrintf
2013-10-15 22:50:57 +04:00
char * expression = cr - > m_regExs [ i ] . getBufStart ( ) ;
// do not allow nulls
if ( ! expression ) expression = " " ;
// skip spaces
if ( * expression & & is_wspace_a ( * expression ) ) expression + + ;
if ( strcmp ( expression , " default " ) = = 0 ) expression = " * " ;
char * action = cr - > m_spiderDiffbotApiUrl [ i ] . getBufStart ( ) ;
// do not all nulls
if ( ! action ) action = " " ;
// skip spaces
if ( * action & & is_wspace_a ( * action ) ) action + + ;
// if no diffbot api url specified, do not process
if ( ! * action ) action = " doNotProcess " ;
// if filtered from crawling, do not even spider
long priority = cr - > m_spiderPriorities [ i ] ;
if ( priority = = SPIDER_PRIORITY_FILTERED ) // -3
2013-10-22 05:05:45 +04:00
action = " doNotCrawl " ;
2013-10-16 23:12:22 +04:00
// we add this supplemental expressin/action for every
// one the user adds in order to give manually added
// urls higher spider priority, so skip it
if ( strncmp ( expression , " ismanualadd && " , 15 ) = = 0 )
continue ;
if ( fmt = = FMT_HTML ) {
sb . safePrintf ( " <tr> "
" <td>Expression "
" <input type=text "
" name=expression size=30 "
" value= \" %s \" > "
" </td><td> "
" Action "
" <input type=text name=action size=50 "
" value= \" %s \" > "
" </td> "
" </tr> \n "
, expression
, action
) ;
continue ;
}
2013-10-15 22:50:57 +04:00
// show it
sb . safePrintf ( " { \" expression \" : \" %s \" , " , expression ) ;
sb . safePrintf ( " \" action \" : \" %s \" } " , action ) ;
// more follow?
2013-10-16 23:12:22 +04:00
sb . pushChar ( ' , ' ) ;
2013-10-15 22:50:57 +04:00
sb . pushChar ( ' \n ' ) ;
2013-10-15 22:45:23 +04:00
}
2013-10-16 23:12:22 +04:00
if ( fmt = = FMT_JSON ) {
// remove trailing comma
sb . removeLastChar ( ' \n ' ) ;
sb . removeLastChar ( ' , ' ) ;
2013-10-16 23:19:25 +04:00
sb . safePrintf ( " ] \n " ) ;
2013-10-16 23:12:22 +04:00
}
2013-10-15 22:45:23 +04:00
return true ;
}
2013-12-17 02:35:27 +04:00
*/
2013-10-15 22:45:23 +04:00
2013-11-08 01:59:43 +04:00
bool printCrawlDetailsInJson ( SafeBuf & sb , CollectionRec * cx ) {
SafeBuf tmp ;
long crawlStatus = - 1 ;
getSpiderStatusMsg ( cx , & tmp , & crawlStatus ) ;
CrawlInfo * ci = & cx - > m_localCrawlInfo ;
long sentAlert = ( long ) ci - > m_sentCrawlDoneAlert ;
if ( sentAlert ) sentAlert = 1 ;
2013-11-12 03:52:04 +04:00
char * crawlTypeStr = " crawl " ;
//char *nomen = "crawl";
if ( cx - > m_isCustomCrawl = = 2 ) {
crawlTypeStr = " bulk " ;
//nomen = "job";
}
2013-11-08 01:59:43 +04:00
sb . safePrintf ( " \n \n { "
" \" name \" : \" %s \" , \n "
2013-11-12 03:52:04 +04:00
" \" type \" : \" %s \" , \n "
2013-11-08 01:59:43 +04:00
//"\"alias\":\"%s\",\n"
//"\"crawlingEnabled\":%li,\n"
2013-11-12 03:52:04 +04:00
" \" jobStatus \" :{ " // nomen = jobStatus / crawlStatus
2013-11-08 01:59:43 +04:00
" \" status \" :%li, "
" \" message \" : \" %s \" }, \n "
2013-11-12 03:52:04 +04:00
" \" sentJobDoneNotification \" :%li, \n "
2013-11-08 01:59:43 +04:00
//"\"crawlingPaused\":%li,\n"
" \" objectsFound \" :%lli, \n "
" \" urlsHarvested \" :%lli, \n "
//"\"urlsExamined\":%lli,\n"
2013-11-12 04:10:47 +04:00
" \" pageCrawlAttempts \" :%lli, \n "
" \" pageCrawlSuccesses \" :%lli, \n "
2013-11-08 01:59:43 +04:00
" \" pageProcessAttempts \" :%lli, \n "
" \" pageProcessSuccesses \" :%lli, \n "
2013-11-12 03:52:04 +04:00
" \" maxRounds \" :%li, \n "
" \" repeat \" :%f, \n "
2013-11-08 01:59:43 +04:00
" \" crawlDelay \" :%f, \n "
2013-11-12 03:52:04 +04:00
2013-11-08 01:59:43 +04:00
//,cx->m_coll
, cx - > m_diffbotCrawlName . getBufStart ( )
2013-11-12 03:52:04 +04:00
, crawlTypeStr
2013-11-08 01:59:43 +04:00
//, alias
//, (long)cx->m_spideringEnabled
, crawlStatus
, tmp . getBufStart ( )
, sentAlert
//, (long)paused
, cx - > m_globalCrawlInfo . m_objectsAdded -
cx - > m_globalCrawlInfo . m_objectsDeleted
, cx - > m_globalCrawlInfo . m_urlsHarvested
//,cx->m_globalCrawlInfo.m_urlsConsidered
, cx - > m_globalCrawlInfo . m_pageDownloadAttempts
, cx - > m_globalCrawlInfo . m_pageDownloadSuccesses
, cx - > m_globalCrawlInfo . m_pageProcessAttempts
, cx - > m_globalCrawlInfo . m_pageProcessSuccesses
2013-11-12 03:52:04 +04:00
2013-11-08 01:59:43 +04:00
, ( long ) cx - > m_maxCrawlRounds
, cx - > m_collectiveRespiderFrequency
, cx - > m_collectiveCrawlDelay
) ;
2013-11-12 03:52:04 +04:00
// if not a "bulk" injection, show crawl stats
if ( cx - > m_isCustomCrawl ! = 2 ) {
sb . safePrintf (
// settable parms
" \" maxToCrawl \" :%lli, \n "
" \" maxToProcess \" :%lli, \n "
" \" obeyRobots \" :%li, \n "
" \" restrictDomain \" :%li, \n "
" \" onlyProcessIfNew \" :%li, \n "
, cx - > m_maxToCrawl
, cx - > m_maxToProcess
, ( long ) cx - > m_useRobotsTxt
, ( long ) cx - > m_restrictDomain
, ( long ) cx - > m_diffbotOnlyProcessIfNew
) ;
sb . safePrintf ( " \" seeds \" : \" " ) ;
sb . safeUtf8ToJSON ( cx - > m_diffbotSeeds . getBufStart ( ) ) ;
sb . safePrintf ( " \" , \n " ) ;
}
sb . safePrintf ( " \" roundsCompleted \" :%li, \n " ,
2013-11-08 01:59:43 +04:00
cx - > m_spiderRoundNum ) ;
2013-11-12 03:52:04 +04:00
sb . safePrintf ( " \" roundStartTime \" :%lu, \n " ,
2013-11-08 01:59:43 +04:00
cx - > m_spiderRoundStartTime ) ;
sb . safePrintf ( " \" currentTime \" :%lu, \n " ,
getTimeGlobal ( ) ) ;
2013-11-21 04:41:28 +04:00
sb . safePrintf ( " \" apiUrl \" : \" " ) ;
sb . safeUtf8ToJSON ( cx - > m_diffbotApiUrl . getBufStart ( ) ) ;
sb . safePrintf ( " \" , \n " ) ;
sb . safePrintf ( " \" urlCrawlPattern \" : \" " ) ;
sb . safeUtf8ToJSON ( cx - > m_diffbotUrlCrawlPattern . getBufStart ( ) ) ;
sb . safePrintf ( " \" , \n " ) ;
sb . safePrintf ( " \" urlProcessPattern \" : \" " ) ;
sb . safeUtf8ToJSON ( cx - > m_diffbotUrlProcessPattern . getBufStart ( ) ) ;
sb . safePrintf ( " \" , \n " ) ;
2013-11-08 01:59:43 +04:00
sb . safePrintf ( " \" pageProcessPattern \" : \" " ) ;
2013-11-21 04:41:28 +04:00
sb . safeUtf8ToJSON ( cx - > m_diffbotPageProcessPattern . getBufStart ( ) ) ;
2013-11-08 01:59:43 +04:00
sb . safePrintf ( " \" , \n " ) ;
2013-12-04 04:23:05 +04:00
sb . safePrintf ( " \" urlCrawlRegEx \" : \" " ) ;
sb . safeUtf8ToJSON ( cx - > m_diffbotUrlCrawlRegEx . getBufStart ( ) ) ;
sb . safePrintf ( " \" , \n " ) ;
sb . safePrintf ( " \" urlProcessRegEx \" : \" " ) ;
sb . safeUtf8ToJSON ( cx - > m_diffbotUrlProcessRegEx . getBufStart ( ) ) ;
sb . safePrintf ( " \" , \n " ) ;
2013-11-08 02:07:38 +04:00
char * token = cx - > m_diffbotToken . getBufStart ( ) ;
char * name = cx - > m_diffbotCrawlName . getBufStart ( ) ;
2013-12-04 04:23:05 +04:00
2013-11-14 02:30:51 +04:00
char * mt = " crawl " ;
if ( cx - > m_isCustomCrawl = = 2 ) mt = " bulk " ;
2013-11-08 02:07:38 +04:00
sb . safePrintf ( " \" downloadJson \" : "
2013-11-14 02:30:51 +04:00
" \" http://api.diffbot.com/v2/%s/download/ "
2013-11-08 02:07:38 +04:00
" %s-%s_data.json \" , \n "
2013-11-14 02:30:51 +04:00
, mt
2013-11-08 02:07:38 +04:00
, token
, name
) ;
sb . safePrintf ( " \" downloadUrls \" : "
2013-11-14 02:30:51 +04:00
" \" http://api.diffbot.com/v2/%s/download/ "
2013-11-08 02:07:38 +04:00
" %s-%s_urls.csv \" , \n "
2013-11-14 02:30:51 +04:00
, mt
2013-11-08 02:07:38 +04:00
, token
, name
) ;
2013-11-08 01:59:43 +04:00
sb . safePrintf ( " \" notifyEmail \" : \" " ) ;
sb . safeUtf8ToJSON ( cx - > m_notifyEmail . getBufStart ( ) ) ;
sb . safePrintf ( " \" , \n " ) ;
sb . safePrintf ( " \" notifyWebhook \" : \" " ) ;
sb . safeUtf8ToJSON ( cx - > m_notifyUrl . getBufStart ( ) ) ;
2013-11-26 21:17:38 +04:00
sb . safePrintf ( " \" \n " ) ;
//sb.safePrintf("\",\n");
2013-11-08 01:59:43 +04:00
/////
//
// show url filters table. kinda hacky!!
//
/////
/*
g_parms . sendPageGeneric ( socket ,
hr ,
PAGE_FILTERS ,
NULL ,
& sb ,
cr - > m_coll , // coll override
true // isJSON?
) ;
*/
2013-11-23 05:37:42 +04:00
//printUrlFilters ( sb , cx , FMT_JSON );
2013-11-08 01:59:43 +04:00
// end that collection rec
2013-11-26 21:17:38 +04:00
sb . safePrintf ( " } \n " ) ;
2013-11-08 01:59:43 +04:00
return true ;
}
2013-09-26 01:37:20 +04:00
bool printCrawlBotPage2 ( TcpSocket * socket ,
HttpRequest * hr ,
char fmt , // format
SafeBuf * injectionResponse ,
2013-09-27 20:49:24 +04:00
SafeBuf * urlUploadResponse ,
collnum_t collnum ) {
2013-09-26 01:37:20 +04:00
2013-10-17 01:03:14 +04:00
2013-09-26 01:37:20 +04:00
// store output into here
SafeBuf sb ;
if ( fmt = = FMT_HTML )
sb . safePrintf (
" <html> "
" <title>Crawlbot - "
" Web Data Extraction and Search Made "
" Easy</title> "
" <body> "
) ;
2013-09-27 20:49:24 +04:00
CollectionRec * cr = g_collectiondb . m_recs [ collnum ] ;
2013-09-17 21:43:23 +04:00
2013-10-19 02:21:00 +04:00
// was coll deleted while adding urls to spiderdb?
if ( ! cr ) {
g_errno = EBADREQUEST ;
char * msg = " invalid crawl. crawl was deleted. " ;
return sendErrorReply2 ( socket , fmt , msg ) ;
}
2013-10-16 01:08:55 +04:00
char * token = cr - > m_diffbotToken . getBufStart ( ) ;
char * name = cr - > m_diffbotCrawlName . getBufStart ( ) ;
// this is usefful
SafeBuf hb ;
hb . safePrintf ( " <input type=hidden name=name value= \" %s \" > "
" <input type=hidden name=token value= \" %s \" > "
" <input type=hidden name=format value= \" html \" > "
, name
, token ) ;
hb . nullTerm ( ) ;
// and this
SafeBuf lb ;
lb . safePrintf ( " name= " ) ;
lb . urlEncode ( name ) ;
lb . safePrintf ( " &token= " ) ;
lb . urlEncode ( token ) ;
if ( fmt = = FMT_HTML ) lb . safePrintf ( " &format=html " ) ;
lb . nullTerm ( ) ;
2013-09-17 21:43:23 +04:00
// set this to current collection. if only token was provided
// then it will return the first collection owned by token.
// if token has no collections it will be NULL.
2013-09-27 20:49:24 +04:00
//if ( ! cr )
// cr = getCollRecFromHttpRequest ( hr );
//if ( ! cr ) {
// char *msg = "failed to add new collection";
// g_msg = " (error: crawlbot failed to allocate crawl)";
// return sendErrorReply2 ( socket , fmt , msg );
//}
2013-09-17 21:43:23 +04:00
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_HTML ) {
sb . safePrintf ( " <table border=0> "
" <tr><td> "
" <b><font size=+2> "
" <a href=/crawlbot?token=%s> "
" Crawlbot</a></font></b> "
" <br> "
" <font size=-1> "
" Crawl, Datamine and Index the Web "
" </font> "
" </td></tr> "
" </table> "
, token
) ;
sb . safePrintf ( " <center><br> " ) ;
2013-09-26 04:00:16 +04:00
// first print help
2013-09-26 05:41:20 +04:00
sb . safePrintf ( " [ <a href=/crawlbot?help=1> "
" api help</a> ] "
// json output
2013-11-04 23:05:10 +04:00
" [ <a href= \" /crawlbot?token=%s&format=json& "
" name=%s \" > "
2013-09-26 05:41:20 +04:00
" json output "
" </a> ] "
2013-11-04 23:05:10 +04:00
, token
, name ) ;
2013-09-27 20:49:24 +04:00
// random coll name to add
unsigned long r1 = rand ( ) ;
unsigned long r2 = rand ( ) ;
unsigned long long rand64 = ( unsigned long long ) r1 ;
rand64 < < = 32 ;
rand64 | = r2 ;
2013-12-17 03:39:24 +04:00
char newCollName [ MAX_COLL_LEN + 1 ] ;
snprintf ( newCollName , MAX_COLL_LEN , " %s-%016llx " ,
token , rand64 ) ;
2013-09-26 01:37:20 +04:00
// first print "add new collection"
2013-10-15 23:40:56 +04:00
sb . safePrintf ( " [ <a href=/crawlbot?name=%016llx&token=%s& "
2013-12-17 03:39:24 +04:00
" format=html&addcrawl=%s> "
2013-10-16 03:57:34 +04:00
" add new crawl "
2013-09-26 01:37:20 +04:00
" </a> ] "
2013-10-15 23:40:56 +04:00
" [ <a href=/crawlbot?token=%s> "
2013-10-16 03:57:34 +04:00
" show all crawls "
2013-09-26 01:37:20 +04:00
" </a> ] "
2013-09-27 20:49:24 +04:00
, rand64
2013-09-26 01:37:20 +04:00
, token
2013-12-17 03:39:24 +04:00
, newCollName
2013-09-26 01:37:20 +04:00
, token
) ;
}
2013-09-14 03:22:07 +04:00
2013-09-26 01:37:20 +04:00
bool firstOne = true ;
2013-09-18 02:32:28 +04:00
2013-09-14 03:22:07 +04:00
//
2013-09-17 21:43:23 +04:00
// print list of collections controlled by this token
2013-09-14 03:22:07 +04:00
//
2013-09-26 01:37:20 +04:00
for ( long i = 0 ; fmt = = FMT_HTML & & i < g_collectiondb . m_numRecs ; i + + ) {
2013-09-14 03:22:07 +04:00
CollectionRec * cx = g_collectiondb . m_recs [ i ] ;
if ( ! cx ) continue ;
2013-09-18 02:32:28 +04:00
// get its token if any
char * ct = cx - > m_diffbotToken . getBufStart ( ) ;
if ( ! ct ) continue ;
// skip if token does not match
if ( strcmp ( ct , token ) )
continue ;
2013-09-14 03:22:07 +04:00
// highlight the tab if it is what we selected
bool highlight = false ;
2013-09-17 21:43:23 +04:00
if ( cx = = cr ) highlight = true ;
2013-09-14 03:22:07 +04:00
char * style = " " ;
if ( highlight ) {
style = " style=text-decoration:none; " ;
sb . safePrintf ( " <b><font color=red> " ) ;
}
// print the crawl id. collection name minus <TOKEN>-
2013-10-15 23:40:56 +04:00
sb . safePrintf ( " <a %shref=/crawlbot?token= " , style ) ;
sb . urlEncode ( token ) ;
sb . safePrintf ( " &name= " ) ;
sb . urlEncode ( cx - > m_diffbotCrawlName . getBufStart ( ) ) ;
sb . safePrintf ( " &format=html> "
2013-09-14 03:22:07 +04:00
" %s "
" </a> "
2013-10-15 21:54:54 +04:00
, cx - > m_diffbotCrawlName . getBufStart ( )
2013-09-14 03:22:07 +04:00
) ;
if ( highlight )
sb . safePrintf ( " </font></b> " ) ;
}
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_HTML )
sb . safePrintf ( " </center><br/> " ) ;
2013-09-14 03:22:07 +04:00
2013-11-04 23:29:22 +04:00
// the ROOT JSON [
2013-10-17 04:17:28 +04:00
if ( fmt = = FMT_JSON )
sb . safePrintf ( " { \n " ) ;
2013-10-17 01:03:14 +04:00
2013-10-30 02:26:32 +04:00
// injection is currently not in use, so this is an artifact:
2013-10-17 01:03:14 +04:00
if ( fmt = = FMT_JSON & & injectionResponse )
2013-11-14 02:30:51 +04:00
sb . safePrintf ( " \" response \" : \" %s \" , \n \n "
2013-10-17 01:03:14 +04:00
, injectionResponse - > getBufStart ( ) ) ;
if ( fmt = = FMT_JSON & & urlUploadResponse )
2013-11-14 02:30:51 +04:00
sb . safePrintf ( " \" response \" : \" %s \" , \n \n "
2013-10-17 01:03:14 +04:00
, urlUploadResponse - > getBufStart ( ) ) ;
2013-09-25 22:57:07 +04:00
//////
//
// print collection summary page
//
//////
2013-11-12 03:52:04 +04:00
// the items in the array now have type:bulk or type:crawl
// so call them 'jobs'
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_JSON )
2013-11-12 03:52:04 +04:00
sb . safePrintf ( " \" jobs \" :[ " ) ; //\"collections\":");
2013-09-26 01:37:20 +04:00
2013-09-25 22:57:07 +04:00
long summary = hr - > getLong ( " summary " , 0 ) ;
2013-09-26 02:59:31 +04:00
// enter summary mode for json
if ( fmt ! = FMT_HTML ) summary = 1 ;
2013-09-25 22:57:07 +04:00
// start the table
2013-09-26 01:37:20 +04:00
if ( summary & & fmt = = FMT_HTML ) {
2013-09-25 22:57:07 +04:00
sb . safePrintf ( " <table border=1 cellpadding=5> "
" <tr> "
" <td><b>Collection</b></td> "
" <td><b>Objects Found</b></td> "
" <td><b>URLs Harvested</b></td> "
" <td><b>URLs Examined</b></td> "
2013-11-12 03:52:04 +04:00
" <td><b>Page Download Attempts</b></td> "
" <td><b>Page Download Successes</b></td> "
2013-09-25 22:57:07 +04:00
" <td><b>Page Process Attempts</b></td> "
" <td><b>Page Process Successes</b></td> "
" </tr> "
) ;
}
2013-10-17 01:03:14 +04:00
2013-11-04 23:05:10 +04:00
char * name3 = hr - > getString ( " name " ) ;
2013-09-25 22:57:07 +04:00
// scan each coll and get its stats
for ( long i = 0 ; summary & & i < g_collectiondb . m_numRecs ; i + + ) {
CollectionRec * cx = g_collectiondb . m_recs [ i ] ;
if ( ! cx ) continue ;
// must belong to us
if ( strcmp ( cx - > m_diffbotToken . getBufStart ( ) , token ) )
continue ;
2013-09-26 01:37:20 +04:00
2013-11-04 23:05:10 +04:00
// just print out single crawl info for json
if ( fmt ! = FMT_HTML & & cx ! = cr & & name3 )
continue ;
2013-09-26 01:37:20 +04:00
// if json, print each collectionrec
if ( fmt = = FMT_JSON ) {
if ( ! firstOne )
sb . safePrintf ( " , \n \t " ) ;
firstOne = false ;
2013-09-29 00:17:43 +04:00
//char *alias = "";
//if ( cx->m_collectionNameAlias.length() > 0 )
// alias=cx->m_collectionNameAlias.getBufStart();
2013-10-22 05:32:57 +04:00
//long paused = 1;
2013-10-30 00:16:01 +04:00
2013-10-22 05:32:57 +04:00
//if ( cx->m_spideringEnabled ) paused = 0;
2013-11-08 01:59:43 +04:00
printCrawlDetailsInJson ( sb , cx ) ;
2013-09-26 01:37:20 +04:00
// print the next one out
continue ;
}
2013-09-25 22:57:07 +04:00
// print in table
sb . safePrintf ( " <tr> "
" <td>%s</td> "
" <td>%lli</td> "
" <td>%lli</td> "
2013-10-29 09:38:15 +04:00
//"<td>%lli</td>"
2013-09-25 22:57:07 +04:00
" <td>%lli</td> "
" <td>%lli</td> "
" <td>%lli</td> "
" <td>%lli</td> "
" </tr> "
, cx - > m_coll
, cx - > m_globalCrawlInfo . m_objectsAdded -
cx - > m_globalCrawlInfo . m_objectsDeleted
, cx - > m_globalCrawlInfo . m_urlsHarvested
2013-10-29 09:38:15 +04:00
//, cx->m_globalCrawlInfo.m_urlsConsidered
2013-09-25 22:57:07 +04:00
, cx - > m_globalCrawlInfo . m_pageDownloadAttempts
, cx - > m_globalCrawlInfo . m_pageDownloadSuccesses
, cx - > m_globalCrawlInfo . m_pageProcessAttempts
, cx - > m_globalCrawlInfo . m_pageProcessSuccesses
) ;
}
2013-09-26 01:37:20 +04:00
if ( summary & & fmt = = FMT_HTML ) {
2013-09-25 22:57:07 +04:00
sb . safePrintf ( " </table></html> " ) ;
2013-09-26 01:37:20 +04:00
return g_httpServer . sendDynamicPage ( socket ,
2013-09-25 22:57:07 +04:00
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-09-27 20:49:24 +04:00
0 ) ; // cachetime
2013-09-25 22:57:07 +04:00
}
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_JSON )
2013-09-26 03:12:01 +04:00
// end the array of collection objects
2013-10-17 04:17:28 +04:00
sb . safePrintf ( " \n ] \n " ) ;
2013-09-26 01:37:20 +04:00
2013-09-25 22:57:07 +04:00
///////
//
// end print collection summary page
//
///////
2013-09-26 01:37:20 +04:00
2013-09-17 02:33:45 +04:00
//
// show urls being crawled (ajax) (from Spider.cpp)
//
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_HTML ) {
sb . safePrintf ( " <table width=100%% cellpadding=5 "
" style=border-width:1px;border-style:solid; "
" border-color:black;> "
//"bgcolor=#%s>\n"
" <tr><td colspan=50> " // bgcolor=#%s>"
" <b>Last 10 URLs</b> (%li spiders active) "
//,LIGHT_BLUE
//,DARK_BLUE
, ( long ) g_spiderLoop . m_numSpidersOut ) ;
2013-10-15 23:40:56 +04:00
char * str = " <font color=green>Resume Crawl</font> " ;
long pval = 0 ;
if ( cr - > m_spideringEnabled ) {
str = " <font color=red>Pause Crawl</font> " ;
pval = 1 ;
}
sb . safePrintf ( " "
2013-10-16 01:08:55 +04:00
" <a href=/crawlbot?%s "
" &pauseCrawl=%li><b>%s</b></a> "
, lb . getBufStart ( ) // has &name=&token= encoded
2013-10-15 23:40:56 +04:00
, pval
, str
) ;
2013-09-26 01:37:20 +04:00
sb . safePrintf ( " </td></tr> \n " ) ;
// the table headers so SpiderRequest::printToTable() works
if ( ! SpiderRequest : : printTableHeaderSimple ( & sb , true ) )
2013-09-17 02:33:45 +04:00
return false ;
2013-09-26 01:37:20 +04:00
// shortcut
XmlDoc * * docs = g_spiderLoop . m_docs ;
// first print the spider recs we are spidering
for ( long i = 0 ; i < ( long ) MAX_SPIDERS ; i + + ) {
// get it
XmlDoc * xd = docs [ i ] ;
// skip if empty
if ( ! xd ) continue ;
// sanity check
if ( ! xd - > m_oldsrValid ) { char * xx = NULL ; * xx = 0 ; }
// skip if not our coll rec!
2013-10-19 02:21:00 +04:00
//if ( xd->m_cr != cr ) continue;
if ( xd - > m_collnum ! = cr - > m_collnum ) continue ;
2013-09-26 01:37:20 +04:00
// grab it
SpiderRequest * oldsr = & xd - > m_oldsr ;
// get status
char * status = xd - > m_statusMsg ;
// show that
if ( ! oldsr - > printToTableSimple ( & sb , status , xd ) )
return false ;
}
2013-09-17 02:33:45 +04:00
2013-09-26 01:37:20 +04:00
// end the table
sb . safePrintf ( " </table> \n " ) ;
sb . safePrintf ( " <br> \n " ) ;
2013-09-17 02:33:45 +04:00
2013-09-26 01:37:20 +04:00
} // end html format
2013-09-17 02:33:45 +04:00
2013-10-16 23:12:22 +04:00
// this is for making sure the search results are not cached
unsigned long r1 = rand ( ) ;
unsigned long r2 = rand ( ) ;
unsigned long long rand64 = ( unsigned long long ) r1 ;
rand64 < < = 32 ;
rand64 | = r2 ;
if ( fmt = = FMT_HTML ) {
sb . safePrintf ( " <br> "
" <table border=0 cellpadding=5> "
// OBJECT search input box
" <form method=get action=/search> "
" <tr> "
" <td> "
" <b>Search Objects:</b> "
" </td><td> "
" <input type=text name=q size=50> "
2013-11-07 21:40:31 +04:00
// site clustering off
" <input type=hidden name=sc value=0> "
// dup removal off
" <input type=hidden name=dr value=0> "
2013-10-16 23:12:22 +04:00
" <input type=hidden name=c value= \" %s \" > "
" <input type=hidden name=rand value=%lli> "
2013-10-22 06:06:13 +04:00
// bypass ajax, searchbox, logo, etc.
" <input type=hidden name=id value=12345> "
2013-10-16 23:12:22 +04:00
// restrict search to json objects
" <input type=hidden name=prepend "
" value= \" type:json | \" > "
" "
" <input type=submit name=submit value=OK> "
" </tr> "
" </form> "
// PAGE search input box
" <form method=get action=/search> "
" <tr> "
" <td> "
" <b>Search Pages:</b> "
" </td><td> "
" <input type=text name=q size=50> "
2013-11-07 21:40:31 +04:00
// site clustering off
" <input type=hidden name=sc value=0> "
// dup removal off
" <input type=hidden name=dr value=0> "
2013-10-16 23:12:22 +04:00
" <input type=hidden name=c value= \" %s \" > "
" <input type=hidden name=rand value=%lli> "
2013-10-22 06:06:13 +04:00
// bypass ajax, searchbox, logo, etc.
" <input type=hidden name=id value=12345> "
2013-10-16 23:12:22 +04:00
// restrict search to NON json objects
" <input type=hidden "
" name=prepend value= \" -type:json | \" > "
" "
" <input type=submit name=submit value=OK> "
" </tr> "
" </form> "
// add url input box
" <form method=get action=/crawlbot> "
" <tr> "
" <td> "
2013-10-18 22:53:14 +04:00
" <b>Add Seed Urls: </b> "
2013-10-16 23:12:22 +04:00
" </td><td> "
2013-10-18 22:53:14 +04:00
" <input type=text name=seeds size=50> "
2013-10-16 23:12:22 +04:00
" %s " // hidden tags
" "
" <input type=submit name=submit value=OK> "
2013-10-18 22:53:14 +04:00
//" <input type=checkbox "
//"name=spiderLinks value=1 "
//"checked>"
//" <i>crawl links on this page?</i>"
2013-10-16 23:12:22 +04:00
, cr - > m_coll
, rand64
, cr - > m_coll
, rand64
, hb . getBufStart ( ) // hidden tags
) ;
}
if ( injectionResponse & & fmt = = FMT_HTML )
sb . safePrintf ( " <br><font size=-1>%s</font> \n "
, injectionResponse - > getBufStart ( )
) ;
if ( fmt = = FMT_HTML )
sb . safePrintf ( //"<input type=hidden name=c value=\"%s\">"
2013-10-18 22:53:14 +04:00
//"<input type=hidden name=crawlbotapi value=1>"
2013-10-16 23:12:22 +04:00
" </td> "
" </tr> "
2013-10-18 22:53:14 +04:00
//"</form>"
2013-10-16 23:12:22 +04:00
" <tr> "
2013-10-18 22:53:14 +04:00
" <td><b>Add Spot URLs:</b></td> "
2013-10-16 23:12:22 +04:00
" <td> "
// this page will call
// printCrawlbotPage2(uploadResponse) 2display it
2013-10-18 22:53:14 +04:00
//"<form method=post action=/crawlbot>"
//"<input type=file name=spots size=40>"
2013-10-30 03:37:14 +04:00
" <input type=text name=spots size=50> "
2013-10-18 22:53:14 +04:00
" <input type=submit name=submit value=OK> "
" %s " // hidden tags
//" <input type=checkbox "
//"name=spiderLinks value=1 "
//"checked>"
//" <i>crawl links on those pages?</i>"
2013-10-16 23:12:22 +04:00
" </form> "
" </td> "
" </tr> "
" </table> "
" <br> "
//, cr->m_coll
2013-10-18 22:53:14 +04:00
, hb . getBufStart ( )
2013-10-16 23:12:22 +04:00
) ;
2013-09-14 03:22:07 +04:00
//
// show stats
//
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_HTML ) {
2013-11-05 04:35:58 +04:00
SafeBuf tmp ;
long crawlStatus = - 1 ;
getSpiderStatusMsg ( cr , & tmp , & crawlStatus ) ;
CrawlInfo * ci = & cr - > m_localCrawlInfo ;
long sentAlert = ( long ) ci - > m_sentCrawlDoneAlert ;
if ( sentAlert ) sentAlert = 1 ;
2013-10-16 23:12:22 +04:00
sb . safePrintf (
2013-09-16 22:42:04 +04:00
" <form method=get action=/crawlbot> "
2013-10-16 01:08:55 +04:00
" %s "
, hb . getBufStart ( ) // hidden input token/name/..
2013-09-17 21:25:54 +04:00
) ;
sb . safePrintf ( " <TABLE border=0> "
2013-09-17 02:18:55 +04:00
" <TR><TD valign=top> "
2013-09-14 03:22:07 +04:00
" <table border=0 cellpadding=5> "
2013-10-17 03:27:24 +04:00
//
" <tr> "
" <td><b>Crawl Name:</td> "
" <td>%s</td> "
" </tr> "
//"<tr>"
//"<td><b>Collection Alias:</td>"
//"<td>%s%s</td>"
//"</tr>"
" <tr> "
" <td><b>Token:</td> "
" <td>%s</td> "
" </tr> "
2013-11-05 04:35:58 +04:00
" <tr> "
" <td><b>Crawl Status:</td> "
" <td>%li</td> "
" </tr> "
" <tr> "
" <td><b>Crawl Status Msg:</td> "
" <td>%s</td> "
" </tr> "
" <tr> "
" <td><b>Rounds Completed:</td> "
" <td>%li</td> "
" </tr> "
2013-10-17 03:27:24 +04:00
2013-09-14 03:22:07 +04:00
// this will have to be in crawlinfo too!
//"<tr>"
//"<td><b>pages indexed</b>"
//"<td>%lli</td>"
//"</tr>"
2013-09-14 04:34:39 +04:00
" <tr> "
2013-09-16 22:22:07 +04:00
" <td><b>Objects Found</b></td> "
2013-09-14 04:34:39 +04:00
" <td>%lli</td> "
2013-09-17 02:18:55 +04:00
" </tr> "
" <tr> "
2013-10-01 00:25:33 +04:00
" <td><b>URLs Harvested</b> (inc. dups)</td> "
2013-09-17 02:18:55 +04:00
" <td>%lli</td> "
" </tr> "
2013-10-29 09:38:15 +04:00
//"<tr>"
//"<td><b>URLs Examined</b></td>"
//"<td>%lli</td>"
//"</tr>"
2013-09-17 02:18:55 +04:00
" <tr> "
2013-10-31 03:14:30 +04:00
" <td><b>Page Crawl Attempts</b></td> "
2013-09-17 02:18:55 +04:00
" <td>%lli</td> "
" </tr> "
" <tr> "
2013-10-31 03:14:30 +04:00
" <td><b>Page Crawl Successes</b></td> "
2013-09-17 02:18:55 +04:00
" <td>%lli</td> "
" </tr> "
" <tr> "
" <td><b>Page Process Attempts</b></td> "
" <td>%lli</td> "
" </tr> "
" <tr> "
" <td><b>Page Process Successes</b></td> "
" <td>%lli</td> "
" </tr> "
2013-10-17 03:27:24 +04:00
, cr - > m_diffbotCrawlName . getBufStart ( )
, cr - > m_diffbotToken . getBufStart ( )
2013-11-05 04:35:58 +04:00
, crawlStatus
, tmp . getBufStart ( )
, cr - > m_spiderRoundNum
2013-09-17 21:25:54 +04:00
, cr - > m_globalCrawlInfo . m_objectsAdded -
cr - > m_globalCrawlInfo . m_objectsDeleted
, cr - > m_globalCrawlInfo . m_urlsHarvested
2013-10-29 09:38:15 +04:00
//, cr->m_globalCrawlInfo.m_urlsConsidered
2013-09-17 21:25:54 +04:00
, cr - > m_globalCrawlInfo . m_pageDownloadAttempts
, cr - > m_globalCrawlInfo . m_pageDownloadSuccesses
, cr - > m_globalCrawlInfo . m_pageProcessAttempts
, cr - > m_globalCrawlInfo . m_pageProcessSuccesses
) ;
2013-09-17 02:18:55 +04:00
2013-12-04 04:23:05 +04:00
sb . safePrintf ( " <tr> "
" <td><b>Download Objects:</b> "
" </td><td> "
" <a href=/crawlbot/download/%s_data.csv> "
" csv</a> "
" "
" <a href=/crawlbot/download/%s_data.json> "
" json</a> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Download Products:</b> "
" </td><td> "
// make it search.csv so excel opens it
" <a href=/search.csv?icc=1&format=csv&sc=0&dr=0& "
" c=%s&n=10000000&rand=%llu&scores=0&id=1& "
" q=gbrevsortby%%3Aproduct.offerPrice& "
" prepend=type%%3Ajson "
//"+type%%3Aproduct%%7C"
" > "
" csv</a> "
" "
" <a href=/search?icc=1&format=html&sc=0&dr=0& "
" c=%s&n=10000000&rand=%llu&scores=0&id=1& "
" q=gbrevsortby%%3Aproduct.offerPrice& "
" prepend=type%%3Ajson "
" > "
" html</a> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Download Urls:</b> "
" </td><td> "
" <a href=/crawlbot/download/%s_urls.csv> "
" csv</a> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Latest Objects:</b> "
" </td><td> "
" <a href=/search.csv?icc=1&format=csv&sc=0&dr=0& "
" c=%s&n=10&rand=%llu&scores=0&id=1& "
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson "
" > "
" csv</a> "
" "
" <a href=/search?icc=1&format=html&sc=0&dr=0& "
" c=%s&n=10rand=%llu&scores=0&id=1& "
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson "
" > "
" html</a> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Latest Products:</b> "
" </td><td> "
" <a href=/search.csv?icc=1&format=csv&sc=0&dr=0& "
" c=%s&n=10&rand=%llu&scores=0&id=1& "
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson+type%%3Aproduct "
" > "
" csv</a> "
" "
" <a href=/search?icc=1&format=html&sc=0&dr=0& "
" c=%s&n=10&rand=%llu&scores=0&id=1& "
" q=gbsortby%%3Agbspiderdate& "
" prepend=type%%3Ajson+type%%3Aproduct "
" > "
" html</a> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Download Pages:</b> "
" </td><td> "
" <a href=/crawlbot/download/%s_pages.txt> "
" txt</a> "
//
" </td> "
" </tr> "
" </table> "
" </TD> "
, cr - > m_coll
, cr - > m_coll
, cr - > m_coll
, rand64
// download products html
, cr - > m_coll
, rand64
//, cr->m_coll
//, cr->m_coll
//, cr->m_coll
, cr - > m_coll
// latest objects in html
, cr - > m_coll
, rand64
// latest objects in csv
, cr - > m_coll
, rand64
// latest products in html
, cr - > m_coll
, rand64
// latest products in csv
, cr - > m_coll
, rand64
// download pages
, cr - > m_coll
) ;
2013-09-18 02:32:28 +04:00
// spacer column
sb . safePrintf ( " <TD> "
2013-09-17 02:18:55 +04:00
" "
" "
" </TD> "
2013-09-18 02:32:28 +04:00
) ;
2013-09-17 02:18:55 +04:00
2013-09-18 02:32:28 +04:00
// what diffbot api to use?
2013-09-18 22:24:16 +04:00
/*
2013-09-18 02:32:28 +04:00
char * api = cr - > m_diffbotApi . getBufStart ( ) ;
2013-09-18 03:30:57 +04:00
char * s [ 10 ] ;
for ( long i = 0 ; i < 10 ; i + + ) s [ i ] = " " ;
2013-09-18 03:38:56 +04:00
if ( api & & strcmp ( api , " all " ) = = 0 ) s [ 0 ] = " selected " ;
if ( api & & strcmp ( api , " article " ) = = 0 ) s [ 1 ] = " selected " ;
if ( api & & strcmp ( api , " product " ) = = 0 ) s [ 2 ] = " selected " ;
if ( api & & strcmp ( api , " image " ) = = 0 ) s [ 3 ] = " selected " ;
if ( api & & strcmp ( api , " frontpage " ) = = 0 ) s [ 4 ] = " selected " ;
if ( api & & strcmp ( api , " none " ) = = 0 ) s [ 5 ] = " selected " ;
if ( ! api | | ! api [ 0 ] ) s [ 5 ] = " selected " ;
2013-09-18 22:24:16 +04:00
*/
2013-09-18 02:32:28 +04:00
sb . safePrintf ( " <TD valign=top> "
2013-09-17 02:18:55 +04:00
" <table cellpadding=5 border=0> "
2013-09-18 22:24:16 +04:00
/*
2013-09-17 02:18:55 +04:00
" <tr> "
2013-09-18 02:32:28 +04:00
" <td> "
" Diffbot API "
" </td><td> "
" <select name=diffbotapi> "
" <option value=all%s>All</option> "
" <option value=article%s>Article</option> "
" <option value=product%s>Product</option> "
" <option value=image%s>Image</option> "
" <option value=frontpage%s>FrontPage</option> "
2013-09-18 03:30:57 +04:00
" <option value=none%s>None</option> "
2013-09-18 02:32:28 +04:00
" </select> "
" </td> "
2013-09-18 02:59:50 +04:00
" </tr> "
2013-09-18 02:32:28 +04:00
, s [ 0 ]
, s [ 1 ]
, s [ 2 ]
, s [ 3 ]
, s [ 4 ]
2013-09-18 03:30:57 +04:00
, s [ 5 ]
2013-09-18 22:24:16 +04:00
*/
2013-09-18 02:32:28 +04:00
) ;
2013-09-29 00:17:43 +04:00
//char *alias = "";
//if ( cr->m_collectionNameAlias.length() > 0 )
// alias = cr->m_collectionNameAlias.getBufStart();
//char *aliasResponse = "";
//if ( alias && ! isAliasUnique(cr,token,alias) )
// aliasResponse = "<br><font size=1 color=red>"
// "Alias not unique</font>";
2013-09-27 00:50:34 +04:00
2013-09-27 22:17:22 +04:00
char * urtYes = " checked " ;
char * urtNo = " " ;
if ( ! cr - > m_useRobotsTxt ) {
urtYes = " " ;
urtNo = " checked " ;
}
2013-10-10 01:24:35 +04:00
2013-10-29 20:31:57 +04:00
char * rdomYes = " checked " ;
char * rdomNo = " " ;
if ( ! cr - > m_restrictDomain ) {
rdomYes = " " ;
rdomNo = " checked " ;
}
2013-10-25 22:14:56 +04:00
char * isNewYes = " " ;
char * isNewNo = " checked " ;
if ( cr - > m_diffbotOnlyProcessIfNew ) {
isNewYes = " checked " ;
isNewNo = " " ;
}
2013-11-21 04:41:28 +04:00
char * api = cr - > m_diffbotApiUrl . getBufStart ( ) ;
if ( ! api ) api = " " ;
SafeBuf apiUrl ;
apiUrl . htmlEncode ( api , gbstrlen ( api ) , true , 0 ) ;
apiUrl . nullTerm ( ) ;
char * px1 = cr - > m_diffbotUrlCrawlPattern . getBufStart ( ) ;
if ( ! px1 ) px1 = " " ;
SafeBuf ppp1 ;
ppp1 . htmlEncode ( px1 , gbstrlen ( px1 ) , true , 0 ) ;
ppp1 . nullTerm ( ) ;
char * px2 = cr - > m_diffbotUrlProcessPattern . getBufStart ( ) ;
if ( ! px2 ) px2 = " " ;
SafeBuf ppp2 ;
ppp2 . htmlEncode ( px2 , gbstrlen ( px2 ) , true , 0 ) ;
ppp2 . nullTerm ( ) ;
char * px3 = cr - > m_diffbotPageProcessPattern . getBufStart ( ) ;
if ( ! px3 ) px3 = " " ;
SafeBuf ppp3 ;
ppp3 . htmlEncode ( px3 , gbstrlen ( px3 ) , true , 0 ) ;
ppp3 . nullTerm ( ) ;
2013-12-04 04:23:05 +04:00
char * rx1 = cr - > m_diffbotUrlCrawlRegEx . getBufStart ( ) ;
if ( ! rx1 ) rx1 = " " ;
SafeBuf rrr1 ;
rrr1 . htmlEncode ( rx1 , gbstrlen ( rx1 ) , true , 0 ) ;
char * rx2 = cr - > m_diffbotUrlProcessRegEx . getBufStart ( ) ;
if ( ! rx2 ) rx2 = " " ;
SafeBuf rrr2 ;
rrr2 . htmlEncode ( rx2 , gbstrlen ( rx2 ) , true , 0 ) ;
2013-10-10 01:24:35 +04:00
char * notifEmail = cr - > m_notifyEmail . getBufStart ( ) ;
char * notifUrl = cr - > m_notifyUrl . getBufStart ( ) ;
if ( ! notifEmail ) notifEmail = " " ;
if ( ! notifUrl ) notifUrl = " " ;
2013-09-27 22:17:22 +04:00
2013-09-18 02:32:28 +04:00
sb . safePrintf (
2013-09-17 02:18:55 +04:00
2013-09-14 05:10:03 +04:00
//
2013-09-17 02:18:55 +04:00
//
2013-10-16 03:57:34 +04:00
" <tr> "
" <td><b>Repeat Crawl:</b> "
" </td><td> "
2013-11-12 03:52:04 +04:00
" <input type=text name=repeat "
2013-10-22 02:06:23 +04:00
" size=10 value= \" %f \" > "
2013-10-16 03:57:34 +04:00
" <input type=submit name=submit value=OK> "
" days "
" </td> "
" </tr> "
2013-11-21 04:41:28 +04:00
" <tr> "
" <td><b>Diffbot API Url:</b> "
" </td><td> "
" <input type=text name=apiUrl "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Url Crawl Pattern:</b> "
" </td><td> "
" <input type=text name=urlCrawlPattern "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Url Process Pattern:</b> "
" </td><td> "
" <input type=text name=urlProcessPattern "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-10-09 04:08:58 +04:00
" <tr> "
" <td><b>Page Process Pattern:</b> "
" </td><td> "
2013-10-15 04:19:59 +04:00
" <input type=text name=pageProcessPattern "
2013-10-09 04:08:58 +04:00
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-12-04 04:23:05 +04:00
" <tr> "
" <td><b>Url Crawl RegEx:</b> "
" </td><td> "
" <input type=text name=urlCrawlRegEx "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Url Process RegEx:</b> "
" </td><td> "
" <input type=text name=urlProcessRegEx "
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-10-25 22:14:56 +04:00
" <tr> "
" <td><b>Only Process If New:</b> "
" </td><td> "
2013-11-05 01:57:44 +04:00
" <input type=radio name=onlyProcessIfNew "
2013-10-25 22:14:56 +04:00
" value=1%s> yes "
2013-11-05 01:57:44 +04:00
" <input type=radio name=onlyProcessIfNew "
2013-10-25 22:14:56 +04:00
" value=0%s> no "
" </td> "
" </tr> "
" <tr> "
2013-10-29 08:20:44 +04:00
" <td><b>Crawl Delay (seconds):</b> "
2013-10-25 22:14:56 +04:00
" </td><td> "
2013-10-29 08:20:44 +04:00
" <input type=text name=crawlDelay "
" size=9 value=%f> "
2013-10-25 22:14:56 +04:00
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-09-17 02:18:55 +04:00
" <tr> "
2013-10-31 03:14:30 +04:00
" <td><b>Max Page Crawl Successes:</b> "
2013-09-17 02:18:55 +04:00
" </td><td> "
2013-10-15 04:19:59 +04:00
" <input type=text name=maxToCrawl "
2013-09-16 22:42:04 +04:00
" size=9 value=%lli> "
" <input type=submit name=submit value=OK> "
2013-09-17 02:18:55 +04:00
" </td> "
2013-09-14 03:22:07 +04:00
" </tr> "
2013-10-02 03:30:06 +04:00
2013-09-14 04:34:39 +04:00
" <tr> "
2013-09-17 02:18:55 +04:00
" <td><b>Max Page Process Successes:</b> "
" </td><td> "
2013-10-15 04:19:59 +04:00
" <input type=text name=maxToProcess "
2013-09-16 22:42:04 +04:00
" size=9 value=%lli> "
" <input type=submit name=submit value=OK> "
2013-09-17 02:18:55 +04:00
" </td> "
2013-09-14 04:34:39 +04:00
" </tr> "
2013-09-14 05:10:03 +04:00
2013-10-23 22:40:30 +04:00
" <tr> "
2013-11-12 03:52:04 +04:00
" <td><b>Max Rounds:</b> "
2013-10-23 22:40:30 +04:00
" </td><td> "
2013-11-12 03:52:04 +04:00
" <input type=text name=maxRounds "
2013-10-23 22:40:30 +04:00
" size=9 value=%li> "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-10-02 03:30:06 +04:00
" <tr> "
" <td><b>Notification Email:</b> "
" </td><td> "
2013-10-15 04:19:59 +04:00
" <input type=text name=notifyEmail "
2013-10-02 03:30:06 +04:00
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
" <tr> "
" <td><b>Notification URL:</b> "
" </td><td> "
2013-10-31 00:39:10 +04:00
" <input type=text name=notifyWebhook "
2013-10-02 03:30:06 +04:00
" size=20 value= \" %s \" > "
" <input type=submit name=submit value=OK> "
" </td> "
" </tr> "
2013-09-17 02:18:55 +04:00
" <tr><td> "
2013-10-30 03:37:14 +04:00
" <b>Use Robots.txt when crawling?</b> "
2013-09-17 02:18:55 +04:00
" </td><td> "
2013-10-15 21:54:54 +04:00
" <input type=radio name=obeyRobots "
2013-09-27 22:17:22 +04:00
" value=1%s> yes "
2013-10-15 21:54:54 +04:00
" <input type=radio name=obeyRobots "
2013-09-27 22:17:22 +04:00
" value=0%s> no "
2013-09-17 02:18:55 +04:00
" </td> "
" </tr> "
2013-10-29 20:31:57 +04:00
" <tr><td> "
2013-10-30 03:37:14 +04:00
" <b>Restrict domain to seeds?</b> "
2013-10-29 20:31:57 +04:00
" </td><td> "
" <input type=radio name=restrictDomain "
" value=1%s> yes "
" <input type=radio name=restrictDomain "
" value=0%s> no "
" </td> "
" </tr> "
2013-09-27 22:17:22 +04:00
//"<tr><td>"
//"Use spider proxies on AWS? "
//"</td><td>"
//"<input type=checkbox name=usefloaters checked>
//"</td>"
//"</tr>"
2013-09-17 02:18:55 +04:00
2013-09-14 03:22:07 +04:00
" </table> "
2013-09-17 02:18:55 +04:00
" </TD> "
" </TR> "
" </TABLE> "
2013-09-27 00:50:34 +04:00
2013-10-16 03:57:34 +04:00
, cr - > m_collectiveRespiderFrequency
2013-11-21 04:41:28 +04:00
, apiUrl . getBufStart ( )
, ppp1 . getBufStart ( )
, ppp2 . getBufStart ( )
, ppp3 . getBufStart ( )
2013-10-09 04:08:58 +04:00
2013-12-04 04:23:05 +04:00
, rrr1 . getBufStart ( )
, rrr2 . getBufStart ( )
2013-10-25 22:14:56 +04:00
, isNewYes
, isNewNo
2013-10-29 08:20:44 +04:00
, cr - > m_collectiveCrawlDelay
2013-10-25 22:14:56 +04:00
2013-10-23 22:40:30 +04:00
, cr - > m_maxToCrawl
, cr - > m_maxToProcess
, ( long ) cr - > m_maxCrawlRounds
2013-09-14 05:10:03 +04:00
2013-10-10 01:24:35 +04:00
, notifEmail
, notifUrl
2013-10-02 03:30:06 +04:00
2013-09-27 22:17:22 +04:00
, urtYes
, urtNo
2013-10-29 20:31:57 +04:00
, rdomYes
, rdomNo
2013-09-14 03:22:07 +04:00
) ;
}
2013-09-16 21:16:49 +04:00
2013-09-14 03:22:07 +04:00
// xml or json does not show the input boxes
2013-09-17 02:33:45 +04:00
//if ( format != FMT_HTML )
// return g_httpServer.sendDynamicPage ( s,
// sb.getBufStart(),
// sb.length(),
// -1 ); // cachetime
2013-09-14 04:34:39 +04:00
2013-09-17 01:29:01 +04:00
//
// print url filters. use "multimedia" to handle jpg etc.
//
2013-09-17 02:00:43 +04:00
// use "notindexable" for images/movies/css etc.
// add a "process" column to send to diffbot...
//
//
2013-09-18 23:38:05 +04:00
2013-10-16 23:12:22 +04:00
/*
2013-09-18 23:38:05 +04:00
char * s1 = " Show " ;
char * s2 = " none " ;
2013-09-19 00:50:55 +04:00
if ( hr - > getLongFromCookie ( " showtable " , 0 ) ) {
2013-09-18 23:38:05 +04:00
s1 = " Hide " ;
s2 = " " ;
}
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_HTML )
sb . safePrintf (
" <a onclick= "
" \" "
" var e = document.getElementById('filters'); "
" var m = document.getElementById('msg'); "
" if ( e.style.display == 'none' ){ "
" e.style.display = ''; "
" m.innerHTML='Hide URL Filters Table'; "
" document.cookie = 'showtable=1;'; "
" } "
" else { "
" e.style.display = 'none'; "
" m.innerHTML='Show URL Filters Table'; "
" document.cookie = 'showtable=0;'; "
" } "
" \" "
" "
" style= "
" cursor:hand; "
" cursor:pointer; "
" color:blue;> "
" <u><b> "
" <div id=msg> "
" %s URL Filters Table "
" </div> "
" </b></u> "
" </a> "
" <div id=filters style=display:%s;> "
" <form method=get action=/crawlbot> "
" <input type=hidden name=c value= \" %s \" > "
" <input type=hidden name=showtable value=1> "
, s1
, s2
, cr - > m_coll
) ;
2013-09-17 02:33:45 +04:00
2013-09-17 03:27:48 +04:00
//
// print url filters. HACKy...
//
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_HTML )
g_parms . sendPageGeneric ( socket ,
hr ,
PAGE_FILTERS ,
NULL ,
& sb ,
cr - > m_coll , // coll override
false ) ; // isJSON?
2013-09-17 03:27:48 +04:00
//
// end HACKy hack
//
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_HTML )
sb . safePrintf (
" </form> "
" </div> "
" <br> "
" <br> "
) ;
2013-10-16 23:12:22 +04:00
*/
2013-09-17 01:29:01 +04:00
2013-09-14 03:22:07 +04:00
//
// add search box to your site
//
2013-09-17 03:27:48 +04:00
/*
2013-09-14 03:22:07 +04:00
sb . safePrintf ( " <br> "
" <table> "
" <tr> "
" <td><a onclick=unhide();> "
" Add this search box to your site "
" </a> "
" </td> "
" </tr> "
" </table> " ) ;
2013-09-17 03:27:48 +04:00
*/
2013-09-14 03:22:07 +04:00
//
2013-10-16 23:12:22 +04:00
// show simpler url filters table
2013-09-14 03:22:07 +04:00
//
2013-10-16 23:12:22 +04:00
if ( fmt = = FMT_HTML ) {
2013-11-23 05:37:42 +04:00
/*
2013-10-16 23:12:22 +04:00
sb . safePrintf ( " <table> "
2013-10-17 01:13:28 +04:00
" <tr><td colspan=2> "
" <b>URL Filters</b> "
" </td></tr> \n "
2013-10-16 23:12:22 +04:00
) ;
// true means its html input
printUrlFilters ( sb , cr , fmt ) ;
// for adding new rule
sb . safePrintf ( " <tr> "
" <td>Expression "
" <input type=text name=expression size=30 "
" value= \" \" > "
" </td><td> "
" Action <input type=text name=action size=50 "
" value= \" \" > "
2013-10-17 01:13:28 +04:00
" "
" <input type=submit name=submit value=OK> "
2013-10-16 23:12:22 +04:00
" </td> "
" </tr> \n "
) ;
//sb.safePrintf("<tr><td colspan=2><font size=-1><i>U
sb . safePrintf ( " </table> \n " ) ;
2013-11-23 05:37:42 +04:00
*/
2013-10-16 23:12:22 +04:00
//
// END THE BIG FORM
//
sb . safePrintf ( " </form> " ) ;
}
2013-09-14 03:22:07 +04:00
2013-10-16 23:12:22 +04:00
//
// show reset and delete crawl buttons
//
2013-09-26 01:37:20 +04:00
if ( fmt = = FMT_HTML ) {
sb . safePrintf (
" <table cellpadding=5> "
" <tr> "
2013-09-17 23:21:09 +04:00
2013-09-26 01:37:20 +04:00
" <td> "
2013-09-17 22:27:31 +04:00
2013-09-17 23:21:09 +04:00
2013-09-26 01:37:20 +04:00
// reset collection form
" <form method=get action=/crawlbot> "
2013-10-16 01:08:55 +04:00
" %s " // hidden tags
, hb . getBufStart ( )
2013-09-26 01:37:20 +04:00
) ;
2013-10-15 23:40:56 +04:00
sb . safePrintf (
2013-09-26 01:37:20 +04:00
2013-11-15 02:09:05 +04:00
" <input type=hidden name=reset value=1> "
2013-09-26 01:37:20 +04:00
// also show it in the display, so set "c"
" <input type=submit name=button value= \" "
" Reset this collection \" > "
" </form> "
// end reset collection form
" </td> "
2013-09-17 23:21:09 +04:00
2013-09-26 01:37:20 +04:00
" <td> "
2013-09-17 23:21:09 +04:00
2013-09-26 01:37:20 +04:00
// delete collection form
" <form method=get action=/crawlbot> "
2013-10-16 01:08:55 +04:00
" %s "
, hb . getBufStart ( )
2013-09-26 01:37:20 +04:00
) ;
2013-09-17 23:21:09 +04:00
2013-10-15 23:40:56 +04:00
sb . safePrintf (
2013-11-15 02:09:05 +04:00
" <input type=hidden name=delete value=1> "
2013-09-26 01:37:20 +04:00
" <input type=submit name=button value= \" "
" Delete this collection \" > "
" </form> "
// end delete collection form
" </td> "
2013-09-14 03:22:07 +04:00
2013-11-15 02:07:45 +04:00
// restart collection form
" <td> "
" <form method=get action=/crawlbot> "
" %s "
2013-11-15 02:09:05 +04:00
" <input type=hidden name=restart value=1> "
2013-11-15 02:07:45 +04:00
" <input type=submit name=button value= \" "
" Restart this collection \" > "
" </form> "
" </td> "
2013-09-26 01:37:20 +04:00
" </tr> "
" </table> "
2013-11-15 02:07:45 +04:00
, hb . getBufStart ( )
2013-09-26 01:37:20 +04:00
) ;
}
2013-09-14 03:22:07 +04:00
2013-10-16 23:12:22 +04:00
2013-10-17 04:17:28 +04:00
// the ROOT JSON }
if ( fmt = = FMT_JSON )
sb . safePrintf ( " } \n " ) ;
2013-10-16 23:12:22 +04:00
2013-10-15 22:50:57 +04:00
char * ct = " text/html " ;
if ( fmt = = FMT_JSON ) ct = " application/json " ;
if ( fmt = = FMT_XML ) ct = " text/xml " ;
2013-11-13 01:51:52 +04:00
if ( fmt = = FMT_CSV ) ct = " text/csv " ;
2013-10-15 22:50:57 +04:00
2013-09-26 01:37:20 +04:00
// this could be in html json or xml
return g_httpServer . sendDynamicPage ( socket ,
2013-09-14 03:22:07 +04:00
sb . getBufStart ( ) ,
sb . length ( ) ,
2013-10-15 22:50:57 +04:00
- 1 , // cachetime
false ,
ct ) ;
2013-09-14 03:22:07 +04:00
/*
" <h1>API for Diffbot</h1> "
" <form action=/api/diffbot> "
" <input type=text name=url size=100> "
" <input type=submit name=inject value= \" Inject \" > "
" </form> "
" <br> "
" <h1>API for Crawlbot</h1> "
// "<form id=\"addCrawl\" onSubmit=\"addCrawlFromForm(); return false;\">"
" <form action=/api/startcrawl method=get> "
" <div class= \" control-group well \" > "
" <div id= \" apiSelection \" class= \" titleColumn \" > "
" <div class= \" row \" > "
" Token: <input type=text name=token><br><br> "
" API: <input type=text name=api> <i>(article, product)</i><br><br> "
" <div class= \" span2 \" ><label class= \" on-default-hide \" >Page-type</label></div> "
" <div class= \" input-append span7 \" > "
" <select id= \" apiSelect \" name= \" api \" class= \" span2 \" value= \" sds \" > "
" <option value= \" \" disabled= \" disabled \" selected= \" selected \" >Select pages to process and extract</option> "
" <option class= \" automatic \" value= \" article \" >Article</option> "
" <option class= \" automatic \" value= \" frontpage \" >Frontpage</option> "
" <option class= \" automatic \" value= \" image \" >Image</option> "
" <option class= \" automatic \" value= \" product \" >Product</option> "
" </select> "
" <span id= \" formError-apiSelect \" class= \" formError \" >Page-type is required</span> "
" <span class= \" inputNote \" >API calls will be made using your current token.</span> "
" </div> "
" </div> "
" </div> "
" <div id= \" apiQueryString \" class= \" titleColumn \" > "
" <div class= \" row \" > "
" <div class= \" span2 \" ><label class= \" on-default-hide \" >API Querystring</label></div> "
" <div class= \" input-prepend span7 \" > "
" <span class= \" add-on \" >?</span><input class= \" span6 search-input \" name= \" apiQueryString \" size= \" 16 \" type= \" text \" placeholder= \" Enter a querystring to specify Diffbot API parameters \" > "
" </div> "
" </div> "
" </div> "
" <hr> "
" <div id= \" seedUrl \" class= \" titleColumn \" > "
" <div class= \" row \" > "
" <div class= \" span2 \" ><label class= \" on-default-hide \" >Seed URL</label></div> "
" <div class= \" input-append span7 \" > "
" <input class= \" span6 search-input \" name= \" seed \" size= \" 16 \" type= \" text \" placeholder= \" Enter a seed URL \" > "
" <span id= \" formError-seedUrl \" class= \" formError \" ><br>Seed URL is required</span> "
" </div> "
" </div> "
" </div> "
" <hr> "
" <div id= \" headerRow \" class= \" titleColumn \" > "
" <div class= \" row \" > "
" <div class= \" span2 \" ><label class= \" on-default-hide \" ><strong>Crawl Filters</strong></label></div> "
" </div> "
" </div> "
" <div id= \" urlCrawlPattern \" class= \" titleColumn \" > "
" <div class= \" regex-edit row \" > "
" <div class= \" span2 \" ><label class= \" on-default-hide \" >URL Regex</label></div> "
" <div class= \" input-append span7 \" > "
" <input class= \" span6 \" name= \" urlCrawlPattern \" size= \" 16 \" type= \" text \" placeholder= \" Only crawl pages whose URLs match this regex \" value= \" \" > "
" <span class= \" inputNote \" >Diffbot uses <a href= \" http://www.regular-expressions.info/refflavors.html \" target= \" _blank \" >Java regex syntax</a>. Be sure to escape your characters.</span> "
" </div> "
" </div> "
" </div> "
" <div id= \" maxCrawled \" class= \" titleColumn \" > "
" <div class= \" regex-edit row \" ><div class= \" span2 \" ><label class= \" on-default-hide \" >Max Pages Crawled</label></div> <div class= \" input-append span7 \" > <input class= \" span1 \" name= \" maxCrawled \" size= \" \" type= \" text \" value= \" \" > </div> </div> </div> <div id= \" headerRow \" class= \" titleColumn \" > <div class= \" row \" > <div class= \" span2 \" ><label class= \" on-default-hide \" ><strong>Processing Filters</strong></label></div> </div> </div> <div id= \" classify \" class= \" titleColumn \" > <div class= \" row \" > <div class= \" span2 \" id= \" smartProcessLabel \" ><label class= \" on-default-hide \" >Smart Processing</label></div> <div class= \" span7 \" ><label class= \" checkbox \" ><input id= \" smartProcessing \" type= \" checkbox \" name= \" classify \" ><span id= \" smartProcessAutomatic \" >Only process pages that match the selected page-type. Uses <a href= \" /our-apis/classifier \" >Page Classifier API</a>.</span><span id= \" smartProcessCustom \" >Smart Processing only operates with Diffbot <a href= \" /products/automatic \" >Automatic APIs.</a></span></label></div> </div> </div> <div id= \" urlProcessPattern \" class= \" titleColumn \" > <div class= \" regex-edit row \" > <div class= \" span2 \" ><label class= \" on-default-hide \" >URL Regex</label></div> <div class= \" input-append span7 \" > <input class= \" span6 \" name= \" urlProcessPattern \" size= \" 16 \" type= \" text \" placeholder= \" Only process pages whose URLs match this regex \" value= \" \" > </div> </div> </div> <div id= \" pageProcessPattern \" class= \" titleColumn \" > <div class= \" regex-edit row \" > <div class= \" span2 \" ><label class= \" on-default-hide \" >Page-Content Regex</label></div> <div class= \" input-append span7 \" > <input class= \" span6 \" name= \" pageProcessPattern \" size= \" 16 \" type= \" text \" placeholder= \" Only process pages whose content contains a match to this regex \" value= \" \" > </div> </div> </div> <div id= \" maxMatches \" class= \" titleColumn \" > <div class= \" regex-edit row \" > <div class= \" span2 \" ><label class= \" on-default-hide \" >Max Pages Processed</label></div> <div class= \" input-append span7 \" > <input class= \" span1 \" name= \" maxProcessed \" size= \" 16 \" type= \" text \" value= \" \" > </div> </div> </div> <hr> <div class= \" controls row \" > <div class= \" span2 \" > </div> <div class= \" span7 \" id= \" startCrawlButtons \" > <button id= \" testButton \" class= \" btn \" type= \" button \" onclick= \" testcrawl(formToData());clicky.log('/dev/crawl#testCrawl','Test Crawl'); \" >Test</button> "
" <!--<button id= \" submitButton \" class= \" btn btn-info \" type= \" button \" onclick= \" addCrawlFromForm() \" >Start Crawl</button>--> "
" <input type=submit name=start value= \" Start Crawl \" > "
" </div> </div> </div> <div id= \" hiddenTestDiv \" style= \" display: none; \" ></div> </form> </div><!-- end Crawler tab --> " ) ;
*/
}
2013-10-22 04:35:14 +04:00
// . do not add dups into m_diffbotSeeds safebuf
// . return 0 if not in table, 1 if in table. -1 on error adding to table.
long isInSeedBuf ( CollectionRec * cr , Url * url ) {
HashTableX * ht = & cr - > m_seedHashTable ;
// if table is empty, populate it
if ( ht - > m_numSlotsUsed < = 0 ) {
// initialize the hash table
if ( ! ht - > set ( 8 , 0 , 1024 , NULL , 0 , false , 1 , " seedtbl " ) )
return - 1 ;
// populate it from list of seed urls
char * p = cr - > m_diffbotSeeds . getBufStart ( ) ;
for ( ; p & & * p ; ) {
// get url
char * purl = p ;
// advance to next
for ( ; * p & & ! is_wspace_a ( * p ) ; p + + ) ;
// make end then
char * end = p ;
// skip possible white space. might be \0.
if ( * p ) p + + ;
// hash it
long long h64 = hash64 ( purl , end - purl ) ;
if ( ! ht - > addKey ( & h64 ) ) return - 1 ;
}
}
// is this url in the hash table?
long long u64 = hash64 ( url - > getUrl ( ) , url - > getUrlLen ( ) ) ;
if ( ht - > isInTable ( & u64 ) ) return 1 ;
// add it to hashtable
if ( ! ht - > addKey ( & u64 ) ) return - 1 ;
// WAS not in table
return 0 ;
}
2013-09-26 02:04:16 +04:00
// just use "fakeips" based on the hash of each url hostname/subdomain
// so we don't waste time doing ip lookups.
2013-09-26 03:51:43 +04:00
bool getSpiderRequestMetaList ( char * doc ,
SafeBuf * listBuf ,
2013-10-22 04:35:14 +04:00
bool spiderLinks ,
CollectionRec * cr ) {
2013-10-18 22:53:14 +04:00
if ( ! doc ) return true ;
2013-09-26 02:04:16 +04:00
// . scan the list of urls
// . assume separated by white space \n \t or space
char * p = doc ;
long now = getTimeGlobal ( ) ;
// a big loop
while ( true ) {
// skip white space (\0 is not a whitespace)
for ( ; is_wspace_a ( * p ) ; p + + ) ;
// all done?
if ( ! * p ) break ;
// save it
char * saved = p ;
// advance to next white space
for ( ; ! is_wspace_a ( * p ) & & * p ; p + + ) ;
// set end
char * end = p ;
// get that url
Url url ;
url . set ( saved , end - saved ) ;
// if not legit skip
if ( url . getUrlLen ( ) < = 0 ) continue ;
// need this
long long probDocId = g_titledb . getProbableDocId ( & url ) ;
// make it
SpiderRequest sreq ;
sreq . reset ( ) ;
sreq . m_firstIp = url . getHostHash32 ( ) ; // fakeip!
sreq . m_hostHash32 = url . getHostHash32 ( ) ;
sreq . m_domHash32 = url . getDomainHash32 ( ) ;
sreq . m_siteHash32 = url . getHostHash32 ( ) ;
sreq . m_probDocId = probDocId ;
sreq . m_hopCount = 0 ; // we're a seed
sreq . m_hopCountValid = true ;
sreq . m_addedTime = now ;
sreq . m_isNewOutlink = 1 ;
sreq . m_isWWWSubdomain = url . isSimpleSubdomain ( ) ;
// treat seed urls as being on same domain and hostname
sreq . m_sameDom = 1 ;
sreq . m_sameHost = 1 ;
sreq . m_sameSite = 1 ;
2013-09-26 03:51:43 +04:00
sreq . m_fakeFirstIp = 1 ;
sreq . m_isAddUrl = 1 ;
// spider links?
if ( ! spiderLinks )
sreq . m_avoidSpiderLinks = 1 ;
2013-09-26 02:04:16 +04:00
// save the url!
strcpy ( sreq . m_url , url . getUrl ( ) ) ;
// finally, we can set the key. isDel = false
sreq . setKey ( sreq . m_firstIp , probDocId , false ) ;
// store rdbid first
if ( ! listBuf - > pushChar ( RDB_SPIDERDB ) )
// return false with g_errno set
return false ;
// store it
if ( ! listBuf - > safeMemcpy ( & sreq , sreq . getRecSize ( ) ) )
// return false with g_errno set
return false ;
2013-10-22 04:35:14 +04:00
if ( ! cr ) continue ;
// do not add dups into m_diffbotSeeds safebuf
long status = isInSeedBuf ( cr , & url ) ;
// error?
if ( status = = - 1 ) {
log ( " crawlbot: error adding seed to table: %s " ,
mstrerror ( g_errno ) ) ;
return true ;
}
// already in buf
if ( status = = 1 ) continue ;
// add url into m_diffbotSeeds, \n separated list
if ( cr - > m_diffbotSeeds . length ( ) )
// make it space not \n so it looks better in the
// json output i guess
cr - > m_diffbotSeeds . pushChar ( ' ' ) ; // \n
cr - > m_diffbotSeeds . safeMemcpy ( url . getUrl ( ) , url . getUrlLen ( ) ) ;
cr - > m_diffbotSeeds . nullTerm ( ) ;
2013-09-26 02:04:16 +04:00
}
// all done
return true ;
}
2013-09-29 00:17:43 +04:00
/*
2013-09-27 00:50:34 +04:00
bool isAliasUnique ( CollectionRec * cr , char * token , char * alias ) {
// scan all collections
for ( long i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) {
CollectionRec * cx = g_collectiondb . m_recs [ i ] ;
if ( ! cx ) continue ;
// must belong to us
if ( strcmp ( cx - > m_diffbotToken . getBufStart ( ) , token ) )
continue ;
// skip if collection we are putting alias on
if ( cx = = cr ) continue ;
// does it match?
if ( cx - > m_collectionNameAlias . length ( ) < = 0 ) continue ;
// return false if it matches! not unique
if ( strcmp ( cx - > m_collectionNameAlias . getBufStart ( ) ,
alias ) = = 0 )
return false ;
}
return true ;
}
2013-09-29 00:17:43 +04:00
*/
2013-10-12 02:14:26 +04:00
// json can be provided via get or post but content type must be
// url-encoded so we can test with a simple html form page.
2013-10-15 04:19:59 +04:00
/*
2013-10-15 03:19:30 +04:00
bool setSpiderParmsFromJSONPost ( TcpSocket * socket ,
HttpRequest * hr ,
CollectionRec * cr ) {
2013-10-12 02:14:26 +04:00
// get the json
char * json = hr - > getString ( " json " ) ;
if ( ! json )
return sendReply2 ( socket ,
FMT_JSON ,
" No &json= provided in request. " ) ;
2013-10-14 23:00:05 +04:00
Json JP ;
bool status = JP . parseJsonStringIntoJsonItems ( json ) ;
2013-10-12 02:14:26 +04:00
2013-10-14 23:00:05 +04:00
// wtf?
if ( ! status )
return sendReply2 ( socket , FMT_JSON ,
" Error with JSON parser. " ) ;
2013-10-12 02:14:26 +04:00
2013-10-14 23:00:05 +04:00
// error adding it?
if ( ! cr )
return sendReply2 ( socket , FMT_JSON ,
" Failed to create new collection. " ) ;
2013-10-15 02:10:48 +04:00
ji = JP . getFirstItem ( ) ;
char * seed = NULL ;
2013-10-12 02:14:26 +04:00
// traverse the json
2013-10-14 23:00:05 +04:00
for ( ; ji ; ji = ji - > m_next ) {
// just get STRINGS or NUMS
if ( ji - > m_type ! = JT_STRING & & ji - > m_type ! = JT_NUMBER )
continue ;
// check name
char * name = ji - > m_name ;
char * val = ji - > getValue ( ) ;
2013-10-15 02:10:48 +04:00
if ( strcmp ( name , " seed " ) = = 0 )
seed = val ;
if ( strcmp ( name , " email " ) = = 0 )
cr - > m_notifyEmail . set ( val ) ;
if ( strcmp ( name , " webhook " ) = = 0 )
cr - > m_notifyUrl . set ( val ) ;
if ( strcmp ( name , " frequency " ) = = 0 )
cr - > m_collectiveRespiderFrequency = atof ( val ) ;
if ( strcmp ( name , " maxToCrawl " ) = = 0 )
2013-10-23 22:40:30 +04:00
cr - > m_maxToCrawl = atoll ( val ) ;
2013-10-15 02:10:48 +04:00
if ( strcmp ( name , " maxToProcess " ) = = 0 )
2013-10-23 22:40:30 +04:00
cr - > m_maxToProcess = atoll ( val ) ;
2013-10-15 02:10:48 +04:00
if ( strcmp ( name , " pageProcessPattern " ) = = 0 )
cr - > m_diffbotPageProcessPattern . set ( val ) ;
if ( strcmp ( name , " obeyRobots " ) = = 0 ) {
if ( val [ 0 ] = = ' t ' | | val [ 0 ] = = ' T ' | | val [ 0 ] = = 1 )
cr - > m_useRobotsTxt = true ;
else
cr - > m_useRobotsTxt = false ;
}
if ( strcmp ( name , " onlyProcessNew " ) = = 0 ) {
if ( val [ 0 ] = = ' t ' | | val [ 0 ] = = ' T ' | | val [ 0 ] = = 1 )
cr - > m_diffbotOnlyProcessIfNew = true ;
else
cr - > m_diffbotOnlyProcessIfNew = false ;
}
if ( strcmp ( name , " pauseCrawl " ) = = 0 ) {
if ( val [ 0 ] = = ' t ' | | val [ 0 ] = = ' T ' | | val [ 0 ] = = 1 )
cr - > m_spideringEnabled = 0 ;
else
cr - > m_spideringEnabled = 1 ;
2013-10-14 23:00:05 +04:00
}
}
2013-10-15 02:10:48 +04:00
// set collective respider in case just that was passed
for ( long i = 0 ; i < MAX_FILTERS ; i + + )
cr - > m_spiderFreqs [ i ] = cr - > m_collectiveRespiderFrequency ;
2013-10-14 23:00:05 +04:00
2013-10-15 02:10:48 +04:00
// if url filters not specified, we are done
if ( ! JP . getItem ( " urlFilters " ) )
return true ;
// reset the url filters here to the default set.
// we will append the client's filters below them below.
resetUrlFilters ( cr ) ;
2013-10-14 23:00:05 +04:00
char * expression = NULL ;
char * action = NULL ;
// start over at top
ji = JP . getFirstItem ( ) ;
// "urlFilters": [
// {
2013-10-15 02:10:48 +04:00
// "value": "*", // MDW - this matches all urls! ("default")
2013-10-14 23:00:05 +04:00
// "action": "http://www.diffbot.com/api/analyze?mode=auto"
// }
// {
// "value": "company",
// "action" : "http://www.diffbot.com/api/article?tags&meta"
// }
// {
// "value": "^http://www",
// "action": "doNotProcess"
// }
// {
// "value": "$.html && category",
// "action": "doNotCrawl"
// }
// {
// "value": "!$.html && $.php",
// "action": "doNotCrawl"
// }
// ]
// how many filters do we have so far?
2013-10-15 02:10:48 +04:00
long nf = cr - > m_numRegExs ;
2013-10-14 23:00:05 +04:00
for ( ; ji ; ji = ji - > m_next ) {
// just get STRINGS only
if ( ji - > m_type ! = JT_STRING ) continue ;
// must be right now
char * name = ji - > m_name ;
char * value = ji - > getValue ( ) ;
if ( strcmp ( name , " value " ) = = 0 )
expression = value ;
if ( strcmp ( name , " action " ) = = 0 )
action = ji - > getValue ( ) ;
// need both
if ( ! action ) continue ;
if ( ! expression ) continue ;
2013-10-15 02:10:48 +04:00
// they use "*" instead of "default" so put that back
if ( expression [ 0 ] = = ' * ' )
expression = " default " ;
2013-10-14 23:00:05 +04:00
// deal with it
cr - > m_regExs [ 1 ] . set ( expression ) ;
2013-10-15 02:10:48 +04:00
cr - > m_numRegExs + + ;
2013-10-14 23:00:05 +04:00
long priority = 50 ;
// default diffbot api call:
2013-10-15 02:10:48 +04:00
char * api = NULL ;
2013-10-14 23:00:05 +04:00
if ( strcasecmp ( action , " donotcrawl " ) = = 0 )
priority = SPIDER_PRIORITY_FILTERED ;
2013-10-15 02:10:48 +04:00
//if ( strcasecmp(action,"donotprocess") == 0 )
// api = NULL;
2013-10-14 23:00:05 +04:00
// a new diffbot url?
if ( strcasecmp ( action , " http " ) = = 0 )
api = action ;
// add the new filter
2013-10-15 02:10:48 +04:00
cr - > m_regExs [ nf ] . set ( expression ) ;
2013-10-14 23:00:05 +04:00
cr - > m_spiderPriorities [ nf ] = priority ;
2013-10-15 02:10:48 +04:00
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( api ) ;
nf + + ;
// add a mirror of that filter but for manually added,
// i.e. injected or via add url,
if ( priority < 0 ) continue ;
// make the priority higher!
cr - > m_regExs [ nf ] . safePrintf ( " ismanualadd && %s " , expression ) ;
cr - > m_spiderPriorities [ nf ] = 70 ;
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( api ) ; // appends \0
nf + + ;
2013-10-14 23:00:05 +04:00
// NULL out again
action = NULL ;
expression = NULL ;
2013-10-15 02:10:48 +04:00
if ( nf < MAX_FILTERS ) continue ;
log ( " crawlbot: too many url filters! " ) ;
break ;
}
// update the counts
cr - > m_numRegExs = nf ;
cr - > m_numRegExs2 = nf ;
cr - > m_numRegExs3 = nf ;
cr - > m_numRegExs10 = nf ;
cr - > m_numRegExs5 = nf ;
cr - > m_numRegExs6 = nf ;
cr - > m_numRegExs7 = nf ;
cr - > m_numRegExs11 = nf ;
// set collective respider
for ( long i = 0 ; i < nf ; i + + )
cr - > m_spiderFreqs [ i ] = cr - > m_collectiveRespiderFrequency ;
return true ;
}
2013-10-15 04:19:59 +04:00
*/
2013-10-15 02:10:48 +04:00
2013-10-15 03:19:30 +04:00
2013-12-11 01:09:55 +04:00
/*
THIS IS NOW AUTOMATIC from new Parms . cpp broadcast logic
2013-10-22 04:35:14 +04:00
2013-10-15 03:19:30 +04:00
bool setSpiderParmsFromHtmlRequest ( TcpSocket * socket ,
HttpRequest * hr ,
CollectionRec * cr ) {
// update the url filters for now since that is complicated
// supply "cr" directly since "c" may not be in the http
// request if addcoll=xxxxxx (just created a new rec)
2013-10-15 04:19:59 +04:00
//long page = PAGE_FILTERS;
//WebPage *pg = g_pages.getPage ( page ) ;
//g_parms.setFromRequest ( hr , socket , pg->m_function, cr );
2013-12-04 04:17:36 +04:00
bool rebuild = false ;
2013-10-15 04:19:59 +04:00
//
// set other diffbot parms for this collection
//
long maxToCrawl = hr - > getLongLong ( " maxToCrawl " , - 1LL ) ;
2013-11-12 03:52:04 +04:00
if ( maxToCrawl = = - 1 )
maxToCrawl = hr - > getLongLong ( " maxToDownload " , - 1LL ) ;
2013-10-15 04:19:59 +04:00
if ( maxToCrawl ! = - 1 ) {
2013-10-23 22:40:30 +04:00
cr - > m_maxToCrawl = maxToCrawl ;
2013-10-15 04:19:59 +04:00
cr - > m_needsSave = 1 ;
}
long maxToProcess = hr - > getLongLong ( " maxToProcess " , - 1LL ) ;
if ( maxToProcess ! = - 1 ) {
2013-10-23 22:40:30 +04:00
cr - > m_maxToProcess = maxToProcess ;
cr - > m_needsSave = 1 ;
}
// -1 means no max, so use -2 as default here
long maxCrawlRounds = hr - > getLongLong ( " maxCrawlRounds " , - 2LL ) ;
2013-11-12 03:52:04 +04:00
if ( maxCrawlRounds = = - 2 )
maxCrawlRounds = hr - > getLongLong ( " maxRounds " , - 2LL ) ;
2013-10-23 22:40:30 +04:00
if ( maxCrawlRounds ! = - 2 ) {
cr - > m_maxCrawlRounds = maxCrawlRounds ;
2013-10-15 04:19:59 +04:00
cr - > m_needsSave = 1 ;
}
char * email = hr - > getString ( " notifyEmail " , NULL , NULL ) ;
if ( email ) {
cr - > m_notifyEmail . set ( email ) ;
cr - > m_needsSave = 1 ;
}
char * url = hr - > getString ( " notifyWebHook " , NULL , NULL ) ;
2013-10-15 22:31:02 +04:00
if ( ! url ) url = hr - > getString ( " notifyWebhook " , NULL , NULL ) ;
2013-10-15 04:19:59 +04:00
if ( url ) {
2013-10-22 04:51:23 +04:00
// assume url is invalid, purge it
cr - > m_notifyUrl . purge ( ) ;
// normalize
Url norm ;
norm . set ( url ) ;
if ( norm . getDomainLen ( ) > 0 & &
norm . getHostLen ( ) > 0 )
// set the ssafebuf to it. will \0 terminate it.
cr - > m_notifyUrl . set ( norm . getUrl ( ) ) ;
// save the collection rec
2013-10-15 04:19:59 +04:00
cr - > m_needsSave = 1 ;
}
long pause = hr - > getLong ( " pauseCrawl " , - 1 ) ;
2013-10-23 05:51:09 +04:00
// /v2/bulk api support
if ( pause = = - 1 ) pause = hr - > getLong ( " pause " , - 1 ) ;
2013-10-15 04:19:59 +04:00
if ( pause = = 0 ) { cr - > m_needsSave = 1 ; cr - > m_spideringEnabled = 1 ; }
if ( pause = = 1 ) { cr - > m_needsSave = 1 ; cr - > m_spideringEnabled = 0 ; }
long obeyRobots = hr - > getLong ( " obeyRobots " , - 1 ) ;
2013-10-23 05:55:19 +04:00
if ( obeyRobots = = - 1 ) obeyRobots = hr - > getLong ( " robots " , - 1 ) ;
2013-10-15 04:19:59 +04:00
if ( obeyRobots ! = - 1 ) {
cr - > m_useRobotsTxt = obeyRobots ;
cr - > m_needsSave = 1 ;
}
2013-10-29 20:31:57 +04:00
long restrictDomain = hr - > getLong ( " restrictDomain " , - 1 ) ;
if ( restrictDomain ! = - 1 ) {
cr - > m_restrictDomain = restrictDomain ;
cr - > m_needsSave = 1 ;
2013-12-04 04:17:36 +04:00
rebuild = true ;
2013-10-29 20:31:57 +04:00
}
2013-11-21 04:41:28 +04:00
char * api = hr - > getString ( " apiUrl " , NULL ) ;
if ( api ) {
cr - > m_diffbotApiUrl . set ( api ) ;
cr - > m_needsSave = 1 ;
}
char * ppp1 = hr - > getString ( " urlCrawlPattern " , NULL ) ;
if ( ppp1 ) {
cr - > m_diffbotUrlCrawlPattern . set ( ppp1 ) ;
cr - > m_needsSave = 1 ;
2013-12-04 04:17:36 +04:00
rebuild = true ;
2013-11-21 04:41:28 +04:00
}
char * ppp2 = hr - > getString ( " urlProcessPattern " , NULL ) ;
if ( ppp2 ) {
cr - > m_diffbotUrlProcessPattern . set ( ppp2 ) ;
cr - > m_needsSave = 1 ;
}
char * ppp3 = hr - > getString ( " pageProcessPattern " , NULL ) ;
if ( ppp3 ) {
cr - > m_diffbotPageProcessPattern . set ( ppp3 ) ;
2013-10-15 04:19:59 +04:00
cr - > m_needsSave = 1 ;
}
2013-12-04 04:23:05 +04:00
// reg ex support
char * rx1 = hr - > getString ( " urlCrawlRegEx " , NULL ) ;
// clear what we had
if ( rx1 & & cr - > m_hasucr ) {
regfree ( & cr - > m_ucr ) ;
cr - > m_hasucr = false ;
cr - > m_diffbotUrlCrawlRegEx . purge ( ) ;
cr - > m_needsSave = 1 ;
2013-12-04 04:23:58 +04:00
rebuild = true ;
2013-12-04 04:23:05 +04:00
}
// add a new one if not blank
if ( rx1 & & rx1 [ 0 ] ) {
cr - > m_diffbotUrlCrawlRegEx . set ( rx1 ) ;
cr - > m_needsSave = 1 ;
// this will store the compiled regular expression into ucr
if ( regcomp ( & cr - > m_ucr ,
// the regular expression to compile
rx1 ,
// some flags
REG_EXTENDED | REG_ICASE |
REG_NEWLINE | REG_NOSUB ) ) {
regfree ( & cr - > m_ucr ) ;
// should never fail!
return log ( " xmldoc: regcomp %s failed: %s. "
" Ignoring. " ,
rx1 , mstrerror ( errno ) ) ;
}
cr - > m_hasucr = true ;
}
char * rx2 = hr - > getString ( " urlProcessRegEx " , NULL ) ;
// clear what we had
if ( rx2 & & cr - > m_hasupr ) {
regfree ( & cr - > m_upr ) ;
cr - > m_hasupr = false ;
cr - > m_diffbotUrlProcessRegEx . purge ( ) ;
cr - > m_needsSave = 1 ;
}
// add a new one if not blank
if ( rx2 & & rx2 [ 0 ] ) {
cr - > m_diffbotUrlProcessRegEx . set ( rx2 ) ;
cr - > m_needsSave = 1 ;
// this will store the compiled regular expression into upr
if ( regcomp ( & cr - > m_upr ,
// the regular expression to compile
rx2 ,
// some flags
REG_EXTENDED | REG_ICASE |
REG_NEWLINE | REG_NOSUB ) ) {
regfree ( & cr - > m_upr ) ;
// error!
return log ( " xmldoc: regcomp %s failed: %s. "
" Ignoring. " ,
rx2 , mstrerror ( errno ) ) ;
}
cr - > m_hasupr = true ;
}
2013-11-12 03:52:04 +04:00
float respider = hr - > getFloat ( " repeatJob " , - 1.0 ) ;
2013-10-23 05:55:19 +04:00
if ( respider = = - 1.0 ) respider = hr - > getFloat ( " repeat " , - 1.0 ) ;
2013-11-12 03:52:04 +04:00
if ( respider = = - 1.0 ) respider = hr - > getFloat ( " repeatCrawl " , - 1.0 ) ;
2013-10-15 04:19:59 +04:00
if ( respider > = 0.0 ) {
2013-10-18 04:17:19 +04:00
// if not 0, then change this by the delta
if ( cr - > m_spiderRoundStartTime ) {
2013-10-25 23:11:40 +04:00
// convert from days into seconds
float rfOld = cr - > m_collectiveRespiderFrequency ;
float rfNew = respider ;
// 86400 seconds in a day
long secondsOld = ( long ) ( rfOld * 86400 ) ;
long secondsNew = ( long ) ( rfNew * 86400 ) ;
2013-10-18 04:17:19 +04:00
// remove old one.
2013-10-25 23:11:40 +04:00
cr - > m_spiderRoundStartTime - = secondsOld ;
2013-10-18 04:17:19 +04:00
// add in new one
2013-10-25 23:11:40 +04:00
cr - > m_spiderRoundStartTime + = secondsNew ;
2013-10-18 04:17:19 +04:00
}
// if 0 that means NO recrawling
2013-10-18 05:59:00 +04:00
if ( respider = = 0.0 ) {
cr - > m_spiderRoundStartTime = 0 ; //getTimeGlobal();
2013-10-18 04:17:19 +04:00
}
2013-10-15 04:19:59 +04:00
cr - > m_collectiveRespiderFrequency = respider ;
cr - > m_needsSave = 1 ;
}
2013-10-25 06:05:57 +04:00
2013-10-29 08:20:44 +04:00
float delay = hr - > getFloat ( " crawlDelay " , - 1.0 ) ;
//long crawlWait = hr->getLong("wait",-1);
2013-12-04 04:17:36 +04:00
if ( delay > = 0.0 ) {
rebuild = true ;
2013-10-29 08:20:44 +04:00
cr - > m_collectiveCrawlDelay = delay ;
2013-12-04 04:17:36 +04:00
}
2013-10-25 06:05:57 +04:00
2013-11-05 01:57:44 +04:00
long onlyProcessNew = hr - > getLong ( " onlyProcessIfNew " , - 1 ) ;
2013-10-15 04:19:59 +04:00
if ( onlyProcessNew ! = - 1 ) {
cr - > m_diffbotOnlyProcessIfNew = onlyProcessNew ;
cr - > m_needsSave = 1 ;
}
// set collective respider
2013-10-22 02:06:23 +04:00
//for ( long i =0 ; i < cr->m_numRegExs ; i++ ) {
// if ( cr->m_collectiveRespiderFrequency == 0.0 )
// cr->m_spiderFreqs[i] = 0.000;
// else
// cr->m_spiderFreqs[i] = 0.001;
// //cr->m_collectiveRespiderFrequency;
//}
2013-10-15 04:19:59 +04:00
2013-10-23 05:51:09 +04:00
char * path = hr - > getPath ( ) ;
bool isBulkApi = false ;
if ( path & & strncmp ( path , " /v2/bulk " , 8 ) = = 0 ) isBulkApi = true ;
2013-10-15 04:19:59 +04:00
// were any url filteres specified? if not, don't reset them
2013-11-21 04:41:28 +04:00
//if ( ! hr->hasField("action") )
// return true;
2013-10-15 04:19:59 +04:00
// reset the url filters here to the default set.
// we will append the client's filters below them below.
resetUrlFilters ( cr ) ;
2013-12-04 04:17:36 +04:00
// if it was not recrawling and we made it start we have
// to repopulate waiting tree because most entries will
// need to be re-added!
// really, anytime we change url filters we have to repopulate
// the waiting tree
SpiderColl * sc = cr - > m_spiderColl ;
if ( sc & & rebuild ) {
// this is causing a bulk job not to complete because
// jenkins keeps checking it every 10 seconds
sc - > m_waitingTreeNeedsRebuild = true ;
}
2013-11-21 04:41:28 +04:00
return true ;
2013-10-15 04:19:59 +04:00
// "urlFilters": [
// {
// "value": "*", // MDW - this matches all urls! ("default")
// "action": "http://www.diffbot.com/api/analyze?mode=auto"
// }
// {
// "value": "company",
// "action" : "http://www.diffbot.com/api/article?tags&meta"
// }
// {
// "value": "^http://www",
// "action": "doNotProcess"
// }
// {
// "value": "$.html && category",
// "action": "doNotCrawl"
// }
// {
// "value": "!$.html && $.php",
// "action": "doNotCrawl"
// }
// ]
char * expression = NULL ;
char * action = NULL ;
// how many filters do we have so far?
long nf = cr - > m_numRegExs ;
2013-10-15 22:22:59 +04:00
// delete the 3rd default filter cuz we should re-add it below
// to the bottom of the list.
if ( nf > = 3 ) nf - - ;
bool addedDefault = false ;
2013-10-15 04:19:59 +04:00
// loop over the cgi parms
for ( long i = 0 ; i < hr - > getNumFields ( ) ; i + + ) {
// get cgi parm name
char * field = hr - > getField ( i ) ;
//long flen = hr->getFieldLen ( i );
if ( strcmp ( field , " expression " ) = = 0 )
expression = hr - > getValue ( i ) ;
if ( strcmp ( field , " action " ) = = 0 )
action = hr - > getValue ( i ) ;
// need both
if ( ! action ) continue ;
2013-11-14 21:54:36 +04:00
// no! the /v2/bulk api just has a single action
if ( isBulkApi ) expression = " * " ;
2013-10-15 04:19:59 +04:00
// action before expresion???? set action to NULL then?
2013-11-14 21:54:36 +04:00
if ( ! expression ) continue ;
//else continue;// { action = NULL; continue; }
2013-10-18 04:17:19 +04:00
// skip whitespace
while ( is_wspace_a ( * expression ) ) expression + + ;
while ( is_wspace_a ( * action ) ) action + + ;
2013-10-16 23:12:22 +04:00
// skip if expression is empty
if ( ! expression [ 0 ] ) {
action = NULL ; expression = NULL ; continue ; }
2013-10-15 04:19:59 +04:00
// they use "*" instead of "default" so put that back
2013-10-15 22:22:59 +04:00
if ( expression [ 0 ] = = ' * ' ) {
2013-10-15 04:19:59 +04:00
expression = " default " ;
2013-10-15 22:22:59 +04:00
addedDefault = true ;
}
2013-10-15 04:19:59 +04:00
// deal with it
long priority = 50 ;
// default diffbot api call:
2013-10-18 04:17:19 +04:00
//char *api = NULL;
2013-10-16 23:12:22 +04:00
if ( strcasecmp ( action , " donotcrawl " ) = = 0 )
2013-10-15 04:19:59 +04:00
priority = SPIDER_PRIORITY_FILTERED ;
//if ( strcasecmp(action,"donotprocess") == 0 )
// api = NULL;
// a new diffbot url?
2013-10-16 23:19:25 +04:00
//if ( strncasecmp(action,"http",4) == 0 )
2013-10-18 04:17:19 +04:00
//api = action;
2013-10-15 04:19:59 +04:00
2013-10-22 00:44:30 +04:00
// add a mirror of that filter but for manually added,
// i.e. injected or via add url,
if ( priority > = 0 ) {
// purge because might have been the last "default"
// filter that we did nf-- above on.
cr - > m_regExs [ nf ] . purge ( ) ;
// make the priority higher!
cr - > m_regExs [ nf ] . safePrintf ( " ismanualadd && %s " ,
expression ) ;
cr - > m_spiderPriorities [ nf ] = 70 ;
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( action ) ; // appends\0
2013-10-22 02:06:23 +04:00
cr - > m_spiderFreqs [ nf ] =
cr - > m_collectiveRespiderFrequency ;
2013-10-22 00:44:30 +04:00
nf + + ;
}
2013-10-15 04:19:59 +04:00
// add the new filter
cr - > m_regExs [ nf ] . set ( expression ) ;
cr - > m_spiderPriorities [ nf ] = priority ;
2013-10-18 04:17:19 +04:00
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( action ) ;
2013-10-22 02:06:23 +04:00
cr - > m_spiderFreqs [ nf ] = cr - > m_collectiveRespiderFrequency ;
2013-10-15 04:19:59 +04:00
nf + + ;
// NULL out again
action = NULL ;
expression = NULL ;
if ( nf < MAX_FILTERS ) continue ;
log ( " crawlbot: too many url filters! " ) ;
break ;
}
2013-10-15 22:22:59 +04:00
// if no '*' line was provided, add it here
if ( ! addedDefault ) {
2013-11-14 06:31:26 +04:00
cr - > m_regExs [ nf ] . set ( " default " ) ;
2013-10-15 22:22:59 +04:00
cr - > m_spiderPriorities [ nf ] = 50 ;
cr - > m_spiderDiffbotApiUrl [ nf ] . set ( NULL ) ;
2013-10-22 02:06:23 +04:00
cr - > m_spiderFreqs [ nf ] = cr - > m_collectiveRespiderFrequency ;
2013-10-15 22:22:59 +04:00
nf + + ;
}
2013-10-15 04:19:59 +04:00
// update the counts
cr - > m_numRegExs = nf ;
cr - > m_numRegExs2 = nf ;
cr - > m_numRegExs3 = nf ;
cr - > m_numRegExs10 = nf ;
cr - > m_numRegExs5 = nf ;
cr - > m_numRegExs6 = nf ;
cr - > m_numRegExs7 = nf ;
cr - > m_numRegExs11 = nf ;
// set collective respider
2013-10-22 02:06:23 +04:00
//for ( long i =0 ; i < nf ; i++ )
// cr->m_spiderFreqs[i] = cr->m_collectiveRespiderFrequency;
2013-10-15 04:19:59 +04:00
return true ;
}
2013-12-11 01:09:55 +04:00
*/
2013-10-24 22:32:41 +04:00
///////////
//
// SUPPORT for getting the last 100 spidered urls
//
// . sends request to each node
// . each node returns top 100 after scanning spiderdb (cache for speed)
// . master node gets top 100 of the top 100s
// . sends pretty html or json back to socket
// . then user can see why their crawl isn't working
// . also since we are scanning spiderdb indicate how many urls are
// ignored because they match "ismedia" or "!isonsamedomain" etc. so
// show each url filter expression then show how many urls matched that.
// when doing this make the spiderReply null, b/c the purpose is to see
// what urls
// . BUT url may never be attempted because it matches "ismedia" so that kind
// of thing might have to be indicated on the spiderdb dump above, not here.
//
//////////
//bool sendPageLast100Urls ( TcpSocket *socket , HttpRequest *hr ) {