more crawlbot fixes

2024-10-05 04:37:39 +03:00 · 2013-10-15 12:22:59 -06:00 · 2013-10-15 12:22:59 -06:00 · 313eb1209e
commit 313eb1209e
parent d8835acfef
3 changed files with 85 additions and 53 deletions
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -1857,7 +1857,7 @@ public:
 	char *m_desc;
 };
 static class HelpItem s_his[] = {
-	{"format","Use &format=json to show JSON output."},
+	{"format","Use &format=html to show HTML output. Default is JSON."},
 	{"token","Required for all operations below."},

 	{"name","Name of the crawl. If missing will just show "
@ -1871,11 +1871,10 @@ static class HelpItem s_his[] = {
 	 ""},
 	{"addUrls","A huge string of whitespace separated URLs to add to "
 	 "spiderdb for crawling."},
-	{"spiderLinks","Use 0 or 1 to not spider or spider links from "
-	 "the seed url. Pass this along with the seed "
-	 "parameter.A Use "
-	 "this parameter in conjunction with the addUrls parameter as well. "
-	 "The DEFAULT is 0!!! So if seeding make sure to say &spiderLinks=1."},
+	{"spiderLinks","Use 1 or 0 to spider the links or NOT spider "
+	 "the links, respectively, from "
+	 "the provided seed or addUrls parameters. "
+	 "The default is 1."},

 	{"maxToCrawl", "Specify max pages to successfully download."},
 	{"maxToProcess", "Specify max pages to successfully process through "
@ -1889,7 +1888,9 @@ static class HelpItem s_his[] = {
 	 "contains any of these then we send it to diffbot for processing. "
 	 "If this is empty we send all pages to diffbot for processing."},

-	{"expression","A pattern to match in a URL. Example expressions:"},
+	{"expression","A pattern to match in a URL. List up to 100 "
+	 "expression/action pairs in the HTTP request. "
+	 "Example expressions:"},
 	{"action","Take the appropriate action when preceeding pattern is "
 	 "matched. Specify multiple expression/action pairs to build a "
 	 "table of filters. Each URL being spidered will take the given "
@ -1920,6 +1921,37 @@ char *getInputString ( char *string , HttpRequest *hr , Json *JS ) {
 // . so if no &cast is present we are the original!!!
 bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {

+	// print help
+	long help = hr->getLong("help",0);
+	if ( help ) {
+		SafeBuf sb;
+		sb.safePrintf("<html>"
+			      "<title>Crawlbot API</title>"
+			      "<h1>Crawlbot API</h1>"
+			      "<b>Use the parameters below on the "
+			      "<a href=\"/crawlbot\">/crawlbot</a> page."
+			      "</b><br><br>"
+			      "<table>"
+			      );
+		for ( long i = 0 ; i < 1000 ; i++ ) {
+			HelpItem *h = &s_his[i];
+			if ( ! h->m_parm ) break;
+			sb.safePrintf( "<tr>"
+				       "<td>%s</td>"
+				       "<td>%s</td>"
+				       "</tr>"
+				       , h->m_parm
+				       , h->m_desc
+				       );
+		}
+		sb.safePrintf("</table>"
+			      "</html>");
+		return g_httpServer.sendDynamicPage (socket, 
+						     sb.getBufStart(), 
+						     sb.length(),
+						     0); // cachetime
+	}
+
 	// . Pages.cpp by default broadcasts all PageCrawlbot /crawlbot 
 	//   requests to every host in the network unless a cast=0 is 
 	//   explicitly given
@ -1936,9 +1968,10 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 	// . now show stats for the current crawl
 	// . put in xml or json if format=xml or format=json or
 	//   xml=1 or json=1 ...
-	char fmt = FMT_HTML;
+	char fmt = FMT_JSON;
 	char *fs = hr->getString("format",NULL,NULL);
 	// give john a json api
+	if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
 	if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON;
 	if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML;
 	// if we got json as input, give it as output
@ -1948,11 +1981,6 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 	//char *token = getInputString ( "token" );
 	char *token = hr->getString("token");

-	if ( gbstrlen(token) > 32 ) { 
-		log("crawlbot: token is over 32 chars.");
-		return NULL;
-	}
-
 	if ( ! token && ( cast == 0 || fmt == FMT_JSON ) ) {
 		char *msg = "invalid token";
 		return sendErrorReply2 (socket,fmt,msg);
@ -1986,6 +2014,12 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 						     0); // cachetime
 	}

+	if ( gbstrlen(token) > 32 ) { 
+		log("crawlbot: token is over 32 chars");
+		char *msg = "crawlbot: token is over 32 chars";
+		return sendErrorReply2 (socket,fmt,msg);
+	}
+
 	char *seed = hr->getString("seed");
 	char *addUrls = hr->getString("addUrls");

@ -2140,36 +2174,6 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 	//
 	/////////

-	// print help
-	long help = hr->getLong("help",0);
-	if ( help ) {
-		SafeBuf sb;
-		sb.safePrintf("<html>"
-			      "<title>Crawlbot API</title>"
-			      "<h1>Crawlbot API</h1>"
-			      "<b>Use the parameters below on the "
-			      "<a href=\"/crawlbot\">/crawlbot</a> page."
-			      "</b><br><br>"
-			      "<table>"
-			      );
-		for ( long i = 0 ; i < 1000 ; i++ ) {
-			HelpItem *h = &s_his[i];
-			if ( ! h->m_parm ) break;
-			sb.safePrintf( "<tr>"
-				       "<td>%s</td>"
-				       "<td>%s</td>"
-				       "</tr>"
-				       , h->m_parm
-				       , h->m_desc
-				       );
-		}
-		sb.safePrintf("</table>"
-			      "</html>");
-		return g_httpServer.sendDynamicPage (socket, 
-						     sb.getBufStart(), 
-						     sb.length(),
-						     0); // cachetime
-	}

 	// collectionrec must be non-null at this point. i.e. we added it
 	if ( ! cr )
@ -2202,7 +2206,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 		// . avoid spidering links for these urls? i would say
 		// . default is to NOT spider the links...
 		// . support camel case and all lower case
-		long spiderLinks = hr->getLong("spiderLinks",0);
+		long spiderLinks = hr->getLong("spiderLinks",1);
 		spiderLinks      = hr->getLong("spiderlinks",spiderLinks);
 		// make a list of spider requests from these urls
 		SafeBuf listBuf;
@ -2248,7 +2252,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 		if ( ! st->m_msg7.inject ( st->m_socket,
 					   &st->m_hr,
 					   st ,
-					   injectedUrlWrapper ) )
+					   injectedUrlWrapper ,
+					   1 ) ) // spiderLinks default is on
 			// if blocked, return now
 			return false;
 		// otherwise send back reply
@ -2958,7 +2963,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
 			      "<form method=get action=/crawlbot>"
 			      "<tr>"
 			      "<td>"
-			      "<b>Inject Url: </b>"
+			      "<b>Add Seed Url: </b>"
 			      "</td><td>"
 			      "<input type=text name=seed size=50>"
 			      " "
@ -3679,18 +3684,26 @@ bool resetUrlFilters ( CollectionRec *cr ) {

 	long i = 0;

-	// 3rd default url filter
+	// 1st default url filter
 	cr->m_regExs[i].set("ismedia");
 	cr->m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
 	cr->m_spiderDiffbotApiUrl[i].purge();
 	i++;

-	// 4th filter
+	// 2nd default filter
 	cr->m_regExs[i].set("!isonsamedomain");
 	cr->m_spiderPriorities   [i] = SPIDER_PRIORITY_FILTERED;
 	cr->m_spiderDiffbotApiUrl[i].purge();
 	i++;

+	// 3rd default filter. BUT if they specified filters this will
+	// be deleted and re-added to the bottom
+	cr->m_regExs[i].set("default");
+	cr->m_spiderPriorities   [i] = 50;
+	cr->m_spiderDiffbotApiUrl[i].purge();
+	i++;
+	
+
 	cr->m_numRegExs   = i;
 	cr->m_numRegExs2  = i;
 	cr->m_numRegExs3  = i;
@ -3802,6 +3815,12 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
 	// how many filters do we have so far?
 	long nf = cr->m_numRegExs;

+	// delete the 3rd default filter cuz we should re-add it below
+	// to the bottom of the list.
+	if ( nf >= 3 ) nf--;
+
+	bool addedDefault = false;
+
 	// loop over the cgi parms
 	for ( long i = 0 ; i < hr->getNumFields() ; i++ ) {
 		// get cgi parm name
@ -3816,8 +3835,10 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
 		// action before expresion???? set action to NULL then?
 		if ( ! expression ) { action = NULL; continue; }
 		// they use "*" instead of "default" so put that back
-		if ( expression[0] == '*' )
+		if ( expression[0] == '*' ) {
 			expression = "default";
+			addedDefault = true;
+		}
 		// deal with it
 		long priority = 50;
 		// default diffbot api call:
@ -3855,6 +3876,14 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
 		break;
 	}

+	// if no '*' line was provided, add it here
+	if ( ! addedDefault ) {
+		cr->m_regExs [nf].safePrintf("default");
+		cr->m_spiderPriorities   [nf] = 50; 
+		cr->m_spiderDiffbotApiUrl[nf].set(NULL);
+		nf++;
+	}
+
 	// update the counts
 	cr->m_numRegExs   = nf;
 	cr->m_numRegExs2  = nf;
--- a/PageInject.cpp
+++ b/PageInject.cpp
@ -443,7 +443,8 @@ Msg7::~Msg7 () {
 bool Msg7::inject ( TcpSocket *s , 
 		    HttpRequest *r ,
 		    void *state ,
-		    void (*callback)(void *state)) {
+		    void (*callback)(void *state) ,
+		    long spiderLinksDefault ) {

 	// save socket
 	// socket is responsible for free the HTTP request, which contains
@ -478,9 +479,10 @@ bool Msg7::inject ( TcpSocket *s ,
 	char hasMime      = r->getLong("hasmime",0);
 	// do consistency testing?
 	bool doConsistencyTesting = r->getLong("dct",0);
-	// . default spiderlinks to no for injects
+	// . default spiderlinks to no for injects. no, not for
+	//   seed urls from PageCrawlbot.cpp, ppl expect links to be spidered.
 	// . support both camel and all-lower cases
-	long spiderLinks = r->getLong("spiderLinks",0);
+	long spiderLinks = r->getLong("spiderLinks",spiderLinksDefault);
 	spiderLinks = r->getLong("spiderlinks",spiderLinks);

 	long  forcedIp  = 0;
--- a/PageInject.h
+++ b/PageInject.h
@ -36,7 +36,8 @@ public:
 	bool inject ( TcpSocket *s , 
 		      HttpRequest *r ,
 		      void *state ,
-		      void (*callback)(void *state));
+		      void (*callback)(void *state),
+		      long spiderLinksDefault = 0 );

 	bool inject ( char *url ,
 		      long  forcedIp ,