mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-05 04:37:39 +03:00
more crawlbot fixes
This commit is contained in:
parent
d8835acfef
commit
313eb1209e
127
PageCrawlBot.cpp
127
PageCrawlBot.cpp
@ -1857,7 +1857,7 @@ public:
|
||||
char *m_desc;
|
||||
};
|
||||
static class HelpItem s_his[] = {
|
||||
{"format","Use &format=json to show JSON output."},
|
||||
{"format","Use &format=html to show HTML output. Default is JSON."},
|
||||
{"token","Required for all operations below."},
|
||||
|
||||
{"name","Name of the crawl. If missing will just show "
|
||||
@ -1871,11 +1871,10 @@ static class HelpItem s_his[] = {
|
||||
""},
|
||||
{"addUrls","A huge string of whitespace separated URLs to add to "
|
||||
"spiderdb for crawling."},
|
||||
{"spiderLinks","Use 0 or 1 to not spider or spider links from "
|
||||
"the seed url. Pass this along with the seed "
|
||||
"parameter.A Use "
|
||||
"this parameter in conjunction with the addUrls parameter as well. "
|
||||
"The DEFAULT is 0!!! So if seeding make sure to say &spiderLinks=1."},
|
||||
{"spiderLinks","Use 1 or 0 to spider the links or NOT spider "
|
||||
"the links, respectively, from "
|
||||
"the provided seed or addUrls parameters. "
|
||||
"The default is 1."},
|
||||
|
||||
{"maxToCrawl", "Specify max pages to successfully download."},
|
||||
{"maxToProcess", "Specify max pages to successfully process through "
|
||||
@ -1889,7 +1888,9 @@ static class HelpItem s_his[] = {
|
||||
"contains any of these then we send it to diffbot for processing. "
|
||||
"If this is empty we send all pages to diffbot for processing."},
|
||||
|
||||
{"expression","A pattern to match in a URL. Example expressions:"},
|
||||
{"expression","A pattern to match in a URL. List up to 100 "
|
||||
"expression/action pairs in the HTTP request. "
|
||||
"Example expressions:"},
|
||||
{"action","Take the appropriate action when preceeding pattern is "
|
||||
"matched. Specify multiple expression/action pairs to build a "
|
||||
"table of filters. Each URL being spidered will take the given "
|
||||
@ -1920,6 +1921,37 @@ char *getInputString ( char *string , HttpRequest *hr , Json *JS ) {
|
||||
// . so if no &cast is present we are the original!!!
|
||||
bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
// print help
|
||||
long help = hr->getLong("help",0);
|
||||
if ( help ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("<html>"
|
||||
"<title>Crawlbot API</title>"
|
||||
"<h1>Crawlbot API</h1>"
|
||||
"<b>Use the parameters below on the "
|
||||
"<a href=\"/crawlbot\">/crawlbot</a> page."
|
||||
"</b><br><br>"
|
||||
"<table>"
|
||||
);
|
||||
for ( long i = 0 ; i < 1000 ; i++ ) {
|
||||
HelpItem *h = &s_his[i];
|
||||
if ( ! h->m_parm ) break;
|
||||
sb.safePrintf( "<tr>"
|
||||
"<td>%s</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
, h->m_parm
|
||||
, h->m_desc
|
||||
);
|
||||
}
|
||||
sb.safePrintf("</table>"
|
||||
"</html>");
|
||||
return g_httpServer.sendDynamicPage (socket,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
0); // cachetime
|
||||
}
|
||||
|
||||
// . Pages.cpp by default broadcasts all PageCrawlbot /crawlbot
|
||||
// requests to every host in the network unless a cast=0 is
|
||||
// explicitly given
|
||||
@ -1936,9 +1968,10 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
// . now show stats for the current crawl
|
||||
// . put in xml or json if format=xml or format=json or
|
||||
// xml=1 or json=1 ...
|
||||
char fmt = FMT_HTML;
|
||||
char fmt = FMT_JSON;
|
||||
char *fs = hr->getString("format",NULL,NULL);
|
||||
// give john a json api
|
||||
if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
|
||||
if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON;
|
||||
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML;
|
||||
// if we got json as input, give it as output
|
||||
@ -1948,11 +1981,6 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
//char *token = getInputString ( "token" );
|
||||
char *token = hr->getString("token");
|
||||
|
||||
if ( gbstrlen(token) > 32 ) {
|
||||
log("crawlbot: token is over 32 chars.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( ! token && ( cast == 0 || fmt == FMT_JSON ) ) {
|
||||
char *msg = "invalid token";
|
||||
return sendErrorReply2 (socket,fmt,msg);
|
||||
@ -1986,6 +2014,12 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
0); // cachetime
|
||||
}
|
||||
|
||||
if ( gbstrlen(token) > 32 ) {
|
||||
log("crawlbot: token is over 32 chars");
|
||||
char *msg = "crawlbot: token is over 32 chars";
|
||||
return sendErrorReply2 (socket,fmt,msg);
|
||||
}
|
||||
|
||||
char *seed = hr->getString("seed");
|
||||
char *addUrls = hr->getString("addUrls");
|
||||
|
||||
@ -2140,36 +2174,6 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
//
|
||||
/////////
|
||||
|
||||
// print help
|
||||
long help = hr->getLong("help",0);
|
||||
if ( help ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("<html>"
|
||||
"<title>Crawlbot API</title>"
|
||||
"<h1>Crawlbot API</h1>"
|
||||
"<b>Use the parameters below on the "
|
||||
"<a href=\"/crawlbot\">/crawlbot</a> page."
|
||||
"</b><br><br>"
|
||||
"<table>"
|
||||
);
|
||||
for ( long i = 0 ; i < 1000 ; i++ ) {
|
||||
HelpItem *h = &s_his[i];
|
||||
if ( ! h->m_parm ) break;
|
||||
sb.safePrintf( "<tr>"
|
||||
"<td>%s</td>"
|
||||
"<td>%s</td>"
|
||||
"</tr>"
|
||||
, h->m_parm
|
||||
, h->m_desc
|
||||
);
|
||||
}
|
||||
sb.safePrintf("</table>"
|
||||
"</html>");
|
||||
return g_httpServer.sendDynamicPage (socket,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
0); // cachetime
|
||||
}
|
||||
|
||||
// collectionrec must be non-null at this point. i.e. we added it
|
||||
if ( ! cr )
|
||||
@ -2202,7 +2206,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
// . avoid spidering links for these urls? i would say
|
||||
// . default is to NOT spider the links...
|
||||
// . support camel case and all lower case
|
||||
long spiderLinks = hr->getLong("spiderLinks",0);
|
||||
long spiderLinks = hr->getLong("spiderLinks",1);
|
||||
spiderLinks = hr->getLong("spiderlinks",spiderLinks);
|
||||
// make a list of spider requests from these urls
|
||||
SafeBuf listBuf;
|
||||
@ -2248,7 +2252,8 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
if ( ! st->m_msg7.inject ( st->m_socket,
|
||||
&st->m_hr,
|
||||
st ,
|
||||
injectedUrlWrapper ) )
|
||||
injectedUrlWrapper ,
|
||||
1 ) ) // spiderLinks default is on
|
||||
// if blocked, return now
|
||||
return false;
|
||||
// otherwise send back reply
|
||||
@ -2958,7 +2963,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
"<form method=get action=/crawlbot>"
|
||||
"<tr>"
|
||||
"<td>"
|
||||
"<b>Inject Url: </b>"
|
||||
"<b>Add Seed Url: </b>"
|
||||
"</td><td>"
|
||||
"<input type=text name=seed size=50>"
|
||||
" "
|
||||
@ -3679,18 +3684,26 @@ bool resetUrlFilters ( CollectionRec *cr ) {
|
||||
|
||||
long i = 0;
|
||||
|
||||
// 3rd default url filter
|
||||
// 1st default url filter
|
||||
cr->m_regExs[i].set("ismedia");
|
||||
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
cr->m_spiderDiffbotApiUrl[i].purge();
|
||||
i++;
|
||||
|
||||
// 4th filter
|
||||
// 2nd default filter
|
||||
cr->m_regExs[i].set("!isonsamedomain");
|
||||
cr->m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
cr->m_spiderDiffbotApiUrl[i].purge();
|
||||
i++;
|
||||
|
||||
// 3rd default filter. BUT if they specified filters this will
|
||||
// be deleted and re-added to the bottom
|
||||
cr->m_regExs[i].set("default");
|
||||
cr->m_spiderPriorities [i] = 50;
|
||||
cr->m_spiderDiffbotApiUrl[i].purge();
|
||||
i++;
|
||||
|
||||
|
||||
cr->m_numRegExs = i;
|
||||
cr->m_numRegExs2 = i;
|
||||
cr->m_numRegExs3 = i;
|
||||
@ -3802,6 +3815,12 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
|
||||
// how many filters do we have so far?
|
||||
long nf = cr->m_numRegExs;
|
||||
|
||||
// delete the 3rd default filter cuz we should re-add it below
|
||||
// to the bottom of the list.
|
||||
if ( nf >= 3 ) nf--;
|
||||
|
||||
bool addedDefault = false;
|
||||
|
||||
// loop over the cgi parms
|
||||
for ( long i = 0 ; i < hr->getNumFields() ; i++ ) {
|
||||
// get cgi parm name
|
||||
@ -3816,8 +3835,10 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
|
||||
// action before expresion???? set action to NULL then?
|
||||
if ( ! expression ) { action = NULL; continue; }
|
||||
// they use "*" instead of "default" so put that back
|
||||
if ( expression[0] == '*' )
|
||||
if ( expression[0] == '*' ) {
|
||||
expression = "default";
|
||||
addedDefault = true;
|
||||
}
|
||||
// deal with it
|
||||
long priority = 50;
|
||||
// default diffbot api call:
|
||||
@ -3855,6 +3876,14 @@ bool setSpiderParmsFromHtmlRequest ( TcpSocket *socket ,
|
||||
break;
|
||||
}
|
||||
|
||||
// if no '*' line was provided, add it here
|
||||
if ( ! addedDefault ) {
|
||||
cr->m_regExs [nf].safePrintf("default");
|
||||
cr->m_spiderPriorities [nf] = 50;
|
||||
cr->m_spiderDiffbotApiUrl[nf].set(NULL);
|
||||
nf++;
|
||||
}
|
||||
|
||||
// update the counts
|
||||
cr->m_numRegExs = nf;
|
||||
cr->m_numRegExs2 = nf;
|
||||
|
@ -443,7 +443,8 @@ Msg7::~Msg7 () {
|
||||
bool Msg7::inject ( TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
void *state ,
|
||||
void (*callback)(void *state)) {
|
||||
void (*callback)(void *state) ,
|
||||
long spiderLinksDefault ) {
|
||||
|
||||
// save socket
|
||||
// socket is responsible for free the HTTP request, which contains
|
||||
@ -478,9 +479,10 @@ bool Msg7::inject ( TcpSocket *s ,
|
||||
char hasMime = r->getLong("hasmime",0);
|
||||
// do consistency testing?
|
||||
bool doConsistencyTesting = r->getLong("dct",0);
|
||||
// . default spiderlinks to no for injects
|
||||
// . default spiderlinks to no for injects. no, not for
|
||||
// seed urls from PageCrawlbot.cpp, ppl expect links to be spidered.
|
||||
// . support both camel and all-lower cases
|
||||
long spiderLinks = r->getLong("spiderLinks",0);
|
||||
long spiderLinks = r->getLong("spiderLinks",spiderLinksDefault);
|
||||
spiderLinks = r->getLong("spiderlinks",spiderLinks);
|
||||
|
||||
long forcedIp = 0;
|
||||
|
@ -36,7 +36,8 @@ public:
|
||||
bool inject ( TcpSocket *s ,
|
||||
HttpRequest *r ,
|
||||
void *state ,
|
||||
void (*callback)(void *state));
|
||||
void (*callback)(void *state),
|
||||
long spiderLinksDefault = 0 );
|
||||
|
||||
bool inject ( char *url ,
|
||||
long forcedIp ,
|
||||
|
Loading…
Reference in New Issue
Block a user