more misc updates.

This commit is contained in:
mwells 2014-04-05 18:09:04 -07:00
parent bd82145626
commit ac5cf7971b
12 changed files with 435 additions and 680 deletions

View File

@ -138,6 +138,19 @@ bool Collectiondb::loadAllCollRecs ( ) {
if ( ! addExistingColl ( coll , collnum ) )
return false;
}
// if no existing recs added... add coll.main.0 always at startup
if ( m_numRecs == 0 ) {
log("admin: adding main collection.");
addNewColl ( "main",
0 , // customCrawl ,
NULL,
0 ,
true , // bool saveIt ,
// Parms.cpp reserves this so it can be sure
// to add the same collnum to every shard
0 );
}
// note it
//log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
// "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
@ -1840,31 +1853,183 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
long n = 0;
//strcpy(m_regExs [n],"default");
/*
m_regExs[n].set("default");
m_regExs[n].nullTerm();
m_numRegExs++;
m_spiderFreqs [n] = 30; // 30 days default
m_numRegExs2++;
m_spiderPriorities[n] = 0;
m_numRegExs3++;
m_maxSpidersPerRule[n] = 99;
m_numRegExs10++;
m_spiderIpWaits[n] = 1000;
m_numRegExs5++;
m_spiderIpMaxSpiders[n] = 7;
m_numRegExs6++;
//m_spidersEnabled[n] = 1;
//m_numRegExs7++;
m_harvestLinks[n] = 1;
m_numRegExs8++;
*/
m_regExs[n].set("isdocidbased");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = -3; // delete!
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 3;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 20;
n++;
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 19;
n++;
m_regExs[n].set("isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 30; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 2;
n++;
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 30; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
// more rules
//m_spiderDiffbotApiNum[n] = 1;
//m_numRegExs11++;

View File

@ -28,10 +28,14 @@
#include "TcpSocket.h"
// values for HttpRequest::m_replyFormat
#define FORMAT_HTML 0
#define FORMAT_XML 1
#define FORMAT_JSON 2
#define FORMAT_CSV 3
#define FORMAT_HTML 1
#define FORMAT_XML 2
#define FORMAT_JSON 3
#define FORMAT_CSV 4
#define FORMAT_TXT 5
#define FORMAT_PROCOG 6
class HttpRequest {

View File

@ -156,7 +156,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
Url u;
for ( ; *pn ; pn++ , lineNum++ ) {
for ( ; *pn ; lineNum++ ) {
// get end
char *s = pn;
@ -169,6 +169,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
char *pe = pn;
for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- );
// advance over '\n' for next line
if ( *pn && *pn == '\n' ) pn++;
// make hash of the line
long h32 = hash32 ( s , pe - s );
@ -728,6 +731,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
char buf [ 128000 ];
SafeBuf sb(buf,128000);
sb.reset();
char *fs = hr->getString("format",NULL,NULL);
char fmt = FORMAT_HTML;
@ -761,7 +765,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
//
// show stats
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
char *seedStr = cr->m_diffbotSeeds.getBufStart();
if ( ! seedStr ) seedStr = "";
@ -773,43 +777,17 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
if ( sentAlert ) sentAlert = 1;
sb.safePrintf(
//sb.safePrintf(
// "<form method=get action=/crawlbot>"
// "%s"
// , sb.getBufStart() // hidden input token/name/..
// );
"<form method=get action=/crawlbot>"
"%s"
, sb.getBufStart() // hidden input token/name/..
);
sb.safePrintf("<TABLE border=0>"
"<TR><TD valign=top>"
"<table border=0 cellpadding=5>"
//
"<tr>"
"<td><b>Crawl Name:</td>"
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Crawl Type:</td>"
"<td>%li</td>"
"</tr>"
//"<tr>"
//"<td><b>Collection Alias:</td>"
//"<td>%s%s</td>"
//"</tr>"
"<tr>"
"<td><b>Token:</td>"
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Seeds:</td>"
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Crawl Status:</td>"
"<td>%li</td>"
@ -820,10 +798,10 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Rounds Completed:</td>"
"<td>%li</td>"
"</tr>"
//"<tr>"
//"<td><b>Rounds Completed:</td>"
//"<td>%li</td>"
//"</tr>"
"<tr>"
"<td><b>Has Urls Ready to Spider:</td>"
@ -837,11 +815,6 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
//"<td>%lli</td>"
//"</tr>"
"<tr>"
"<td><b>Objects Found</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>URLs Harvested</b> (inc. dups)</td>"
"<td>%lli</td>"
@ -868,37 +841,11 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Process Attempts</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Process Successes</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Process Successes This Round</b></td>"
"<td>%lli</td>"
"</tr>"
, cr->m_diffbotCrawlName.getBufStart()
, (long)cr->m_isCustomCrawl
, cr->m_diffbotToken.getBufStart()
, seedStr
, crawlStatus
, tmp.getBufStart()
, cr->m_spiderRoundNum
//, cr->m_spiderRoundNum
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
, cr->m_globalCrawlInfo.m_objectsAdded -
cr->m_globalCrawlInfo.m_objectsDeleted
, cr->m_globalCrawlInfo.m_urlsHarvested
//, cr->m_globalCrawlInfo.m_urlsConsidered
@ -906,16 +853,13 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
, cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
, cr->m_globalCrawlInfo.m_pageProcessAttempts
, cr->m_globalCrawlInfo.m_pageProcessSuccesses
, cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
);
}
if ( fmt != FORMAT_JSON )
// wrap up the form, print a submit button
g_pages.printAdminBottom ( &sb );
//if ( fmt != FORMAT_JSON )
// // wrap up the form, print a submit button
// g_pages.printAdminBottom ( &sb );
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),

View File

@ -25,11 +25,11 @@
#include "Parms.h"
// so user can specify the format of the reply/output
#define FMT_HTML 1
#define FMT_XML 2
#define FMT_JSON 3
#define FMT_CSV 4
#define FMT_TXT 5
//#define FMT_HTML 1
//#define FMT_XML 2
//#define FMT_JSON 3
//#define FMT_CSV 4
//#define FMT_TXT 5
void doneSendingWrapper ( void *state , TcpSocket *sock ) ;
bool sendBackDump ( TcpSocket *s,HttpRequest *hr );
@ -158,25 +158,25 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
if ( ( xx = strstr ( path , "_data.json" ) ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_JSON;
fmt = FORMAT_JSON;
downloadJSON = true;
}
else if ( ( xx = strstr ( path , "_data.csv" ) ) ) {
rdbId = RDB_TITLEDB;
downloadJSON = true;
fmt = FMT_CSV;
fmt = FORMAT_CSV;
}
else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_CSV;
fmt = FORMAT_CSV;
}
else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_TXT;
fmt = FORMAT_TXT;
}
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_TXT;
fmt = FORMAT_TXT;
}
// sanity, must be one of 3 download calls
@ -213,7 +213,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// . if doing download of csv, make it search results now!
// . make an httprequest on stack and call it
if ( fmt == FMT_CSV && rdbId == RDB_TITLEDB ) {
if ( fmt == FORMAT_CSV && rdbId == RDB_TITLEDB ) {
char tmp2[5000];
SafeBuf sb2(tmp2,5000);
long dr = 1;
@ -247,7 +247,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// . if doing download of json, make it search results now!
// . make an httprequest on stack and call it
if ( fmt == FMT_JSON && rdbId == RDB_TITLEDB ) {
if ( fmt == FORMAT_JSON && rdbId == RDB_TITLEDB ) {
char tmp2[5000];
SafeBuf sb2(tmp2,5000);
long dr = 1;
@ -514,13 +514,13 @@ bool StateCD::sendList ( ) {
//sb.setLabel("dbotdmp");
char *ct = "text/csv";
if ( m_fmt == FMT_JSON )
if ( m_fmt == FORMAT_JSON )
ct = "application/json";
if ( m_fmt == FMT_XML )
if ( m_fmt == FORMAT_XML )
ct = "text/xml";
if ( m_fmt == FMT_TXT )
if ( m_fmt == FORMAT_TXT )
ct = "text/plain";
if ( m_fmt == FMT_CSV )
if ( m_fmt == FORMAT_CSV )
ct = "text/csv";
// . if we haven't yet sent an http mime back to the user
@ -545,13 +545,13 @@ bool StateCD::sendList ( ) {
//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! m_printedFirstBracket && m_fmt == FMT_JSON ) {
if ( ! m_printedFirstBracket && m_fmt == FORMAT_JSON ) {
sb.safePrintf("[\n");
m_printedFirstBracket = true;
}
// these are csv files not xls
//if ( ! m_printedFirstBracket && m_fmt == FMT_CSV ) {
//if ( ! m_printedFirstBracket && m_fmt == FORMAT_CSV ) {
// sb.safePrintf("sep=,\n");
// m_printedFirstBracket = true;
//}
@ -638,7 +638,7 @@ bool StateCD::sendList ( ) {
// use this for printing out urls.csv as well...
m_printedEndingBracket = true;
// end array of json objects. might be empty!
if ( m_rdbId == RDB_TITLEDB && m_fmt == FMT_JSON )
if ( m_rdbId == RDB_TITLEDB && m_fmt == FORMAT_JSON )
sb.safePrintf("\n]\n");
//log("adding ]. len=%li",sb.length());
// i'd like to exit streaming mode here. i fixed tcpserver.cpp
@ -853,7 +853,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
}
// "csv" is default if json not specified
if ( m_fmt == FMT_JSON )
if ( m_fmt == FORMAT_JSON )
sb->safePrintf("[{"
"{\"url\":"
"\"%s\"},"
@ -997,7 +997,7 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// if not json, just print the json item out in csv
// moved into PageResults.cpp...
//if ( m_fmt == FMT_CSV ) {
//if ( m_fmt == FORMAT_CSV ) {
// printJsonItemInCsv ( json , sb );
// continue;
//}
@ -1337,7 +1337,7 @@ bool sendReply2 (TcpSocket *socket , long fmt , char *msg ) {
// send this back to browser
SafeBuf sb;
if ( fmt == FMT_JSON ) {
if ( fmt == FORMAT_JSON ) {
sb.safePrintf("{\n\"response\":\"success\",\n"
"\"message\":\"%s\"\n}\n"
, msg );
@ -1368,7 +1368,7 @@ bool sendErrorReply2 ( TcpSocket *socket , long fmt , char *msg ) {
// send this back to browser
SafeBuf sb;
if ( fmt == FMT_JSON ) {
if ( fmt == FORMAT_JSON ) {
sb.safePrintf("{\"error\":\"%s\"}\n"
, msg );
ct = "application/json";
@ -1476,7 +1476,7 @@ void injectedUrlWrapper ( void *state ) {
// send back the html or json response?
SafeBuf *response = &sb;
if ( st->m_fmt == FMT_JSON ) response = &js;
if ( st->m_fmt == FORMAT_JSON ) response = &js;
// . this will call g_httpServer.sendReply()
// . pass it in the injection response, "sb"
@ -1673,7 +1673,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// . now show stats for the current crawl
// . put in xml or json if format=xml or format=json or
// xml=1 or json=1 ...
char fmt = FMT_JSON;
char fmt = FORMAT_JSON;
// token is always required. get from json or html form input
//char *token = getInputString ( "token" );
@ -1693,21 +1693,21 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
name++;
}
// change default formatting to html
fmt = FMT_HTML;
fmt = FORMAT_HTML;
}
char *fs = hr->getString("format",NULL,NULL);
// give john a json api
if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON;
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML;
if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
// if we got json as input, give it as output
//if ( JS.getFirstItem() ) fmt = FMT_JSON;
//if ( JS.getFirstItem() ) fmt = FORMAT_JSON;
if ( ! token && fmt == FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) {
if ( ! token && fmt == FORMAT_JSON ) { // (cast==0|| fmt == FORMAT_JSON ) ) {
char *msg = "invalid token";
return sendErrorReply2 (socket,fmt,msg);
}
@ -1772,7 +1772,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
//}
// just send back a list of all the collections after the delete
//if ( delColl && cast && fmt == FMT_JSON ) {
//if ( delColl && cast && fmt == FORMAT_JSON ) {
// char *msg = "Collection deleted.";
// return sendReply2 (socket,fmt,msg);
//}
@ -2263,7 +2263,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
/*
bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
sb.safePrintf("\"urlFilters\":[");
// skip first filters that are:
@ -2303,7 +2303,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
// urls higher spider priority, so skip it
if ( strncmp(expression,"ismanualadd && ",15) == 0 )
continue;
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("<tr>"
"<td>Expression "
"<input type=text "
@ -2328,7 +2328,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
sb.pushChar('\n');
}
if ( fmt == FMT_JSON ) {
if ( fmt == FORMAT_JSON ) {
// remove trailing comma
sb.removeLastChar('\n');
sb.removeLastChar(',');
@ -2519,7 +2519,7 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
true // isJSON?
);
*/
//printUrlFilters ( sb , cx , FMT_JSON );
//printUrlFilters ( sb , cx , FORMAT_JSON );
// end that collection rec
sb->safePrintf("}\n");
@ -2537,7 +2537,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// store output into here
SafeBuf sb;
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf(
"<html>"
"<title>Crawlbot - "
@ -2573,7 +2573,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
lb.urlEncode(name);
lb.safePrintf ("&token=");
lb.urlEncode(token);
if ( fmt == FMT_HTML ) lb.safePrintf("&format=html");
if ( fmt == FORMAT_HTML ) lb.safePrintf("&format=html");
lb.nullTerm();
@ -2590,7 +2590,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//}
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("<table border=0>"
"<tr><td>"
"<b><font size=+2>"
@ -2645,7 +2645,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// print list of collections controlled by this token
//
for ( long i = 0 ; fmt == FMT_HTML && i<g_collectiondb.m_numRecs;i++ ){
for ( long i = 0 ; fmt == FORMAT_HTML && i<g_collectiondb.m_numRecs;i++ ){
CollectionRec *cx = g_collectiondb.m_recs[i];
if ( ! cx ) continue;
// get its token if any
@ -2677,19 +2677,19 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
sb.safePrintf("</font></b>");
}
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf ( "</center><br/>" );
// the ROOT JSON [
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
sb.safePrintf("{\n");
// injection is currently not in use, so this is an artifact:
if ( fmt == FMT_JSON && injectionResponse )
if ( fmt == FORMAT_JSON && injectionResponse )
sb.safePrintf("\"response\":\"%s\",\n\n"
, injectionResponse->getBufStart() );
if ( fmt == FMT_JSON && urlUploadResponse )
if ( fmt == FORMAT_JSON && urlUploadResponse )
sb.safePrintf("\"response\":\"%s\",\n\n"
, urlUploadResponse->getBufStart() );
@ -2702,14 +2702,14 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// the items in the array now have type:bulk or type:crawl
// so call them 'jobs'
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
sb.safePrintf("\"jobs\":[");//\"collections\":");
long summary = hr->getLong("summary",0);
// enter summary mode for json
if ( fmt != FMT_HTML ) summary = 1;
if ( fmt != FORMAT_HTML ) summary = 1;
// start the table
if ( summary && fmt == FMT_HTML ) {
if ( summary && fmt == FORMAT_HTML ) {
sb.safePrintf("<table border=1 cellpadding=5>"
"<tr>"
"<td><b>Collection</b></td>"
@ -2740,11 +2740,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// just print out single crawl info for json
if ( fmt != FMT_HTML && cx != cr && name3 )
if ( fmt != FORMAT_HTML && cx != cr && name3 )
continue;
// if json, print each collectionrec
if ( fmt == FMT_JSON ) {
if ( fmt == FORMAT_JSON ) {
if ( ! firstOne )
sb.safePrintf(",\n\t");
firstOne = false;
@ -2786,7 +2786,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
);
}
if ( summary && fmt == FMT_HTML ) {
if ( summary && fmt == FORMAT_HTML ) {
sb.safePrintf("</table></html>" );
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
@ -2794,7 +2794,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
0); // cachetime
}
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
// end the array of collection objects
sb.safePrintf("\n]\n");
@ -2808,7 +2808,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show urls being crawled (ajax) (from Spider.cpp)
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf ( "<table width=100%% cellpadding=5 "
"style=border-width:1px;border-style:solid;"
"border-color:black;>"
@ -2879,7 +2879,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
rand64 |= r2;
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("<br>"
"<table border=0 cellpadding=5>"
@ -2952,12 +2952,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
);
}
if ( injectionResponse && fmt == FMT_HTML )
if ( injectionResponse && fmt == FORMAT_HTML )
sb.safePrintf("<br><font size=-1>%s</font>\n"
,injectionResponse->getBufStart()
);
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf(//"<input type=hidden name=c value=\"%s\">"
//"<input type=hidden name=crawlbotapi value=1>"
"</td>"
@ -2996,7 +2996,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show stats
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
char *seedStr = cr->m_diffbotSeeds.getBufStart();
if ( ! seedStr ) seedStr = "";
@ -3654,7 +3654,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// xml or json does not show the input boxes
//if ( format != FMT_HTML )
//if ( format != FORMAT_HTML )
// return g_httpServer.sendDynamicPage ( s,
// sb.getBufStart(),
// sb.length(),
@ -3677,7 +3677,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
s2 = "";
}
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf(
"<a onclick="
@ -3721,7 +3721,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// print url filters. HACKy...
//
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
g_parms.sendPageGeneric ( socket ,
hr ,
PAGE_FILTERS ,
@ -3732,7 +3732,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// end HACKy hack
//
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf(
"</form>"
"</div>"
@ -3760,7 +3760,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show simpler url filters table
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
/*
sb.safePrintf ( "<table>"
"<tr><td colspan=2>"
@ -3796,7 +3796,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show reset and delete crawl buttons
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf(
"<table cellpadding=5>"
"<tr>"
@ -3859,13 +3859,13 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// the ROOT JSON }
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
sb.safePrintf("}\n");
char *ct = "text/html";
if ( fmt == FMT_JSON ) ct = "application/json";
if ( fmt == FMT_XML ) ct = "text/xml";
if ( fmt == FMT_CSV ) ct = "text/csv";
if ( fmt == FORMAT_JSON ) ct = "application/json";
if ( fmt == FORMAT_XML ) ct = "text/xml";
if ( fmt == FORMAT_CSV ) ct = "text/csv";
// this could be in html json or xml
return g_httpServer.sendDynamicPage ( socket,
@ -4142,7 +4142,7 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
char *json = hr->getString("json");
if ( ! json )
return sendReply2 ( socket,
FMT_JSON,
FORMAT_JSON,
"No &json= provided in request.");
@ -4151,12 +4151,12 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
// wtf?
if ( ! status )
return sendReply2 ( socket, FMT_JSON,
return sendReply2 ( socket, FORMAT_JSON,
"Error with JSON parser.");
// error adding it?
if ( ! cr )
return sendReply2 ( socket,FMT_JSON,
return sendReply2 ( socket,FORMAT_JSON,
"Failed to create new collection.");
ji = JP.getFirstItem();

View File

@ -169,6 +169,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("</form>\n");
sb.safePrintf("<br>\n");
sb.safePrintf("\n");
// print any red boxes we might need to
if ( printRedBox2 ( &sb , true ) )
sb.safePrintf("<br>\n");
sb.safePrintf("<table cellpadding=3>\n");
sb.safePrintf("\n");

152
Pages.cpp
View File

@ -50,6 +50,9 @@ static WebPage s_pages[] = {
"dummy page - if set in the users row then user will have master=0 and "
" collection links will be highlighted in red",
NULL, 0 },
//{ PAGE_QUALITY , "quality", 0, "quality", 0, 0,
// "dummy page - if set in the users row then \"Quality Control\""
// " will be printed besides the logo for certain pages",
@ -102,12 +105,66 @@ static WebPage s_pages[] = {
// "Basic diffbot page.", sendPageBasicDiffbot , 0 } ,
{ PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 ,
"Basic security page.", sendPageGeneric , 0 } ,
{ PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 ,
"Basic search page.", sendPageRoot , 0 } ,
{ PAGE_MASTER , "admin/master" , 0 , "master controls" , 1 , 0 ,
//USER_MASTER | USER_PROXY ,
"master controls page",
sendPageGeneric , 0 } ,
{ PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1,
//USER_ADMIN | USER_MASTER ,
"search controls page",
sendPageGeneric , 0 } ,
{ PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0,
//USER_ADMIN | USER_MASTER | USER_PROXY ,
"spider controls page",
sendPageGeneric , 0 } ,
{ PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 ,
//USER_MASTER | USER_PROXY,
"log page",
sendPageGeneric , 0 } ,
{ PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 ,
//USER_MASTER | USER_PROXY ,
"advanced security page",
sendPageGeneric , 0 } ,
{ PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 ,
//USER_MASTER ,
"add a new collection using this page",
sendPageAddColl , 0 } ,
{ PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0,
//USER_MASTER ,
"delete a collection using this page",
sendPageDelColl , 0 } ,
{ PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 ,
//USER_MASTER ,
"repair page",
sendPageGeneric , 0 },
{ PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
"what sites can be spidered",
sendPageGeneric , 0 } , // sendPageBasicSettings
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
//USER_ADMIN | USER_MASTER ,
"prioritize urls for spidering",
sendPageGeneric , 0 } ,
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
//USER_ADMIN | USER_MASTER ,
"inject url in the index here",
sendPageInject , 2 } ,
// this is the addurl page the the admin!
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
"add url page for admin",
sendPageAddUrl2 , 0 } ,
{ PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 ,
//USER_ADMIN | USER_MASTER,
"reindex url page",
sendPageReindex , 0 } ,
{ PAGE_HOSTS , "admin/hosts" , 0 , "hosts" , 0 , 0 ,
//USER_MASTER | USER_PROXY,
@ -134,10 +191,7 @@ static WebPage s_pages[] = {
//USER_MASTER | USER_PROXY,
"sockets page",
sendPageSockets , 0 } ,
{ PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 ,
//USER_MASTER | USER_PROXY,
"log page",
sendPageGeneric , 0 } ,
{ PAGE_LOGVIEW , "admin/logview" , 0 , "log view" , 0 , 0 ,
//USER_MASTER ,
"logview page",
@ -147,18 +201,6 @@ static WebPage s_pages[] = {
// "sync page",
// sendPageGeneric , 0 } ,
{ PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 ,
//USER_MASTER | USER_PROXY ,
"advanced security page",
sendPageGeneric , 0 } ,
{ PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 ,
//USER_MASTER ,
"add a new collection using this page",
sendPageAddColl , 0 } ,
{ PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0,
//USER_MASTER ,
"delete a collection using this page",
sendPageDelColl , 0 } ,
{ PAGE_AUTOBAN ,"admin/autoban" , 0 , "autoban" , 1 , 1 ,
//USER_MASTER | USER_PROXY ,
"autobanned ips",
@ -175,10 +217,6 @@ static WebPage s_pages[] = {
//USER_MASTER ,
"threads page",
sendPageThreads , 0 },
{ PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 ,
//USER_MASTER ,
"repair page",
sendPageGeneric , 0 },
//{ PAGE_THESAURUS, "admin/thesaurus", 0 , "thesaurus", 0 , 0 ,
// //USER_MASTER ,
// "thesaurus page",
@ -207,14 +245,6 @@ static WebPage s_pages[] = {
"titledb page",
sendPageTitledb , 2 } ,
// 1 = usePost
{ PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1,
//USER_ADMIN | USER_MASTER ,
"search controls page",
sendPageGeneric , 0 } ,
{ PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0,
//USER_ADMIN | USER_MASTER | USER_PROXY ,
"spider controls page",
sendPageGeneric , 0 } ,
{ PAGE_CRAWLBOT , "crawlbot" , 0 , "crawlbot" , 1 , 0,
"simplified spider controls page",
@ -229,30 +259,6 @@ static WebPage s_pages[] = {
// "spider priorities page",
// sendPageGeneric , 0 } ,
{ PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
"what sites can be spidered",
sendPageGeneric , 0 } , // sendPageBasicSettings
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
//USER_ADMIN | USER_MASTER ,
"prioritize urls for spidering",
sendPageGeneric , 0 } ,
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
//USER_ADMIN | USER_MASTER ,
"inject url in the index here",
sendPageInject , 2 } ,
// this is the addurl page the the admin!
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
"add url page for admin",
sendPageAddUrl2 , 0 } ,
{ PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 ,
//USER_ADMIN | USER_MASTER,
"reindex url page",
sendPageReindex , 0 } ,
//{ PAGE_KEYWORDS, "admin/queries",0,"queries" , 0 , 1 ,
// "get queries a url matches",
// sendPageMatchingQueries , 2 } ,
@ -893,8 +899,6 @@ bool Pages::getNiceness ( long page ) {
return s_pages[page].m_niceness;
}
bool printRedBox ( SafeBuf *mb ) ;
///////////////////////////////////////////////////////////
//
// Convenient html printing routines
@ -1056,6 +1060,7 @@ bool Pages::printAdminTop (SafeBuf *sb ,
//if ( page == PAGE_BASIC_DIFFBOT ) isBasic = true;
//if ( page == PAGE_BASIC_SEARCH ) isBasic = true;
if ( page == PAGE_BASIC_SECURITY ) isBasic = true;
if ( page == PAGE_BASIC_SEARCH ) isBasic = true;
//
// print breadcrumb. main > Basic > Settings
@ -1791,7 +1796,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
// is this page basic?
bool pageBasic = false;
if ( i >= PAGE_BASIC_SETTINGS &&
i <= PAGE_BASIC_SECURITY )
i <= PAGE_BASIC_SEARCH )
pageBasic = true;
// print basic pages under the basic menu, advanced pages
@ -2627,9 +2632,18 @@ bool sendPageLogin ( TcpSocket *socket , HttpRequest *hr ) {
NULL);// cookie
}
bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage ) {
SafeBuf mb;
// return false if no red box
if ( ! printRedBox ( &mb , isRootWebPage ) ) return false;
// otherwise, print it
sb->safeStrcpy ( mb.getBufStart() );
// return true since we printed one
return true;
}
// emergency message box
bool printRedBox ( SafeBuf *mb ) {
bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) {
PingServer *ps = &g_pingServer;
@ -2649,11 +2663,33 @@ bool printRedBox ( SafeBuf *mb ) {
char *boxEnd =
"</td></tr></table>";
bool adds = false;
long adds = 0;
mb->safePrintf("<div style=max-width:500px;>");
// are we just starting off? give them a little help.
CollectionRec *cr = g_collectiondb.getRec("main");
if ( g_collectiondb.m_numRecs == 1 &&
cr &&
isRootWebPage &&
cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) {
if ( adds ) mb->safePrintf("<br>");
adds++;
mb->safePrintf("%s",box);
mb->safePrintf("Welcome to Gigablast. The most powerful "
"search engine you can legally download. "
"Please add the websites you want to spider "
"<a href=/admin/settings?c=main>here</a>."
);
mb->safePrintf("%s",boxEnd);
}
if ( isRootWebPage ) {
mb->safePrintf("</div>");
return (bool)adds;
}
if ( g_conf.m_numConnectIps == 0 && g_conf.m_numMasterPwds == 0 ) {
if ( adds ) mb->safePrintf("<br>");
adds++;
@ -2738,5 +2774,5 @@ bool printRedBox ( SafeBuf *mb ) {
mb->safePrintf("</div>");
return adds;
return (bool)adds;
}

31
Pages.h
View File

@ -5,6 +5,9 @@
#ifndef _PAGES_H_
#define _PAGES_H_
bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage = false ) ;
bool printRedBox ( SafeBuf *mb , bool isRootWebPage = false ) ;
// for PageEvents.cpp and Accessdb.cpp
//#define RESULTSWIDTHSTR "550px"
@ -304,25 +307,36 @@ enum {
//PAGE_BASIC_SEARCH , // TODO
//PAGE_BASIC_DIFFBOT , // TODO
PAGE_BASIC_SECURITY ,
PAGE_BASIC_SEARCH ,
// master admin pages
PAGE_MASTER ,
PAGE_SEARCH ,
PAGE_SPIDER ,
PAGE_LOG ,
PAGE_SECURITY ,
PAGE_ADDCOLL ,
PAGE_DELCOLL ,
PAGE_REPAIR ,
PAGE_SITES , // site filters
PAGE_FILTERS ,
PAGE_INJECT ,
PAGE_ADDURL2 ,
PAGE_REINDEX ,
PAGE_HOSTS ,
PAGE_STATS , // 10
PAGE_STATSDB ,
PAGE_PERF ,
PAGE_SOCKETS ,
PAGE_LOG ,
PAGE_LOGVIEW ,
// PAGE_SYNC ,
PAGE_SECURITY ,
PAGE_ADDCOLL ,
PAGE_DELCOLL ,
PAGE_AUTOBAN , // 20
//PAGE_SPIDERLOCKS ,
PAGE_PROFILER ,
PAGE_THREADS ,
PAGE_REPAIR ,
// PAGE_THESAURUS ,
// . non master-admin pages (collection controls)
@ -335,16 +349,9 @@ enum {
PAGE_TITLEDB ,
//PAGE_STATSDB ,
PAGE_SEARCH ,
PAGE_SPIDER ,
PAGE_CRAWLBOT , // 35
PAGE_SPIDERDB ,
//PAGE_PRIORITIES , // priority queue controls
PAGE_SITES , // site filters
PAGE_FILTERS ,
PAGE_INJECT ,
PAGE_ADDURL2 ,
PAGE_REINDEX ,
//PAGE_KEYWORDS ,
PAGE_SEO ,
PAGE_ACCESS , //40

View File

@ -1888,7 +1888,7 @@ bool Parms::printParm ( SafeBuf* sb,
"value=\"%f\" "
// 3 was ok on firefox but need 6
// on chrome
"size=6>",cgi,*(float *)s);
"size=7>",cgi,*(float *)s);
}
else if ( t == TYPE_IP ) {
if ( m->m_max > 0 && j == jend )
@ -1896,7 +1896,7 @@ bool Parms::printParm ( SafeBuf* sb,
"size=12>",cgi);
else
sb->safePrintf ("<input type=text name=%s value=\"%s\" "
"size=6>",cgi,iptoa(*(long *)s));
"size=12>",cgi,iptoa(*(long *)s));
}
else if ( t == TYPE_LONG ) {
// just show the parm name and value if printing in json
@ -7534,6 +7534,7 @@ void Parms::init ( ) {
m->m_flags = PF_TEXTAREA;
m++;
/*
// the new upload post submit button
m->m_title = "upload urls";
m->m_desc = "Upload your file of urls.";
@ -7542,6 +7543,7 @@ void Parms::init ( ) {
m->m_obj = OBJ_NONE;
m->m_type = TYPE_FILEUPLOADBUTTON;
m++;
*/
m->m_title = "strip sessionids";
m->m_desc = "Strip added urls of their session ids.";
@ -7591,6 +7593,7 @@ void Parms::init ( ) {
m->m_title = "site list";
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
"See <a href=#examples>example site list</a> below. "
"Gigablast uses the "
"<a href=/admin/filters#insitelist>insitelist</a> "
"directive on "
@ -7599,8 +7602,7 @@ void Parms::init ( ) {
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
"tools. "
"See <a href=#examples>example site list</a> below. "
"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the <a href=/admin/addurl>add "
"urls</a> interface.";
m->m_cgi = "sitelist";
@ -7629,6 +7631,7 @@ void Parms::init ( ) {
m++;
*/
/*
// the new upload post submit button
m->m_title = "upload site list";
m->m_desc = "Upload your file of site patterns. Completely replaces "
@ -7640,12 +7643,13 @@ void Parms::init ( ) {
m->m_type = TYPE_FILEUPLOADBUTTON;
m->m_flags = PF_NOSAVE | PF_DUP;
m++;
*/
m->m_title = "restart collection";
m->m_desc = "Remove all documents from this collection and starts "
"spidering over again. If you do this accidentally there "
"is a <a href=/admin.html#recover>recovery procedure</a> to "
"get back the trashed data.";
m->m_desc = "Remove all documents from this collection and restart "
"spidering.";// If you do this accidentally there "
//"is a <a href=/admin.html#recover>recovery procedure</a> to "
// "get back the trashed data.";
m->m_cgi = "restart";
m->m_page = PAGE_BASIC_SETTINGS;
m->m_obj = OBJ_COLL;
@ -7659,6 +7663,7 @@ void Parms::init ( ) {
m->m_title = "site list";
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
"See <a href=#examples>example site list</a> below. "
"Gigablast uses the "
"<a href=/admin/filters#insitelist>insitelist</a> "
"directive on "
@ -7667,8 +7672,7 @@ void Parms::init ( ) {
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
"tools. "
"See <a href=#examples>example site list</a> below. "
"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the <a href=/admin/addurl>addurl"
"</a> interface.";
m->m_cgi = "sitelist";
@ -8762,11 +8766,11 @@ void Parms::init ( ) {
m++;
m->m_title = "max robots.txt cache age";
m->m_desc = "How many second to cache a robots.txt file for. "
m->m_desc = "How many seconds to cache a robots.txt file for. "
"86400 is 1 day. 0 means Gigablast will not read from the "
"cache at all and will download the robots.txt before every "
"page if robots.txt use is enabled above. However, if this is "
"0 then Gigablast will still store robots.txt files into the "
"0 then Gigablast will still store robots.txt files in the "
"cache.";
m->m_cgi = "mrca";
m->m_off = (char *)&cr.m_maxRobotsCacheAge - x;
@ -10639,8 +10643,9 @@ void Parms::init ( ) {
m++;
m->m_title = "do query expansion";
m->m_desc = "Query expansion will include word stems and synonyms in "
"its search results.";
m->m_desc = "If enabled, query expansion will expand your query "
"to include word stems and "
"synonyms of the query terms.";
m->m_def = "1";
m->m_off = (char *)&cr.m_queryExpansion - x;
m->m_soff = (char *)&si.m_queryExpansion - y;
@ -10653,7 +10658,7 @@ void Parms::init ( ) {
// more general parameters
m->m_title = "max search results";
m->m_desc = "What is the limit to the total number "
m->m_desc = "What is the maximum total number "
"of returned search results.";
m->m_cgi = "msr";
m->m_off = (char *)&cr.m_maxSearchResults - x;
@ -12457,7 +12462,7 @@ void Parms::init ( ) {
m++;
m->m_title = "max summary line width";
m->m_desc = "<br> tags are inserted to keep the number "
m->m_desc = "&lt;br&gt; tags are inserted to keep the number "
"of chars in the summary per line at or below this width. "
"Strings without spaces that exceed this "
"width are not split.";

View File

@ -15163,7 +15163,7 @@ bool Sections::printVotingInfoInJSON ( SafeBuf *sb ) {
// breathe
QUICKPOLL ( m_niceness );
// print this section
printSectionDiv ( sk , FMT_JSON ); // forProCog );
printSectionDiv ( sk , FORMAT_JSON ); // forProCog );
// advance
long b = sk->m_b;
// stop if last
@ -15190,7 +15190,8 @@ bool Sections::print2 ( SafeBuf *sbuf ,
HashTableX *st2 ,
HashTableX *tt ,
Addresses *aa ,
char format ) { // bool forProCog ){//FMT_PROCOG FMT_JSON HTML
char format ) { // bool forProCog ){
//FORMAT_PROCOG FORMAT_JSON HTML
//sbuf->safePrintf("<b>Sections in Document</b>\n");
@ -15244,7 +15245,7 @@ bool Sections::print2 ( SafeBuf *sbuf ,
sk = m_sectionPtrs[b];
}
if ( format != FMT_HTML ) return true; // forProCog
if ( format != FORMAT_HTML ) return true; // forProCog
// print header
char *hdr =
@ -15553,7 +15554,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
// m_sbuf->safePrintf("A=%li ",sk->m_a);
if ( format == FMT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
if ( format == FORMAT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
// do not count our own site!
m_sbuf->safePrintf("<i>"
"<font size=-1>"
@ -15573,7 +15574,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
m_sbuf->safePrintf("<i>");
if ( format == FMT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
if ( format == FORMAT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
sec_t f = sk->m_flags;
//if ( f & SEC_SENTENCE )
// m_sbuf->safePrintf("sentence " );
@ -15598,7 +15599,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
// m_sbuf->safePrintf("notdupvotes=%li ",
// sk->m_votesForNotDup);
if ( format != FMT_PROCOG ) {
if ( format != FORMAT_PROCOG ) {
// print the flags
m_sbuf->safePrintf("A=%li ",sk->m_a);

View File

@ -12414,7 +12414,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
if ( cx->m_isCustomCrawl )
return msg->safePrintf("Job is in progress.");
else
return true;
return msg->safePrintf("Spider is in progress.");
}
// pattern is a ||-separted list of substrings

View File

@ -1,416 +0,0 @@
# List of sites to spider, one per line. Gigablast uses the <a
# href=/admin/filters#insitelist>insitelist</a> directive on the <a
# href=/admin/filters>url filters</a> page to make sure that the spider only
# indexes urls that match the site patterns you specify here, other than urls
# you add individually via the add urls or inject url tools. See <a
# href=#examples>example site list</a> below. Limit list to 300MB. If you have
# a lot of INDIVIDUAL URLS to add then consider using the <a
# href=/admin/addurl>addurl</a> interface.
<siteList><![CDATA[]]></>
# All <, >, " and # characters that are values for a field contained herein
# must be represented as &lt;, &gt;, &#34; and &#035; respectively.
# Controls just the spiders for this collection.
<spideringEnabled>1</>
# What is the maximum number of web pages the spider is allowed to download
# simultaneously PER HOST for THIS collection?
<maxSpiders>100</>
# make each spider wait this many milliseconds before getting the ip and
# downloading the page.
<spiderDelayInMilliseconds>0</>
# If this is true Gigablast will respect the robots.txt convention.
<useRobotstxt>1</>
# How many second to cache a robots.txt file for. 86400 is 1 day. 0 means
# Gigablast will not read from the cache at all and will download the
# robots.txt before every page if robots.txt use is enabled above. However, if
# this is 0 then Gigablast will still store robots.txt files into the cache.
<maxRobotstxtCacheAge>86400</>
# Do a tight merge on posdb and titledb at this time every day. This is
# expressed in MINUTES past midnight UTC. UTC is 5 hours ahead of EST and 7
# hours ahead of MST. Leave this as -1 to NOT perform a daily merge. To merge
# at midnight EST use 60*5=300 and midnight MST use 60*7=420.
<dailyMergeTime>-1</>
# Comma separated list of days to merge on. Use 0 for Sunday, 1 for Monday,
# ... 6 for Saturday. Leaving this parmaeter empty or without any numbers will
# make the daily merge happen every day
<dailyMergeDays><![CDATA[0]]></>
# When the daily merge was last kicked off. Expressed in UTC in seconds since
# the epoch.
<dailyMergeLastStarted>-1</>
# If this is true, users will have to pass a simple Turing test to add a url.
# This prevents automated url submission.
<turingTestEnabled>0</>
# Maximum number of urls that can be submitted via the addurl interface, per
# IP domain, per 24 hour period. A value less than or equal to zero implies no
# limit.
<maxAddUrls>0</>
# When the spider round started
<spiderRoundStartTime>0</>
# The spider round number.
<spiderRoundNum>0</>
# When enabled, the spider will discard web pages which are identical to other
# web pages that are already in the index. However, root urls, urls that have
# no path, are never discarded. It most likely has to hit disk to do these
# checks so it does cause some slow down. Only use it if you need it.
<dedupingEnabled>0</>
# When enabled, the spider will discard web pages which, when a www is
# prepended to the page's url, result in a url already in the index.
<dedupingEnabledForWww>1</>
# Detect and do not index pages which have a 200 status code, but are likely
# to be error pages.
<detectCustomErrorPages>1</>
# Should pages be removed from the index if they are no longer accessible on
# the web?
<delete404s>1</>
# If this is true, the spider, when a url redirects to a "simpler" url, will
# add that simpler url into the spider queue and abandon the spidering of the
# current url.
<useSimplifiedRedirects>1</>
# If this is true, the spider, when updating a web page that is already in the
# index, will not even download the whole page if it hasn't been updated since
# the last time Gigablast spidered it. This is primarily a bandwidth saving
# feature. It relies on the remote webserver's returned Last-Modified-Since
# field being accurate.
<useIfModifiedSince>0</>
# If this is true, do not allow spammy inlinks to vote. This check is too
# aggressive for some collections, i.e. it does not allow pages with cgi in
# their urls to vote.
<doLinkSpamChecking>1</>
# If this is true Gigablast will only allow one vote per the top 2 significant
# bytes of the IP address. Otherwise, multiple pages from the same top IP can
# contribute to the link text and link-based quality ratings of a particular
# URL. Furthermore, no votes will be accepted from IPs that have the same top
# 2 significant bytes as the IP of the page being indexed.
<restrictLinkVotingByIp>1</>
# How often should Gigablast recompute the link info for a url. Also applies
# to getting the quality of a site or root url, which is based on the link
# info. In days. Can use decimals. 0 means to update the link info every time
# the url's content is re-indexed. If the content is not reindexed because it
# is unchanged then the link info will not be updated. When getting the link
# info or quality of the root url from an external cluster, Gigablast will
# tell the external cluster to recompute it if its age is this or higher.
<updateLinkInfoFrequency>60.000000</>
# If this is eabled the spider will not allow any docs which are determined to
# be serps.
<doSerpDetection>1</>
# If this is false then the filter will not be used on html or text pages.
<applyFilterToTextPages>0</>
# Program to spawn to filter all HTTP replies the spider receives. Leave blank
# for none.
<filterName><![CDATA[]]></>
# Kill filter shell after this many seconds. Assume it stalled permanently.
<filterTimeout>40</>
# Retrieve pages from the proxy at this IP address.
<proxyIp>0.0.0.0</>
# Retrieve pages from the proxy on this port.
<proxyPort>0</>
# Index the body of the documents so you can search it. Required for searching
# that. You wil pretty much always want to keep this enabled.
<indexBody>1</>
# Send every spidered url to this diffbot.com by appending a &url=<url> to it
# before trinyg to downloading it. We expect get get back a JSON reply which
# we index. You will need to supply your token to this as well.
<diffbotApiUrl><![CDATA[]]></>
# Get scoring information for each result so you can see how each result is
# scored? You must explicitly request this using &scores=1 for the XML feed
# because it is not included by default.
<getDocidScoringInfo>1</>
# Query expansion will include word stems and synonyms in its search results.
<doQueryExpansion>1</>
# What is the limit to the total number of returned search results.
<maxSearchResults>1000</>
# What is the limit to the total number of returned search results per query?
<maxSearchResultsPerQuery>100</>
# What is the maximum number of characters allowed in titles displayed in the
# search results?
<maxTitleLen>80</>
# Should search results be site clustered by default?
<siteClusterByDefault>1</>
# Hide all clustered results instead of displaying two results from each site.
<hideAllClusteredResults>0</>
# Should duplicate search results be removed by default?
<dedupResultsByDefault>1</>
# Should we dedup URLs with case insensitivity? This is mainly to correct
# duplicate wiki pages.
<dedupURLs>0</>
# If document summary is this percent similar to a document summary above it,
# then remove it from the search results. 100 means only to remove if exactly
# the same. 0 means no summary deduping.
<percentSimilarDedupSummary>90</>
# Sets the number of lines to generate for summary deduping. This is to help
# the deduping process not thorw out valid summaries when normally displayed
# summaries are smaller values. Requires percent similar dedup summary to be
# enabled.
<numberOfLinesToUseInSummaryToDedup>4</>
# Default language to use for ranking results. Value should be any language
# abbreviation, for example "en" for English.
<sortLanguagePreference><![CDATA[en]]></>
# Default country to use for ranking results. Value should be any country code
# abbreviation, for example "us" for United States.
<sortCountryPreference><![CDATA[us]]></>
# What is the maximum number of characters displayed in a summary for a search
# result?
<maxSummaryLen>512</>
# What is the maximum number of excerpts displayed in the summary of a search
# result?
<maxSummaryExcerpts>4</>
# What is the maximum number of characters allowed per summary excerpt?
<maxSummaryExcerptLength>300</>
# What is the default number of summary excerpts displayed per search result?
<defaultNumberOfSummaryExcerpts>3</>
# <br> tags are inserted to keep the number of chars in the summary per line
# at or below this width. Strings without spaces that exceed this width are
# not split.
<maxSummaryLineWidth>80</>
# Truncating this will miss out on good summaries, but performance will
# increase.
<bytesOfDocToScanForSummaryGeneration>70000</>
# Front html tag used for highlightig query terms in the summaries displated
# in the search results.
<frontHighlightTag><![CDATA[&lt;b style=&#34;color:black;background-color:&#035;ffff66&#34;&gt;]]></>
# Front html tag used for highlightig query terms in the summaries displated
# in the search results.
<backHighlightTag><![CDATA[&lt;/b&gt;]]></>
# How many search results should we scan for related topics (gigabits) per
# query?
<docsToScanForTopics>300</>
# Should Gigablast only get one document per IP domain and per domain for
# topic (gigabit) generation?
<ipRestrictionForTopics>0</>
# Should Gigablast remove overlapping topics (gigabits)?
<removeOverlappingTopics>1</>
# What is the number of related topics (gigabits) displayed per query? Set to
# 0 to save CPU time.
<numberOfRelatedTopics>11</>
# Related topics (gigabits) with scores below this will be excluded. Scores
# range from 0% to over 100%.
<minTopicsScore>5</>
# How many documents must contain the topic (gigabit) for it to be displayed.
<minTopicDocCount>2</>
# If a document is this percent similar to another document with a higher
# score, then it will not contribute to the topic (gigabit) generation.
<dedupDocPercentForTopics>80</>
# Maximum number of words a topic (gigabit) can have. Affects raw feeds, too.
<maxWordsPerTopic>6</>
# Max chars to sample from each doc for topics (gigabits).
<topicMaxSampleSize>4096</>
# If enabled, results in dmoz will display their categories on the results
# page.
<displayDmozCategoriesInResults>1</>
# If enabled, results in dmoz will display their indirect categories on the
# results page.
<displayIndirectDmozCategoriesInResults>0</>
# If enabled, a link will appear next to each category on each result allowing
# the user to perform their query on that entire category.
<displaySearchCategoryLinkToQueryCategoryOfResult>0</>
# Yes to use DMOZ given title when a page is untitled but is in DMOZ.
<useDmozForUntitled>1</>
# Yes to always show DMOZ summaries with search results that are in DMOZ.
<showDmozSummaries>1</>
# Yes to display the Adult category in the Top category
<showAdultCategoryOnTop>0</>
# Before downloading the contents of a URL, Gigablast first chains down this
# list of expressions</a>, starting with expression #0. The first expression
# it matches is the ONE AND ONLY matching row for that url. It then uses the
# respider frequency, spider priority, etc. on the MATCHING ROW when spidering
# that URL. If you specify the <i>expression</i> as <i><b>default</b></i> then
# that MATCHES ALL URLs. URLs with high spider priorities take spidering
# precedence over URLs with lower spider priorities. The respider frequency
# dictates how often a URL will be respidered. See the help table below for
# examples of all the supported expressions. Use the <i>&&</i> operator to
# string multiple expressions together in the same expression text box. A
# <i>spider priority</i> of <i>DELETE</i> will cause the URL to not be
# spidered, or if it has already been indexed, it will be deleted when it is
# respidered.<br><br>
<filterExpression><![CDATA[isdocidbased]]></>
<filterExpression><![CDATA[ismedia]]></>
<filterExpression><![CDATA[errorcount&gt;=3 &amp;&amp; hastmperror]]></>
<filterExpression><![CDATA[errorcount&gt;=1 &amp;&amp; hastmperror]]></>
<filterExpression><![CDATA[isaddurl]]></>
<filterExpression><![CDATA[hopcount==0 &amp;&amp; iswww &amp;&amp; isnew]]></>
<filterExpression><![CDATA[hopcount==0 &amp;&amp; iswww]]></>
<filterExpression><![CDATA[hopcount==0 &amp;&amp; isnew]]></>
<filterExpression><![CDATA[hopcount==0]]></>
<filterExpression><![CDATA[hopcount==1 &amp;&amp; isnew]]></>
<filterExpression><![CDATA[hopcount==1]]></>
<filterExpression><![CDATA[hopcount==2 &amp;&amp; isnew]]></>
<filterExpression><![CDATA[hopcount==2]]></>
<filterExpression><![CDATA[hopcount&gt;=3 &amp;&amp; isnew]]></>
<filterExpression><![CDATA[hopcount&gt;=3]]></>
<filterExpression><![CDATA[isnew]]></>
<filterExpression><![CDATA[default]]></>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<harvestLinks>1</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>1.000000</>
<filterFrequency>1.000000</>
<filterFrequency>1.000000</>
<filterFrequency>7.000000</>
<filterFrequency>7.000000</>
<filterFrequency>7.000000</>
<filterFrequency>10.000000</>
<filterFrequency>20.000000</>
<filterFrequency>20.000000</>
<filterFrequency>40.000000</>
<filterFrequency>40.000000</>
<filterFrequency>60.000000</>
<filterFrequency>60.000000</>
<filterFrequency>30.000000</>
<filterFrequency>30.000000</>
# Do not allow more than this many outstanding spiders for all urls in this
# priority.
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>4</>
<maxSpidersPerRule>2</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>2</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
# Allow this many spiders per IP.
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
# Wait at least this long before downloading urls from the same IP address.
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<filterPriority>80</>
<filterPriority>-3</>
<filterPriority>3</>
<filterPriority>45</>
<filterPriority>85</>
<filterPriority>50</>
<filterPriority>48</>
<filterPriority>49</>
<filterPriority>47</>
<filterPriority>40</>
<filterPriority>39</>
<filterPriority>30</>
<filterPriority>29</>
<filterPriority>20</>
<filterPriority>19</>
<filterPriority>1</>
<filterPriority>0</>

View File

@ -127,11 +127,14 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
<td style="padding-bottom:12px">&nbsp;</td>
<td style="padding-bottom:12px">&nbsp;</td>
</tr>
<!--
<tr bgcolor="#006699">
<th><a name="boolean" id="boolean"></a><font color="#FFFFFF">Boolean Search</font></th>
<th><font color="#FFFFFF">Description</font></th>
<tr bgcolor="#0340fd">
<th><font color=33dcff>Boolean Search</font></th>
<th><font color=33dcff>Description</font></th>
</tr>
<tr>
<td colspan="2" bgcolor="#FFFFCC"><center>
Note: boolean operators must be in UPPER CASE.
@ -214,16 +217,17 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
expressions and can be optionally enclosed in parentheses. A NOT
operator can optionally preceed the left or the right operand.</td>
</tr>
-->
</table>
</td></tr>
</table>
<br>
<center>
Copyright &copy; 2013. All rights reserved.
Copyright &copy; 2014. All rights reserved.
</center>
</body>
</html>