"
""
- //
- ""
- "Crawl Name: | "
- "%s | "
- " "
-
- ""
- "Crawl Type: | "
- "%li | "
- " "
-
- //""
- //"Collection Alias: | "
- //"%s%s | "
- //" "
-
- ""
- "Token: | "
- "%s | "
- " "
-
- ""
- "Seeds: | "
- "%s | "
- " "
-
""
"Crawl Status: | "
"%li | "
@@ -820,10 +798,10 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
"%s | "
" "
- ""
- "Rounds Completed: | "
- "%li | "
- " "
+ //""
+ //"Rounds Completed: | "
+ //"%li | "
+ //" "
""
"Has Urls Ready to Spider: | "
@@ -837,11 +815,6 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
//"%lli | "
//" "
- ""
- "Objects Found | "
- "%lli | "
- " "
-
""
"URLs Harvested (inc. dups) | "
"%lli | "
@@ -868,37 +841,11 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
"%lli | "
" "
- ""
- "Page Process Attempts | "
- "%lli | "
- " "
-
- ""
- "Page Process Successes | "
- "%lli | "
- " "
-
- ""
- "Page Process Successes This Round | "
- "%lli | "
- " "
-
-
- , cr->m_diffbotCrawlName.getBufStart()
-
- , (long)cr->m_isCustomCrawl
-
- , cr->m_diffbotToken.getBufStart()
-
- , seedStr
-
, crawlStatus
, tmp.getBufStart()
- , cr->m_spiderRoundNum
+ //, cr->m_spiderRoundNum
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
- , cr->m_globalCrawlInfo.m_objectsAdded -
- cr->m_globalCrawlInfo.m_objectsDeleted
, cr->m_globalCrawlInfo.m_urlsHarvested
//, cr->m_globalCrawlInfo.m_urlsConsidered
@@ -906,16 +853,13 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
, cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
- , cr->m_globalCrawlInfo.m_pageProcessAttempts
- , cr->m_globalCrawlInfo.m_pageProcessSuccesses
- , cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
);
}
- if ( fmt != FORMAT_JSON )
- // wrap up the form, print a submit button
- g_pages.printAdminBottom ( &sb );
+ //if ( fmt != FORMAT_JSON )
+ // // wrap up the form, print a submit button
+ // g_pages.printAdminBottom ( &sb );
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
diff --git a/PageCrawlBot.cpp b/PageCrawlBot.cpp
index 5491f045..3f9f3903 100644
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@@ -25,11 +25,11 @@
#include "Parms.h"
// so user can specify the format of the reply/output
-#define FMT_HTML 1
-#define FMT_XML 2
-#define FMT_JSON 3
-#define FMT_CSV 4
-#define FMT_TXT 5
+//#define FMT_HTML 1
+//#define FMT_XML 2
+//#define FMT_JSON 3
+//#define FMT_CSV 4
+//#define FMT_TXT 5
void doneSendingWrapper ( void *state , TcpSocket *sock ) ;
bool sendBackDump ( TcpSocket *s,HttpRequest *hr );
@@ -158,25 +158,25 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
if ( ( xx = strstr ( path , "_data.json" ) ) ) {
rdbId = RDB_TITLEDB;
- fmt = FMT_JSON;
+ fmt = FORMAT_JSON;
downloadJSON = true;
}
else if ( ( xx = strstr ( path , "_data.csv" ) ) ) {
rdbId = RDB_TITLEDB;
downloadJSON = true;
- fmt = FMT_CSV;
+ fmt = FORMAT_CSV;
}
else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
rdbId = RDB_SPIDERDB;
- fmt = FMT_CSV;
+ fmt = FORMAT_CSV;
}
else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
rdbId = RDB_SPIDERDB;
- fmt = FMT_TXT;
+ fmt = FORMAT_TXT;
}
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
rdbId = RDB_TITLEDB;
- fmt = FMT_TXT;
+ fmt = FORMAT_TXT;
}
// sanity, must be one of 3 download calls
@@ -213,7 +213,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// . if doing download of csv, make it search results now!
// . make an httprequest on stack and call it
- if ( fmt == FMT_CSV && rdbId == RDB_TITLEDB ) {
+ if ( fmt == FORMAT_CSV && rdbId == RDB_TITLEDB ) {
char tmp2[5000];
SafeBuf sb2(tmp2,5000);
long dr = 1;
@@ -247,7 +247,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// . if doing download of json, make it search results now!
// . make an httprequest on stack and call it
- if ( fmt == FMT_JSON && rdbId == RDB_TITLEDB ) {
+ if ( fmt == FORMAT_JSON && rdbId == RDB_TITLEDB ) {
char tmp2[5000];
SafeBuf sb2(tmp2,5000);
long dr = 1;
@@ -514,13 +514,13 @@ bool StateCD::sendList ( ) {
//sb.setLabel("dbotdmp");
char *ct = "text/csv";
- if ( m_fmt == FMT_JSON )
+ if ( m_fmt == FORMAT_JSON )
ct = "application/json";
- if ( m_fmt == FMT_XML )
+ if ( m_fmt == FORMAT_XML )
ct = "text/xml";
- if ( m_fmt == FMT_TXT )
+ if ( m_fmt == FORMAT_TXT )
ct = "text/plain";
- if ( m_fmt == FMT_CSV )
+ if ( m_fmt == FORMAT_CSV )
ct = "text/csv";
// . if we haven't yet sent an http mime back to the user
@@ -545,13 +545,13 @@ bool StateCD::sendList ( ) {
//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
- if ( ! m_printedFirstBracket && m_fmt == FMT_JSON ) {
+ if ( ! m_printedFirstBracket && m_fmt == FORMAT_JSON ) {
sb.safePrintf("[\n");
m_printedFirstBracket = true;
}
// these are csv files not xls
- //if ( ! m_printedFirstBracket && m_fmt == FMT_CSV ) {
+ //if ( ! m_printedFirstBracket && m_fmt == FORMAT_CSV ) {
// sb.safePrintf("sep=,\n");
// m_printedFirstBracket = true;
//}
@@ -638,7 +638,7 @@ bool StateCD::sendList ( ) {
// use this for printing out urls.csv as well...
m_printedEndingBracket = true;
// end array of json objects. might be empty!
- if ( m_rdbId == RDB_TITLEDB && m_fmt == FMT_JSON )
+ if ( m_rdbId == RDB_TITLEDB && m_fmt == FORMAT_JSON )
sb.safePrintf("\n]\n");
//log("adding ]. len=%li",sb.length());
// i'd like to exit streaming mode here. i fixed tcpserver.cpp
@@ -853,7 +853,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
}
// "csv" is default if json not specified
- if ( m_fmt == FMT_JSON )
+ if ( m_fmt == FORMAT_JSON )
sb->safePrintf("[{"
"{\"url\":"
"\"%s\"},"
@@ -997,7 +997,7 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// if not json, just print the json item out in csv
// moved into PageResults.cpp...
- //if ( m_fmt == FMT_CSV ) {
+ //if ( m_fmt == FORMAT_CSV ) {
// printJsonItemInCsv ( json , sb );
// continue;
//}
@@ -1337,7 +1337,7 @@ bool sendReply2 (TcpSocket *socket , long fmt , char *msg ) {
// send this back to browser
SafeBuf sb;
- if ( fmt == FMT_JSON ) {
+ if ( fmt == FORMAT_JSON ) {
sb.safePrintf("{\n\"response\":\"success\",\n"
"\"message\":\"%s\"\n}\n"
, msg );
@@ -1368,7 +1368,7 @@ bool sendErrorReply2 ( TcpSocket *socket , long fmt , char *msg ) {
// send this back to browser
SafeBuf sb;
- if ( fmt == FMT_JSON ) {
+ if ( fmt == FORMAT_JSON ) {
sb.safePrintf("{\"error\":\"%s\"}\n"
, msg );
ct = "application/json";
@@ -1476,7 +1476,7 @@ void injectedUrlWrapper ( void *state ) {
// send back the html or json response?
SafeBuf *response = &sb;
- if ( st->m_fmt == FMT_JSON ) response = &js;
+ if ( st->m_fmt == FORMAT_JSON ) response = &js;
// . this will call g_httpServer.sendReply()
// . pass it in the injection response, "sb"
@@ -1673,7 +1673,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// . now show stats for the current crawl
// . put in xml or json if format=xml or format=json or
// xml=1 or json=1 ...
- char fmt = FMT_JSON;
+ char fmt = FORMAT_JSON;
// token is always required. get from json or html form input
//char *token = getInputString ( "token" );
@@ -1693,21 +1693,21 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
name++;
}
// change default formatting to html
- fmt = FMT_HTML;
+ fmt = FORMAT_HTML;
}
char *fs = hr->getString("format",NULL,NULL);
// give john a json api
- if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
- if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON;
- if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML;
+ if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
+ if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
+ if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
// if we got json as input, give it as output
- //if ( JS.getFirstItem() ) fmt = FMT_JSON;
+ //if ( JS.getFirstItem() ) fmt = FORMAT_JSON;
- if ( ! token && fmt == FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) {
+ if ( ! token && fmt == FORMAT_JSON ) { // (cast==0|| fmt == FORMAT_JSON ) ) {
char *msg = "invalid token";
return sendErrorReply2 (socket,fmt,msg);
}
@@ -1772,7 +1772,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
//}
// just send back a list of all the collections after the delete
- //if ( delColl && cast && fmt == FMT_JSON ) {
+ //if ( delColl && cast && fmt == FORMAT_JSON ) {
// char *msg = "Collection deleted.";
// return sendReply2 (socket,fmt,msg);
//}
@@ -2263,7 +2263,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
/*
bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
- if ( fmt == FMT_JSON )
+ if ( fmt == FORMAT_JSON )
sb.safePrintf("\"urlFilters\":[");
// skip first filters that are:
@@ -2303,7 +2303,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
// urls higher spider priority, so skip it
if ( strncmp(expression,"ismanualadd && ",15) == 0 )
continue;
- if ( fmt == FMT_HTML ) {
+ if ( fmt == FORMAT_HTML ) {
sb.safePrintf(""
"Expression "
"safePrintf("}\n");
@@ -2537,7 +2537,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// store output into here
SafeBuf sb;
- if ( fmt == FMT_HTML )
+ if ( fmt == FORMAT_HTML )
sb.safePrintf(
""
"Crawlbot - "
@@ -2573,7 +2573,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
lb.urlEncode(name);
lb.safePrintf ("&token=");
lb.urlEncode(token);
- if ( fmt == FMT_HTML ) lb.safePrintf("&format=html");
+ if ( fmt == FORMAT_HTML ) lb.safePrintf("&format=html");
lb.nullTerm();
@@ -2590,7 +2590,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//}
- if ( fmt == FMT_HTML ) {
+ if ( fmt == FORMAT_HTML ) {
sb.safePrintf(""
""
""
@@ -2645,7 +2645,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// print list of collections controlled by this token
//
- for ( long i = 0 ; fmt == FMT_HTML && i");
}
- if ( fmt == FMT_HTML )
+ if ( fmt == FORMAT_HTML )
sb.safePrintf ( " " );
// the ROOT JSON [
- if ( fmt == FMT_JSON )
+ if ( fmt == FORMAT_JSON )
sb.safePrintf("{\n");
// injection is currently not in use, so this is an artifact:
- if ( fmt == FMT_JSON && injectionResponse )
+ if ( fmt == FORMAT_JSON && injectionResponse )
sb.safePrintf("\"response\":\"%s\",\n\n"
, injectionResponse->getBufStart() );
- if ( fmt == FMT_JSON && urlUploadResponse )
+ if ( fmt == FORMAT_JSON && urlUploadResponse )
sb.safePrintf("\"response\":\"%s\",\n\n"
, urlUploadResponse->getBufStart() );
@@ -2702,14 +2702,14 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// the items in the array now have type:bulk or type:crawl
// so call them 'jobs'
- if ( fmt == FMT_JSON )
+ if ( fmt == FORMAT_JSON )
sb.safePrintf("\"jobs\":[");//\"collections\":");
long summary = hr->getLong("summary",0);
// enter summary mode for json
- if ( fmt != FMT_HTML ) summary = 1;
+ if ( fmt != FORMAT_HTML ) summary = 1;
// start the table
- if ( summary && fmt == FMT_HTML ) {
+ if ( summary && fmt == FORMAT_HTML ) {
sb.safePrintf(""
""
"Collection | "
@@ -2740,11 +2740,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// just print out single crawl info for json
- if ( fmt != FMT_HTML && cx != cr && name3 )
+ if ( fmt != FORMAT_HTML && cx != cr && name3 )
continue;
// if json, print each collectionrec
- if ( fmt == FMT_JSON ) {
+ if ( fmt == FORMAT_JSON ) {
if ( ! firstOne )
sb.safePrintf(",\n\t");
firstOne = false;
@@ -2786,7 +2786,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
);
}
- if ( summary && fmt == FMT_HTML ) {
+ if ( summary && fmt == FORMAT_HTML ) {
sb.safePrintf(" " );
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
@@ -2794,7 +2794,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
0); // cachetime
}
- if ( fmt == FMT_JSON )
+ if ( fmt == FORMAT_JSON )
// end the array of collection objects
sb.safePrintf("\n]\n");
@@ -2808,7 +2808,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show urls being crawled (ajax) (from Spider.cpp)
//
- if ( fmt == FMT_HTML ) {
+ if ( fmt == FORMAT_HTML ) {
sb.safePrintf ( ""
@@ -2879,7 +2879,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
rand64 |= r2;
- if ( fmt == FMT_HTML ) {
+ if ( fmt == FORMAT_HTML ) {
sb.safePrintf(" "
""
@@ -2952,12 +2952,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
);
}
- if ( injectionResponse && fmt == FMT_HTML )
+ if ( injectionResponse && fmt == FORMAT_HTML )
sb.safePrintf(" %s\n"
,injectionResponse->getBufStart()
);
- if ( fmt == FMT_HTML )
+ if ( fmt == FORMAT_HTML )
sb.safePrintf(//""
//""
""
@@ -2996,7 +2996,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show stats
//
- if ( fmt == FMT_HTML ) {
+ if ( fmt == FORMAT_HTML ) {
char *seedStr = cr->m_diffbotSeeds.getBufStart();
if ( ! seedStr ) seedStr = "";
@@ -3654,7 +3654,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// xml or json does not show the input boxes
- //if ( format != FMT_HTML )
+ //if ( format != FORMAT_HTML )
// return g_httpServer.sendDynamicPage ( s,
// sb.getBufStart(),
// sb.length(),
@@ -3677,7 +3677,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
s2 = "";
}
- if ( fmt == FMT_HTML )
+ if ( fmt == FORMAT_HTML )
sb.safePrintf(
""
""
@@ -3760,7 +3760,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show simpler url filters table
//
- if ( fmt == FMT_HTML ) {
+ if ( fmt == FORMAT_HTML ) {
/*
sb.safePrintf ( ""
""
@@ -3796,7 +3796,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show reset and delete crawl buttons
//
- if ( fmt == FMT_HTML ) {
+ if ( fmt == FORMAT_HTML ) {
sb.safePrintf(
""
""
@@ -3859,13 +3859,13 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// the ROOT JSON }
- if ( fmt == FMT_JSON )
+ if ( fmt == FORMAT_JSON )
sb.safePrintf("}\n");
char *ct = "text/html";
- if ( fmt == FMT_JSON ) ct = "application/json";
- if ( fmt == FMT_XML ) ct = "text/xml";
- if ( fmt == FMT_CSV ) ct = "text/csv";
+ if ( fmt == FORMAT_JSON ) ct = "application/json";
+ if ( fmt == FORMAT_XML ) ct = "text/xml";
+ if ( fmt == FORMAT_CSV ) ct = "text/csv";
// this could be in html json or xml
return g_httpServer.sendDynamicPage ( socket,
@@ -4142,7 +4142,7 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
char *json = hr->getString("json");
if ( ! json )
return sendReply2 ( socket,
- FMT_JSON,
+ FORMAT_JSON,
"No &json= provided in request.");
@@ -4151,12 +4151,12 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
// wtf?
if ( ! status )
- return sendReply2 ( socket, FMT_JSON,
+ return sendReply2 ( socket, FORMAT_JSON,
"Error with JSON parser.");
// error adding it?
if ( ! cr )
- return sendReply2 ( socket,FMT_JSON,
+ return sendReply2 ( socket,FORMAT_JSON,
"Failed to create new collection.");
ji = JP.getFirstItem();
diff --git a/PageRoot.cpp b/PageRoot.cpp
index 29058b36..0654f8b2 100644
--- a/PageRoot.cpp
+++ b/PageRoot.cpp
@@ -169,6 +169,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("\n");
sb.safePrintf(" \n");
sb.safePrintf("\n");
+
+ // print any red boxes we might need to
+ if ( printRedBox2 ( &sb , true ) )
+ sb.safePrintf(" \n");
+
sb.safePrintf("\n");
sb.safePrintf("\n");
diff --git a/Pages.cpp b/Pages.cpp
index 9af16001..1773dcbc 100644
--- a/Pages.cpp
+++ b/Pages.cpp
@@ -50,6 +50,9 @@ static WebPage s_pages[] = {
"dummy page - if set in the users row then user will have master=0 and "
" collection links will be highlighted in red",
NULL, 0 },
+
+
+
//{ PAGE_QUALITY , "quality", 0, "quality", 0, 0,
// "dummy page - if set in the users row then \"Quality Control\""
// " will be printed besides the logo for certain pages",
@@ -102,12 +105,66 @@ static WebPage s_pages[] = {
// "Basic diffbot page.", sendPageBasicDiffbot , 0 } ,
{ PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 ,
"Basic security page.", sendPageGeneric , 0 } ,
+ { PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 ,
+ "Basic search page.", sendPageRoot , 0 } ,
+
{ PAGE_MASTER , "admin/master" , 0 , "master controls" , 1 , 0 ,
//USER_MASTER | USER_PROXY ,
"master controls page",
sendPageGeneric , 0 } ,
+ { PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1,
+ //USER_ADMIN | USER_MASTER ,
+ "search controls page",
+ sendPageGeneric , 0 } ,
+ { PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0,
+ //USER_ADMIN | USER_MASTER | USER_PROXY ,
+ "spider controls page",
+ sendPageGeneric , 0 } ,
+ { PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 ,
+ //USER_MASTER | USER_PROXY,
+ "log page",
+ sendPageGeneric , 0 } ,
+ { PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 ,
+ //USER_MASTER | USER_PROXY ,
+ "advanced security page",
+ sendPageGeneric , 0 } ,
+ { PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 ,
+ //USER_MASTER ,
+ "add a new collection using this page",
+ sendPageAddColl , 0 } ,
+ { PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0,
+ //USER_MASTER ,
+ "delete a collection using this page",
+ sendPageDelColl , 0 } ,
+ { PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 ,
+ //USER_MASTER ,
+ "repair page",
+ sendPageGeneric , 0 },
+ { PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
+ "what sites can be spidered",
+ sendPageGeneric , 0 } , // sendPageBasicSettings
+ { PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
+ //USER_ADMIN | USER_MASTER ,
+ "prioritize urls for spidering",
+ sendPageGeneric , 0 } ,
+ { PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
+ //USER_ADMIN | USER_MASTER ,
+ "inject url in the index here",
+ sendPageInject , 2 } ,
+ // this is the addurl page the the admin!
+ { PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
+ "add url page for admin",
+ sendPageAddUrl2 , 0 } ,
+ { PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 ,
+ //USER_ADMIN | USER_MASTER,
+ "reindex url page",
+ sendPageReindex , 0 } ,
+
+
+
+
{ PAGE_HOSTS , "admin/hosts" , 0 , "hosts" , 0 , 0 ,
//USER_MASTER | USER_PROXY,
@@ -134,10 +191,7 @@ static WebPage s_pages[] = {
//USER_MASTER | USER_PROXY,
"sockets page",
sendPageSockets , 0 } ,
- { PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 ,
- //USER_MASTER | USER_PROXY,
- "log page",
- sendPageGeneric , 0 } ,
+
{ PAGE_LOGVIEW , "admin/logview" , 0 , "log view" , 0 , 0 ,
//USER_MASTER ,
"logview page",
@@ -147,18 +201,6 @@ static WebPage s_pages[] = {
// "sync page",
// sendPageGeneric , 0 } ,
- { PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 ,
- //USER_MASTER | USER_PROXY ,
- "advanced security page",
- sendPageGeneric , 0 } ,
- { PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 ,
- //USER_MASTER ,
- "add a new collection using this page",
- sendPageAddColl , 0 } ,
- { PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0,
- //USER_MASTER ,
- "delete a collection using this page",
- sendPageDelColl , 0 } ,
{ PAGE_AUTOBAN ,"admin/autoban" , 0 , "autoban" , 1 , 1 ,
//USER_MASTER | USER_PROXY ,
"autobanned ips",
@@ -175,10 +217,6 @@ static WebPage s_pages[] = {
//USER_MASTER ,
"threads page",
sendPageThreads , 0 },
- { PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 ,
- //USER_MASTER ,
- "repair page",
- sendPageGeneric , 0 },
//{ PAGE_THESAURUS, "admin/thesaurus", 0 , "thesaurus", 0 , 0 ,
// //USER_MASTER ,
// "thesaurus page",
@@ -207,14 +245,6 @@ static WebPage s_pages[] = {
"titledb page",
sendPageTitledb , 2 } ,
// 1 = usePost
- { PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1,
- //USER_ADMIN | USER_MASTER ,
- "search controls page",
- sendPageGeneric , 0 } ,
- { PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0,
- //USER_ADMIN | USER_MASTER | USER_PROXY ,
- "spider controls page",
- sendPageGeneric , 0 } ,
{ PAGE_CRAWLBOT , "crawlbot" , 0 , "crawlbot" , 1 , 0,
"simplified spider controls page",
@@ -229,30 +259,6 @@ static WebPage s_pages[] = {
// "spider priorities page",
// sendPageGeneric , 0 } ,
- { PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
- "what sites can be spidered",
- sendPageGeneric , 0 } , // sendPageBasicSettings
-
- { PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
- //USER_ADMIN | USER_MASTER ,
- "prioritize urls for spidering",
- sendPageGeneric , 0 } ,
-
- { PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
- //USER_ADMIN | USER_MASTER ,
- "inject url in the index here",
- sendPageInject , 2 } ,
-
- // this is the addurl page the the admin!
- { PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
- "add url page for admin",
- sendPageAddUrl2 , 0 } ,
-
- { PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 ,
- //USER_ADMIN | USER_MASTER,
- "reindex url page",
- sendPageReindex , 0 } ,
-
//{ PAGE_KEYWORDS, "admin/queries",0,"queries" , 0 , 1 ,
// "get queries a url matches",
// sendPageMatchingQueries , 2 } ,
@@ -893,8 +899,6 @@ bool Pages::getNiceness ( long page ) {
return s_pages[page].m_niceness;
}
-bool printRedBox ( SafeBuf *mb ) ;
-
///////////////////////////////////////////////////////////
//
// Convenient html printing routines
@@ -1056,6 +1060,7 @@ bool Pages::printAdminTop (SafeBuf *sb ,
//if ( page == PAGE_BASIC_DIFFBOT ) isBasic = true;
//if ( page == PAGE_BASIC_SEARCH ) isBasic = true;
if ( page == PAGE_BASIC_SECURITY ) isBasic = true;
+ if ( page == PAGE_BASIC_SEARCH ) isBasic = true;
//
// print breadcrumb. main > Basic > Settings
@@ -1791,7 +1796,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
// is this page basic?
bool pageBasic = false;
if ( i >= PAGE_BASIC_SETTINGS &&
- i <= PAGE_BASIC_SECURITY )
+ i <= PAGE_BASIC_SEARCH )
pageBasic = true;
// print basic pages under the basic menu, advanced pages
@@ -2627,9 +2632,18 @@ bool sendPageLogin ( TcpSocket *socket , HttpRequest *hr ) {
NULL);// cookie
}
+bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage ) {
+ SafeBuf mb;
+ // return false if no red box
+ if ( ! printRedBox ( &mb , isRootWebPage ) ) return false;
+ // otherwise, print it
+ sb->safeStrcpy ( mb.getBufStart() );
+ // return true since we printed one
+ return true;
+}
// emergency message box
-bool printRedBox ( SafeBuf *mb ) {
+bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) {
PingServer *ps = &g_pingServer;
@@ -2649,11 +2663,33 @@ bool printRedBox ( SafeBuf *mb ) {
char *boxEnd =
" ";
- bool adds = false;
+ long adds = 0;
mb->safePrintf("");
+ // are we just starting off? give them a little help.
+ CollectionRec *cr = g_collectiondb.getRec("main");
+ if ( g_collectiondb.m_numRecs == 1 &&
+ cr &&
+ isRootWebPage &&
+ cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) {
+ if ( adds ) mb->safePrintf(" ");
+ adds++;
+ mb->safePrintf("%s",box);
+ mb->safePrintf("Welcome to Gigablast. The most powerful "
+ "search engine you can legally download. "
+ "Please add the websites you want to spider "
+ " here."
+ );
+ mb->safePrintf("%s",boxEnd);
+ }
+
+ if ( isRootWebPage ) {
+ mb->safePrintf(" ");
+ return (bool)adds;
+ }
+
if ( g_conf.m_numConnectIps == 0 && g_conf.m_numMasterPwds == 0 ) {
if ( adds ) mb->safePrintf(" ");
adds++;
@@ -2738,5 +2774,5 @@ bool printRedBox ( SafeBuf *mb ) {
mb->safePrintf("");
- return adds;
+ return (bool)adds;
}
diff --git a/Pages.h b/Pages.h
index 281fe1cc..4dde73af 100644
--- a/Pages.h
+++ b/Pages.h
@@ -5,6 +5,9 @@
#ifndef _PAGES_H_
#define _PAGES_H_
+bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage = false ) ;
+bool printRedBox ( SafeBuf *mb , bool isRootWebPage = false ) ;
+
// for PageEvents.cpp and Accessdb.cpp
//#define RESULTSWIDTHSTR "550px"
@@ -304,25 +307,36 @@ enum {
//PAGE_BASIC_SEARCH , // TODO
//PAGE_BASIC_DIFFBOT , // TODO
PAGE_BASIC_SECURITY ,
+ PAGE_BASIC_SEARCH ,
// master admin pages
PAGE_MASTER ,
+ PAGE_SEARCH ,
+ PAGE_SPIDER ,
+ PAGE_LOG ,
+ PAGE_SECURITY ,
+ PAGE_ADDCOLL ,
+ PAGE_DELCOLL ,
+ PAGE_REPAIR ,
+ PAGE_SITES , // site filters
+ PAGE_FILTERS ,
+ PAGE_INJECT ,
+ PAGE_ADDURL2 ,
+ PAGE_REINDEX ,
+
PAGE_HOSTS ,
PAGE_STATS , // 10
PAGE_STATSDB ,
PAGE_PERF ,
PAGE_SOCKETS ,
- PAGE_LOG ,
+
PAGE_LOGVIEW ,
// PAGE_SYNC ,
- PAGE_SECURITY ,
- PAGE_ADDCOLL ,
- PAGE_DELCOLL ,
PAGE_AUTOBAN , // 20
//PAGE_SPIDERLOCKS ,
PAGE_PROFILER ,
PAGE_THREADS ,
- PAGE_REPAIR ,
+
// PAGE_THESAURUS ,
// . non master-admin pages (collection controls)
@@ -335,16 +349,9 @@ enum {
PAGE_TITLEDB ,
//PAGE_STATSDB ,
- PAGE_SEARCH ,
- PAGE_SPIDER ,
PAGE_CRAWLBOT , // 35
PAGE_SPIDERDB ,
//PAGE_PRIORITIES , // priority queue controls
- PAGE_SITES , // site filters
- PAGE_FILTERS ,
- PAGE_INJECT ,
- PAGE_ADDURL2 ,
- PAGE_REINDEX ,
//PAGE_KEYWORDS ,
PAGE_SEO ,
PAGE_ACCESS , //40
diff --git a/Parms.cpp b/Parms.cpp
index 962003aa..2260bee3 100644
--- a/Parms.cpp
+++ b/Parms.cpp
@@ -1888,7 +1888,7 @@ bool Parms::printParm ( SafeBuf* sb,
"value=\"%f\" "
// 3 was ok on firefox but need 6
// on chrome
- "size=6>",cgi,*(float *)s);
+ "size=7>",cgi,*(float *)s);
}
else if ( t == TYPE_IP ) {
if ( m->m_max > 0 && j == jend )
@@ -1896,7 +1896,7 @@ bool Parms::printParm ( SafeBuf* sb,
"size=12>",cgi);
else
sb->safePrintf ("",cgi,iptoa(*(long *)s));
+ "size=12>",cgi,iptoa(*(long *)s));
}
else if ( t == TYPE_LONG ) {
// just show the parm name and value if printing in json
@@ -7534,6 +7534,7 @@ void Parms::init ( ) {
m->m_flags = PF_TEXTAREA;
m++;
+ /*
// the new upload post submit button
m->m_title = "upload urls";
m->m_desc = "Upload your file of urls.";
@@ -7542,6 +7543,7 @@ void Parms::init ( ) {
m->m_obj = OBJ_NONE;
m->m_type = TYPE_FILEUPLOADBUTTON;
m++;
+ */
m->m_title = "strip sessionids";
m->m_desc = "Strip added urls of their session ids.";
@@ -7591,6 +7593,7 @@ void Parms::init ( ) {
m->m_title = "site list";
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
+ "See example site list below. "
"Gigablast uses the "
"insitelist "
"directive on "
@@ -7599,8 +7602,7 @@ void Parms::init ( ) {
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
"tools. "
- "See example site list below. "
- "Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
+ "Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the add "
"urls interface.";
m->m_cgi = "sitelist";
@@ -7629,6 +7631,7 @@ void Parms::init ( ) {
m++;
*/
+ /*
// the new upload post submit button
m->m_title = "upload site list";
m->m_desc = "Upload your file of site patterns. Completely replaces "
@@ -7640,12 +7643,13 @@ void Parms::init ( ) {
m->m_type = TYPE_FILEUPLOADBUTTON;
m->m_flags = PF_NOSAVE | PF_DUP;
m++;
+ */
m->m_title = "restart collection";
- m->m_desc = "Remove all documents from this collection and starts "
- "spidering over again. If you do this accidentally there "
- "is a recovery procedure to "
- "get back the trashed data.";
+ m->m_desc = "Remove all documents from this collection and restart "
+ "spidering.";// If you do this accidentally there "
+ //"is a recovery procedure to "
+ // "get back the trashed data.";
m->m_cgi = "restart";
m->m_page = PAGE_BASIC_SETTINGS;
m->m_obj = OBJ_COLL;
@@ -7659,6 +7663,7 @@ void Parms::init ( ) {
m->m_title = "site list";
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
+ "See example site list below. "
"Gigablast uses the "
"insitelist "
"directive on "
@@ -7667,8 +7672,7 @@ void Parms::init ( ) {
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
"tools. "
- "See example site list below. "
- "Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
+ "Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the addurl"
" interface.";
m->m_cgi = "sitelist";
@@ -8762,11 +8766,11 @@ void Parms::init ( ) {
m++;
m->m_title = "max robots.txt cache age";
- m->m_desc = "How many second to cache a robots.txt file for. "
+ m->m_desc = "How many seconds to cache a robots.txt file for. "
"86400 is 1 day. 0 means Gigablast will not read from the "
"cache at all and will download the robots.txt before every "
"page if robots.txt use is enabled above. However, if this is "
- "0 then Gigablast will still store robots.txt files into the "
+ "0 then Gigablast will still store robots.txt files in the "
"cache.";
m->m_cgi = "mrca";
m->m_off = (char *)&cr.m_maxRobotsCacheAge - x;
@@ -10639,8 +10643,9 @@ void Parms::init ( ) {
m++;
m->m_title = "do query expansion";
- m->m_desc = "Query expansion will include word stems and synonyms in "
- "its search results.";
+ m->m_desc = "If enabled, query expansion will expand your query "
+ "to include word stems and "
+ "synonyms of the query terms.";
m->m_def = "1";
m->m_off = (char *)&cr.m_queryExpansion - x;
m->m_soff = (char *)&si.m_queryExpansion - y;
@@ -10653,7 +10658,7 @@ void Parms::init ( ) {
// more general parameters
m->m_title = "max search results";
- m->m_desc = "What is the limit to the total number "
+ m->m_desc = "What is the maximum total number "
"of returned search results.";
m->m_cgi = "msr";
m->m_off = (char *)&cr.m_maxSearchResults - x;
@@ -12457,7 +12462,7 @@ void Parms::init ( ) {
m++;
m->m_title = "max summary line width";
- m->m_desc = " tags are inserted to keep the number "
+ m->m_desc = "<br> tags are inserted to keep the number "
"of chars in the summary per line at or below this width. "
"Strings without spaces that exceed this "
"width are not split.";
diff --git a/Sections.cpp b/Sections.cpp
index 6a373db5..f675e964 100644
--- a/Sections.cpp
+++ b/Sections.cpp
@@ -15163,7 +15163,7 @@ bool Sections::printVotingInfoInJSON ( SafeBuf *sb ) {
// breathe
QUICKPOLL ( m_niceness );
// print this section
- printSectionDiv ( sk , FMT_JSON ); // forProCog );
+ printSectionDiv ( sk , FORMAT_JSON ); // forProCog );
// advance
long b = sk->m_b;
// stop if last
@@ -15190,7 +15190,8 @@ bool Sections::print2 ( SafeBuf *sbuf ,
HashTableX *st2 ,
HashTableX *tt ,
Addresses *aa ,
- char format ) { // bool forProCog ){//FMT_PROCOG FMT_JSON HTML
+ char format ) { // bool forProCog ){
+ //FORMAT_PROCOG FORMAT_JSON HTML
//sbuf->safePrintf("Sections in Document\n");
@@ -15244,7 +15245,7 @@ bool Sections::print2 ( SafeBuf *sbuf ,
sk = m_sectionPtrs[b];
}
- if ( format != FMT_HTML ) return true; // forProCog
+ if ( format != FORMAT_HTML ) return true; // forProCog
// print header
char *hdr =
@@ -15553,7 +15554,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
// m_sbuf->safePrintf("A=%li ",sk->m_a);
- if ( format == FMT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
+ if ( format == FORMAT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
// do not count our own site!
m_sbuf->safePrintf(""
""
@@ -15573,7 +15574,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
m_sbuf->safePrintf("");
- if ( format == FMT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
+ if ( format == FORMAT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
sec_t f = sk->m_flags;
//if ( f & SEC_SENTENCE )
// m_sbuf->safePrintf("sentence " );
@@ -15598,7 +15599,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
// m_sbuf->safePrintf("notdupvotes=%li ",
// sk->m_votesForNotDup);
- if ( format != FMT_PROCOG ) {
+ if ( format != FORMAT_PROCOG ) {
// print the flags
m_sbuf->safePrintf("A=%li ",sk->m_a);
diff --git a/Spider.cpp b/Spider.cpp
index e513ee8b..635bc820 100644
--- a/Spider.cpp
+++ b/Spider.cpp
@@ -12414,7 +12414,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
if ( cx->m_isCustomCrawl )
return msg->safePrintf("Job is in progress.");
else
- return true;
+ return msg->safePrintf("Spider is in progress.");
}
// pattern is a ||-separted list of substrings
diff --git a/coll.main.0/coll.conf b/coll.main.0/coll.conf
deleted file mode 100644
index 6d1eb7a3..00000000
--- a/coll.main.0/coll.conf
+++ /dev/null
@@ -1,416 +0,0 @@
-# List of sites to spider, one per line. Gigablast uses the insitelist directive on the url filters page to make sure that the spider only
-# indexes urls that match the site patterns you specify here, other than urls
-# you add individually via the add urls or inject url tools. See example site list below. Limit list to 300MB. If you have
-# a lot of INDIVIDUAL URLS to add then consider using the addurl interface.
->
-
-# All <, >, " and # characters that are values for a field contained herein
-# must be represented as <, >, " and # respectively.
-
-# Controls just the spiders for this collection.
-1>
-
-# What is the maximum number of web pages the spider is allowed to download
-# simultaneously PER HOST for THIS collection?
-100>
-
-# make each spider wait this many milliseconds before getting the ip and
-# downloading the page.
-0>
-
-# If this is true Gigablast will respect the robots.txt convention.
-1>
-
-# How many second to cache a robots.txt file for. 86400 is 1 day. 0 means
-# Gigablast will not read from the cache at all and will download the
-# robots.txt before every page if robots.txt use is enabled above. However, if
-# this is 0 then Gigablast will still store robots.txt files into the cache.
-86400>
-
-# Do a tight merge on posdb and titledb at this time every day. This is
-# expressed in MINUTES past midnight UTC. UTC is 5 hours ahead of EST and 7
-# hours ahead of MST. Leave this as -1 to NOT perform a daily merge. To merge
-# at midnight EST use 60*5=300 and midnight MST use 60*7=420.
--1>
-
-# Comma separated list of days to merge on. Use 0 for Sunday, 1 for Monday,
-# ... 6 for Saturday. Leaving this parmaeter empty or without any numbers will
-# make the daily merge happen every day
->
-
-# When the daily merge was last kicked off. Expressed in UTC in seconds since
-# the epoch.
--1>
-
-# If this is true, users will have to pass a simple Turing test to add a url.
-# This prevents automated url submission.
-0>
-
-# Maximum number of urls that can be submitted via the addurl interface, per
-# IP domain, per 24 hour period. A value less than or equal to zero implies no
-# limit.
-0>
-
-# When the spider round started
-0>
-
-# The spider round number.
-0>
-
-# When enabled, the spider will discard web pages which are identical to other
-# web pages that are already in the index. However, root urls, urls that have
-# no path, are never discarded. It most likely has to hit disk to do these
-# checks so it does cause some slow down. Only use it if you need it.
-0>
-
-# When enabled, the spider will discard web pages which, when a www is
-# prepended to the page's url, result in a url already in the index.
-1>
-
-# Detect and do not index pages which have a 200 status code, but are likely
-# to be error pages.
-1>
-
-# Should pages be removed from the index if they are no longer accessible on
-# the web?
-1>
-
-# If this is true, the spider, when a url redirects to a "simpler" url, will
-# add that simpler url into the spider queue and abandon the spidering of the
-# current url.
-1>
-
-# If this is true, the spider, when updating a web page that is already in the
-# index, will not even download the whole page if it hasn't been updated since
-# the last time Gigablast spidered it. This is primarily a bandwidth saving
-# feature. It relies on the remote webserver's returned Last-Modified-Since
-# field being accurate.
-0>
-
-# If this is true, do not allow spammy inlinks to vote. This check is too
-# aggressive for some collections, i.e. it does not allow pages with cgi in
-# their urls to vote.
-1>
-
-# If this is true Gigablast will only allow one vote per the top 2 significant
-# bytes of the IP address. Otherwise, multiple pages from the same top IP can
-# contribute to the link text and link-based quality ratings of a particular
-# URL. Furthermore, no votes will be accepted from IPs that have the same top
-# 2 significant bytes as the IP of the page being indexed.
-1>
-
-# How often should Gigablast recompute the link info for a url. Also applies
-# to getting the quality of a site or root url, which is based on the link
-# info. In days. Can use decimals. 0 means to update the link info every time
-# the url's content is re-indexed. If the content is not reindexed because it
-# is unchanged then the link info will not be updated. When getting the link
-# info or quality of the root url from an external cluster, Gigablast will
-# tell the external cluster to recompute it if its age is this or higher.
-60.000000>
-
-# If this is eabled the spider will not allow any docs which are determined to
-# be serps.
-1>
-
-# If this is false then the filter will not be used on html or text pages.
-0>
-
-# Program to spawn to filter all HTTP replies the spider receives. Leave blank
-# for none.
->
-
-# Kill filter shell after this many seconds. Assume it stalled permanently.
-40>
-
-# Retrieve pages from the proxy at this IP address.
-0.0.0.0>
-
-# Retrieve pages from the proxy on this port.
-0>
-
-# Index the body of the documents so you can search it. Required for searching
-# that. You wil pretty much always want to keep this enabled.
-1>
-
-# Send every spidered url to this diffbot.com by appending a &url= to it
-# before trinyg to downloading it. We expect get get back a JSON reply which
-# we index. You will need to supply your token to this as well.
->
-
-# Get scoring information for each result so you can see how each result is
-# scored? You must explicitly request this using &scores=1 for the XML feed
-# because it is not included by default.
-1>
-
-# Query expansion will include word stems and synonyms in its search results.
-1>
-
-# What is the limit to the total number of returned search results.
-1000>
-
-# What is the limit to the total number of returned search results per query?
-100>
-
-# What is the maximum number of characters allowed in titles displayed in the
-# search results?
-80>
-
-# Should search results be site clustered by default?
-1>
-
-# Hide all clustered results instead of displaying two results from each site.
-0>
-
-# Should duplicate search results be removed by default?
-1>
-
-# Should we dedup URLs with case insensitivity? This is mainly to correct
-# duplicate wiki pages.
-0>
-
-# If document summary is this percent similar to a document summary above it,
-# then remove it from the search results. 100 means only to remove if exactly
-# the same. 0 means no summary deduping.
-90>
-
-# Sets the number of lines to generate for summary deduping. This is to help
-# the deduping process not thorw out valid summaries when normally displayed
-# summaries are smaller values. Requires percent similar dedup summary to be
-# enabled.
-4>
-
-# Default language to use for ranking results. Value should be any language
-# abbreviation, for example "en" for English.
->
-
-# Default country to use for ranking results. Value should be any country code
-# abbreviation, for example "us" for United States.
->
-
-# What is the maximum number of characters displayed in a summary for a search
-# result?
-512>
-
-# What is the maximum number of excerpts displayed in the summary of a search
-# result?
-4>
-
-# What is the maximum number of characters allowed per summary excerpt?
-300>
-
-# What is the default number of summary excerpts displayed per search result?
-3>
-
-# tags are inserted to keep the number of chars in the summary per line
-# at or below this width. Strings without spaces that exceed this width are
-# not split.
-80>
-
-# Truncating this will miss out on good summaries, but performance will
-# increase.
-70000>
-
-# Front html tag used for highlightig query terms in the summaries displated
-# in the search results.
->
-
-# Front html tag used for highlightig query terms in the summaries displated
-# in the search results.
->
-
-# How many search results should we scan for related topics (gigabits) per
-# query?
-300>
-
-# Should Gigablast only get one document per IP domain and per domain for
-# topic (gigabit) generation?
-0>
-
-# Should Gigablast remove overlapping topics (gigabits)?
-1>
-
-# What is the number of related topics (gigabits) displayed per query? Set to
-# 0 to save CPU time.
-11>
-
-# Related topics (gigabits) with scores below this will be excluded. Scores
-# range from 0% to over 100%.
-5>
-
-# How many documents must contain the topic (gigabit) for it to be displayed.
-2>
-
-# If a document is this percent similar to another document with a higher
-# score, then it will not contribute to the topic (gigabit) generation.
-80>
-
-# Maximum number of words a topic (gigabit) can have. Affects raw feeds, too.
-6>
-
-# Max chars to sample from each doc for topics (gigabits).
-4096>
-
-# If enabled, results in dmoz will display their categories on the results
-# page.
-1>
-
-# If enabled, results in dmoz will display their indirect categories on the
-# results page.
-0>
-
-# If enabled, a link will appear next to each category on each result allowing
-# the user to perform their query on that entire category.
-0>
-
-# Yes to use DMOZ given title when a page is untitled but is in DMOZ.
-1>
-
-# Yes to always show DMOZ summaries with search results that are in DMOZ.
-1>
-
-# Yes to display the Adult category in the Top category
-0>
-
-# Before downloading the contents of a URL, Gigablast first chains down this
-# list of expressions, starting with expression #0. The first expression
-# it matches is the ONE AND ONLY matching row for that url. It then uses the
-# respider frequency, spider priority, etc. on the MATCHING ROW when spidering
-# that URL. If you specify the expression as default then
-# that MATCHES ALL URLs. URLs with high spider priorities take spidering
-# precedence over URLs with lower spider priorities. The respider frequency
-# dictates how often a URL will be respidered. See the help table below for
-# examples of all the supported expressions. Use the && operator to
-# string multiple expressions together in the same expression text box. A
-# spider priority of DELETE will cause the URL to not be
-# spidered, or if it has already been indexed, it will be deleted when it is
-# respidered.
->
->
->
->
->
->
->
->
->
->
->
->
->
->
->
->
->
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-0.000000>
-0.000000>
-1.000000>
-1.000000>
-1.000000>
-7.000000>
-7.000000>
-7.000000>
-10.000000>
-20.000000>
-20.000000>
-40.000000>
-40.000000>
-60.000000>
-60.000000>
-30.000000>
-30.000000>
-
-# Do not allow more than this many outstanding spiders for all urls in this
-# priority.
-99>
-99>
-1>
-1>
-99>
-4>
-2>
-1>
-2>
-99>
-1>
-99>
-1>
-99>
-1>
-99>
-99>
-
-# Allow this many spiders per IP.
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-1>
-
-# Wait at least this long before downloading urls from the same IP address.
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-1000>
-80>
--3>
-3>
-45>
-85>
-50>
-48>
-49>
-47>
-40>
-39>
-30>
-29>
-20>
-19>
-1>
-0>
diff --git a/html/help.html b/html/help.html
index a011a321..af2b9180 100644
--- a/html/help.html
+++ b/html/help.html
@@ -127,11 +127,14 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
|
|
-
+
|
+
-Copyright © 2013. All rights reserved.
+Copyright © 2014. All rights reserved.
| | |