diff --git a/Collectiondb.cpp b/Collectiondb.cpp index 3f3b7770..eea3b3f9 100644 --- a/Collectiondb.cpp +++ b/Collectiondb.cpp @@ -138,6 +138,19 @@ bool Collectiondb::loadAllCollRecs ( ) { if ( ! addExistingColl ( coll , collnum ) ) return false; } + // if no existing recs added... add coll.main.0 always at startup + if ( m_numRecs == 0 ) { + log("admin: adding main collection."); + addNewColl ( "main", + 0 , // customCrawl , + NULL, + 0 , + true , // bool saveIt , + // Parms.cpp reserves this so it can be sure + // to add the same collnum to every shard + 0 ); + } + // note it //log(LOG_INFO,"db: Loaded data for %li collections. Ranging from " // "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1); @@ -1840,31 +1853,183 @@ void CollectionRec::setUrlFiltersToDefaults ( ) { long n = 0; - //strcpy(m_regExs [n],"default"); + /* m_regExs[n].set("default"); m_regExs[n].nullTerm(); - m_numRegExs++; - m_spiderFreqs [n] = 30; // 30 days default - m_numRegExs2++; - m_spiderPriorities[n] = 0; - m_numRegExs3++; - m_maxSpidersPerRule[n] = 99; - m_numRegExs10++; - m_spiderIpWaits[n] = 1000; - m_numRegExs5++; - m_spiderIpMaxSpiders[n] = 7; - m_numRegExs6++; - - //m_spidersEnabled[n] = 1; - //m_numRegExs7++; - m_harvestLinks[n] = 1; - m_numRegExs8++; + */ + + m_regExs[n].set("isdocidbased"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 0; // 30 days default + m_maxSpidersPerRule [n] = 99; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 80; + n++; + + m_regExs[n].set("ismedia"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 0; // 30 days default + m_maxSpidersPerRule [n] = 99; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = -3; // delete! + n++; + + m_regExs[n].set("errorcount>=3 && hastmperror"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 1; // 30 days default + m_maxSpidersPerRule [n] = 1; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 3; + n++; + + m_regExs[n].set("errorcount>=1 && hastmperror"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 1; // 30 days default + m_maxSpidersPerRule [n] = 1; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 45; + n++; + + m_regExs[n].set("isaddurl"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 7; // 30 days default + m_maxSpidersPerRule [n] = 99; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 85; + n++; + + m_regExs[n].set("hopcount==0 && iswww && isnew"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 7; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 50; + n++; + + m_regExs[n].set("hopcount==0 && iswww"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 7; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 48; + n++; + + m_regExs[n].set("hopcount==0 && isnew"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 7; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 49; + n++; + + m_regExs[n].set("hopcount==0"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 10; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 47; + n++; + + m_regExs[n].set("hopcount==1 && isnew"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 20; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 40; + n++; + + m_regExs[n].set("hopcount==1"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 20; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 39; + n++; + + m_regExs[n].set("hopcount==2 && isnew"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 40; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 30; + n++; + + m_regExs[n].set("hopcount==2"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 40; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 29; + n++; + + m_regExs[n].set("hopcount>=3 && isnew"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 60; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 20; + n++; + + m_regExs[n].set("hopcount>=3"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 60; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 19; + n++; + + m_regExs[n].set("isnew"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 30; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 2; + n++; + + m_regExs[n].set("default"); + m_harvestLinks [n] = 1; + m_spiderFreqs [n] = 30; // 30 days default + m_maxSpidersPerRule [n] = 9; // max spiders + m_spiderIpMaxSpiders [n] = 1; // max spiders per ip + m_spiderIpWaits [n] = 1000; // same ip wait + m_spiderPriorities [n] = 1; + n++; + + + m_numRegExs = n; + m_numRegExs2 = n; + m_numRegExs3 = n; + m_numRegExs10 = n; + m_numRegExs5 = n; + m_numRegExs6 = n; + m_numRegExs8 = n; + + // more rules + + + //m_spiderDiffbotApiNum[n] = 1; //m_numRegExs11++; diff --git a/HttpRequest.h b/HttpRequest.h index 9a425da1..14902d93 100644 --- a/HttpRequest.h +++ b/HttpRequest.h @@ -28,10 +28,14 @@ #include "TcpSocket.h" // values for HttpRequest::m_replyFormat -#define FORMAT_HTML 0 -#define FORMAT_XML 1 -#define FORMAT_JSON 2 -#define FORMAT_CSV 3 +#define FORMAT_HTML 1 +#define FORMAT_XML 2 +#define FORMAT_JSON 3 +#define FORMAT_CSV 4 +#define FORMAT_TXT 5 +#define FORMAT_PROCOG 6 + + class HttpRequest { diff --git a/PageBasic.cpp b/PageBasic.cpp index 2fff37a8..0632aa3a 100644 --- a/PageBasic.cpp +++ b/PageBasic.cpp @@ -156,7 +156,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) { Url u; - for ( ; *pn ; pn++ , lineNum++ ) { + for ( ; *pn ; lineNum++ ) { // get end char *s = pn; @@ -169,6 +169,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) { char *pe = pn; for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- ); + // advance over '\n' for next line + if ( *pn && *pn == '\n' ) pn++; + // make hash of the line long h32 = hash32 ( s , pe - s ); @@ -728,6 +731,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) { char buf [ 128000 ]; SafeBuf sb(buf,128000); + sb.reset(); char *fs = hr->getString("format",NULL,NULL); char fmt = FORMAT_HTML; @@ -761,7 +765,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) { // // show stats // - if ( fmt == FMT_HTML ) { + if ( fmt == FORMAT_HTML ) { char *seedStr = cr->m_diffbotSeeds.getBufStart(); if ( ! seedStr ) seedStr = ""; @@ -773,43 +777,17 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) { long sentAlert = (long)ci->m_sentCrawlDoneAlert; if ( sentAlert ) sentAlert = 1; - sb.safePrintf( + //sb.safePrintf( + // "
" + // "%s" + // , sb.getBufStart() // hidden input token/name/.. + // ); - "" - "%s" - , sb.getBufStart() // hidden input token/name/.. - ); sb.safePrintf("" "
" "" - // - "" - "" - "" - "" - - "" - "" - "" - "" - - //"" - //"" - //"" - //"" - - "" - "" - "" - "" - - "" - "" - "" - "" - "" "" "" @@ -820,10 +798,10 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) { "" "" - "" - "" - "" - "" + //"" + //"" + //"" + //"" "" "" @@ -837,11 +815,6 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) { //"" //"" - "" - "" - "" - "" - "" "" "" @@ -868,37 +841,11 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) { "" "" - "" - "" - "" - "" - - "" - "" - "" - "" - - "" - "" - "" - "" - - - , cr->m_diffbotCrawlName.getBufStart() - - , (long)cr->m_isCustomCrawl - - , cr->m_diffbotToken.getBufStart() - - , seedStr - , crawlStatus , tmp.getBufStart() - , cr->m_spiderRoundNum + //, cr->m_spiderRoundNum , cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider - , cr->m_globalCrawlInfo.m_objectsAdded - - cr->m_globalCrawlInfo.m_objectsDeleted , cr->m_globalCrawlInfo.m_urlsHarvested //, cr->m_globalCrawlInfo.m_urlsConsidered @@ -906,16 +853,13 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) { , cr->m_globalCrawlInfo.m_pageDownloadSuccesses , cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound - , cr->m_globalCrawlInfo.m_pageProcessAttempts - , cr->m_globalCrawlInfo.m_pageProcessSuccesses - , cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound ); } - if ( fmt != FORMAT_JSON ) - // wrap up the form, print a submit button - g_pages.printAdminBottom ( &sb ); + //if ( fmt != FORMAT_JSON ) + // // wrap up the form, print a submit button + // g_pages.printAdminBottom ( &sb ); return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), diff --git a/PageCrawlBot.cpp b/PageCrawlBot.cpp index 5491f045..3f9f3903 100644 --- a/PageCrawlBot.cpp +++ b/PageCrawlBot.cpp @@ -25,11 +25,11 @@ #include "Parms.h" // so user can specify the format of the reply/output -#define FMT_HTML 1 -#define FMT_XML 2 -#define FMT_JSON 3 -#define FMT_CSV 4 -#define FMT_TXT 5 +//#define FMT_HTML 1 +//#define FMT_XML 2 +//#define FMT_JSON 3 +//#define FMT_CSV 4 +//#define FMT_TXT 5 void doneSendingWrapper ( void *state , TcpSocket *sock ) ; bool sendBackDump ( TcpSocket *s,HttpRequest *hr ); @@ -158,25 +158,25 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) { if ( ( xx = strstr ( path , "_data.json" ) ) ) { rdbId = RDB_TITLEDB; - fmt = FMT_JSON; + fmt = FORMAT_JSON; downloadJSON = true; } else if ( ( xx = strstr ( path , "_data.csv" ) ) ) { rdbId = RDB_TITLEDB; downloadJSON = true; - fmt = FMT_CSV; + fmt = FORMAT_CSV; } else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) { rdbId = RDB_SPIDERDB; - fmt = FMT_CSV; + fmt = FORMAT_CSV; } else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) { rdbId = RDB_SPIDERDB; - fmt = FMT_TXT; + fmt = FORMAT_TXT; } else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) { rdbId = RDB_TITLEDB; - fmt = FMT_TXT; + fmt = FORMAT_TXT; } // sanity, must be one of 3 download calls @@ -213,7 +213,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) { // . if doing download of csv, make it search results now! // . make an httprequest on stack and call it - if ( fmt == FMT_CSV && rdbId == RDB_TITLEDB ) { + if ( fmt == FORMAT_CSV && rdbId == RDB_TITLEDB ) { char tmp2[5000]; SafeBuf sb2(tmp2,5000); long dr = 1; @@ -247,7 +247,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) { // . if doing download of json, make it search results now! // . make an httprequest on stack and call it - if ( fmt == FMT_JSON && rdbId == RDB_TITLEDB ) { + if ( fmt == FORMAT_JSON && rdbId == RDB_TITLEDB ) { char tmp2[5000]; SafeBuf sb2(tmp2,5000); long dr = 1; @@ -514,13 +514,13 @@ bool StateCD::sendList ( ) { //sb.setLabel("dbotdmp"); char *ct = "text/csv"; - if ( m_fmt == FMT_JSON ) + if ( m_fmt == FORMAT_JSON ) ct = "application/json"; - if ( m_fmt == FMT_XML ) + if ( m_fmt == FORMAT_XML ) ct = "text/xml"; - if ( m_fmt == FMT_TXT ) + if ( m_fmt == FORMAT_TXT ) ct = "text/plain"; - if ( m_fmt == FMT_CSV ) + if ( m_fmt == FORMAT_CSV ) ct = "text/csv"; // . if we haven't yet sent an http mime back to the user @@ -545,13 +545,13 @@ bool StateCD::sendList ( ) { //CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); - if ( ! m_printedFirstBracket && m_fmt == FMT_JSON ) { + if ( ! m_printedFirstBracket && m_fmt == FORMAT_JSON ) { sb.safePrintf("[\n"); m_printedFirstBracket = true; } // these are csv files not xls - //if ( ! m_printedFirstBracket && m_fmt == FMT_CSV ) { + //if ( ! m_printedFirstBracket && m_fmt == FORMAT_CSV ) { // sb.safePrintf("sep=,\n"); // m_printedFirstBracket = true; //} @@ -638,7 +638,7 @@ bool StateCD::sendList ( ) { // use this for printing out urls.csv as well... m_printedEndingBracket = true; // end array of json objects. might be empty! - if ( m_rdbId == RDB_TITLEDB && m_fmt == FMT_JSON ) + if ( m_rdbId == RDB_TITLEDB && m_fmt == FORMAT_JSON ) sb.safePrintf("\n]\n"); //log("adding ]. len=%li",sb.length()); // i'd like to exit streaming mode here. i fixed tcpserver.cpp @@ -853,7 +853,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){ } // "csv" is default if json not specified - if ( m_fmt == FMT_JSON ) + if ( m_fmt == FORMAT_JSON ) sb->safePrintf("[{" "{\"url\":" "\"%s\"}," @@ -997,7 +997,7 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){ // if not json, just print the json item out in csv // moved into PageResults.cpp... - //if ( m_fmt == FMT_CSV ) { + //if ( m_fmt == FORMAT_CSV ) { // printJsonItemInCsv ( json , sb ); // continue; //} @@ -1337,7 +1337,7 @@ bool sendReply2 (TcpSocket *socket , long fmt , char *msg ) { // send this back to browser SafeBuf sb; - if ( fmt == FMT_JSON ) { + if ( fmt == FORMAT_JSON ) { sb.safePrintf("{\n\"response\":\"success\",\n" "\"message\":\"%s\"\n}\n" , msg ); @@ -1368,7 +1368,7 @@ bool sendErrorReply2 ( TcpSocket *socket , long fmt , char *msg ) { // send this back to browser SafeBuf sb; - if ( fmt == FMT_JSON ) { + if ( fmt == FORMAT_JSON ) { sb.safePrintf("{\"error\":\"%s\"}\n" , msg ); ct = "application/json"; @@ -1476,7 +1476,7 @@ void injectedUrlWrapper ( void *state ) { // send back the html or json response? SafeBuf *response = &sb; - if ( st->m_fmt == FMT_JSON ) response = &js; + if ( st->m_fmt == FORMAT_JSON ) response = &js; // . this will call g_httpServer.sendReply() // . pass it in the injection response, "sb" @@ -1673,7 +1673,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) { // . now show stats for the current crawl // . put in xml or json if format=xml or format=json or // xml=1 or json=1 ... - char fmt = FMT_JSON; + char fmt = FORMAT_JSON; // token is always required. get from json or html form input //char *token = getInputString ( "token" ); @@ -1693,21 +1693,21 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) { name++; } // change default formatting to html - fmt = FMT_HTML; + fmt = FORMAT_HTML; } char *fs = hr->getString("format",NULL,NULL); // give john a json api - if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML; - if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON; - if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML; + if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML; + if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON; + if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML; // if we got json as input, give it as output - //if ( JS.getFirstItem() ) fmt = FMT_JSON; + //if ( JS.getFirstItem() ) fmt = FORMAT_JSON; - if ( ! token && fmt == FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) { + if ( ! token && fmt == FORMAT_JSON ) { // (cast==0|| fmt == FORMAT_JSON ) ) { char *msg = "invalid token"; return sendErrorReply2 (socket,fmt,msg); } @@ -1772,7 +1772,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) { //} // just send back a list of all the collections after the delete - //if ( delColl && cast && fmt == FMT_JSON ) { + //if ( delColl && cast && fmt == FORMAT_JSON ) { // char *msg = "Collection deleted."; // return sendReply2 (socket,fmt,msg); //} @@ -2263,7 +2263,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) { /* bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) { - if ( fmt == FMT_JSON ) + if ( fmt == FORMAT_JSON ) sb.safePrintf("\"urlFilters\":["); // skip first filters that are: @@ -2303,7 +2303,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) { // urls higher spider priority, so skip it if ( strncmp(expression,"ismanualadd && ",15) == 0 ) continue; - if ( fmt == FMT_HTML ) { + if ( fmt == FORMAT_HTML ) { sb.safePrintf("" "
Crawl Name:%s
Crawl Type:%li
Collection Alias:%s%s
Token:%s
Seeds:%s
Crawl Status:%li%s
Rounds Completed:%li
Rounds Completed:%li
Has Urls Ready to Spider:%lli
Objects Found%lli
URLs Harvested (inc. dups)%lli%lli
Page Process Attempts%lli
Page Process Successes%lli
Page Process Successes This Round%lli
Expression " "safePrintf("}\n"); @@ -2537,7 +2537,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // store output into here SafeBuf sb; - if ( fmt == FMT_HTML ) + if ( fmt == FORMAT_HTML ) sb.safePrintf( "" "Crawlbot - " @@ -2573,7 +2573,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , lb.urlEncode(name); lb.safePrintf ("&token="); lb.urlEncode(token); - if ( fmt == FMT_HTML ) lb.safePrintf("&format=html"); + if ( fmt == FORMAT_HTML ) lb.safePrintf("&format=html"); lb.nullTerm(); @@ -2590,7 +2590,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , //} - if ( fmt == FMT_HTML ) { + if ( fmt == FORMAT_HTML ) { sb.safePrintf("<table border=0>" "<tr><td>" "<b><font size=+2>" @@ -2645,7 +2645,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // // print list of collections controlled by this token // - for ( long i = 0 ; fmt == FMT_HTML && i<g_collectiondb.m_numRecs;i++ ){ + for ( long i = 0 ; fmt == FORMAT_HTML && i<g_collectiondb.m_numRecs;i++ ){ CollectionRec *cx = g_collectiondb.m_recs[i]; if ( ! cx ) continue; // get its token if any @@ -2677,19 +2677,19 @@ bool printCrawlBotPage2 ( TcpSocket *socket , sb.safePrintf("</font></b>"); } - if ( fmt == FMT_HTML ) + if ( fmt == FORMAT_HTML ) sb.safePrintf ( "</center><br/>" ); // the ROOT JSON [ - if ( fmt == FMT_JSON ) + if ( fmt == FORMAT_JSON ) sb.safePrintf("{\n"); // injection is currently not in use, so this is an artifact: - if ( fmt == FMT_JSON && injectionResponse ) + if ( fmt == FORMAT_JSON && injectionResponse ) sb.safePrintf("\"response\":\"%s\",\n\n" , injectionResponse->getBufStart() ); - if ( fmt == FMT_JSON && urlUploadResponse ) + if ( fmt == FORMAT_JSON && urlUploadResponse ) sb.safePrintf("\"response\":\"%s\",\n\n" , urlUploadResponse->getBufStart() ); @@ -2702,14 +2702,14 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // the items in the array now have type:bulk or type:crawl // so call them 'jobs' - if ( fmt == FMT_JSON ) + if ( fmt == FORMAT_JSON ) sb.safePrintf("\"jobs\":[");//\"collections\":"); long summary = hr->getLong("summary",0); // enter summary mode for json - if ( fmt != FMT_HTML ) summary = 1; + if ( fmt != FORMAT_HTML ) summary = 1; // start the table - if ( summary && fmt == FMT_HTML ) { + if ( summary && fmt == FORMAT_HTML ) { sb.safePrintf("<table border=1 cellpadding=5>" "<tr>" "<td><b>Collection</b></td>" @@ -2740,11 +2740,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // just print out single crawl info for json - if ( fmt != FMT_HTML && cx != cr && name3 ) + if ( fmt != FORMAT_HTML && cx != cr && name3 ) continue; // if json, print each collectionrec - if ( fmt == FMT_JSON ) { + if ( fmt == FORMAT_JSON ) { if ( ! firstOne ) sb.safePrintf(",\n\t"); firstOne = false; @@ -2786,7 +2786,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , , cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound ); } - if ( summary && fmt == FMT_HTML ) { + if ( summary && fmt == FORMAT_HTML ) { sb.safePrintf("</table></html>" ); return g_httpServer.sendDynamicPage (socket, sb.getBufStart(), @@ -2794,7 +2794,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , 0); // cachetime } - if ( fmt == FMT_JSON ) + if ( fmt == FORMAT_JSON ) // end the array of collection objects sb.safePrintf("\n]\n"); @@ -2808,7 +2808,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // // show urls being crawled (ajax) (from Spider.cpp) // - if ( fmt == FMT_HTML ) { + if ( fmt == FORMAT_HTML ) { sb.safePrintf ( "<table width=100%% cellpadding=5 " "style=border-width:1px;border-style:solid;" "border-color:black;>" @@ -2879,7 +2879,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , rand64 |= r2; - if ( fmt == FMT_HTML ) { + if ( fmt == FORMAT_HTML ) { sb.safePrintf("<br>" "<table border=0 cellpadding=5>" @@ -2952,12 +2952,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket , ); } - if ( injectionResponse && fmt == FMT_HTML ) + if ( injectionResponse && fmt == FORMAT_HTML ) sb.safePrintf("<br><font size=-1>%s</font>\n" ,injectionResponse->getBufStart() ); - if ( fmt == FMT_HTML ) + if ( fmt == FORMAT_HTML ) sb.safePrintf(//"<input type=hidden name=c value=\"%s\">" //"<input type=hidden name=crawlbotapi value=1>" "</td>" @@ -2996,7 +2996,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // // show stats // - if ( fmt == FMT_HTML ) { + if ( fmt == FORMAT_HTML ) { char *seedStr = cr->m_diffbotSeeds.getBufStart(); if ( ! seedStr ) seedStr = ""; @@ -3654,7 +3654,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // xml or json does not show the input boxes - //if ( format != FMT_HTML ) + //if ( format != FORMAT_HTML ) // return g_httpServer.sendDynamicPage ( s, // sb.getBufStart(), // sb.length(), @@ -3677,7 +3677,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , s2 = ""; } - if ( fmt == FMT_HTML ) + if ( fmt == FORMAT_HTML ) sb.safePrintf( "<a onclick=" @@ -3721,7 +3721,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // // print url filters. HACKy... // - if ( fmt == FMT_HTML ) + if ( fmt == FORMAT_HTML ) g_parms.sendPageGeneric ( socket , hr , PAGE_FILTERS , @@ -3732,7 +3732,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // // end HACKy hack // - if ( fmt == FMT_HTML ) + if ( fmt == FORMAT_HTML ) sb.safePrintf( "</form>" "</div>" @@ -3760,7 +3760,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // // show simpler url filters table // - if ( fmt == FMT_HTML ) { + if ( fmt == FORMAT_HTML ) { /* sb.safePrintf ( "<table>" "<tr><td colspan=2>" @@ -3796,7 +3796,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // // show reset and delete crawl buttons // - if ( fmt == FMT_HTML ) { + if ( fmt == FORMAT_HTML ) { sb.safePrintf( "<table cellpadding=5>" "<tr>" @@ -3859,13 +3859,13 @@ bool printCrawlBotPage2 ( TcpSocket *socket , // the ROOT JSON } - if ( fmt == FMT_JSON ) + if ( fmt == FORMAT_JSON ) sb.safePrintf("}\n"); char *ct = "text/html"; - if ( fmt == FMT_JSON ) ct = "application/json"; - if ( fmt == FMT_XML ) ct = "text/xml"; - if ( fmt == FMT_CSV ) ct = "text/csv"; + if ( fmt == FORMAT_JSON ) ct = "application/json"; + if ( fmt == FORMAT_XML ) ct = "text/xml"; + if ( fmt == FORMAT_CSV ) ct = "text/csv"; // this could be in html json or xml return g_httpServer.sendDynamicPage ( socket, @@ -4142,7 +4142,7 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket , char *json = hr->getString("json"); if ( ! json ) return sendReply2 ( socket, - FMT_JSON, + FORMAT_JSON, "No &json= provided in request."); @@ -4151,12 +4151,12 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket , // wtf? if ( ! status ) - return sendReply2 ( socket, FMT_JSON, + return sendReply2 ( socket, FORMAT_JSON, "Error with JSON parser."); // error adding it? if ( ! cr ) - return sendReply2 ( socket,FMT_JSON, + return sendReply2 ( socket,FORMAT_JSON, "Failed to create new collection."); ji = JP.getFirstItem(); diff --git a/PageRoot.cpp b/PageRoot.cpp index 29058b36..0654f8b2 100644 --- a/PageRoot.cpp +++ b/PageRoot.cpp @@ -169,6 +169,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) { sb.safePrintf("</form>\n"); sb.safePrintf("<br>\n"); sb.safePrintf("\n"); + + // print any red boxes we might need to + if ( printRedBox2 ( &sb , true ) ) + sb.safePrintf("<br>\n"); + sb.safePrintf("<table cellpadding=3>\n"); sb.safePrintf("\n"); diff --git a/Pages.cpp b/Pages.cpp index 9af16001..1773dcbc 100644 --- a/Pages.cpp +++ b/Pages.cpp @@ -50,6 +50,9 @@ static WebPage s_pages[] = { "dummy page - if set in the users row then user will have master=0 and " " collection links will be highlighted in red", NULL, 0 }, + + + //{ PAGE_QUALITY , "quality", 0, "quality", 0, 0, // "dummy page - if set in the users row then \"Quality Control\"" // " will be printed besides the logo for certain pages", @@ -102,12 +105,66 @@ static WebPage s_pages[] = { // "Basic diffbot page.", sendPageBasicDiffbot , 0 } , { PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 , "Basic security page.", sendPageGeneric , 0 } , + { PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 , + "Basic search page.", sendPageRoot , 0 } , + { PAGE_MASTER , "admin/master" , 0 , "master controls" , 1 , 0 , //USER_MASTER | USER_PROXY , "master controls page", sendPageGeneric , 0 } , + { PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1, + //USER_ADMIN | USER_MASTER , + "search controls page", + sendPageGeneric , 0 } , + { PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0, + //USER_ADMIN | USER_MASTER | USER_PROXY , + "spider controls page", + sendPageGeneric , 0 } , + { PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 , + //USER_MASTER | USER_PROXY, + "log page", + sendPageGeneric , 0 } , + { PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 , + //USER_MASTER | USER_PROXY , + "advanced security page", + sendPageGeneric , 0 } , + { PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 , + //USER_MASTER , + "add a new collection using this page", + sendPageAddColl , 0 } , + { PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0, + //USER_MASTER , + "delete a collection using this page", + sendPageDelColl , 0 } , + { PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 , + //USER_MASTER , + "repair page", + sendPageGeneric , 0 }, + { PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1, + "what sites can be spidered", + sendPageGeneric , 0 } , // sendPageBasicSettings + { PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1, + //USER_ADMIN | USER_MASTER , + "prioritize urls for spidering", + sendPageGeneric , 0 } , + { PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 , + //USER_ADMIN | USER_MASTER , + "inject url in the index here", + sendPageInject , 2 } , + // this is the addurl page the the admin! + { PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 , + "add url page for admin", + sendPageAddUrl2 , 0 } , + { PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 , + //USER_ADMIN | USER_MASTER, + "reindex url page", + sendPageReindex , 0 } , + + + + { PAGE_HOSTS , "admin/hosts" , 0 , "hosts" , 0 , 0 , //USER_MASTER | USER_PROXY, @@ -134,10 +191,7 @@ static WebPage s_pages[] = { //USER_MASTER | USER_PROXY, "sockets page", sendPageSockets , 0 } , - { PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 , - //USER_MASTER | USER_PROXY, - "log page", - sendPageGeneric , 0 } , + { PAGE_LOGVIEW , "admin/logview" , 0 , "log view" , 0 , 0 , //USER_MASTER , "logview page", @@ -147,18 +201,6 @@ static WebPage s_pages[] = { // "sync page", // sendPageGeneric , 0 } , - { PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 , - //USER_MASTER | USER_PROXY , - "advanced security page", - sendPageGeneric , 0 } , - { PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 , - //USER_MASTER , - "add a new collection using this page", - sendPageAddColl , 0 } , - { PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0, - //USER_MASTER , - "delete a collection using this page", - sendPageDelColl , 0 } , { PAGE_AUTOBAN ,"admin/autoban" , 0 , "autoban" , 1 , 1 , //USER_MASTER | USER_PROXY , "autobanned ips", @@ -175,10 +217,6 @@ static WebPage s_pages[] = { //USER_MASTER , "threads page", sendPageThreads , 0 }, - { PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 , - //USER_MASTER , - "repair page", - sendPageGeneric , 0 }, //{ PAGE_THESAURUS, "admin/thesaurus", 0 , "thesaurus", 0 , 0 , // //USER_MASTER , // "thesaurus page", @@ -207,14 +245,6 @@ static WebPage s_pages[] = { "titledb page", sendPageTitledb , 2 } , // 1 = usePost - { PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1, - //USER_ADMIN | USER_MASTER , - "search controls page", - sendPageGeneric , 0 } , - { PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0, - //USER_ADMIN | USER_MASTER | USER_PROXY , - "spider controls page", - sendPageGeneric , 0 } , { PAGE_CRAWLBOT , "crawlbot" , 0 , "crawlbot" , 1 , 0, "simplified spider controls page", @@ -229,30 +259,6 @@ static WebPage s_pages[] = { // "spider priorities page", // sendPageGeneric , 0 } , - { PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1, - "what sites can be spidered", - sendPageGeneric , 0 } , // sendPageBasicSettings - - { PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1, - //USER_ADMIN | USER_MASTER , - "prioritize urls for spidering", - sendPageGeneric , 0 } , - - { PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 , - //USER_ADMIN | USER_MASTER , - "inject url in the index here", - sendPageInject , 2 } , - - // this is the addurl page the the admin! - { PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 , - "add url page for admin", - sendPageAddUrl2 , 0 } , - - { PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 , - //USER_ADMIN | USER_MASTER, - "reindex url page", - sendPageReindex , 0 } , - //{ PAGE_KEYWORDS, "admin/queries",0,"queries" , 0 , 1 , // "get queries a url matches", // sendPageMatchingQueries , 2 } , @@ -893,8 +899,6 @@ bool Pages::getNiceness ( long page ) { return s_pages[page].m_niceness; } -bool printRedBox ( SafeBuf *mb ) ; - /////////////////////////////////////////////////////////// // // Convenient html printing routines @@ -1056,6 +1060,7 @@ bool Pages::printAdminTop (SafeBuf *sb , //if ( page == PAGE_BASIC_DIFFBOT ) isBasic = true; //if ( page == PAGE_BASIC_SEARCH ) isBasic = true; if ( page == PAGE_BASIC_SECURITY ) isBasic = true; + if ( page == PAGE_BASIC_SEARCH ) isBasic = true; // // print breadcrumb. main > Basic > Settings @@ -1791,7 +1796,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb, // is this page basic? bool pageBasic = false; if ( i >= PAGE_BASIC_SETTINGS && - i <= PAGE_BASIC_SECURITY ) + i <= PAGE_BASIC_SEARCH ) pageBasic = true; // print basic pages under the basic menu, advanced pages @@ -2627,9 +2632,18 @@ bool sendPageLogin ( TcpSocket *socket , HttpRequest *hr ) { NULL);// cookie } +bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage ) { + SafeBuf mb; + // return false if no red box + if ( ! printRedBox ( &mb , isRootWebPage ) ) return false; + // otherwise, print it + sb->safeStrcpy ( mb.getBufStart() ); + // return true since we printed one + return true; +} // emergency message box -bool printRedBox ( SafeBuf *mb ) { +bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) { PingServer *ps = &g_pingServer; @@ -2649,11 +2663,33 @@ bool printRedBox ( SafeBuf *mb ) { char *boxEnd = "</td></tr></table>"; - bool adds = false; + long adds = 0; mb->safePrintf("<div style=max-width:500px;>"); + // are we just starting off? give them a little help. + CollectionRec *cr = g_collectiondb.getRec("main"); + if ( g_collectiondb.m_numRecs == 1 && + cr && + isRootWebPage && + cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) { + if ( adds ) mb->safePrintf("<br>"); + adds++; + mb->safePrintf("%s",box); + mb->safePrintf("Welcome to Gigablast. The most powerful " + "search engine you can legally download. " + "Please add the websites you want to spider " + "<a href=/admin/settings?c=main>here</a>." + ); + mb->safePrintf("%s",boxEnd); + } + + if ( isRootWebPage ) { + mb->safePrintf("</div>"); + return (bool)adds; + } + if ( g_conf.m_numConnectIps == 0 && g_conf.m_numMasterPwds == 0 ) { if ( adds ) mb->safePrintf("<br>"); adds++; @@ -2738,5 +2774,5 @@ bool printRedBox ( SafeBuf *mb ) { mb->safePrintf("</div>"); - return adds; + return (bool)adds; } diff --git a/Pages.h b/Pages.h index 281fe1cc..4dde73af 100644 --- a/Pages.h +++ b/Pages.h @@ -5,6 +5,9 @@ #ifndef _PAGES_H_ #define _PAGES_H_ +bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage = false ) ; +bool printRedBox ( SafeBuf *mb , bool isRootWebPage = false ) ; + // for PageEvents.cpp and Accessdb.cpp //#define RESULTSWIDTHSTR "550px" @@ -304,25 +307,36 @@ enum { //PAGE_BASIC_SEARCH , // TODO //PAGE_BASIC_DIFFBOT , // TODO PAGE_BASIC_SECURITY , + PAGE_BASIC_SEARCH , // master admin pages PAGE_MASTER , + PAGE_SEARCH , + PAGE_SPIDER , + PAGE_LOG , + PAGE_SECURITY , + PAGE_ADDCOLL , + PAGE_DELCOLL , + PAGE_REPAIR , + PAGE_SITES , // site filters + PAGE_FILTERS , + PAGE_INJECT , + PAGE_ADDURL2 , + PAGE_REINDEX , + PAGE_HOSTS , PAGE_STATS , // 10 PAGE_STATSDB , PAGE_PERF , PAGE_SOCKETS , - PAGE_LOG , + PAGE_LOGVIEW , // PAGE_SYNC , - PAGE_SECURITY , - PAGE_ADDCOLL , - PAGE_DELCOLL , PAGE_AUTOBAN , // 20 //PAGE_SPIDERLOCKS , PAGE_PROFILER , PAGE_THREADS , - PAGE_REPAIR , + // PAGE_THESAURUS , // . non master-admin pages (collection controls) @@ -335,16 +349,9 @@ enum { PAGE_TITLEDB , //PAGE_STATSDB , - PAGE_SEARCH , - PAGE_SPIDER , PAGE_CRAWLBOT , // 35 PAGE_SPIDERDB , //PAGE_PRIORITIES , // priority queue controls - PAGE_SITES , // site filters - PAGE_FILTERS , - PAGE_INJECT , - PAGE_ADDURL2 , - PAGE_REINDEX , //PAGE_KEYWORDS , PAGE_SEO , PAGE_ACCESS , //40 diff --git a/Parms.cpp b/Parms.cpp index 962003aa..2260bee3 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -1888,7 +1888,7 @@ bool Parms::printParm ( SafeBuf* sb, "value=\"%f\" " // 3 was ok on firefox but need 6 // on chrome - "size=6>",cgi,*(float *)s); + "size=7>",cgi,*(float *)s); } else if ( t == TYPE_IP ) { if ( m->m_max > 0 && j == jend ) @@ -1896,7 +1896,7 @@ bool Parms::printParm ( SafeBuf* sb, "size=12>",cgi); else sb->safePrintf ("<input type=text name=%s value=\"%s\" " - "size=6>",cgi,iptoa(*(long *)s)); + "size=12>",cgi,iptoa(*(long *)s)); } else if ( t == TYPE_LONG ) { // just show the parm name and value if printing in json @@ -7534,6 +7534,7 @@ void Parms::init ( ) { m->m_flags = PF_TEXTAREA; m++; + /* // the new upload post submit button m->m_title = "upload urls"; m->m_desc = "Upload your file of urls."; @@ -7542,6 +7543,7 @@ void Parms::init ( ) { m->m_obj = OBJ_NONE; m->m_type = TYPE_FILEUPLOADBUTTON; m++; + */ m->m_title = "strip sessionids"; m->m_desc = "Strip added urls of their session ids."; @@ -7591,6 +7593,7 @@ void Parms::init ( ) { m->m_title = "site list"; m->m_xml = "siteList"; m->m_desc = "List of sites to spider, one per line. " + "See <a href=#examples>example site list</a> below. " "Gigablast uses the " "<a href=/admin/filters#insitelist>insitelist</a> " "directive on " @@ -7599,8 +7602,7 @@ void Parms::init ( ) { "that match the site patterns you specify here, other than " "urls you add individually via the add urls or inject url " "tools. " - "See <a href=#examples>example site list</a> below. " - "Limit list to 300MB. If you have a lot of INDIVIDUAL URLS " + "Limit list to 300MB. If you have a lot of INDIVIDUAL urls " "to add then consider using the <a href=/admin/addurl>add " "urls</a> interface."; m->m_cgi = "sitelist"; @@ -7629,6 +7631,7 @@ void Parms::init ( ) { m++; */ + /* // the new upload post submit button m->m_title = "upload site list"; m->m_desc = "Upload your file of site patterns. Completely replaces " @@ -7640,12 +7643,13 @@ void Parms::init ( ) { m->m_type = TYPE_FILEUPLOADBUTTON; m->m_flags = PF_NOSAVE | PF_DUP; m++; + */ m->m_title = "restart collection"; - m->m_desc = "Remove all documents from this collection and starts " - "spidering over again. If you do this accidentally there " - "is a <a href=/admin.html#recover>recovery procedure</a> to " - "get back the trashed data."; + m->m_desc = "Remove all documents from this collection and restart " + "spidering.";// If you do this accidentally there " + //"is a <a href=/admin.html#recover>recovery procedure</a> to " + // "get back the trashed data."; m->m_cgi = "restart"; m->m_page = PAGE_BASIC_SETTINGS; m->m_obj = OBJ_COLL; @@ -7659,6 +7663,7 @@ void Parms::init ( ) { m->m_title = "site list"; m->m_xml = "siteList"; m->m_desc = "List of sites to spider, one per line. " + "See <a href=#examples>example site list</a> below. " "Gigablast uses the " "<a href=/admin/filters#insitelist>insitelist</a> " "directive on " @@ -7667,8 +7672,7 @@ void Parms::init ( ) { "that match the site patterns you specify here, other than " "urls you add individually via the add urls or inject url " "tools. " - "See <a href=#examples>example site list</a> below. " - "Limit list to 300MB. If you have a lot of INDIVIDUAL URLS " + "Limit list to 300MB. If you have a lot of INDIVIDUAL urls " "to add then consider using the <a href=/admin/addurl>addurl" "</a> interface."; m->m_cgi = "sitelist"; @@ -8762,11 +8766,11 @@ void Parms::init ( ) { m++; m->m_title = "max robots.txt cache age"; - m->m_desc = "How many second to cache a robots.txt file for. " + m->m_desc = "How many seconds to cache a robots.txt file for. " "86400 is 1 day. 0 means Gigablast will not read from the " "cache at all and will download the robots.txt before every " "page if robots.txt use is enabled above. However, if this is " - "0 then Gigablast will still store robots.txt files into the " + "0 then Gigablast will still store robots.txt files in the " "cache."; m->m_cgi = "mrca"; m->m_off = (char *)&cr.m_maxRobotsCacheAge - x; @@ -10639,8 +10643,9 @@ void Parms::init ( ) { m++; m->m_title = "do query expansion"; - m->m_desc = "Query expansion will include word stems and synonyms in " - "its search results."; + m->m_desc = "If enabled, query expansion will expand your query " + "to include word stems and " + "synonyms of the query terms."; m->m_def = "1"; m->m_off = (char *)&cr.m_queryExpansion - x; m->m_soff = (char *)&si.m_queryExpansion - y; @@ -10653,7 +10658,7 @@ void Parms::init ( ) { // more general parameters m->m_title = "max search results"; - m->m_desc = "What is the limit to the total number " + m->m_desc = "What is the maximum total number " "of returned search results."; m->m_cgi = "msr"; m->m_off = (char *)&cr.m_maxSearchResults - x; @@ -12457,7 +12462,7 @@ void Parms::init ( ) { m++; m->m_title = "max summary line width"; - m->m_desc = "<br> tags are inserted to keep the number " + m->m_desc = "<br> tags are inserted to keep the number " "of chars in the summary per line at or below this width. " "Strings without spaces that exceed this " "width are not split."; diff --git a/Sections.cpp b/Sections.cpp index 6a373db5..f675e964 100644 --- a/Sections.cpp +++ b/Sections.cpp @@ -15163,7 +15163,7 @@ bool Sections::printVotingInfoInJSON ( SafeBuf *sb ) { // breathe QUICKPOLL ( m_niceness ); // print this section - printSectionDiv ( sk , FMT_JSON ); // forProCog ); + printSectionDiv ( sk , FORMAT_JSON ); // forProCog ); // advance long b = sk->m_b; // stop if last @@ -15190,7 +15190,8 @@ bool Sections::print2 ( SafeBuf *sbuf , HashTableX *st2 , HashTableX *tt , Addresses *aa , - char format ) { // bool forProCog ){//FMT_PROCOG FMT_JSON HTML + char format ) { // bool forProCog ){ + //FORMAT_PROCOG FORMAT_JSON HTML //sbuf->safePrintf("<b>Sections in Document</b>\n"); @@ -15244,7 +15245,7 @@ bool Sections::print2 ( SafeBuf *sbuf , sk = m_sectionPtrs[b]; } - if ( format != FMT_HTML ) return true; // forProCog + if ( format != FORMAT_HTML ) return true; // forProCog // print header char *hdr = @@ -15553,7 +15554,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog // m_sbuf->safePrintf("A=%li ",sk->m_a); - if ( format == FMT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) { + if ( format == FORMAT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) { // do not count our own site! m_sbuf->safePrintf("<i>" "<font size=-1>" @@ -15573,7 +15574,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog m_sbuf->safePrintf("<i>"); - if ( format == FMT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) { + if ( format == FORMAT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) { sec_t f = sk->m_flags; //if ( f & SEC_SENTENCE ) // m_sbuf->safePrintf("sentence " ); @@ -15598,7 +15599,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog // m_sbuf->safePrintf("notdupvotes=%li ", // sk->m_votesForNotDup); - if ( format != FMT_PROCOG ) { + if ( format != FORMAT_PROCOG ) { // print the flags m_sbuf->safePrintf("A=%li ",sk->m_a); diff --git a/Spider.cpp b/Spider.cpp index e513ee8b..635bc820 100644 --- a/Spider.cpp +++ b/Spider.cpp @@ -12414,7 +12414,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) { if ( cx->m_isCustomCrawl ) return msg->safePrintf("Job is in progress."); else - return true; + return msg->safePrintf("Spider is in progress."); } // pattern is a ||-separted list of substrings diff --git a/coll.main.0/coll.conf b/coll.main.0/coll.conf deleted file mode 100644 index 6d1eb7a3..00000000 --- a/coll.main.0/coll.conf +++ /dev/null @@ -1,416 +0,0 @@ -# List of sites to spider, one per line. Gigablast uses the <a -# href=/admin/filters#insitelist>insitelist</a> directive on the <a -# href=/admin/filters>url filters</a> page to make sure that the spider only -# indexes urls that match the site patterns you specify here, other than urls -# you add individually via the add urls or inject url tools. See <a -# href=#examples>example site list</a> below. Limit list to 300MB. If you have -# a lot of INDIVIDUAL URLS to add then consider using the <a -# href=/admin/addurl>addurl</a> interface. -<siteList><![CDATA[]]></> - -# All <, >, " and # characters that are values for a field contained herein -# must be represented as <, >, " and # respectively. - -# Controls just the spiders for this collection. -<spideringEnabled>1</> - -# What is the maximum number of web pages the spider is allowed to download -# simultaneously PER HOST for THIS collection? -<maxSpiders>100</> - -# make each spider wait this many milliseconds before getting the ip and -# downloading the page. -<spiderDelayInMilliseconds>0</> - -# If this is true Gigablast will respect the robots.txt convention. -<useRobotstxt>1</> - -# How many second to cache a robots.txt file for. 86400 is 1 day. 0 means -# Gigablast will not read from the cache at all and will download the -# robots.txt before every page if robots.txt use is enabled above. However, if -# this is 0 then Gigablast will still store robots.txt files into the cache. -<maxRobotstxtCacheAge>86400</> - -# Do a tight merge on posdb and titledb at this time every day. This is -# expressed in MINUTES past midnight UTC. UTC is 5 hours ahead of EST and 7 -# hours ahead of MST. Leave this as -1 to NOT perform a daily merge. To merge -# at midnight EST use 60*5=300 and midnight MST use 60*7=420. -<dailyMergeTime>-1</> - -# Comma separated list of days to merge on. Use 0 for Sunday, 1 for Monday, -# ... 6 for Saturday. Leaving this parmaeter empty or without any numbers will -# make the daily merge happen every day -<dailyMergeDays><![CDATA[0]]></> - -# When the daily merge was last kicked off. Expressed in UTC in seconds since -# the epoch. -<dailyMergeLastStarted>-1</> - -# If this is true, users will have to pass a simple Turing test to add a url. -# This prevents automated url submission. -<turingTestEnabled>0</> - -# Maximum number of urls that can be submitted via the addurl interface, per -# IP domain, per 24 hour period. A value less than or equal to zero implies no -# limit. -<maxAddUrls>0</> - -# When the spider round started -<spiderRoundStartTime>0</> - -# The spider round number. -<spiderRoundNum>0</> - -# When enabled, the spider will discard web pages which are identical to other -# web pages that are already in the index. However, root urls, urls that have -# no path, are never discarded. It most likely has to hit disk to do these -# checks so it does cause some slow down. Only use it if you need it. -<dedupingEnabled>0</> - -# When enabled, the spider will discard web pages which, when a www is -# prepended to the page's url, result in a url already in the index. -<dedupingEnabledForWww>1</> - -# Detect and do not index pages which have a 200 status code, but are likely -# to be error pages. -<detectCustomErrorPages>1</> - -# Should pages be removed from the index if they are no longer accessible on -# the web? -<delete404s>1</> - -# If this is true, the spider, when a url redirects to a "simpler" url, will -# add that simpler url into the spider queue and abandon the spidering of the -# current url. -<useSimplifiedRedirects>1</> - -# If this is true, the spider, when updating a web page that is already in the -# index, will not even download the whole page if it hasn't been updated since -# the last time Gigablast spidered it. This is primarily a bandwidth saving -# feature. It relies on the remote webserver's returned Last-Modified-Since -# field being accurate. -<useIfModifiedSince>0</> - -# If this is true, do not allow spammy inlinks to vote. This check is too -# aggressive for some collections, i.e. it does not allow pages with cgi in -# their urls to vote. -<doLinkSpamChecking>1</> - -# If this is true Gigablast will only allow one vote per the top 2 significant -# bytes of the IP address. Otherwise, multiple pages from the same top IP can -# contribute to the link text and link-based quality ratings of a particular -# URL. Furthermore, no votes will be accepted from IPs that have the same top -# 2 significant bytes as the IP of the page being indexed. -<restrictLinkVotingByIp>1</> - -# How often should Gigablast recompute the link info for a url. Also applies -# to getting the quality of a site or root url, which is based on the link -# info. In days. Can use decimals. 0 means to update the link info every time -# the url's content is re-indexed. If the content is not reindexed because it -# is unchanged then the link info will not be updated. When getting the link -# info or quality of the root url from an external cluster, Gigablast will -# tell the external cluster to recompute it if its age is this or higher. -<updateLinkInfoFrequency>60.000000</> - -# If this is eabled the spider will not allow any docs which are determined to -# be serps. -<doSerpDetection>1</> - -# If this is false then the filter will not be used on html or text pages. -<applyFilterToTextPages>0</> - -# Program to spawn to filter all HTTP replies the spider receives. Leave blank -# for none. -<filterName><![CDATA[]]></> - -# Kill filter shell after this many seconds. Assume it stalled permanently. -<filterTimeout>40</> - -# Retrieve pages from the proxy at this IP address. -<proxyIp>0.0.0.0</> - -# Retrieve pages from the proxy on this port. -<proxyPort>0</> - -# Index the body of the documents so you can search it. Required for searching -# that. You wil pretty much always want to keep this enabled. -<indexBody>1</> - -# Send every spidered url to this diffbot.com by appending a &url=<url> to it -# before trinyg to downloading it. We expect get get back a JSON reply which -# we index. You will need to supply your token to this as well. -<diffbotApiUrl><![CDATA[]]></> - -# Get scoring information for each result so you can see how each result is -# scored? You must explicitly request this using &scores=1 for the XML feed -# because it is not included by default. -<getDocidScoringInfo>1</> - -# Query expansion will include word stems and synonyms in its search results. -<doQueryExpansion>1</> - -# What is the limit to the total number of returned search results. -<maxSearchResults>1000</> - -# What is the limit to the total number of returned search results per query? -<maxSearchResultsPerQuery>100</> - -# What is the maximum number of characters allowed in titles displayed in the -# search results? -<maxTitleLen>80</> - -# Should search results be site clustered by default? -<siteClusterByDefault>1</> - -# Hide all clustered results instead of displaying two results from each site. -<hideAllClusteredResults>0</> - -# Should duplicate search results be removed by default? -<dedupResultsByDefault>1</> - -# Should we dedup URLs with case insensitivity? This is mainly to correct -# duplicate wiki pages. -<dedupURLs>0</> - -# If document summary is this percent similar to a document summary above it, -# then remove it from the search results. 100 means only to remove if exactly -# the same. 0 means no summary deduping. -<percentSimilarDedupSummary>90</> - -# Sets the number of lines to generate for summary deduping. This is to help -# the deduping process not thorw out valid summaries when normally displayed -# summaries are smaller values. Requires percent similar dedup summary to be -# enabled. -<numberOfLinesToUseInSummaryToDedup>4</> - -# Default language to use for ranking results. Value should be any language -# abbreviation, for example "en" for English. -<sortLanguagePreference><![CDATA[en]]></> - -# Default country to use for ranking results. Value should be any country code -# abbreviation, for example "us" for United States. -<sortCountryPreference><![CDATA[us]]></> - -# What is the maximum number of characters displayed in a summary for a search -# result? -<maxSummaryLen>512</> - -# What is the maximum number of excerpts displayed in the summary of a search -# result? -<maxSummaryExcerpts>4</> - -# What is the maximum number of characters allowed per summary excerpt? -<maxSummaryExcerptLength>300</> - -# What is the default number of summary excerpts displayed per search result? -<defaultNumberOfSummaryExcerpts>3</> - -# <br> tags are inserted to keep the number of chars in the summary per line -# at or below this width. Strings without spaces that exceed this width are -# not split. -<maxSummaryLineWidth>80</> - -# Truncating this will miss out on good summaries, but performance will -# increase. -<bytesOfDocToScanForSummaryGeneration>70000</> - -# Front html tag used for highlightig query terms in the summaries displated -# in the search results. -<frontHighlightTag><![CDATA[<b style="color:black;background-color:#ffff66">]]></> - -# Front html tag used for highlightig query terms in the summaries displated -# in the search results. -<backHighlightTag><![CDATA[</b>]]></> - -# How many search results should we scan for related topics (gigabits) per -# query? -<docsToScanForTopics>300</> - -# Should Gigablast only get one document per IP domain and per domain for -# topic (gigabit) generation? -<ipRestrictionForTopics>0</> - -# Should Gigablast remove overlapping topics (gigabits)? -<removeOverlappingTopics>1</> - -# What is the number of related topics (gigabits) displayed per query? Set to -# 0 to save CPU time. -<numberOfRelatedTopics>11</> - -# Related topics (gigabits) with scores below this will be excluded. Scores -# range from 0% to over 100%. -<minTopicsScore>5</> - -# How many documents must contain the topic (gigabit) for it to be displayed. -<minTopicDocCount>2</> - -# If a document is this percent similar to another document with a higher -# score, then it will not contribute to the topic (gigabit) generation. -<dedupDocPercentForTopics>80</> - -# Maximum number of words a topic (gigabit) can have. Affects raw feeds, too. -<maxWordsPerTopic>6</> - -# Max chars to sample from each doc for topics (gigabits). -<topicMaxSampleSize>4096</> - -# If enabled, results in dmoz will display their categories on the results -# page. -<displayDmozCategoriesInResults>1</> - -# If enabled, results in dmoz will display their indirect categories on the -# results page. -<displayIndirectDmozCategoriesInResults>0</> - -# If enabled, a link will appear next to each category on each result allowing -# the user to perform their query on that entire category. -<displaySearchCategoryLinkToQueryCategoryOfResult>0</> - -# Yes to use DMOZ given title when a page is untitled but is in DMOZ. -<useDmozForUntitled>1</> - -# Yes to always show DMOZ summaries with search results that are in DMOZ. -<showDmozSummaries>1</> - -# Yes to display the Adult category in the Top category -<showAdultCategoryOnTop>0</> - -# Before downloading the contents of a URL, Gigablast first chains down this -# list of expressions</a>, starting with expression #0. The first expression -# it matches is the ONE AND ONLY matching row for that url. It then uses the -# respider frequency, spider priority, etc. on the MATCHING ROW when spidering -# that URL. If you specify the <i>expression</i> as <i><b>default</b></i> then -# that MATCHES ALL URLs. URLs with high spider priorities take spidering -# precedence over URLs with lower spider priorities. The respider frequency -# dictates how often a URL will be respidered. See the help table below for -# examples of all the supported expressions. Use the <i>&&</i> operator to -# string multiple expressions together in the same expression text box. A -# <i>spider priority</i> of <i>DELETE</i> will cause the URL to not be -# spidered, or if it has already been indexed, it will be deleted when it is -# respidered.<br><br> -<filterExpression><![CDATA[isdocidbased]]></> -<filterExpression><![CDATA[ismedia]]></> -<filterExpression><![CDATA[errorcount>=3 && hastmperror]]></> -<filterExpression><![CDATA[errorcount>=1 && hastmperror]]></> -<filterExpression><![CDATA[isaddurl]]></> -<filterExpression><![CDATA[hopcount==0 && iswww && isnew]]></> -<filterExpression><![CDATA[hopcount==0 && iswww]]></> -<filterExpression><![CDATA[hopcount==0 && isnew]]></> -<filterExpression><![CDATA[hopcount==0]]></> -<filterExpression><![CDATA[hopcount==1 && isnew]]></> -<filterExpression><![CDATA[hopcount==1]]></> -<filterExpression><![CDATA[hopcount==2 && isnew]]></> -<filterExpression><![CDATA[hopcount==2]]></> -<filterExpression><![CDATA[hopcount>=3 && isnew]]></> -<filterExpression><![CDATA[hopcount>=3]]></> -<filterExpression><![CDATA[isnew]]></> -<filterExpression><![CDATA[default]]></> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<harvestLinks>1</> -<filterFrequency>0.000000</> -<filterFrequency>0.000000</> -<filterFrequency>1.000000</> -<filterFrequency>1.000000</> -<filterFrequency>1.000000</> -<filterFrequency>7.000000</> -<filterFrequency>7.000000</> -<filterFrequency>7.000000</> -<filterFrequency>10.000000</> -<filterFrequency>20.000000</> -<filterFrequency>20.000000</> -<filterFrequency>40.000000</> -<filterFrequency>40.000000</> -<filterFrequency>60.000000</> -<filterFrequency>60.000000</> -<filterFrequency>30.000000</> -<filterFrequency>30.000000</> - -# Do not allow more than this many outstanding spiders for all urls in this -# priority. -<maxSpidersPerRule>99</> -<maxSpidersPerRule>99</> -<maxSpidersPerRule>1</> -<maxSpidersPerRule>1</> -<maxSpidersPerRule>99</> -<maxSpidersPerRule>4</> -<maxSpidersPerRule>2</> -<maxSpidersPerRule>1</> -<maxSpidersPerRule>2</> -<maxSpidersPerRule>99</> -<maxSpidersPerRule>1</> -<maxSpidersPerRule>99</> -<maxSpidersPerRule>1</> -<maxSpidersPerRule>99</> -<maxSpidersPerRule>1</> -<maxSpidersPerRule>99</> -<maxSpidersPerRule>99</> - -# Allow this many spiders per IP. -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> -<maxSpidersPerIp>1</> - -# Wait at least this long before downloading urls from the same IP address. -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<spiderIpWait>1000</> -<filterPriority>80</> -<filterPriority>-3</> -<filterPriority>3</> -<filterPriority>45</> -<filterPriority>85</> -<filterPriority>50</> -<filterPriority>48</> -<filterPriority>49</> -<filterPriority>47</> -<filterPriority>40</> -<filterPriority>39</> -<filterPriority>30</> -<filterPriority>29</> -<filterPriority>20</> -<filterPriority>19</> -<filterPriority>1</> -<filterPriority>0</> diff --git a/html/help.html b/html/help.html index a011a321..af2b9180 100644 --- a/html/help.html +++ b/html/help.html @@ -127,11 +127,14 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;} <td style="padding-bottom:12px"> </td> <td style="padding-bottom:12px"> </td> </tr> -<!-- - <tr bgcolor="#006699"> - <th><a name="boolean" id="boolean"></a><font color="#FFFFFF">Boolean Search</font></th> - <th><font color="#FFFFFF">Description</font></th> + +<tr bgcolor="#0340fd"> + + <th><font color=33dcff>Boolean Search</font></th> + <th><font color=33dcff>Description</font></th> + </tr> + <tr> <td colspan="2" bgcolor="#FFFFCC"><center> Note: boolean operators must be in UPPER CASE. @@ -214,16 +217,17 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;} expressions and can be optionally enclosed in parentheses. A NOT operator can optionally preceed the left or the right operand.</td> </tr> ---> + </table> </td></tr> </table> +<br> <center> -Copyright © 2013. All rights reserved. +Copyright © 2014. All rights reserved. </center> </body> </html>