mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
4092177e5f
to describe how to inject a file of concatenated HTML documents into gb. Still have to find out how to do that in SOLR and elasticsearch for comparison.
1628 lines
51 KiB
C++
1628 lines
51 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Indexdb.h" // makeKey(long long docId)
|
|
#include "Titledb.h"
|
|
#include "Spider.h"
|
|
#include "Tagdb.h"
|
|
#include "Dns.h"
|
|
//#include "PageResults.h" // for query buf, g_qbuf
|
|
#include "Collectiondb.h"
|
|
#include "CollectionRec.h"
|
|
#include "Clusterdb.h" // for getting # of docs indexed
|
|
//#include "Checksumdb.h" // should migrate to this one, though
|
|
#include "Pages.h"
|
|
#include "Query.h" // MAX_QUERY_LEN
|
|
#include "SafeBuf.h"
|
|
#include "LanguageIdentifier.h"
|
|
#include "LanguagePages.h"
|
|
#include "Users.h"
|
|
#include "Address.h" // getIPLocation
|
|
#include "Proxy.h"
|
|
|
|
//char *printNumResultsDropDown ( char *p, long n, bool *printedDropDown);
|
|
bool printNumResultsDropDown ( SafeBuf& sb, long n, bool *printedDropDown);
|
|
//static char *printTopDirectory ( char *p, char *pend );
|
|
static bool printTopDirectory ( SafeBuf& sb );
|
|
|
|
// this prints the last five queries
|
|
//static long printLastQueries ( char *p , char *pend ) ;
|
|
|
|
//static char *expandRootHtml ( char *p , long plen ,
|
|
/*
|
|
static bool expandRootHtml ( SafeBuf& sb,
|
|
uint8_t *html , long htmlLen ,
|
|
char *q , long qlen ,
|
|
HttpRequest *r ,
|
|
TcpSocket *s ,
|
|
long long docsInColl ,
|
|
CollectionRec *cr ) ;
|
|
*/
|
|
|
|
bool sendPageRoot ( TcpSocket *s, HttpRequest *r ){
|
|
return sendPageRoot ( s, r, NULL );
|
|
}
|
|
|
|
bool printNav ( SafeBuf &sb , HttpRequest *r ) {
|
|
sb.safePrintf("<center><b><p class=nav>"
|
|
"<a href=\"/about.html\">About</a>"
|
|
" <a href=\"/contact.html\">Contact</a>"
|
|
" <a href=\"/help.html\">Help</a>"
|
|
" <a href=/privacy.html>Privacy Policy</a>"
|
|
" <a href=\"/searchfeed.html\">"
|
|
"Search API</a>"
|
|
" <a href=/seoapi.html>SEO API</a>"
|
|
" <a href=/account>My Account</a> "
|
|
//" <a href=/logout>Logout</a>"
|
|
);
|
|
if ( r->isLocal() )
|
|
sb.safePrintf(" [<a href=\"/master?\">Admin</a>]");
|
|
sb.safePrintf("</p></b></center></body></html>");
|
|
return true;
|
|
}
|
|
|
|
bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
|
|
|
sb.safePrintf("<html>\n");
|
|
sb.safePrintf("<head>\n");
|
|
//sb.safePrintf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf8\">");
|
|
sb.safePrintf("<meta name=\"description\" content=\"A powerful, new search engine that does real-time indexing!\">\n");
|
|
sb.safePrintf("<meta name=\"keywords\" content=\"search, search engine, search engines, search the web, fresh index, green search engine, green search, clean search engine, clean search\">\n");
|
|
sb.safePrintf("<title>Gigablast - "
|
|
"An Alternative Open Source Search Engine</title>\n");
|
|
sb.safePrintf("<style><!--\n");
|
|
sb.safePrintf("body {\n");
|
|
sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
|
|
sb.safePrintf("color: #000000;\n");
|
|
sb.safePrintf("font-size: 12px;\n");
|
|
sb.safePrintf("margin: 20px 5px;\n");
|
|
sb.safePrintf("letter-spacing: 0.04em;\n");
|
|
sb.safePrintf("}\n");
|
|
sb.safePrintf("a:link {color:#00c}\n");
|
|
sb.safePrintf("a:visited {color:#551a8b}\n");
|
|
sb.safePrintf("a:active {color:#f00}\n");
|
|
sb.safePrintf(".bold {font-weight: bold;}\n");
|
|
sb.safePrintf(".bluetable {background:#d1e1ff;margin-bottom:15px;font-size:12px;}\n");
|
|
sb.safePrintf(".url {color:#008000;}\n");
|
|
sb.safePrintf(".cached, .cached a {font-size: 10px;color: #666666;\n");
|
|
sb.safePrintf("}\n");
|
|
sb.safePrintf("table {\n");
|
|
sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
|
|
sb.safePrintf("color: #000000;\n");
|
|
sb.safePrintf("font-size: 12px;\n");
|
|
sb.safePrintf("}\n");
|
|
sb.safePrintf(".directory {font-size: 16px;}\n");
|
|
sb.safePrintf("-->\n");
|
|
sb.safePrintf("</style>\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("</head>\n");
|
|
sb.safePrintf("<script>\n");
|
|
sb.safePrintf("<!--\n");
|
|
sb.safePrintf("function x(){document.f.q.focus();}\n");
|
|
sb.safePrintf("// --></script>\n");
|
|
sb.safePrintf("<body onload=\"x()\">\n");
|
|
//sb.safePrintf("<body>\n");
|
|
//g_proxy.insertLoginBarDirective ( &sb );
|
|
sb.safePrintf("<br><br>\n");
|
|
sb.safePrintf("<center><a href=/><img border=0 width=500 height=122 src=/logo-med.jpg></a>\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
sb.safePrintf("<br><br><br>\n");
|
|
sb.safePrintf("<b>web</b> <a href=/seo>seo</a> <a href=\"http://www.gigablast.com/?c=dmoz3\">directory</a> \n");
|
|
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
|
sb.safePrintf(" ");
|
|
sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
|
|
"Gigablast's index\">add url</a>");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
// submit to https now
|
|
sb.safePrintf("<form method=get "
|
|
"action=/search name=f>\n");
|
|
sb.safePrintf("<input name=q type=text size=60 value=\"\"> <input type=\"submit\" value=\"Search Green\">\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("</form>\n");
|
|
sb.safePrintf("<br>\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("<table cellpadding=3>\n");
|
|
sb.safePrintf("\n");
|
|
|
|
|
|
sb.safePrintf("<tr valign=top>\n");
|
|
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:red;></td>\n");
|
|
sb.safePrintf("<td><font size=+1><b>Open Source!</b>"
|
|
"</font><br>\n");
|
|
sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=/admin.html#features>Features.</a> Limited support available for free."
|
|
,80);
|
|
sb.safePrintf("<br><br>");
|
|
sb.safePrintf("</td></tr>\n");
|
|
|
|
|
|
|
|
sb.safePrintf("<tr valign=top>\n");
|
|
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:green;></td>\n");
|
|
sb.safePrintf("<td><font size=+1><b>The Green Search Engine</b></font><br>\n");
|
|
sb.brify2("Gigablast is the only clean-powered web search engine. 90% of its power usage comes from wind energy. Astoundingly, Gigablast is one of ONLY four search engines in the United States indexing over a billion pages.",80);
|
|
sb.safePrintf("<br><br></td></tr>\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("\n");
|
|
|
|
|
|
|
|
sb.safePrintf("<tr valign=top>\n");
|
|
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:0040fe;></td>\n");
|
|
sb.safePrintf("<td><font size=+1><b>The Transparent Search Engine</b></font><br>\n");
|
|
sb.brify2("Gigablast is the first truly transparent search engine. It tells you exactly why the search results are ranked the way they are. There is nothing left to the imagination.",85);
|
|
sb.safePrintf("<br><br>");
|
|
sb.safePrintf("</td></tr>\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("\n");
|
|
|
|
sb.safePrintf("<tr valign=top>\n");
|
|
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:f2b629;></td>\n");
|
|
sb.safePrintf("<td><font size=+1><b>The SEO Search Engine</b></font><br>\n");
|
|
sb.brify2("When it comes to search-engine based SEO, Gigablast is the place to be. With a frothy set of unique and effective <a href=/seo>SEO tools</a>, you will find all you need to execute a simple yet effective SEO strategy. Stop the guesswork, and let a search engine tell you how to SEO it.",85);
|
|
sb.safePrintf("</td></tr>\n");
|
|
|
|
|
|
/*
|
|
sb.safePrintf("<tr valign=top>\n");
|
|
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:ff3030;></td>\n");
|
|
sb.safePrintf("<td><font size=+1><b>Xml Search Feed</b></font><br>\n");
|
|
sb.brify2("Utilize Gigablast's results on your own site or product by connecting with Gigablast's <a href=/searchfeed.html>XML search feed</a>. It's now simpler than ever to setup and use. You can also add the web pages you want into the index in near real-time.",85);
|
|
sb.safePrintf("</td></tr>\n");
|
|
*/
|
|
|
|
/*
|
|
sb.safePrintf("<tr valign=top>\n");
|
|
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:black;></td>\n");
|
|
sb.safePrintf("<td><font size=+1><b>The Private Search Engine</b>"
|
|
"</font><br>\n");
|
|
sb.brify2("Gigablast does not allow the NSA or any third party "
|
|
"to spy on the queries your IP address is doing, "
|
|
"unlike "
|
|
"<a href=http://www.guardian.co.uk/world/2013/jun/"
|
|
"06/us-tech-giants-nsa-data>"
|
|
"other large search engines</a>. "
|
|
"Gigablast is the only "
|
|
"<a href=/privacy.html>truly private search engine</a> "
|
|
"in the United States."
|
|
//" Everyone else has fundamental "
|
|
//"gaps in their "
|
|
//"security as explained by the above link."
|
|
//"Tell Congress "
|
|
//"to <a href=https://optin.stopwatching.us/>stop spying "
|
|
//"on you</a>."
|
|
,85);
|
|
sb.safePrintf("</td></tr>\n");
|
|
*/
|
|
|
|
/*
|
|
sb.safePrintf("<tr valign=top>\n");
|
|
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:black;></td>\n");
|
|
sb.safePrintf("<td><font size=+1><b>No Tax Dodging</b></font><br>\n");
|
|
sb.brify2("Gigablast pays its taxes when it makes a profit. "
|
|
"Google and Bing <a href=http://www.bloomberg.com/news/"
|
|
"2010-10-21/google-2-4-rate-shows-how-60-billion-u-s-"
|
|
"revenue-lost-to-tax-loopholes.html>do not</a>. They "
|
|
"stash their profits in "
|
|
"offshore tax havens to avoid paying taxes. "
|
|
//"The end result is that taxes are higher for you. "
|
|
"You may think Google and Bing are free to use, but in "
|
|
"reality, <u>you</u> pay for it in increased taxes."
|
|
,85);
|
|
sb.safePrintf("</td></tr>\n");
|
|
*/
|
|
|
|
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("</table>\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
printNav ( sb , r );
|
|
return true;
|
|
}
|
|
|
|
bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
|
|
|
sb.safePrintf("<html>\n");
|
|
sb.safePrintf("<head>\n");
|
|
sb.safePrintf("<title>Gigablast - Add Url</title>\n");
|
|
sb.safePrintf("<style><!--\n");
|
|
sb.safePrintf("body {\n");
|
|
sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
|
|
sb.safePrintf("color: #000000;\n");
|
|
sb.safePrintf("font-size: 12px;\n");
|
|
sb.safePrintf("margin: 20px 5px;\n");
|
|
sb.safePrintf("letter-spacing: 0.04em;\n");
|
|
sb.safePrintf("}\n");
|
|
sb.safePrintf("a:link {color:#00c}\n");
|
|
sb.safePrintf("a:visited {color:#551a8b}\n");
|
|
sb.safePrintf("a:active {color:#f00}\n");
|
|
sb.safePrintf(".bold {font-weight: bold;}\n");
|
|
sb.safePrintf(".bluetable {background:#d1e1ff;margin-bottom:15px;font-size:12px;}\n");
|
|
sb.safePrintf(".url {color:#008000;}\n");
|
|
sb.safePrintf(".cached, .cached a {font-size: 10px;color: #666666;\n");
|
|
sb.safePrintf("}\n");
|
|
sb.safePrintf("table {\n");
|
|
sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
|
|
sb.safePrintf("color: #000000;\n");
|
|
sb.safePrintf("font-size: 12px;\n");
|
|
sb.safePrintf("}\n");
|
|
sb.safePrintf(".directory {font-size: 16px;}\n");
|
|
sb.safePrintf("-->\n");
|
|
sb.safePrintf("</style>\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("</head>\n");
|
|
sb.safePrintf("<script>\n");
|
|
sb.safePrintf("<!--\n");
|
|
sb.safePrintf("function x(){document.f.q.focus();}\n");
|
|
sb.safePrintf("// --></script>\n");
|
|
//sb.safePrintf("<body onload=\"x()\">\n");
|
|
/*
|
|
if ( url ) {
|
|
sb.safePrintf(
|
|
"<body "
|
|
"onLoad=\""
|
|
"var client = new XMLHttpRequest();\n"
|
|
"client.onreadystatechange = handler;\n"
|
|
"var url='/addurl?u="
|
|
);
|
|
sb.urlEncode ( url );
|
|
// propagate "admin" if set
|
|
//long admin = hr->getLong("admin",-1);
|
|
//if ( admin != -1 ) sb.safePrintf("&admin=%li",admin);
|
|
// provide hash of the query so clients can't just pass in
|
|
// a bogus id to get search results from us
|
|
unsigned long h32 = hash32n(url);
|
|
if ( h32 == 0 ) h32 = 1;
|
|
unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
|
|
sb.safePrintf("&id=%lu&rand=%llu';\n"
|
|
"client.open('GET', url );\n"
|
|
"client.send();\n"
|
|
"\">"
|
|
, h32
|
|
, rand64
|
|
);
|
|
|
|
}
|
|
else {
|
|
sb.safePrintf("<body>");
|
|
}
|
|
*/
|
|
sb.safePrintf("<body>");
|
|
|
|
|
|
sb.safePrintf("<script type=\"text/javascript\">\n"
|
|
"function handler() {\n"
|
|
"if(this.readyState == 4 ) {\n"
|
|
"document.getElementById('msgbox').innerHTML="
|
|
"this.responseText;\n"
|
|
//"alert(this.status+this.statusText+"
|
|
//"this.responseXML+this.responseText);\n"
|
|
"}}\n"
|
|
"</script>\n");
|
|
|
|
|
|
//g_proxy.insertLoginBarDirective ( &sb );
|
|
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
sb.safePrintf("<center><a href=/><img border=0 width=500 height=122 src=/logo-med.jpg></a>\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
sb.safePrintf("<br><br><br>\n");
|
|
sb.safePrintf("<a href=/>web</a> <a href=/seo>seo</a> <a href=\"http://www.gigablast.com/?c=dmoz3\">directory</a> \n");
|
|
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
|
sb.safePrintf(" ");
|
|
sb.safePrintf("<b title=\"Instantly add your url to Gigablast's "
|
|
"index\">"
|
|
"add url</b>");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
sb.safePrintf("<form method=get action=/addurl name=f>\n");
|
|
sb.safePrintf("<input name=u type=text size=60 value=\"");
|
|
if ( url ) {
|
|
SafeBuf tmp;
|
|
tmp.safePrintf("%s",url);
|
|
// don't let double quotes in the url close our val attribute
|
|
tmp.replace("\"","%22");
|
|
sb.safeMemcpy(&tmp);
|
|
}
|
|
else
|
|
sb.safePrintf("http://");
|
|
sb.safePrintf("\"> <input type=\"submit\" value=\"Add Url\">\n");
|
|
sb.safePrintf("\n");
|
|
|
|
// if addurl is turned off, just print "disabled" msg
|
|
char *msg = NULL;
|
|
if ( ! g_conf.m_addUrlEnabled )
|
|
msg = "Add url is temporarily disabled";
|
|
// can also be turned off in the collection rec
|
|
CollectionRec *cr = g_collectiondb.getRec ( "main" );
|
|
if ( ! cr->m_addUrlEnabled )
|
|
msg = "Add url is temporarily disabled";
|
|
// or if in read-only mode
|
|
if ( g_conf.m_readOnlyMode )
|
|
msg = "Add url is temporarily disabled";
|
|
// if url is non-empty the ajax will receive this identical msg
|
|
// and display it in the div, so do not duplicate the msg!
|
|
if ( msg && ! url )
|
|
sb.safePrintf("<br><br>%s",msg);
|
|
|
|
|
|
// . the ajax msgbox div
|
|
// . when loaded with the main page for the first time it will
|
|
// immediately replace its content...
|
|
if ( url ) {
|
|
sb.safePrintf("<br>"
|
|
"<br>"
|
|
"<div id=msgbox>"
|
|
//"<b>Injecting your url. Please wait...</b>"
|
|
"<center>"
|
|
"<img src=/gears.gif width=50 height=50>"
|
|
"</center>"
|
|
"<script type=text/javascript>"
|
|
//"alert('shit');"
|
|
"var client = new XMLHttpRequest();\n"
|
|
"client.onreadystatechange = handler;\n"
|
|
"var url='/addurl?u="
|
|
);
|
|
sb.urlEncode ( url );
|
|
// propagate "admin" if set
|
|
//long admin = hr->getLong("admin",-1);
|
|
//if ( admin != -1 ) sb.safePrintf("&admin=%li",admin);
|
|
// provide hash of the query so clients can't just pass in
|
|
// a bogus id to get search results from us
|
|
unsigned long h32 = hash32n(url);
|
|
if ( h32 == 0 ) h32 = 1;
|
|
unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
|
|
sb.safePrintf("&id=%lu&rand=%llu';\n"
|
|
"client.open('GET', url );\n"
|
|
"client.send();\n"
|
|
"</script>\n"
|
|
, h32
|
|
, rand64
|
|
);
|
|
sb.safePrintf("</div>\n");
|
|
}
|
|
|
|
sb.safePrintf("</form>\n");
|
|
sb.safePrintf("<br>\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
|
|
printNav ( sb , r );
|
|
return true;
|
|
}
|
|
|
|
|
|
bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
|
|
|
sb.safePrintf("<html>\n");
|
|
sb.safePrintf("<head>\n");
|
|
//sb.safePrintf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
|
|
sb.safePrintf("<meta name=\"description\" content=\"A powerful, new search engine that does real-time indexing!\">\n");
|
|
sb.safePrintf("<meta name=\"keywords\" content=\"search, search engine, search engines, search the web, fresh index, green search engine, green search, clean search engine, clean search\">\n");
|
|
sb.safePrintf("<title>Gigablast</title>\n");
|
|
sb.safePrintf("<style><!--\n");
|
|
sb.safePrintf("body {\n");
|
|
sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
|
|
sb.safePrintf("color: #000000;\n");
|
|
sb.safePrintf("font-size: 12px;\n");
|
|
sb.safePrintf("margin: 20px 5px;\n");
|
|
sb.safePrintf("letter-spacing: 0.04em;\n");
|
|
sb.safePrintf("}\n");
|
|
sb.safePrintf("a:link {color:#00c}\n");
|
|
sb.safePrintf("a:visited {color:#551a8b}\n");
|
|
sb.safePrintf("a:active {color:#f00}\n");
|
|
sb.safePrintf(".bold {font-weight: bold;}\n");
|
|
sb.safePrintf(".bluetable {background:#d1e1ff;margin-bottom:15px;font-size:12px;}\n");
|
|
sb.safePrintf(".url {color:#008000;}\n");
|
|
sb.safePrintf(".cached, .cached a {font-size: 10px;color: #666666;\n");
|
|
sb.safePrintf("}\n");
|
|
sb.safePrintf("table {\n");
|
|
sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
|
|
sb.safePrintf("color: #000000;\n");
|
|
sb.safePrintf("font-size: 12px;\n");
|
|
sb.safePrintf("}\n");
|
|
sb.safePrintf(".directory {font-size: 16px;}\n");
|
|
sb.safePrintf("-->\n");
|
|
sb.safePrintf("</style>\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("</head>\n");
|
|
sb.safePrintf("<script>\n");
|
|
sb.safePrintf("<!--\n");
|
|
sb.safePrintf("function x(){document.f.q.focus();}\n");
|
|
sb.safePrintf("// --></script>\n");
|
|
sb.safePrintf("<body onload=\"x()\">\n");
|
|
sb.safePrintf("<body>\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
sb.safePrintf("<center><a href=/><img border=0 width=500 height=122 src=/logo-med.jpg></a>\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
sb.safePrintf("<br><br><br>\n");
|
|
sb.safePrintf("<a href=/>web</a> <a href=/seo>seo</a> <b>directory</b> \n");
|
|
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
|
sb.safePrintf(" ");
|
|
sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
|
|
"Gigablast's index\">add url</a>");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("<br><br>\n");
|
|
// submit to HTTPS now
|
|
sb.safePrintf("<form method=get "
|
|
"action=/search name=f>\n");
|
|
sb.safePrintf("<input name=q type=text size=60 value=\"\"> <input type=\"submit\" value=\"Search Green\">\n");
|
|
sb.safePrintf("\n");
|
|
sb.safePrintf("</form>\n");
|
|
sb.safePrintf("<br>\n");
|
|
sb.safePrintf("\n");
|
|
|
|
|
|
printTopDirectory ( sb );
|
|
|
|
sb.safePrintf("<br><br>\n");
|
|
|
|
printNav ( sb , r);
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets errno on error
|
|
// . make a web page displaying the config of this host
|
|
// . call g_httpServer.sendDynamicPage() to send it
|
|
bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) {
|
|
// don't allow pages bigger than 128k in cache
|
|
char buf [ 10*1024 + MAX_QUERY_LEN ];
|
|
// a ptr into "buf"
|
|
//char *p = buf;
|
|
//char *pend = buf + 10*1024 + MAX_QUERY_LEN - 100 ;
|
|
SafeBuf sb(buf, 10*1024 + MAX_QUERY_LEN);
|
|
// print bgcolors, set focus, set font style
|
|
//p = g_httpServer.printFocus ( p , pend );
|
|
//p = g_httpServer.printColors ( p , pend );
|
|
//long qlen;
|
|
//char *q = r->getString ( "q" , &qlen , NULL );
|
|
// insert collection name too
|
|
long collLen;
|
|
char *coll = r->getString("c",&collLen);
|
|
if ( ! coll || ! coll[0] ) {
|
|
//coll = g_conf.m_defaultColl;
|
|
coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
|
|
collLen = gbstrlen(coll);
|
|
}
|
|
// ensure collection not too big
|
|
if ( collLen >= MAX_COLL_LEN ) {
|
|
g_errno = ECOLLTOOBIG;
|
|
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
|
}
|
|
// get the collection rec
|
|
/*
|
|
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
|
uint8_t *hp = NULL;
|
|
long hpLen;
|
|
long long docsInColl = -1;
|
|
if ( ! cr ) {
|
|
// use the default
|
|
Parm *pp = g_parms.getParm ( "hp" );
|
|
if ( ! pp ) {
|
|
g_errno = ENOTFOUND;
|
|
g_msg = " (error: no such collection)";
|
|
return g_httpServer.sendErrorReply(s,500,
|
|
mstrerror(g_errno));
|
|
}
|
|
hp = (uint8_t*)pp->m_def;
|
|
if ( hp ) hpLen = uint8strlen ( hp );
|
|
if ( hpLen <= 0 || ! hp )
|
|
log(LOG_INFO,"http: No root page html present.");
|
|
} else {
|
|
if(cr->m_useLanguagePages) {
|
|
uint8_t lang = g_langId.guessGBLanguageFromUrl(r->getHost());
|
|
if(lang && (hp = g_languagePages.getLanguagePage(lang)) != NULL) {
|
|
hpLen = uint8strlen(hp);
|
|
// Set sort language as well
|
|
// This might not be a good idea, as it
|
|
// overrides any other setting. May be
|
|
// better to let the user agent string
|
|
// tell us what the user wants.
|
|
strcpy(cr->m_defaultSortLanguage,
|
|
getLanguageAbbr(lang));
|
|
}
|
|
}
|
|
if(!hp) {
|
|
hp = (uint8_t*)cr->m_htmlRoot;
|
|
hpLen = cr->m_htmlRootLen;
|
|
}
|
|
//RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , coll );
|
|
RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , coll );
|
|
if ( base ) docsInColl = base->getNumGlobalRecs();
|
|
}
|
|
*/
|
|
// print the page out
|
|
/*
|
|
expandRootHtml ( sb,
|
|
hp , hpLen ,
|
|
q , qlen , r , s , docsInColl ,
|
|
cr );
|
|
*/
|
|
|
|
|
|
if ( ! strcmp(coll,"dmoz3" ) )
|
|
printDirHomePage(sb,r);
|
|
else
|
|
printWebHomePage(sb,r);
|
|
|
|
|
|
// . print last 5 queries
|
|
// . put 'em in a table
|
|
// . disable for now, impossible to monitor/control
|
|
//p += printLastQueries ( p , pend );
|
|
// are we the admin?
|
|
//bool isAdmin = g_collectiondb.isAdmin ( r , s );
|
|
|
|
// calculate bufLen
|
|
//long bufLen = p - buf;
|
|
// . now encapsulate it in html head/tail and send it off
|
|
// . the 0 means browser caches for however long it's set for
|
|
// . but we don't use 0 anymore, use -2 so it never gets cached so
|
|
// our display of the # of pages in the index is fresh
|
|
// . no, but that will piss people off, its faster to keep it cached
|
|
//return g_httpServer.sendDynamicPage ( s , buf , bufLen , -1 );
|
|
return g_httpServer.sendDynamicPage ( s,
|
|
(char*) sb.getBufStart(),
|
|
sb.length(),
|
|
// 120 seconds cachetime
|
|
// don't cache anymore since
|
|
// we have the login bar at
|
|
// the top of the page
|
|
0,//120, // cachetime
|
|
false,// post?
|
|
"text/html",
|
|
200,
|
|
NULL, // cookie
|
|
"UTF-8",
|
|
r);
|
|
}
|
|
|
|
/*
|
|
//char *expandRootHtml ( char *p , long plen ,
|
|
bool expandRootHtml ( SafeBuf& sb,
|
|
uint8_t *head , long hlen ,
|
|
char *q , long qlen ,
|
|
HttpRequest *r ,
|
|
TcpSocket *s ,
|
|
long long docsInColl ,
|
|
CollectionRec *cr ) {
|
|
//char *pend = p + plen;
|
|
// store custom header into buf now
|
|
//for ( long i = 0 ; i < hlen && p+10 < pend ; i++ ) {
|
|
for ( long i = 0 ; i < hlen; i++ ) {
|
|
if ( head[i] != '%' ) {
|
|
// *p++ = head[i];
|
|
sb.safeMemcpy((char*)&head[i], 1);
|
|
continue;
|
|
}
|
|
if ( i + 1 >= hlen ) {
|
|
// *p++ = head[i];
|
|
sb.safeMemcpy((char*)&head[i], 1);
|
|
continue;
|
|
}
|
|
if ( head[i+1] == 'S' ) {
|
|
// now we got the %S, insert "spiders are [on/off]"
|
|
bool spidersOn = true;
|
|
if ( ! g_conf.m_spideringEnabled ) spidersOn = false;
|
|
if ( ! cr->m_spideringEnabled ) spidersOn = false;
|
|
if ( spidersOn )
|
|
sb.safePrintf("Spiders are on");
|
|
else
|
|
sb.safePrintf("Spiders are off");
|
|
// skip over %S
|
|
i += 1;
|
|
continue;
|
|
}
|
|
|
|
if ( head[i+1] == 'q' ) {
|
|
// now we got the %q, insert the query
|
|
char *p = (char*) sb.getBuf();
|
|
char *pend = (char*) sb.getBufEnd();
|
|
long eqlen = dequote ( p , pend , q , qlen );
|
|
//p += eqlen;
|
|
sb.incrementLength(eqlen);
|
|
// skip over %q
|
|
i += 1;
|
|
continue;
|
|
}
|
|
if ( head[i+1] == 'w' &&
|
|
head[i+2] == 'h' &&
|
|
head[i+3] == 'e' &&
|
|
head[i+4] == 'r' &&
|
|
head[i+5] == 'e' ) {
|
|
// insert the location
|
|
long whereLen;
|
|
char *where = r->getString("where",&whereLen);
|
|
// get it from cookie as well!
|
|
if ( ! where )
|
|
where = r->getStringFromCookie("where",
|
|
&whereLen);
|
|
// fix for getStringFromCookie
|
|
if ( where && ! where[0] ) where = NULL;
|
|
// skip over the %where
|
|
i += 5;
|
|
// if empty, base it on IP
|
|
if ( ! where ) {
|
|
double lat;
|
|
double lon;
|
|
double radius;
|
|
char *city,*state,*ctry;
|
|
// use this by default
|
|
long ip = r->m_userIP;
|
|
// ip for testing?
|
|
long iplen;
|
|
char *ips = r->getString("uip",&iplen);
|
|
if ( ips ) ip = atoip(ips);
|
|
// returns true if found in db
|
|
char buf[128];
|
|
getIPLocation ( ip ,
|
|
&lat ,
|
|
&lon ,
|
|
&radius,
|
|
&city ,
|
|
&state ,
|
|
&ctry ,
|
|
buf ,
|
|
128 ) ;
|
|
if ( city && state )
|
|
sb.safePrintf("%s, %s",city,state);
|
|
}
|
|
else
|
|
sb.dequote (where,whereLen);
|
|
continue;
|
|
}
|
|
if ( head[i+1] == 'w' &&
|
|
head[i+2] == 'h' &&
|
|
head[i+3] == 'e' &&
|
|
head[i+4] == 'n' ) {
|
|
// insert the location
|
|
long whenLen;
|
|
char *when = r->getString("when",&whenLen);
|
|
// skip over the %when
|
|
i += 4;
|
|
if ( ! when ) continue;
|
|
sb.dequote (when,whenLen);
|
|
continue;
|
|
}
|
|
// %sortby
|
|
if ( head[i+1] == 's' &&
|
|
head[i+2] == 'o' &&
|
|
head[i+3] == 'r' &&
|
|
head[i+4] == 't' &&
|
|
head[i+5] == 'b' &&
|
|
head[i+6] == 'y' ) {
|
|
// insert the location
|
|
long sortBy = r->getLong("sortby",1);
|
|
// print the radio buttons
|
|
char *cs[5];
|
|
cs[0]="";
|
|
cs[1]="";
|
|
cs[2]="";
|
|
cs[3]="";
|
|
cs[4]="";
|
|
if ( sortBy >=1 && sortBy <=4 )
|
|
cs[sortBy] = " checked";
|
|
sb.safePrintf(
|
|
"<input type=radio name=sortby value=1%s>date "
|
|
"<input type=radio name=sortby value=2%s>distance "
|
|
"<input type=radio name=sortby value=3%s>relevancy "
|
|
"<input type=radio name=sortby value=4%s>popularity",
|
|
cs[1],cs[2],cs[3],cs[4]);
|
|
// skip over the %sortby
|
|
i += 6;
|
|
continue;
|
|
}
|
|
if ( head[i+1] == 'e' ) {
|
|
// now we got the %e, insert the query
|
|
char *p = (char*) sb.getBuf();
|
|
long plen = sb.getAvail();
|
|
long eqlen = urlEncode ( p , plen , q , qlen );
|
|
//p += eqlen;
|
|
sb.incrementLength(eqlen);
|
|
// skip over %e
|
|
i += 1;
|
|
continue;
|
|
}
|
|
if ( head[i+1] == 'N' ) {
|
|
// now we got the %N, insert the global doc count
|
|
//long long c=g_checksumdb.getRdb()->getNumGlobalRecs();
|
|
//now each host tells us how many docs it has in itsping
|
|
long long c = g_hostdb.getNumGlobalRecs();
|
|
c += g_conf.m_docCountAdjustment;
|
|
// never allow to go negative
|
|
if ( c < 0 ) c = 0;
|
|
//p+=ulltoa(p,c);
|
|
char *p = (char*) sb.getBuf();
|
|
sb.reserve2x(16);
|
|
long len = ulltoa(p, c);
|
|
sb.incrementLength(len);
|
|
// skip over %N
|
|
i += 1;
|
|
continue;
|
|
}
|
|
if ( head[i+1] == 'E' ) {
|
|
// now each host tells us how many docs it has in its
|
|
// ping request
|
|
long long c = g_hostdb.getNumGlobalEvents();
|
|
char *p = (char*) sb.getBuf();
|
|
sb.reserve2x(16);
|
|
long len = ulltoa(p, c);
|
|
sb.incrementLength(len);
|
|
// skip over %E
|
|
i += 1;
|
|
continue;
|
|
}
|
|
if ( head[i+1] == 'n' ) {
|
|
// now we got the %n, insert the collection doc count
|
|
//p+=ulltoa(p,docsInColl);
|
|
char *p = (char*) sb.getBuf();
|
|
sb.reserve2x(16);
|
|
long len = ulltoa(p, docsInColl);
|
|
sb.incrementLength(len);
|
|
// skip over %n
|
|
i += 1;
|
|
continue;
|
|
}
|
|
if ( head[i+1] == 'T' ) {
|
|
// . print the final tail
|
|
// . only print admin link if we're local
|
|
//long user = g_pages.getUserType ( s , r );
|
|
//char *username = g_users.getUsername(r);
|
|
//char *pwd = r->getString ( "pwd" );
|
|
char *p = (char*) sb.getBuf();
|
|
long plen = sb.getAvail();
|
|
//p = g_pages.printTail ( p , p + plen , user , pwd );
|
|
char *n = g_pages.printTail(p , p + plen ,
|
|
r->isLocal());
|
|
sb.incrementLength(n - p);
|
|
// skip over %T
|
|
i += 1;
|
|
continue;
|
|
}
|
|
// print the drop down menu for selecting the # of reslts
|
|
if ( head[i+1] == 'D' ) {
|
|
// skip over %D
|
|
i += 1;
|
|
// skip if not enough buffer
|
|
//if ( p + 1000 >= pend ) continue;
|
|
// # results
|
|
//long n = r->getLong("n",10);
|
|
//bool printedDropDown;
|
|
//p = printNumResultsDropDown(p,n,&printedDropDown);
|
|
//printNumResultsDropDown(sb,n,&printedDropDown);
|
|
continue;
|
|
}
|
|
if ( head[i+1] == 'H' ) {
|
|
// . insert the secret key here, to stop seo bots
|
|
// . TODO: randomize its position to make parsing more
|
|
// difficult
|
|
// . this secret key is for submitting a new query
|
|
long key;
|
|
char kname[4];
|
|
g_httpServer.getKey (&key,kname,NULL,0,time(NULL),0,
|
|
10);
|
|
//sprintf ( p , "<input type=hidden name=%s value=%li>",
|
|
// kname,key);
|
|
//p += gbstrlen ( p );
|
|
sb.safePrintf( "<input type=hidden name=%s value=%li>",
|
|
kname,key);
|
|
|
|
//adds param for default screen size
|
|
//if(cr)
|
|
// sb.safePrintf("<input type=hidden id='screenWidth' name='ws' value=%li>", cr->m_screenWidth);
|
|
|
|
// insert collection name too
|
|
long collLen;
|
|
char *coll = r->getString ( "c" , &collLen );
|
|
if ( collLen > 0 && collLen < MAX_COLL_LEN ) {
|
|
//sprintf (p,"<input type=hidden name=c "
|
|
// "value=\"");
|
|
//p += gbstrlen ( p );
|
|
sb.safePrintf("<input type=hidden name=c "
|
|
"value=\"");
|
|
//memcpy ( p , coll , collLen );
|
|
//p += collLen;
|
|
sb.safeMemcpy(coll, collLen);
|
|
//sprintf ( p , "\">\n");
|
|
//p += gbstrlen ( p );
|
|
sb.safePrintf("\">\n");
|
|
}
|
|
|
|
// pass this crap on so zak can do searches
|
|
char *username = g_users.getUsername(r);
|
|
// this is null because not in the cookie and we are
|
|
// logged in
|
|
//char *pwd = r->getString ( "pwd" );
|
|
//sb.safePrintf("<input type=hidden name=pwd value=\"%s\">\n",
|
|
//pwd);
|
|
sb.safePrintf("<input type=hidden name=username "
|
|
"value=\"%s\">\n",username);
|
|
|
|
// skip over %H
|
|
i += 1;
|
|
continue;
|
|
}
|
|
// %t, print Top Directory section
|
|
if ( head[i+1] == 't' ) {
|
|
i += 1;
|
|
//p = printTopDirectory ( p, pend );
|
|
printTopDirectory ( sb );
|
|
continue;
|
|
}
|
|
|
|
// *p++ = head[i];
|
|
sb.safeMemcpy((char*)&head[i], 1);
|
|
continue;
|
|
}
|
|
//return p;
|
|
return true;
|
|
}
|
|
*/
|
|
|
|
// . store into "p"
|
|
// . returns bytes stored into "p"
|
|
// . used for entertainment purposes
|
|
/*
|
|
long printLastQueries ( char *p , char *pend ) {
|
|
// if not 512 bytes left, bail
|
|
if ( pend - p < 512 ) return 0;
|
|
// return w/ no table if no queries have been added to g_qbuf yet
|
|
if ( ! g_nextq == -1 ) return 0;
|
|
// remember start for returning # of bytes stored
|
|
char *start = p;
|
|
// begin table (no border)
|
|
sprintf (p,"<br><table border=0><tr><td><center>Last %li queries:"
|
|
"</td></tr>", (long)QBUF_NUMQUERIES );
|
|
p += gbstrlen ( p );
|
|
// point to last query added
|
|
long n = g_nextq - 1;
|
|
// . wrap it if we need to
|
|
// . QBUF_NUMQUERIES is defined to be 5 in PageResults.h
|
|
if ( n < 0 ) n = QBUF_NUMQUERIES - 1;
|
|
// . print up to five queries
|
|
// . queries are stored by advancing g_nextq, so "i" should go backward
|
|
long count = 0;
|
|
for ( long i = n ; count < QBUF_NUMQUERIES ; count++ , i-- ) {
|
|
// wrap i if we need to
|
|
if ( i == -1 ) i = QBUF_NUMQUERIES - 1;
|
|
// if this query is empty, skip it (might be uninitialized)
|
|
if ( g_qbuf[i][0] == '\0' ) continue;
|
|
// point to the query (these are NULL terminated)
|
|
char *q = g_qbuf[i];
|
|
long qlen = gbstrlen(q);
|
|
// bail if too big
|
|
if ( p + qlen + 32 + 1024 >= pend ) return p - start;
|
|
// otherwise, print this query to the page
|
|
sprintf ( p , "<tr><td><a href=/cgi/0.cgi?q=" );
|
|
p += gbstrlen ( p );
|
|
// store encoded query as cgi parm
|
|
p += urlEncode ( p , q , qlen );
|
|
// end a href tag
|
|
*p++ = '>';
|
|
// . then print the actual query to the page
|
|
// . use htmlEncode so nobody can abuse it
|
|
p += saftenTags ( p , pend - p , q , qlen );
|
|
// wrap it up
|
|
sprintf ( p , "</a></td></tr>" );
|
|
p += gbstrlen ( p );
|
|
}
|
|
// end the table
|
|
sprintf ( p , "</table>");
|
|
p += gbstrlen ( p );
|
|
// return bytes written
|
|
return p - start;
|
|
}
|
|
*/
|
|
|
|
|
|
//char *printTopDirectory ( char *p, char *pend ) {
|
|
bool printTopDirectory ( SafeBuf& sb ) {
|
|
//char topList[4096];
|
|
//sprintf(topList,
|
|
return sb.safePrintf (
|
|
"<center>"
|
|
"<table cellspacing=\"4\" cellpadding=\"4\"><tr><td valign=top>\n"
|
|
"<b><a href=\"/Arts/\">Arts</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Arts/Movies/\">Movies</a>, "
|
|
"<a href=\"/Arts/Television/\">Television</a>, "
|
|
"<a href=\"/Arts/Music/\">Music</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/Business/\">Business</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Business/Employment/\">Jobs</a>, "
|
|
"<a href=\"/Business/Real_Estate/\">Real Estate</a>, "
|
|
"<a href=\"/Business/Investing/\">Investing</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/Computers/\">Computers</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Computers/Internet/\">Internet</a>, "
|
|
"<a href=\"/Computers/Software/\">Software</a>, "
|
|
"<a href=\"/Computers/Hardware/\">Hardware</a>..."
|
|
"</small>\n"
|
|
"</td></tr><tr><td valign=top>"
|
|
"<b><a href=\"/Games/\">Games</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Games/Video_Games/\">Video Games</a>, "
|
|
"<a href=\"/Games/Roleplaying/\">RPGs</a>, "
|
|
"<a href=\"/Games/Gambling/\">Gambling</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/Health/\">Health</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Health/Fitness/\">Fitness</a>, "
|
|
"<a href=\"/Health/Medicine/\">Medicine</a>, "
|
|
"<a href=\"/Health/Alternative/\">Alternative</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/Home/\">Home</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Home/Family/\">Family</a>, "
|
|
"<a href=\"/Home/Consumer_Information/\">Consumers</a>, "
|
|
"<a href=\"/Home/Cooking/\">Cooking</a>..."
|
|
"</small>\n"
|
|
"</td></tr><tr><td valign=top>"
|
|
//"<b><a href=\"/Kids_and_Teens/\">"
|
|
//"<font color=\"#ff0000\">K</font>"
|
|
//"<font color=\"339900\">i</font>"
|
|
//"<font color=\"#ff6600\">d</font>"
|
|
//"<font color=\"#0066ff\">s</font>"
|
|
//" and Teens</a></b><br>"
|
|
"<b><a href=\"/Kids_and_Teens/\">Kids and Teens</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Kids_and_Teens/Arts/\">Arts</a>, "
|
|
"<a href=\"/Kids_and_Teens/School_Time/\">School Time</a>, "
|
|
"<a href=\"/Kids_and_Teens/Teen_Life/\">Teen Life</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/News/\">News</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/News/Media/\">Media</a>, "
|
|
"<a href=\"/News/Newspapers/\">Newspapers</a>, "
|
|
"<a href=\"/News/Weather/\">Weather</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/Recreation/\">Recreation</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Recreation/Travel/\">Travel</a>, "
|
|
"<a href=\"/Recreation/Food/\">Food</a>, "
|
|
"<a href=\"/Recreation/Outdoors/\">Outdoors</a>, "
|
|
"<a href=\"/Recreation/Humor/\">Humor</a>..."
|
|
"</small>\n"
|
|
"</td></tr><tr><td valign=top>"
|
|
"<b><a href=\"/Reference/\">Reference</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Reference/Maps/\">Maps</a>, "
|
|
"<a href=\"/Reference/Education/\">Education</a>, "
|
|
"<a href=\"/Reference/Libraries/\">Libraries</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/Regional/\">Regional</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Regional/North_America/United_States/\">US</a>, "
|
|
"<a href=\"/Regional/North_America/Canada/\">Canada</a>, "
|
|
"<a href=\"/Regional/Europe/United_Kingdom/\">UK</a>, "
|
|
"<a href=\"/Regional/Europe/\">Europe</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/Science/\">Science</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Science/Biology/\">Biology</a>, "
|
|
"<a href=\"/Science/Social_Sciences/Psychology/\">Psychology</a>, "
|
|
"<a href=\"/Science/Physics/\">Physics</a>..."
|
|
"</small>\n"
|
|
"</td></tr><tr><td valign=top>"
|
|
"<b><a href=\"/Shopping/\">Shopping</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Shopping/Vehicles/Autos/\">Autos</a>, "
|
|
"<a href=\"/Shopping/Clothing/\">Clothing</a>, "
|
|
"<a href=\"/Shopping/Gifts/\">Gifts</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/Society/\">Society</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Society/People/\">People</a>, "
|
|
"<a href=\"/Society/Religion_and_Spirituality/\">Religion</a>, "
|
|
"<a href=\"/Society/Issues/\">Issues</a>..."
|
|
"</small>\n"
|
|
"</td><td valign=top>"
|
|
"<b><a href=\"/Sports/\">Sports</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/Sports/Baseball/\">Baseball</a>, "
|
|
"<a href=\"/Sports/Soccer/\">Soccer</a>, "
|
|
"<a href=\"/Sports/Basketball/\">Basketball</a>..."
|
|
"</small>\n"
|
|
"</td></tr>"
|
|
"<tr><td colspan=3 valign=top>"
|
|
"<b><a href=\"/World/\">World</a></b><br>"
|
|
"<small>"
|
|
"<a href=\"/World/Deutsch/\">Deutsch</a>, "
|
|
"<a href=\"/World/Espa%%c3%%b1ol/\">Espa%c%col</a>, "
|
|
"<a href=\"/World/Fran%%c3%%a7ais/\">Fran%c%cais</a>, "
|
|
"<a href=\"/World/Italiano/\">Italiano</a>, "
|
|
"<a href=\"/World/Japanese/\">Japanese</a>, "
|
|
"<a href=\"/World/Nederlands/\">Nederlands</a>, "
|
|
"<a href=\"/World/Polska/\">Polska</a>, "
|
|
"<a href=\"/World/Dansk/\">Dansk</a>, "
|
|
"<a href=\"/World/Svenska/\">Svenska</a>..."
|
|
"</small>\n"
|
|
"</td></tr></table></center>\n",
|
|
195, 177, 195, 167);
|
|
// make sure there's room
|
|
//long topListLen = gbstrlen(topList);
|
|
//if (pend - p <= topListLen+1)
|
|
// return p;
|
|
// copy it in
|
|
//memcpy(p, topList, topListLen);
|
|
//p += topListLen;
|
|
//*p = '\0';
|
|
//return p;
|
|
}
|
|
|
|
/////////////////
|
|
//
|
|
// ADD URL PAGE
|
|
//
|
|
/////////////////
|
|
|
|
#include "PageInject.h"
|
|
#include "TuringTest.h"
|
|
#include "AutoBan.h"
|
|
#include "CollectionRec.h"
|
|
#include "Users.h"
|
|
#include "Spider.h"
|
|
|
|
//static bool sendReply ( void *state , bool addUrlEnabled );
|
|
static bool canSubmit (unsigned long h, long now, long maxUrlsPerIpDom);
|
|
|
|
//static void addedStuff ( void *state );
|
|
|
|
void resetPageAddUrl ( ) ;
|
|
|
|
/*
|
|
class State2 {
|
|
public:
|
|
Url m_url;
|
|
//char *m_buf;
|
|
//long m_bufLen;
|
|
//long m_bufMaxLen;
|
|
};
|
|
*/
|
|
|
|
class State1 {
|
|
public:
|
|
//Msg4 m_msg4;
|
|
Msg7 m_msg7;
|
|
TcpSocket *m_socket;
|
|
bool m_isAdmin;
|
|
char m_coll[MAX_COLL_LEN+1];
|
|
bool m_goodAnswer;
|
|
bool m_doTuringTest;
|
|
long m_ufuLen;
|
|
char m_ufu[MAX_URL_LEN];
|
|
|
|
//long m_urlLen;
|
|
//char m_url[MAX_URL_LEN];
|
|
|
|
//char m_username[MAX_USER_SIZE];
|
|
bool m_strip;
|
|
bool m_spiderLinks;
|
|
bool m_forceRespider;
|
|
// buf filled by the links coming from google, msn, yahoo, etc
|
|
//State2 m_state2[5]; // gb, goog, yahoo, msn, ask
|
|
long m_numSent;
|
|
long m_numReceived;
|
|
//long m_raw;
|
|
//SpiderRequest m_sreq;
|
|
};
|
|
|
|
static void doneInjectingWrapper3 ( void *st1 ) ;
|
|
|
|
// only allow up to 1 Msg10's to be in progress at a time
|
|
static bool s_inprogress = false;
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
|
// . get fields from cgi field of the requested url
|
|
// . get the search query
|
|
long urlLen = 0;
|
|
char *url = r->getString ( "u" , &urlLen , NULL /*default*/);
|
|
|
|
// see if they provided a url of a file of urls if they did not
|
|
// provide a url to add directly
|
|
bool isAdmin = g_collectiondb.isAdmin ( r , s );
|
|
long ufuLen = 0;
|
|
char *ufu = NULL;
|
|
if ( isAdmin )
|
|
// get the url of a file of urls (ufu)
|
|
ufu = r->getString ( "ufu" , &ufuLen , NULL );
|
|
|
|
// can't be too long, that's obnoxious
|
|
if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
g_msg = " (error: url too long)";
|
|
return g_httpServer.sendErrorReply(s,500,"url too long");
|
|
}
|
|
// get the collection
|
|
long collLen = 0;
|
|
char *coll = r->getString("c",&collLen);
|
|
if ( ! coll || ! coll[0] ) {
|
|
//coll = g_conf.m_defaultColl;
|
|
coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
|
|
collLen = gbstrlen(coll);
|
|
}
|
|
// get collection rec
|
|
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
|
// bitch if no collection rec found
|
|
if ( ! cr ) {
|
|
g_errno = ENOCOLLREC;
|
|
g_msg = " (error: no collection)";
|
|
return g_httpServer.sendErrorReply(s,500,"no coll rec");
|
|
}
|
|
// . make sure the ip is not banned
|
|
// . we may also have an exclusive list of IPs for private collections
|
|
if ( ! cr->hasSearchPermission ( s ) ) {
|
|
g_errno = ENOPERM;
|
|
g_msg = " (error: permission denied)";
|
|
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
|
}
|
|
|
|
|
|
//
|
|
// if no url, print the main homepage page
|
|
//
|
|
if ( ! url ) {
|
|
SafeBuf sb;
|
|
printAddUrlHomePage ( sb , NULL , r );
|
|
return g_httpServer.sendDynamicPage(s,
|
|
sb.getBufStart(),
|
|
sb.length(),
|
|
// 120 secs cachetime
|
|
// don't cache any more
|
|
// since we have the
|
|
// login bar at top of page
|
|
0,//120 ,// cachetime
|
|
false,// post?
|
|
"text/html",
|
|
200,
|
|
NULL, // cookie
|
|
"UTF-8",
|
|
r);
|
|
}
|
|
|
|
//
|
|
// run the ajax script on load to submit the url now
|
|
//
|
|
long id = r->getLong("id",0);
|
|
// if we are not being called by the ajax loader, the put the
|
|
// ajax loader script into the html now
|
|
if ( id == 0 ) {
|
|
SafeBuf sb;
|
|
printAddUrlHomePage ( sb , url , r );
|
|
return g_httpServer.sendDynamicPage ( s,
|
|
sb.getBufStart(),
|
|
sb.length(),
|
|
// don't cache any more
|
|
// since we have the
|
|
// login bar at top of
|
|
//page
|
|
0,//3600,// cachetime
|
|
false,// post?
|
|
"text/html",
|
|
200,
|
|
NULL, // cookie
|
|
"UTF-8",
|
|
r);
|
|
}
|
|
|
|
//
|
|
// ok, inject the provided url!!
|
|
//
|
|
|
|
//
|
|
// check for errors first
|
|
//
|
|
|
|
// if addurl is turned off, just print "disabled" msg
|
|
char *msg = NULL;
|
|
if ( ! g_conf.m_addUrlEnabled )
|
|
msg = "Add url is temporarily disabled";
|
|
// can also be turned off in the collection rec
|
|
if ( ! cr->m_addUrlEnabled )
|
|
msg = "Add url is temporarily disabled";
|
|
// or if in read-only mode
|
|
if ( g_conf.m_readOnlyMode )
|
|
msg = "Add url is temporarily disabled";
|
|
// cannot add if another Msg10 from here is still in progress
|
|
if ( s_inprogress )
|
|
msg = "Add url is currently busy! Try again in a second.";
|
|
|
|
// . send msg back to the ajax request
|
|
// . use cachetime of 3600 so it does not re-inject if you hit the
|
|
// back button!
|
|
if ( msg ) {
|
|
SafeBuf sb;
|
|
sb.safePrintf("%s",msg);
|
|
g_httpServer.sendDynamicPage (s,
|
|
sb.getBufStart(),
|
|
sb.length(),
|
|
3600,//-1, // cachetime
|
|
false,// post?
|
|
"text/html",
|
|
200, // http status
|
|
NULL, // cookie
|
|
"UTF-8");
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
|
|
// make a new state
|
|
State1 *st1 ;
|
|
try { st1 = new (State1); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("PageAddUrl: new(%i): %s",
|
|
sizeof(State1),mstrerror(g_errno));
|
|
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); }
|
|
mnew ( st1 , sizeof(State1) , "PageAddUrl" );
|
|
// save socket and isAdmin
|
|
st1->m_socket = s;
|
|
st1->m_isAdmin = isAdmin;
|
|
|
|
/*
|
|
// save the url
|
|
st1->m_url[0] = '\0';
|
|
if ( url ) {
|
|
// normalize and add www. if it needs it
|
|
Url uu;
|
|
uu.set ( url , gbstrlen(url) , true );
|
|
// remove >'s i guess and store in st1->m_url[] buffer
|
|
st1->m_urlLen=cleanInput ( st1->m_url,
|
|
MAX_URL_LEN,
|
|
uu.getUrl(),
|
|
uu.getUrlLen() );
|
|
}
|
|
*/
|
|
|
|
// save the "ufu" (url of file of urls)
|
|
st1->m_ufu[0] = '\0';
|
|
st1->m_ufuLen = ufuLen;
|
|
memcpy ( st1->m_ufu , ufu , ufuLen );
|
|
st1->m_ufu[ufuLen] = '\0';
|
|
|
|
st1->m_doTuringTest = cr->m_doTuringTest;
|
|
st1->m_spiderLinks = true;
|
|
st1->m_strip = true;
|
|
|
|
// save the collection name in the State1 class
|
|
if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
|
|
strncpy ( st1->m_coll , coll , collLen );
|
|
st1->m_coll [ collLen ] = '\0';
|
|
|
|
// assume they answered turing test correctly
|
|
st1->m_goodAnswer = true;
|
|
|
|
// get ip of submitter
|
|
//unsigned long h = ipdom ( s->m_ip );
|
|
// . use top 2 bytes now, some isps have large blocks
|
|
// . if this causes problems, then they can do pay for inclusion
|
|
unsigned long h = iptop ( s->m_ip );
|
|
long codeLen;
|
|
char* code = r->getString("code", &codeLen);
|
|
if(g_autoBan.hasCode(code, codeLen, s->m_ip)) {
|
|
long uipLen = 0;
|
|
char* uip = r->getString("uip",&uipLen);
|
|
long hip = 0;
|
|
//use the uip when we have a raw query to test if
|
|
//we can submit
|
|
if(uip) {
|
|
hip = atoip(uip, uipLen);
|
|
h = iptop( hip );
|
|
}
|
|
}
|
|
|
|
|
|
st1->m_strip = r->getLong("strip",0);
|
|
// Remember, for cgi, if the box is not checked, then it is not
|
|
// reported in the request, so set default return value to 0
|
|
st1->m_spiderLinks = r->getLong("spiderLinks",0);
|
|
|
|
// . should we force it into spiderdb even if already in there
|
|
// . use to manually update spider times for a url
|
|
// . however, will not remove old scheduled spider times
|
|
// . mdw: made force on the default
|
|
st1->m_forceRespider = r->getLong("force",1); // 0);
|
|
|
|
long now = getTimeGlobal();
|
|
// . allow 1 submit every 1 hour
|
|
// . restrict by submitter domain ip
|
|
if ( ! st1->m_isAdmin &&
|
|
! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
|
|
// return error page
|
|
//g_errno = ETOOEARLY;
|
|
SafeBuf sb;
|
|
sb.safePrintf("You breached your add url quota.");
|
|
mdelete ( st1 , sizeof(State1) , "PageAddUrl" );
|
|
delete (st1);
|
|
// use cachetime of 3600 so it does not re-inject if you hit
|
|
// the back button!
|
|
g_httpServer.sendDynamicPage (s,
|
|
sb.getBufStart(),
|
|
sb.length(),
|
|
3600,//-1, // cachetime
|
|
false,// post?
|
|
"text/html",
|
|
200, // http status
|
|
NULL, // cookie
|
|
"UTF-8");
|
|
return true;
|
|
}
|
|
|
|
//st1->m_query = r->getString( "qts", &st1->m_queryLen );
|
|
|
|
// check it, if turing test is enabled for this collection
|
|
/*
|
|
if ( ! st1->m_isAdmin && cr->m_doTuringTest &&
|
|
! g_turingTest.isHuman(r) ) {
|
|
// log note so we know it didn't make it
|
|
g_msg = " (error: bad answer)";
|
|
//log("PageAddUrl:: addurl failed for %s : bad answer",
|
|
// iptoa(s->m_ip));
|
|
st1->m_goodAnswer = false;
|
|
return sendReply ( st1 , true ); // addUrl enabled?
|
|
}
|
|
*/
|
|
|
|
|
|
//
|
|
// inject using msg7
|
|
//
|
|
|
|
// . pass in the cleaned url
|
|
// . returns false if blocked, true otherwise
|
|
if ( ! st1->m_msg7.inject ( s ,
|
|
r ,
|
|
st1 ,
|
|
doneInjectingWrapper3 ) )
|
|
return false;
|
|
|
|
// some kinda error, g_errno should be set i guess
|
|
doneInjectingWrapper3 ( st1 );
|
|
// we did not block
|
|
return true;
|
|
}
|
|
|
|
|
|
void doneInjectingWrapper3 ( void *st ) {
|
|
State1 *st1 = (State1 *)st;
|
|
// allow others to add now
|
|
s_inprogress = false;
|
|
// get the state properly
|
|
//State1 *st1 = (State1 *) state;
|
|
// in order to see what sites are being added log it, then we can
|
|
// more easily remove sites from sitesearch.gigablast.com that are
|
|
// being added but not being searched
|
|
char *url = st1->m_msg7.m_xd.m_firstUrl.m_url;
|
|
log(LOG_INFO,"http: add url %s (%s)",url ,mstrerror(g_errno));
|
|
// extract info from state
|
|
TcpSocket *s = st1->m_socket;
|
|
//bool isAdmin = st1->m_isAdmin;
|
|
//char *url = NULL;
|
|
//if ( st1->m_urlLen ) url = st1->m_url;
|
|
// re-null it out if just http://
|
|
//bool printUrl = true;
|
|
//if ( st1->m_urlLen == 0 ) printUrl = false;
|
|
//if ( ! st1->m_url ) printUrl = false;
|
|
//if(st1->m_urlLen==7&&st1->m_url&&!strncasecmp(st1->m_url,"http://",7)
|
|
// printUrl = false;
|
|
|
|
// page is not more than 32k
|
|
char buf[1024*32+MAX_URL_LEN*2];
|
|
SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);
|
|
|
|
//char rawbuf[1024*8];
|
|
//SafeBuf rb(rawbuf, 1024*8);
|
|
//rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
|
|
//rb.safePrintf("<status>\n");
|
|
//CollectionRec *cr = g_collectiondb.getRec ( st1->m_coll );
|
|
|
|
// collection name
|
|
|
|
//char tt [ 128 ];
|
|
//tt[0] = '\0';
|
|
//if ( st1->m_coll[0] != '\0' && ! isAdmin )
|
|
// sprintf ( tt , " for %s", st1->m_coll );
|
|
|
|
|
|
//
|
|
// what we print here will just be the error msg, because the
|
|
// ajax will fill the text we print here into the div below
|
|
// the add url box
|
|
//
|
|
|
|
// if there was an error let them know
|
|
//char msg[MAX_URL_LEN + 1024];
|
|
char *pm = "";
|
|
if ( g_errno ) {
|
|
if ( g_errno == ETOOEARLY ) {
|
|
pm = "Error. 100 urls have "
|
|
"already been submitted by "
|
|
"this IP address for the last 24 hours. "
|
|
"<a href=/addurlerror.html>Explanation</a>.";
|
|
log("addurls: Failed for user at %s: "
|
|
"quota breeched.", iptoa(s->m_ip));
|
|
|
|
//rb.safePrintf("Error. %li urls have "
|
|
// "already been submitted by "
|
|
// "this IP address for the "
|
|
// "last 24 hours. ",
|
|
// cr->m_maxAddUrlsPerIpDomPerDay);
|
|
sb.safePrintf("%s",pm);
|
|
}
|
|
else {
|
|
sb.safePrintf("Error adding url(s): <b>%s[%i]</b>",
|
|
mstrerror(g_errno) , g_errno);
|
|
//pm = msg;
|
|
//rb.safePrintf("Error adding url(s): %s[%i]",
|
|
// mstrerror(g_errno) , g_errno);
|
|
//sb.safePrintf("%s",pm);
|
|
}
|
|
}
|
|
else {
|
|
if ( ! g_conf.m_addUrlEnabled ) {
|
|
pm = "<font color=#ff0000>"
|
|
"Sorry, this feature is temporarily disabled. "
|
|
"Please try again later.</font>";
|
|
if ( url )
|
|
log("addurls: failed for user at %s: "
|
|
"add url is disabled. "
|
|
"Enable add url on the "
|
|
"Master Controls page and "
|
|
"on the Spider Controls page for "
|
|
"this collection.",
|
|
iptoa(s->m_ip));
|
|
|
|
sb.safePrintf("%s",pm);
|
|
//rb.safePrintf("Sorry, this feature is temporarily "
|
|
// "disabled. Please try again later.");
|
|
}
|
|
else if ( s_inprogress ) {
|
|
pm = "Add url busy. Try again later.";
|
|
log("addurls: Failed for user at %s: "
|
|
"busy adding another.", iptoa(s->m_ip));
|
|
//rb.safePrintf("Add url busy. Try again later.");
|
|
sb.safePrintf("%s",pm);
|
|
}
|
|
// did they fail the turing test?
|
|
else if ( ! st1->m_goodAnswer ) {
|
|
pm = "<font color=#ff0000>"
|
|
"Oops, you did not enter the 4 large letters "
|
|
"you see below. Please try again.</font>";
|
|
//rb.safePrintf("could not add the url"
|
|
// " because the turing test"
|
|
// " is enabled.");
|
|
sb.safePrintf("%s",pm);
|
|
}
|
|
else if ( st1->m_msg7.m_xd.m_indexCodeValid &&
|
|
st1->m_msg7.m_xd.m_indexCode ) {
|
|
long ic = st1->m_msg7.m_xd.m_indexCode;
|
|
sb.safePrintf("<b>Had error injecting url: %s</b>",
|
|
mstrerror(ic));
|
|
}
|
|
/*
|
|
if ( url && ! st1->m_ufu[0] && url[0] && printUrl ) {
|
|
sprintf ( msg ,"<u>%s</u> added to spider "
|
|
"queue "
|
|
"successfully", url );
|
|
//rb.safePrintf("%s added to spider "
|
|
// "queue successfully", url );
|
|
}
|
|
else if ( st1->m_ufu[0] ) {
|
|
sprintf ( msg ,"urls in <u>%s</u> "
|
|
"added to spider queue "
|
|
"successfully", st1->m_ufu );
|
|
|
|
//rb.safePrintf("urls in %s added to spider "
|
|
// "queue successfully", url );
|
|
|
|
}
|
|
*/
|
|
else {
|
|
//rb.safePrintf("Add the url you want:");
|
|
// avoid hitting browser page cache
|
|
unsigned long rand32 = rand();
|
|
// in the mime to 0 seconds!
|
|
sb.safePrintf("<b>Url successfully added. "
|
|
"<a href=/search?rand=%lu&q=url%%3A",
|
|
rand32);
|
|
sb.urlEncode(url);
|
|
sb.safePrintf(">Check it</a> or "
|
|
"<a href=/seo?u=");
|
|
sb.urlEncode(url);
|
|
sb.safePrintf(">SEO it</a>"
|
|
".</b>");
|
|
}
|
|
|
|
//pm = msg;
|
|
//url = "http://";
|
|
//else
|
|
// pm = "Don't forget to <a href=/gigaboost.html>"
|
|
// "Gigaboost</a> your URL.";
|
|
}
|
|
|
|
// store it
|
|
sb.safePrintf("<b>%s</b>",pm );
|
|
|
|
// clear g_errno, if any, so our reply send goes through
|
|
g_errno = 0;
|
|
|
|
|
|
// nuke state
|
|
mdelete ( st1 , sizeof(State1) , "PageAddUrl" );
|
|
delete (st1);
|
|
|
|
// this reply should be loaded from the ajax loader so use a cache
|
|
// time of 1 hour so it does not re-inject the url if you hit the
|
|
// back button
|
|
g_httpServer.sendDynamicPage (s,
|
|
sb.getBufStart(),
|
|
sb.length(),
|
|
3600, // cachetime
|
|
false,// post?
|
|
"text/html",
|
|
200, // http status
|
|
NULL, // cookie
|
|
"UTF-8");
|
|
}
|
|
|
|
|
|
// we get like 100k submissions a day!!!
|
|
static HashTable s_htable;
|
|
static bool s_init = false;
|
|
static long s_lastTime = 0;
|
|
bool canSubmit ( unsigned long h , long now , long maxAddUrlsPerIpDomPerDay ) {
|
|
// . sometimes no limit
|
|
// . 0 means no limit because if they don't want any submission they
|
|
// can just turn off add url and we want to avoid excess
|
|
// troubleshooting for why a url can't be added
|
|
if ( maxAddUrlsPerIpDomPerDay <= 0 ) return true;
|
|
// init the table
|
|
if ( ! s_init ) {
|
|
s_htable.set ( 50000 );
|
|
s_init = true;
|
|
}
|
|
// clean out table every 24 hours
|
|
if ( now - s_lastTime > 24*60*60 ) {
|
|
s_lastTime = now;
|
|
s_htable.clear();
|
|
}
|
|
// . if table almost full clean out ALL slots
|
|
// . TODO: just clean out oldest slots
|
|
if ( s_htable.getNumSlotsUsed() > 47000 ) s_htable.clear ();
|
|
// . how many times has this IP domain submitted?
|
|
// . allow 10 times per day
|
|
long n = s_htable.getValue ( h );
|
|
// if over 24hr limit then bail
|
|
if ( n >= maxAddUrlsPerIpDomPerDay ) return false;
|
|
// otherwise, inc it
|
|
n++;
|
|
// add to table, will replace old values
|
|
s_htable.addKey ( h , n );
|
|
return true;
|
|
}
|
|
|
|
|
|
void resetPageAddUrl ( ) {
|
|
s_htable.reset();
|
|
}
|
|
|