#include "gb-include.h" #include "Indexdb.h" // makeKey(long long docId) #include "Titledb.h" #include "Spider.h" #include "Tagdb.h" #include "Dns.h" //#include "PageResults.h" // for query buf, g_qbuf #include "Collectiondb.h" //#include "CollectionRec.h" #include "Clusterdb.h" // for getting # of docs indexed //#include "Checksumdb.h" // should migrate to this one, though #include "Pages.h" #include "Query.h" // MAX_QUERY_LEN #include "SafeBuf.h" #include "LanguageIdentifier.h" #include "LanguagePages.h" #include "Users.h" #include "Address.h" // getIPLocation #include "Proxy.h" //char *printNumResultsDropDown ( char *p, long n, bool *printedDropDown); bool printNumResultsDropDown ( SafeBuf& sb, long n, bool *printedDropDown); //static char *printTopDirectory ( char *p, char *pend ); static bool printTopDirectory ( SafeBuf& sb , char format ); // this prints the last five queries //static long printLastQueries ( char *p , char *pend ) ; //static char *expandRootHtml ( char *p , long plen , /* static bool expandRootHtml ( SafeBuf& sb, uint8_t *html , long htmlLen , char *q , long qlen , HttpRequest *r , TcpSocket *s , long long docsInColl , CollectionRec *cr ) ; */ bool sendPageRoot ( TcpSocket *s, HttpRequest *r ){ return sendPageRoot ( s, r, NULL ); } bool printNav ( SafeBuf &sb , HttpRequest *r ) { /* char *root = ""; char *rootSecure = ""; if ( g_conf.m_isMattWells ) { root = "http://www.gigablast.com"; rootSecure = "https://www.gigablast.com"; } sb.safePrintf("
"); */ sb.safePrintf("" ""); return true; } ////////////// // // BEGIN expandHtml() helper functions // ////////////// bool printFamilyFilter ( SafeBuf& sb , bool familyFilterOn ) { char *s1 = ""; char *s2 = ""; if ( familyFilterOn ) s1 = " checked"; else s2 = " checked"; //p += sprintf ( p , return sb.safePrintf ( "Family filter: " "On   " "Off   " , s1 , s2 ); //return p; } //char *printNumResultsDropDown ( char *p , long n , bool *printedDropDown ) { bool printNumResultsDropDown ( SafeBuf& sb , long n , bool *printedDropDown ) { if ( n!=10 && n!=20 && n!=30 && n!=50 && n!=100 ) //return p; return true; *printedDropDown = true; char *d1 = ""; char *d2 = ""; char *d3 = ""; char *d4 = ""; char *d5 = ""; if ( n == 10 ) d1 = " selected"; if ( n == 20 ) d2 = " selected"; if ( n == 30 ) d3 = " selected"; if ( n == 50 ) d4 = " selected"; if ( n ==100 ) d5 = " selected"; //p += sprintf ( p , return sb.safePrintf ( "", d1,d2,d3,d4,d5); //return p; } //char *printDirectorySearchType ( char *p, long sdirt ) { bool printDirectorySearchType ( SafeBuf& sb, long sdirt ) { // default to entire directory if (sdirt < 1 || sdirt > 4) sdirt = 3; // by default search the whole thing sb.safePrintf(""); else sb.safePrintf(">"); sb.safePrintf("Entire Directory
\n"); // entire category sb.safePrintf(""); else sb.safePrintf(">"); sb.safePrintf("Entire Category
\n"); // base category only sb.safePrintf(""); else sb.safePrintf(">"); sb.safePrintf("Pages in Base Category
\n"); // sites in base category sb.safePrintf(""); else sb.safePrintf(">"); sb.safePrintf("Sites in Base Category
\n"); // sites in entire category sb.safePrintf(""); else sb.safePrintf(">"); sb.safePrintf("Sites in Entire Category
\n"); // end it return true; } #include "SearchInput.h" bool printRadioButtons ( SafeBuf& sb , SearchInput *si ) { // don't display this for directory search // look it up. returns catId <= 0 if dmoz not setup yet. // From PageDirectory.cpp //long catId= g_categories->getIdFromPath(decodedPath, decodedPathLen); // if /Top print the directory homepage //if ( catId == 1 || catId <= 0 ) // return true; // site /* if ( si->m_siteLen > 0 ) { // . print rest of search box etc. // . print cobranding radio buttons //if ( p + si->m_siteLen + 1 >= pend ) return p; //p += sprintf ( p , return sb.safePrintf ( //"   " //"" //"" //"Powered by Gigablast" //"
" //"" "" "Search the Web " "Search %s" , //"
" //"" //"" , si->m_site , si->m_site ); } else if ( si->m_sitesLen > 0 ) { */ if ( si->m_sites && si->m_sites[0] ) { // . print rest of search box etc. // . print cobranding radio buttons //if ( p + si->m_sitesLen + 1 >= pend ) return p; // if not explicitly instructed to print all sites // and they are a long list, do not print all /* char tmp[1000]; char *x = si->m_sites; if ( si->m_sitesLen > 255){//&&!st->m_printAllSites){ // copy what's there strncpy ( tmp , si->m_sites , 255 ); x = tmp + 254 ; // do not hack off in the middle of a site while ( is_alnum(*x) && x > tmp ) x--; // overwrite it with [more] link //x += sprintf ( x , "= xend ) goto skipit; sprintf ( x , " ..." ); x = tmp; } */ //p += sprintf ( p , sb.safePrintf ( //"   " //"" //"" //"Powered by Gigablast" //"
" //"" "" "Search the Web " "Search ", //"
" //"" //"" , si->m_sites ); sb.safeTruncateEllipsis ( si->m_sites, 255 ); } return true; } bool printLogo ( SafeBuf& sb , SearchInput *si ) { // if an image was provided... if ( ! si->m_imgUrl || ! si->m_imgUrl[0] ) { // no, now we default to our logo //return true; //p += sprintf ( p , return sb.safePrintf ( "" "" ); //return p; } // do we have a link? if ( si->m_imgLink && si->m_imgLink[0]) //p += sprintf ( p , "",si->m_imgLink); sb.safePrintf ( "", si->m_imgLink ); // print image width and length if ( si->m_imgWidth >= 0 && si->m_imgHeight >= 0 ) //p += sprintf ( p , "m_imgWidth , si->m_imgHeight ); else //p += sprintf ( p , "", sb.safePrintf( "border=0 src=\"%s\">", si->m_imgUrl ); // end the link if we had one if ( si->m_imgLink && si->m_imgLink[0] ) //p += sprintf ( p , ""); sb.safePrintf ( ""); return true; } ///////////// // // END expandHtml() helper functions // ///////////// bool expandHtml ( SafeBuf& sb, char *head , long hlen , char *q , long qlen , HttpRequest *r , SearchInput *si, char *method , CollectionRec *cr ) { //char *pend = p + plen; // store custom header into buf now //for ( long i = 0 ; i < hlen && p+10 < pend ; i++ ) { for ( long i = 0 ; i < hlen; i++ ) { if ( head[i] != '%' ) { // *p++ = head[i]; sb.safeMemcpy((char*)&head[i], 1); continue; } if ( i + 1 >= hlen ) { // *p++ = head[i]; sb.safeMemcpy((char*)&head[i], 1); continue; } if ( head[i+1] == 'S' ) { // now we got the %S, insert "spiders are [on/off]" bool spidersOn = true; if ( ! g_conf.m_spideringEnabled ) spidersOn = false; if ( ! cr->m_spideringEnabled ) spidersOn = false; if ( spidersOn ) sb.safePrintf("Spiders are on"); else sb.safePrintf("Spiders are off"); // skip over %S i += 1; continue; } if ( head[i+1] == 'q' ) { // now we got the %q, insert the query char *p = (char*) sb.getBuf(); char *pend = (char*) sb.getBufEnd(); long eqlen = dequote ( p , pend , q , qlen ); //p += eqlen; sb.incrementLength(eqlen); // skip over %q i += 1; continue; } if ( head[i+1] == 'c' ) { // now we got the %q, insert the query if ( cr ) sb.safeStrcpy(cr->m_coll); // skip over %c i += 1; continue; } if ( head[i+1] == 'w' && head[i+2] == 'h' && head[i+3] == 'e' && head[i+4] == 'r' && head[i+5] == 'e' ) { // insert the location long whereLen; char *where = r->getString("where",&whereLen); // get it from cookie as well! if ( ! where ) where = r->getStringFromCookie("where", &whereLen); // fix for getStringFromCookie if ( where && ! where[0] ) where = NULL; // skip over the %where i += 5; // if empty, base it on IP if ( ! where ) { double lat; double lon; double radius; char *city,*state,*ctry; // use this by default long ip = r->m_userIP; // ip for testing? long iplen; char *ips = r->getString("uip",&iplen); if ( ips ) ip = atoip(ips); // returns true if found in db char buf[128]; getIPLocation ( ip , &lat , &lon , &radius, &city , &state , &ctry , buf , 128 ) ; if ( city && state ) sb.safePrintf("%s, %s",city,state); } else sb.dequote (where,whereLen); continue; } if ( head[i+1] == 'w' && head[i+2] == 'h' && head[i+3] == 'e' && head[i+4] == 'n' ) { // insert the location long whenLen; char *when = r->getString("when",&whenLen); // skip over the %when i += 4; if ( ! when ) continue; sb.dequote (when,whenLen); continue; } // %sortby if ( head[i+1] == 's' && head[i+2] == 'o' && head[i+3] == 'r' && head[i+4] == 't' && head[i+5] == 'b' && head[i+6] == 'y' ) { // insert the location long sortBy = r->getLong("sortby",1); // print the radio buttons char *cs[5]; cs[0]=""; cs[1]=""; cs[2]=""; cs[3]=""; cs[4]=""; if ( sortBy >=1 && sortBy <=4 ) cs[sortBy] = " checked"; sb.safePrintf( "date " "distance " "relevancy " "popularity", cs[1],cs[2],cs[3],cs[4]); // skip over the %sortby i += 6; continue; } if ( head[i+1] == 'e' ) { // now we got the %e, insert the query char *p = (char*) sb.getBuf(); long plen = sb.getAvail(); long eqlen = urlEncode ( p , plen , q , qlen ); //p += eqlen; sb.incrementLength(eqlen); // skip over %e i += 1; continue; } if ( head[i+1] == 'N' ) { // now we got the %N, insert the global doc count //long long c=g_checksumdb.getRdb()->getNumGlobalRecs(); //now each host tells us how many docs it has in itsping long long c = g_hostdb.getNumGlobalRecs(); c += g_conf.m_docCountAdjustment; // never allow to go negative if ( c < 0 ) c = 0; //p+=ulltoa(p,c); char *p = (char*) sb.getBuf(); sb.reserve2x(16); long len = ulltoa(p, c); sb.incrementLength(len); // skip over %N i += 1; continue; } /* if ( head[i+1] == 'E' ) { // now each host tells us how many docs it has in its // ping request long long c = g_hostdb.getNumGlobalEvents(); char *p = (char*) sb.getBuf(); sb.reserve2x(16); long len = ulltoa(p, c); sb.incrementLength(len); // skip over %E i += 1; continue; } */ if ( head[i+1] == 'n' ) { // now we got the %n, insert the collection doc count //p+=ulltoa(p,docsInColl); char *p = (char*) sb.getBuf(); sb.reserve2x(16); long long docsInColl = 0; if ( cr ) docsInColl = cr->getNumDocsIndexed(); long len = ulltoa(p, docsInColl); sb.incrementLength(len); // skip over %n i += 1; continue; } /* if ( head[i+1] == 'T' ) { // . print the final tail // . only print admin link if we're local //long user = g_pages.getUserType ( s , r ); //char *username = g_users.getUsername(r); //char *pwd = r->getString ( "pwd" ); char *p = (char*) sb.getBuf(); long plen = sb.getAvail(); //p = g_pages.printTail ( p , p + plen , user , pwd ); char *n = g_pages.printTail(p , p + plen , r->isLocal()); sb.incrementLength(n - p); // skip over %T i += 1; continue; } */ // print the drop down menu for selecting the # of reslts if ( head[i+1] == 'D' ) { // skip over %D i += 1; // skip if not enough buffer //if ( p + 1000 >= pend ) continue; // # results //long n = r->getLong("n",10); //bool printedDropDown; //p = printNumResultsDropDown(p,n,&printedDropDown); //printNumResultsDropDown(sb,n,&printedDropDown); continue; } if ( head[i+1] == 'H' ) { // . insert the secret key here, to stop seo bots // . TODO: randomize its position to make parsing more // difficult // . this secret key is for submitting a new query // long key; // char kname[4]; // g_httpServer.getKey (&key,kname,NULL,0,time(NULL),0, // 10); //sprintf (p , "", // kname,key); //p += gbstrlen ( p ); // sb.safePrintf( "", // kname,key); //adds param for default screen size //if(cr) // sb.safePrintf("", //cr->m_screenWidth); // insert collection name too long collLen; char *coll = r->getString ( "c" , &collLen ); if ( collLen > 0 && collLen < MAX_COLL_LEN ) { //sprintf (p,"\n"); //p += gbstrlen ( p ); sb.safePrintf("\">\n"); } // pass this crap on so zak can do searches //char *username = g_users.getUsername(r); // this is null because not in the cookie and we are // logged in //char *pwd = r->getString ( "pwd" ); //sb.safePrintf("\n", //pwd); //sb.safePrintf("\n",username); // skip over %H i += 1; continue; } // %t, print Top Directory section if ( head[i+1] == 't' ) { i += 1; //p = printTopDirectory ( p, pend ); printTopDirectory ( sb , FORMAT_HTML ); continue; } // MDW if ( head[i+1] == 'F' ) { i += 1; //p = printTopDirectory ( p, pend ); if ( ! method ) method = "GET"; sb.safePrintf("
\n",method); continue; } if ( head[i+1] == 'L' ) { i += 1; //p = printTopDirectory ( p, pend ); printLogo ( sb , si ); continue; } if ( head[i+1] == 'f' ) { i += 1; //p = printTopDirectory ( p, pend ); printFamilyFilter ( sb , si->m_familyFilter ); continue; } if ( head[i+1] == 'R' ) { i += 1; //p = printTopDirectory ( p, pend ); printRadioButtons ( sb , si ); continue; } // MDW // *p++ = head[i]; sb.safeMemcpy((char*)&head[i], 1); continue; } //return p; return true; } bool printFrontPageShell ( SafeBuf &sb , long pageNum ) { sb.safePrintf("\n"); sb.safePrintf("\n"); //sb.safePrintf(""); sb.safePrintf("\n"); sb.safePrintf("\n"); char *title = "An Alternative Open Source Search Engine"; if ( pageNum == 1 ) title = "Directory"; if ( pageNum == 2 ) title = "Advanced"; if ( pageNum == 3 ) title = "Add Url"; if ( pageNum == 4 ) title = "About"; if ( pageNum == 5 ) title = "Help"; if ( pageNum == 6 ) title = "API"; sb.safePrintf("Gigablast - %s\n",title); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); //sb.safePrintf("\n"); //g_proxy.insertLoginBarDirective ( &sb ); // // DIVIDE INTO TWO PANES, LEFT COLUMN and MAIN COLUMN // sb.safePrintf("" "\n\n"); class MenuItem { public: char *m_text; char *m_url; }; static MenuItem mi[] = { {"SEARCH","/"}, {"DIRECTORY","/Top"}, {"ADVANCED","/adv.html"}, {"ADD URL","/addurl"}, {"ABOUT","/about.html"}, {"HELP","/help.html"}, {"API","/api"} }; // // first the nav column // sb.safePrintf("" "
" "
" "
" "" "
" "
" "" "
" "
" "
" "
" "
" ); long n = sizeof(mi) / sizeof(MenuItem); for ( long i = 0 ; i < n ; i++ ) { sb.safePrintf( "" "
" // make button wider "" "            " "%s    " , mi[i].m_text ); // // begin hack: white out the blue border line!! // if ( i == pageNum ) sb.safePrintf( "
" " " "
" ); // end hack sb.safePrintf( "
" "
" "
" ); } // admin link sb.safePrintf( "" "
" "ADMIN    " "
" "
" "
" ); // // now the MAIN column // sb.safePrintf("\n
\n"); sb.safePrintf("

"); sb.safePrintf("\n"); // sb.safePrintf("
" // "\n"); return true; } bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) { SearchInput si; si.set ( sock , r ); // if there's a ton of sites use the post method otherwise // they won't fit into the http request, the browser will reject // sending such a large request with "GET" char *method = "GET"; if ( si.m_sites && gbstrlen(si.m_sites)>800 ) method = "POST"; // if the provided their own CollectionRec *cr = g_collectiondb.getRec ( r ); if ( cr && cr->m_htmlRoot.length() ) { return expandHtml ( sb , cr->m_htmlRoot.getBufStart(), cr->m_htmlRoot.length(), NULL, 0, r , &si, //TcpSocket *s , method , // "GET" or "POST" cr );//CollectionRec *cr ) { } printFrontPageShell ( sb ,0 ); //sb.safePrintf("

\n"); // try to avoid using https for images. it is like 10ms slower. // if ( g_conf.m_isMattWells ) // sb.safePrintf("
\n"); // else sb.safePrintf("

\n"); sb.safePrintf("


\n"); /* sb.safePrintf("web      "); if ( g_conf.m_isMattWells ) sb.safePrintf("seo " "     " ); sb.safePrintf( "directory " "     \n"); sb.safePrintf("advanced search"); sb.safePrintf("      "); sb.safePrintf("add url"); sb.safePrintf("\n"); sb.safePrintf("

\n"); */ // submit to https now sb.safePrintf("\n", method); if ( cr ) sb.safePrintf("", cr->m_coll); // put search box in a box sb.safePrintf("
"); sb.safePrintf("   " //"" "
" "GO" "
" "\n" ); sb.safePrintf("
\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("
\n"); sb.safePrintf("\n"); if ( cr && cr->m_coll ) { // && strcmp(cr->m_coll,"main") ) { sb.safePrintf("
" "Searching the %s collection." "
", cr->m_coll); sb.safePrintf("
\n"); sb.safePrintf("\n"); } // print any red boxes we might need to if ( printRedBox2 ( &sb , true ) ) sb.safePrintf("
\n"); /* do not show table for open source installs sb.safePrintf("\n"); sb.safePrintf("\n"); char *root = ""; if ( g_conf.m_isMattWells ) root = "http://www.gigablast.com"; sb.safePrintf("\n"); //sb.safePrintf("\n"); sb.safePrintf("\n" , root ); sb.safePrintf("\n"); // donate with paypal sb.safePrintf("\n"); sb.safePrintf("\n" ); sb.safePrintf("\n"); */ /* sb.safePrintf("\n"); // 204x143 sb.safePrintf("\n" , root ); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); */ /* sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); */ /* sb.safePrintf("\n"); sb.safePrintf("\n" , root ); sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); */ /* if ( g_conf.m_isMattWells ) { sb.safePrintf("\n"); sb.safePrintf("\n" , root ); sb.safePrintf("\n"); } */ /* sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); */ /* sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); */ /* sb.safePrintf("\n"); sb.safePrintf("\n"); sb.safePrintf("\n"); */ sb.safePrintf("\n"); sb.safePrintf("\n"); //sb.safePrintf("
Open Source!" "
\n"); sb.brify2("Gigablast is now available as an open source search engine on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. Features. Limited support available for free." ,80); sb.safePrintf("

"); sb.safePrintf("
" // BEGIN PAYPAL DONATE BUTTON "
" "" "" "" "" "" "" "" "\"\"" "
" // END PAYPAY BUTTON "
" "Support Gigablast" "
\n" ); sb.brify2( "Donations of $100 or more receive a black " "Gigablast T-shirt " "with embroidered logo while quantities last. " "State your address and size " "in an email. " "PayPal accepted. " "Help Gigablast continue " "to grow and add new features." , 80 ); sb.safePrintf("
Event Guru Returns
\n"); sb.brify2("Event Guru datamines events from the web. It identifies events on a web page, or even plain text, using the same rules of deduction used by the human mind. It also has Facebook integration and lots of other cool things.",80); sb.safePrintf("

The Green Search Engine
\n"); sb.brify2("Gigablast is the only clean-powered web search engine. 90% of its power usage comes from wind energy. Astoundingly, Gigablast is one of ONLY four search engines in the United States indexing over a billion pages.",80); sb.safePrintf("

The Transparent Search Engine
\n"); sb.brify2("Gigablast is the first truly transparent search engine. It tells you exactly why the search results are ranked the way they are. There is nothing left to the imagination.",85); sb.safePrintf("

"); sb.safePrintf("
The SEO Search Engine
\n"); sb.brify2("When it comes to search-engine based SEO, Gigablast is the place to be. With a frothy set of unique and effective SEO tools, you will find all you need to execute a simple yet effective SEO strategy. Stop the guesswork, and let a search engine tell you how to SEO it.",85); sb.safePrintf("
Xml Search Feed
\n"); sb.brify2("Utilize Gigablast's results on your own site or product by connecting with Gigablast's XML search feed. It's now simpler than ever to setup and use. You can also add the web pages you want into the index in near real-time.",85); sb.safePrintf("
The Private Search Engine" "
\n"); sb.brify2("Gigablast does not allow the NSA or any third party " "to spy on the queries your IP address is doing, " "unlike " "" "other large search engines. " "Gigablast is the only " "truly private search engine " "in the United States." //" Everyone else has fundamental " //"gaps in their " //"security as explained by the above link." //"Tell Congress " //"to stop spying " //"on you." ,85); sb.safePrintf("
No Tax Dodging
\n"); sb.brify2("Gigablast pays its taxes when it makes a profit. " "Google and Bing do not. They " "stash their profits in " "offshore tax havens to avoid paying taxes. " //"The end result is that taxes are higher for you. " "You may think Google and Bing are free to use, but in " "reality, you pay for it in increased taxes." ,85); sb.safePrintf("
\n"); sb.safePrintf("

\n"); printNav ( sb , r ); return true; } bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) { printFrontPageShell ( sb , 3 ); sb.safePrintf("\n"); sb.safePrintf("

\n"); sb.safePrintf("


\n"); /* sb.safePrintf("web      "); if ( g_conf.m_isMattWells ) sb.safePrintf("seo " "     " ); sb.safePrintf( "directory " "     \n"); sb.safePrintf("advanced search"); sb.safePrintf("      "); sb.safePrintf("add url"); sb.safePrintf("\n"); sb.safePrintf("

\n"); */ // submit to https now sb.safePrintf("
\n" ); CollectionRec *cr = g_collectiondb.getRec ( r ); char *coll = ""; if ( cr ) coll = cr->m_coll; if ( cr ) sb.safePrintf("", cr->m_coll); // put search box in a box sb.safePrintf("
"); sb.safePrintf("   " //"\n" "
" "GO" "
" "\n" ); sb.safePrintf("\n"); sb.safePrintf("
\n"); sb.safePrintf("\n"); sb.safePrintf("
\n"); sb.safePrintf("\n"); // if addurl is turned off, just print "disabled" msg char *msg = NULL; if ( ! g_conf.m_addUrlEnabled ) msg = "Add url is temporarily disabled"; // can also be turned off in the collection rec //if ( ! cr->m_addUrlEnabled ) // msg = "Add url is temporarily disabled"; // or if in read-only mode if ( g_conf.m_readOnlyMode ) msg = "Add url is temporarily disabled"; sb.safePrintf("
" "Add a url to the %s collection
",coll); // if url is non-empty the ajax will receive this identical msg // and display it in the div, so do not duplicate the msg! if ( msg && ! url ) sb.safePrintf("

%s",msg); // . the ajax msgbox div // . when loaded with the main page for the first time it will // immediately replace its content... if ( url ) { char *root = ""; if ( g_conf.m_isMattWells ) root = "http://www.gigablast.com"; sb.safePrintf("
" "
" "
" //"Injecting your url. Please wait..." "
" "" "
" "\n" , h32 , coll , rand64 ); sb.safePrintf("
\n"); } sb.safePrintf("
\n"); sb.safePrintf("
\n"); sb.safePrintf("\n"); sb.safePrintf("

\n"); printNav ( sb , r ); return true; } bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) { char format = r->getReplyFormat(); if ( format != FORMAT_HTML ) return printTopDirectory ( sb , format ); printFrontPageShell ( sb , 1 ); sb.safePrintf("

\n"); sb.safePrintf("


\n"); // submit to https now sb.safePrintf("
\n"); CollectionRec *cr = g_collectiondb.getRec ( r ); if ( cr ) sb.safePrintf("", cr->m_coll); // put search box in a box sb.safePrintf("
"); sb.safePrintf("   " //"\n"); "
" "GO" "
" "\n" ); sb.safePrintf("
\n"); sb.safePrintf("\n"); sb.safePrintf("
\n"); sb.safePrintf("
\n"); sb.safePrintf("\n"); printTopDirectory ( sb , FORMAT_HTML ); sb.safePrintf("

\n"); printNav ( sb , r); return true; } // . returns false if blocked, true otherwise // . sets errno on error // . make a web page displaying the config of this host // . call g_httpServer.sendDynamicPage() to send it bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie ) { // don't allow pages bigger than 128k in cache char buf [ 10*1024 + MAX_QUERY_LEN ]; // a ptr into "buf" //char *p = buf; //char *pend = buf + 10*1024 + MAX_QUERY_LEN - 100 ; SafeBuf sb(buf, 10*1024 + MAX_QUERY_LEN); // print bgcolors, set focus, set font style //p = g_httpServer.printFocus ( p , pend ); //p = g_httpServer.printColors ( p , pend ); //long qlen; //char *q = r->getString ( "q" , &qlen , NULL ); // insert collection name too CollectionRec *cr = g_collectiondb.getRec(r); if ( ! cr ) { g_errno = ENOCOLLREC; return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); } // get the collection rec /* CollectionRec *cr = g_collectiondb.getRec ( coll ); uint8_t *hp = NULL; long hpLen; long long docsInColl = -1; if ( ! cr ) { // use the default Parm *pp = g_parms.getParm ( "hp" ); if ( ! pp ) { g_errno = ENOTFOUND; g_msg = " (error: no such collection)"; return g_httpServer.sendErrorReply(s,500, mstrerror(g_errno)); } hp = (uint8_t*)pp->m_def; if ( hp ) hpLen = uint8strlen ( hp ); if ( hpLen <= 0 || ! hp ) log(LOG_INFO,"http: No root page html present."); } else { if(cr->m_useLanguagePages) { uint8_t lang = g_langId.guessGBLanguageFromUrl(r->getHost()); if(lang && (hp = g_languagePages.getLanguagePage(lang)) != NULL) { hpLen = uint8strlen(hp); // Set sort language as well // This might not be a good idea, as it // overrides any other setting. May be // better to let the user agent string // tell us what the user wants. strcpy(cr->m_defaultSortLanguage, getLanguageAbbr(lang)); } } if(!hp) { hp = (uint8_t*)cr->m_htmlRoot; hpLen = cr->m_htmlRootLen; } //RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , coll ); RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , coll ); if ( base ) docsInColl = base->getNumGlobalRecs(); } */ // print the page out /* expandRootHtml ( sb, hp , hpLen , q , qlen , r , s , docsInColl , cr ); */ //if ( ! strcmp(coll,"dmoz" ) ) // printDirHomePage(sb,r); //else printWebHomePage(sb,r,s); // . print last 5 queries // . put 'em in a table // . disable for now, impossible to monitor/control //p += printLastQueries ( p , pend ); // are we the admin? //bool isAdmin = g_collectiondb.isAdmin ( r , s ); // calculate bufLen //long bufLen = p - buf; // . now encapsulate it in html head/tail and send it off // . the 0 means browser caches for however long it's set for // . but we don't use 0 anymore, use -2 so it never gets cached so // our display of the # of pages in the index is fresh // . no, but that will piss people off, its faster to keep it cached //return g_httpServer.sendDynamicPage ( s , buf , bufLen , -1 ); return g_httpServer.sendDynamicPage ( s, (char*) sb.getBufStart(), sb.length(), // 120 seconds cachetime // don't cache anymore since // we have the login bar at // the top of the page 0,//120, // cachetime false,// post? "text/html", 200, NULL, // cookie "UTF-8", r); } // . store into "p" // . returns bytes stored into "p" // . used for entertainment purposes /* long printLastQueries ( char *p , char *pend ) { // if not 512 bytes left, bail if ( pend - p < 512 ) return 0; // return w/ no table if no queries have been added to g_qbuf yet if ( ! g_nextq == -1 ) return 0; // remember start for returning # of bytes stored char *start = p; // begin table (no border) sprintf (p,"
", (long)QBUF_NUMQUERIES ); p += gbstrlen ( p ); // point to last query added long n = g_nextq - 1; // . wrap it if we need to // . QBUF_NUMQUERIES is defined to be 5 in PageResults.h if ( n < 0 ) n = QBUF_NUMQUERIES - 1; // . print up to five queries // . queries are stored by advancing g_nextq, so "i" should go backward long count = 0; for ( long i = n ; count < QBUF_NUMQUERIES ; count++ , i-- ) { // wrap i if we need to if ( i == -1 ) i = QBUF_NUMQUERIES - 1; // if this query is empty, skip it (might be uninitialized) if ( g_qbuf[i][0] == '\0' ) continue; // point to the query (these are NULL terminated) char *q = g_qbuf[i]; long qlen = gbstrlen(q); // bail if too big if ( p + qlen + 32 + 1024 >= pend ) return p - start; // otherwise, print this query to the page sprintf ( p , "" ); p += gbstrlen ( p ); } // end the table sprintf ( p , "
Last %li queries:" "
"); p += gbstrlen ( p ); // return bytes written return p - start; } */ //char *printTopDirectory ( char *p, char *pend ) { bool printTopDirectory ( SafeBuf& sb , char format ) { long nr = g_catdb.getRdb()->getNumTotalRecs(); // if no recs in catdb, print instructions if ( nr == 0 && format == FORMAT_HTML) return sb.safePrintf("
" "DMOZ functionality is not set up." "
" "
" "" "Please follow the set up " "" "instructions" "." "" "
"); // send back an xml/json error reply if ( nr == 0 && format != FORMAT_HTML ) { g_errno = EDMOZNOTREADY; return false; } //char topList[4096]; //sprintf(topList, return sb.safePrintf ( "
" "" "
\n" "Arts
" "" "Movies, " "Television, " "Music..." "\n" "
" "Business
" "" "Jobs, " "Real Estate, " "Investing..." "\n" "
" "Computers
" "" "Internet, " "Software, " "Hardware..." "\n" "
" "Games
" "" "Video Games, " "RPGs, " "Gambling..." "\n" "
" "Health
" "" "Fitness, " "Medicine, " "Alternative..." "\n" "
" "Home
" "" "Family, " "Consumers, " "Cooking..." "\n" "
" //"" //"K" //"i" //"d" //"s" //" and Teens
" "Kids and Teens
" "" "Arts, " "School Time, " "Teen Life..." "\n" "
" "News
" "" "Media, " "Newspapers, " "Weather..." "\n" "
" "Recreation
" "" "Travel, " "Food, " "Outdoors, " "Humor..." "\n" "
" "Reference
" "" "Maps, " "Education, " "Libraries..." "\n" "
" "Regional
" "" "US, " "Canada, " "UK, " "Europe..." "\n" "
" "Science
" "" "Biology, " "Psychology, " "Physics..." "\n" "
" "Shopping
" "" "Autos, " "Clothing, " "Gifts..." "\n" "
" "Society
" "" "People, " "Religion, " "Issues..." "\n" "
" "Sports
" "" "Baseball, " "Soccer, " "Basketball..." "\n" "
" "World
" "" "Deutsch, " "Espa%c%col, " "Fran%c%cais, " "Italiano, " "Japanese, " "Nederlands, " "Polska, " "Dansk, " "Svenska..." "\n" "
\n", 195, 177, 195, 167); // make sure there's room //long topListLen = gbstrlen(topList); //if (pend - p <= topListLen+1) // return p; // copy it in //memcpy(p, topList, topListLen); //p += topListLen; //*p = '\0'; //return p; } ///////////////// // // ADD URL PAGE // ///////////////// #include "PageInject.h" #include "TuringTest.h" #include "AutoBan.h" //#include "CollectionRec.h" #include "Users.h" #include "Spider.h" //static bool sendReply ( void *state , bool addUrlEnabled ); static bool canSubmit (unsigned long h, long now, long maxUrlsPerIpDom); //static void addedStuff ( void *state ); void resetPageAddUrl ( ) ; /* class State2 { public: Url m_url; //char *m_buf; //long m_bufLen; //long m_bufMaxLen; }; */ class State1i { public: //Msg4 m_msg4; Msg7 m_msg7; TcpSocket *m_socket; bool m_isAdmin; char m_coll[MAX_COLL_LEN+1]; bool m_goodAnswer; bool m_doTuringTest; long m_ufuLen; char m_ufu[MAX_URL_LEN]; //long m_urlLen; //char m_url[MAX_URL_LEN]; //char m_username[MAX_USER_SIZE]; bool m_strip; bool m_spiderLinks; bool m_forceRespider; // buf filled by the links coming from google, msn, yahoo, etc //State2 m_state2[5]; // gb, goog, yahoo, msn, ask long m_numSent; long m_numReceived; //long m_raw; //SpiderRequest m_sreq; }; static void doneInjectingWrapper3 ( void *st1 ) ; // only allow up to 1 Msg10's to be in progress at a time static bool s_inprogress = false; // . returns false if blocked, true otherwise // . sets g_errno on error bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) { // . get fields from cgi field of the requested url // . get the search query long urlLen = 0; char *url = hr->getString ( "urls" , &urlLen , NULL /*default*/); // see if they provided a url of a file of urls if they did not // provide a url to add directly bool isAdmin = g_conf.isCollAdmin ( sock , hr ); long ufuLen = 0; char *ufu = NULL; //if ( isAdmin ) // // get the url of a file of urls (ufu) // ufu = hr->getString ( "ufu" , &ufuLen , NULL ); // can't be too long, that's obnoxious if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) { g_errno = EBUFTOOSMALL; g_msg = " (error: url too long)"; return g_httpServer.sendErrorReply(sock,500,"url too long"); } // get the collection //long collLen = 0; //char *coll9 = r->getString("c",NULL);//&collLen); //if ( ! coll || ! coll[0] ) { // //coll = g_conf.m_defaultColl; // coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() ); // collLen = gbstrlen(coll); //} // get collection rec CollectionRec *cr = g_collectiondb.getRec ( hr ); // bitch if no collection rec found if ( ! cr ) { g_errno = ENOCOLLREC; g_msg = " (error: no collection)"; return g_httpServer.sendErrorReply(sock,500,"no coll rec"); } // . make sure the ip is not banned // . we may also have an exclusive list of IPs for private collections if ( ! cr->hasSearchPermission ( sock ) ) { g_errno = ENOPERM; g_msg = " (error: permission denied)"; return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno)); } // // if no url, print the main homepage page // if ( ! url ) { SafeBuf sb; printAddUrlHomePage ( sb , NULL , hr ); return g_httpServer.sendDynamicPage(sock, sb.getBufStart(), sb.length(), // 120 secs cachetime // don't cache any more // since we have the // login bar at top of page 0,//120 ,// cachetime false,// post? "text/html", 200, NULL, // cookie "UTF-8", hr); } // // run the ajax script on load to submit the url now // long id = hr->getLong("id",0); // if we are not being called by the ajax loader, the put the // ajax loader script into the html now if ( id == 0 ) { SafeBuf sb; printAddUrlHomePage ( sb , url , hr ); return g_httpServer.sendDynamicPage ( sock, sb.getBufStart(), sb.length(), // don't cache any more // since we have the // login bar at top of //page 0,//3600,// cachetime false,// post? "text/html", 200, NULL, // cookie "UTF-8", hr); } // // ok, inject the provided url!! // // // check for errors first // // if addurl is turned off, just print "disabled" msg char *msg = NULL; if ( ! g_conf.m_addUrlEnabled ) msg = "Add url is temporarily disabled"; // can also be turned off in the collection rec //if ( ! cr->m_addUrlEnabled ) // msg = "Add url is temporarily disabled"; // or if in read-only mode if ( g_conf.m_readOnlyMode ) msg = "Add url is temporarily disabled"; // cannot add if another Msg10 from here is still in progress if ( s_inprogress ) msg = "Add url is currently busy! Try again in a second."; // . send msg back to the ajax request // . use cachetime of 3600 so it does not re-inject if you hit the // back button! if ( msg ) { SafeBuf sb; sb.safePrintf("%s",msg); g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), 3600,//-1, // cachetime false,// post? "text/html", 200, // http status NULL, // cookie "UTF-8"); return true; } // make a new state State1i *st1 ; try { st1 = new (State1i); } catch ( ... ) { g_errno = ENOMEM; log("PageAddUrl: new(%i): %s", sizeof(State1i),mstrerror(g_errno)); return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno)); } mnew ( st1 , sizeof(State1i) , "PageAddUrl" ); // save socket and isAdmin st1->m_socket = sock; st1->m_isAdmin = isAdmin; /* // save the url st1->m_url[0] = '\0'; if ( url ) { // normalize and add www. if it needs it Url uu; uu.set ( url , gbstrlen(url) , true ); // remove >'s i guess and store in st1->m_url[] buffer st1->m_urlLen=cleanInput ( st1->m_url, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); } */ // save the "ufu" (url of file of urls) st1->m_ufu[0] = '\0'; st1->m_ufuLen = ufuLen; memcpy ( st1->m_ufu , ufu , ufuLen ); st1->m_ufu[ufuLen] = '\0'; st1->m_doTuringTest = cr->m_doTuringTest; st1->m_spiderLinks = true; st1->m_strip = true; // save the collection name in the State1i class //if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN; //strncpy ( st1->m_coll , coll , collLen ); //st1->m_coll [ collLen ] = '\0'; strcpy ( st1->m_coll , cr->m_coll ); // assume they answered turing test correctly st1->m_goodAnswer = true; // get ip of submitter //unsigned long h = ipdom ( s->m_ip ); // . use top 2 bytes now, some isps have large blocks // . if this causes problems, then they can do pay for inclusion unsigned long h = iptop ( sock->m_ip ); long codeLen; char* code = hr->getString("code", &codeLen); if(g_autoBan.hasCode(code, codeLen, sock->m_ip)) { long uipLen = 0; char* uip = hr->getString("uip",&uipLen); long hip = 0; //use the uip when we have a raw query to test if //we can submit if(uip) { hip = atoip(uip, uipLen); h = iptop( hip ); } } st1->m_strip = hr->getLong("strip",0); // . Remember, for cgi, if the box is not checked, then it is not // reported in the request, so set default return value to 0 // . support both camel case and all lower-cases st1->m_spiderLinks = hr->getLong("spiderLinks",0); st1->m_spiderLinks = hr->getLong("spiderlinks",st1->m_spiderLinks); // . should we force it into spiderdb even if already in there // . use to manually update spider times for a url // . however, will not remove old scheduled spider times // . mdw: made force on the default st1->m_forceRespider = hr->getLong("force",1); // 0); long now = getTimeGlobal(); // . allow 1 submit every 1 hour // . restrict by submitter domain ip if ( ! st1->m_isAdmin && ! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) { // return error page //g_errno = ETOOEARLY; SafeBuf sb; sb.safePrintf("You breached your add url quota."); mdelete ( st1 , sizeof(State1i) , "PageAddUrl" ); delete (st1); // use cachetime of 3600 so it does not re-inject if you hit // the back button! g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), 3600,//-1, // cachetime false,// post? "text/html", 200, // http status NULL, // cookie "UTF-8"); return true; } //st1->m_query = r->getString( "qts", &st1->m_queryLen ); // check it, if turing test is enabled for this collection /* if ( ! st1->m_isAdmin && cr->m_doTuringTest && ! g_turingTest.isHuman(r) ) { // log note so we know it didn't make it g_msg = " (error: bad answer)"; //log("PageAddUrl:: addurl failed for %s : bad answer", // iptoa(sock->m_ip)); st1->m_goodAnswer = false; return sendReply ( st1 , true ); // addUrl enabled? } */ // set this. also sets gr->m_hr GigablastRequest *gr = &st1->m_msg7.m_gr; // this will fill in GigablastRequest so all the parms we need are set g_parms.setGigablastRequest ( sock , hr , gr ); // this is really an injection, not add url, so make // GigablastRequest::m_url point to Gigablast::m_urlsBuf because // the PAGE_ADDURLS2 parms in Parms.cpp fill in the m_urlsBuf. // HACK! gr->m_url = gr->m_urlsBuf; // // inject using msg7 // // . pass in the cleaned url // . returns false if blocked, true otherwise if ( ! st1->m_msg7.inject ( //s , //r , st1 , doneInjectingWrapper3 ) ) return false; // some kinda error, g_errno should be set i guess doneInjectingWrapper3 ( st1 ); // we did not block return true; } void doneInjectingWrapper3 ( void *st ) { State1i *st1 = (State1i *)st; // allow others to add now s_inprogress = false; // get the state properly //State1i *st1 = (State1i *) state; // in order to see what sites are being added log it, then we can // more easily remove sites from sitesearch.gigablast.com that are // being added but not being searched char *url = st1->m_msg7.m_xd.m_firstUrl.m_url; log(LOG_INFO,"http: add url %s (%s)",url ,mstrerror(g_errno)); // extract info from state TcpSocket *sock = st1->m_socket; //bool isAdmin = st1->m_isAdmin; //char *url = NULL; //if ( st1->m_urlLen ) url = st1->m_url; // re-null it out if just http:// //bool printUrl = true; //if ( st1->m_urlLen == 0 ) printUrl = false; //if ( ! st1->m_url ) printUrl = false; //if(st1->m_urlLen==7&&st1->m_url&&!strncasecmp(st1->m_url,"http://",7) // printUrl = false; // page is not more than 32k char buf[1024*32+MAX_URL_LEN*2]; SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2); //char rawbuf[1024*8]; //SafeBuf rb(rawbuf, 1024*8); //rb.safePrintf("\n"); //rb.safePrintf("\n"); //CollectionRec *cr = g_collectiondb.getRec ( st1->m_coll ); // collection name char *coll = st1->m_coll; if ( ! coll ) coll = ""; //char tt [ 128 ]; //tt[0] = '\0'; //if ( st1->m_coll[0] != '\0' && ! isAdmin ) // sprintf ( tt , " for %s", st1->m_coll ); // // what we print here will just be the error msg, because the // ajax will fill the text we print here into the div below // the add url box // // if there was an error let them know //char msg[MAX_URL_LEN + 1024]; char *pm = ""; if ( g_errno ) { if ( g_errno == ETOOEARLY ) { pm = "Error. 100 urls have " "already been submitted by " "this IP address for the last 24 hours. " "Explanation."; log("addurls: Failed for user at %s: " "quota breeched.", iptoa(sock->m_ip)); //rb.safePrintf("Error. %li urls have " // "already been submitted by " // "this IP address for the " // "last 24 hours. ", // cr->m_maxAddUrlsPerIpDomPerDay); sb.safePrintf("%s",pm); } else { sb.safePrintf("Error adding url(s): %s[%i]", mstrerror(g_errno) , g_errno); //pm = msg; //rb.safePrintf("Error adding url(s): %s[%i]", // mstrerror(g_errno) , g_errno); //sb.safePrintf("%s",pm); } } else { if ( ! g_conf.m_addUrlEnabled ) { pm = "" "Sorry, this feature is temporarily disabled. " "Please try again later."; if ( url ) log("addurls: failed for user at %s: " "add url is disabled. " "Enable add url on the " "Master Controls page and " "on the Spider Controls page for " "this collection.", iptoa(sock->m_ip)); sb.safePrintf("%s",pm); //rb.safePrintf("Sorry, this feature is temporarily " // "disabled. Please try again later."); } else if ( s_inprogress ) { pm = "Add url busy. Try again later."; log("addurls: Failed for user at %s: " "busy adding another.", iptoa(sock->m_ip)); //rb.safePrintf("Add url busy. Try again later."); sb.safePrintf("%s",pm); } // did they fail the turing test? else if ( ! st1->m_goodAnswer ) { pm = "" "Oops, you did not enter the 4 large letters " "you see below. Please try again."; //rb.safePrintf("could not add the url" // " because the turing test" // " is enabled."); sb.safePrintf("%s",pm); } else if ( st1->m_msg7.m_xd.m_indexCodeValid && st1->m_msg7.m_xd.m_indexCode ) { long ic = st1->m_msg7.m_xd.m_indexCode; sb.safePrintf("Had error injecting url: %s", mstrerror(ic)); } /* if ( url && ! st1->m_ufu[0] && url[0] && printUrl ) { sprintf ( msg ,"%s added to spider " "queue " "successfully", url ); //rb.safePrintf("%s added to spider " // "queue successfully", url ); } else if ( st1->m_ufu[0] ) { sprintf ( msg ,"urls in %s " "added to spider queue " "successfully", st1->m_ufu ); //rb.safePrintf("urls in %s added to spider " // "queue successfully", url ); } */ else { //rb.safePrintf("Add the url you want:"); // avoid hitting browser page cache unsigned long rand32 = rand(); // in the mime to 0 seconds! sb.safePrintf("Url successfully added. " "Check it"// or " //"SEO it" "." ""); } //pm = msg; //url = "http://"; //else // pm = "Don't forget to " // "Gigaboost your URL."; } // store it sb.safePrintf("%s",pm ); // clear g_errno, if any, so our reply send goes through g_errno = 0; // nuke state mdelete ( st1 , sizeof(State1i) , "PageAddUrl" ); delete (st1); // this reply should be loaded from the ajax loader so use a cache // time of 1 hour so it does not re-inject the url if you hit the // back button g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), 3600, // cachetime false,// post? "text/html", 200, // http status NULL, // cookie "UTF-8"); } // we get like 100k submissions a day!!! static HashTable s_htable; static bool s_init = false; static long s_lastTime = 0; bool canSubmit ( unsigned long h , long now , long maxAddUrlsPerIpDomPerDay ) { // . sometimes no limit // . 0 means no limit because if they don't want any submission they // can just turn off add url and we want to avoid excess // troubleshooting for why a url can't be added if ( maxAddUrlsPerIpDomPerDay <= 0 ) return true; // init the table if ( ! s_init ) { s_htable.set ( 50000 ); s_init = true; } // clean out table every 24 hours if ( now - s_lastTime > 24*60*60 ) { s_lastTime = now; s_htable.clear(); } // . if table almost full clean out ALL slots // . TODO: just clean out oldest slots if ( s_htable.getNumSlotsUsed() > 47000 ) s_htable.clear (); // . how many times has this IP domain submitted? // . allow 10 times per day long n = s_htable.getValue ( h ); // if over 24hr limit then bail if ( n >= maxAddUrlsPerIpDomPerDay ) return false; // otherwise, inc it n++; // add to table, will replace old values s_htable.addKey ( h , n ); return true; } void resetPageAddUrl ( ) { s_htable.reset(); } bool sendPageAdvanced ( TcpSocket *sock , HttpRequest *hr ) { SafeBuf sb; printFrontPageShell ( sb , 2 ); sb.safePrintf("

\n"); sb.safePrintf("


\n"); // submit to https now sb.safePrintf("
\n" ); CollectionRec *cr = g_collectiondb.getRec ( hr ); char *coll = ""; if ( cr ) coll = cr->m_coll; if ( cr ) sb.safePrintf("", cr->m_coll); sb.safePrintf( "" "" "" "" //"" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "" " " " " " " " " "" "" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "
Search for...
all of these words" "" "
" "GO" "
" "
this exact phrase
and this exact phrase
any of these words
none of these words
In this language:" " " " " "
Restrict to this URL
Pages that link to this URL
Site Clusteringyes   no
Number of summary excerpts0   1   2   3   4   5
Results per Page10  20  30  40  50  100
Restrict to these Sites
" ); sb.safePrintf("
\n"); sb.safePrintf("
\n"); sb.safePrintf("\n"); sb.safePrintf("

\n"); printNav ( sb , hr ); g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), 3600, // cachetime false,// post? "text/html", 200, // http status NULL, // cookie "UTF-8"); return true; } bool sendPageAbout ( TcpSocket *sock , HttpRequest *hr ) { SafeBuf sb; printFrontPageShell ( sb , 4 ); sb.safePrintf("
\n"); //sb.safePrintf("


\n"); // submit to https now //sb.safePrintf("
\n" ); CollectionRec *cr = g_collectiondb.getRec ( hr ); char *coll = ""; if ( cr ) coll = cr->m_coll; if ( cr ) sb.safePrintf("", cr->m_coll); sb.safePrintf( "" "" "" "
" "
" "" "
" "" "" "
" "" "

As of 2013, Gigablast is one of the remaining four search engines in the United States that maintains its own searchable index of over a billion pages." "" "

" "" "

Founded in 2000, Matt Wells created Gigablast to index up to 200 Billion pages" " with the least amount of hardware possible. Gigablast provides large-scale," " high-performance, real-time information retrieval technology for partner" " sites. The company offers a variety of features including topic generation" " and the ability to index multiple document formats. This search delivery" " mechanism gives a partner \"turn key\" search capability and" " the capacity to instantly offer search at maximum scalability with minimum" " cost. " " Clients range from NASDAQ 100 listed corporations to boutique" " companies." "

" "" "

" "Matt Wells is currently the sole maintainer and programmer of Gigablast and is open for consulting work. For more information, contact us at
" "

" "
" "
" "" "
" "
" "
" "
" ); sb.safePrintf("
\n"); sb.safePrintf("
\n"); sb.safePrintf("\n"); sb.safePrintf("

\n"); printNav ( sb , hr ); g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), 3600, // cachetime false,// post? "text/html", 200, // http status NULL, // cookie "UTF-8"); return true; } bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) { SafeBuf sb; printFrontPageShell ( sb , 5 ); sb.safePrintf("

\n"); sb.safePrintf("


\n"); // submit to https now //sb.safePrintf("
\n" ); // CollectionRec *cr = g_collectiondb.getRec ( hr ); // char *coll = ""; // if ( cr ) coll = cr->m_coll; // if ( cr ) // sb.safePrintf("", // cr->m_coll); sb.safePrintf( "
" " " "" "" "" " " " " " " " " " " " " " " "" "" " " " " " " " " "" "" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "" "" " " " " " " " " "" " " " " " " " " "" " " " " " " " " "" "" " " " " " " " " "" "" "" " " " " "" " " "" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "" "
SearchDescription
cat " " dogSearch results have the word cat and the word dog " " in them. They could also have cats and dogs.
+catSearch results have the word cat in them. If the search results has the word cats then it will not be included. The plus sign indicates an exact match and not to use synonyms, hypernyms or hyponyms or any other form of the word.
mp3 \"take five\"Search results have the word mp3 and the exact phrase take " " five in them.
\"john smith\" -\"bob dole\"Search results have the phrase john smith but NOT the " " phrase bob dole in them.
bmx -gameSearch results have the word bmx but not game.
inurl:edu title:universitySearch results have university in their title and edu " " in their url.
site:www.ibm.com \"big blue\"Search results are from the site www.ibm.com and have the phrase " " big blue in them.
url:www.yahoo.comSearch result is the single URL www.yahoo.com, if it is indexed.
title:\"the " " news\" -\"weather report\"Search results have the phrase the news in their title, " " and do NOT have the phrase weather report anywhere in their " " content.
ip:216.32.120Search results are from the the ip 216.32.120.*.
type:pdf nutritionSearch results are PDF (Portable Document Format) documents that " " contain the word nutrition.
type:docSearch results are Microsoft Word documents.
type:xlsSearch results are Microsoft Excel documents.
type:pptSearch results are Microsoft Power Point documents.
type:psSearch results are Postscript documents.
type:textSearch results are plain text documents.
filetype:pdfSearch results are PDF documents.
link:www.yahoo.comAll the pages that link to www.yahoo.com.
sitelink:www.yahoo.comAll the pages that link to any page on www.yahoo.com.
ext:txtAll the pages whose url ends in the .txt extension.
  
Boolean SearchDescription
" " Note: boolean operators must be in UPPER CASE. " "
cat AND dogSearch results have the word cat AND the word dog " " in them.
cat OR dogSearch results have the word cat OR the word dog " " in them, but preference is given to results that have both words.
cat dog OR pigSearch results have the two words cat and dog " " OR search results have the word pig, but preference is " " given to results that have all three words. This illustrates how " " the individual words of one operand are all required for that operand " " to be true.
\"cat dog\" OR pigSearch results have the phrase \"cat dog\" in them OR they " " have the word pig, but preference is given to results that " " have both.
title:\"cat " " dog\" OR pigSearch results have the phrase \"cat dog\" in their title " " OR they have the word pig, but preference is given to results " " that have both.
cat OR dog OR pigSearch results need only have one word, cat or dog " " or pig, but preference is given to results that have the " " most of the words.
cat OR dog AND pigSearch results have dog and pig, but they may " " or may not have cat. Preference is given to results that " " have all three. To evaluate expressions with more than two operands, " " as in this case where we have three, you can divide the expression " " up into sub-expressions that consist of only one operator each. " " In this case we would have the following two sub-expressions: cat " " OR dog and dog AND pig. Then, for the original expression " " to be true, at least one of the sub-expressions that have an OR " " operator must be true, and, in addition, all of the sub-expressions " " that have AND operators must be true. Using this logic you can evaluate " " expressions with more than one boolean operator.
cat AND NOT dogSearch results have cat but do not have dog.
cat AND NOT (dog OR pig)Search results have cat but do not have dog " " and do not have pig. When evaluating a boolean expression " " that contains ()'s you can evaluate the sub-expression in the ()'s " " first. So if a document has dog or it has pig " " or it has both, then the expression, (dog OR pig) would " " be true. So you could, in this case, substitute true for " " that expression to get the following: cat AND NOT (true) = cat " " AND false = false. Does anyone actually read this far?
(cat OR dog) AND NOT (cat AND dog)Search results have cat or dog but not both.
left-operand  OPERATOR  right-operandThis is the general format of a boolean expression. The possible " " operators are: OR and AND. The operands can themselves be boolean " " expressions and can be optionally enclosed in parentheses. A NOT " " operator can optionally preceed the left or the right operand.
" "" "" "" "
" "
" ); //sb.safePrintf("\n"); sb.safePrintf("
\n"); sb.safePrintf("\n"); sb.safePrintf("

\n"); printNav ( sb , hr ); g_httpServer.sendDynamicPage (sock, sb.getBufStart(), sb.length(), 3600, // cachetime false,// post? "text/html", 200, // http status NULL, // cookie "UTF-8"); return true; }