Merge branch 'testing' into diffbot-testing

This commit is contained in:
Matt Wells 2014-09-03 20:00:04 -07:00
commit 217fb1f1e9
53 changed files with 2421 additions and 1031 deletions

View File

@ -410,6 +410,12 @@ class CollectionRec {
char m_dailyMergeDOWList[48];
//
// CLOUD SEARCH ENGINE SUPPORT
//
// ip of user adding the collection
char m_userIp[16];
// spider controls for this collection
//char m_oldSpideringEnabled ;
//char m_newSpideringEnabled ;

2
Conf.h
View File

@ -466,6 +466,8 @@ class Conf {
// redhat 9's NPTL doesn't like our async signals
bool m_allowAsyncSignals;
bool m_allowCloudUsers;
// if in read-only mode we do no spidering and load no saved trees
// so we can use all mem for caching index lists
bool m_readOnlyMode;

View File

@ -184,7 +184,7 @@ case EDIFFBOTTOKENEXPIRED: return "Diffbot token expired";
case EDIFFBOTUNKNOWNERROR: return "Diffbot unknown error";
case EMISSINGINPUT: return "Missing required input parms";
case EDMOZNOTREADY: return "Dmoz is not setup, follow instructions in "
"admin.html to setup";
"faq.html to setup";
case EPROXYSSLCONNECTFAILED: return "SSL tunnel through HTTP proxy failed";
case EINLINESECTIONS: return "Error generating section votes";
case EREADONLYMODE: return "In read only mode. Failed.";

View File

@ -701,7 +701,7 @@ bool HttpRequest::set ( char *origReq , long origReqLen , TcpSocket *sock ) {
// reset our hostname
m_hostLen = 0;
// assume request is NOT from local network
//m_isAdmin = false;
//m_isRootAdmin = false;
m_isLocal = false;
// get the virtual hostname they want to use
char *s = strstr ( req ,"Host:" );
@ -844,9 +844,9 @@ bool HttpRequest::set ( char *origReq , long origReqLen , TcpSocket *sock ) {
m_plen = i - filenameStart;
// we're local if hostname is 192.168.[0|1].y
//if ( strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) {
// m_isAdmin = true; m_isLocal = true; }
// m_isRootAdmin = true; m_isLocal = true; }
//if ( strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) {
// m_isAdmin = true; m_isLocal = true; }
// m_isRootAdmin = true; m_isLocal = true; }
//if(strncmp(iptoa(sock->m_ip),"192.168.1.",10) == 0) m_isLocal = true;
//if(strncmp(iptoa(sock->m_ip),"192.168.0.",10) == 0) m_isLocal = true;
if ( sock && strncmp(iptoa(sock->m_ip),"192.168.",8) == 0)
@ -921,7 +921,7 @@ bool HttpRequest::set ( char *origReq , long origReqLen , TcpSocket *sock ) {
// . also if we're coming from lenny at my house consider it local
// . this is a security risk, however... TODO: FIX!!!
//if ( sock->m_ip == atoip ("68.35.105.199" , 13 ) ) m_isAdmin = true;
//if ( sock->m_ip == atoip ("68.35.105.199" , 13 ) ) m_isRootAdmin = true;
// . TODO: now add any cgi data from a POST.....
// . look after the mime
//char *d = NULL;
@ -941,7 +941,7 @@ bool HttpRequest::set ( char *origReq , long origReqLen , TcpSocket *sock ) {
if ( ! addCgi ( post , postLen ) ) return false;
}
// sometimes i don't want to be admin
//if ( getLong ( "admin" , 1 ) == 0 ) m_isAdmin = false;
//if ( getLong ( "admin" , 1 ) == 0 ) m_isRootAdmin = false;
// success
/////
@ -1650,3 +1650,32 @@ int getVersionFromRequest ( HttpRequest *r ) {
return v;
}
// if user is NOT the root admin, and the collection is NOT main/dmoz/demo
// then assume they are a guest admin
bool HttpRequest::isGuestAdmin ( ) {
// if we are coming from a local ip then we are the root admin, not
// a guest user.
if ( isLocal() )
return false;
CollectionRec *cr = g_collectiondb.getRec ( this , false );
// if collection does not exist, meaning &c=xxxxx was not specified,
// in the GET request, then assume they want to create a new one.
if ( ! cr ) return true;
// main, dmoz and demo colls are off limits. those are never guest
// admin collection names.
if ( strcmp(cr->m_coll,"main") ) return false;
if ( strcmp(cr->m_coll,"dmoz") ) return false;
if ( strcmp(cr->m_coll,"demo") ) return false;
// if they know the collection name then they are logged in
return true;
}

View File

@ -73,6 +73,7 @@ class HttpRequest {
};
// FORMAT_HTML FORMAT_JSON FORMAT_XML
char getFormat() { return getReplyFormat(); };
char getReplyFormat();
bool m_replyFormatValid;
char m_replyFormat;
@ -120,7 +121,7 @@ class HttpRequest {
char *getHost () { return m_host; };
long getHostLen () { return m_hostLen; };
//bool isLocal () { return m_isLocal; };
//bool isAdmin () { return m_isAdmin; };
//bool isAdmin () { return m_isRootAdmin; };
bool isLocal () { return m_isLocal; };
// is this the admin of a collection?
@ -153,6 +154,8 @@ class HttpRequest {
bool hasField ( char *field );
bool isGuestAdmin ( ) ;
// are we a redir? if so return non-NULL
char *getRedir ( ) { return m_redir; };
long getRedirLen ( ) { return m_redirLen; };
@ -230,7 +233,7 @@ class HttpRequest {
bool m_isMSIE;
// does the connecting machine have admin privledges?
//bool m_isAdmin;
//bool m_isRootAdmin;
// . decoded cgi data stored here
// . this just points into TcpSocket::m_readBuf

View File

@ -18,6 +18,8 @@ HttpServer g_httpServer;
//bool sendPageSiteMap ( TcpSocket *s , HttpRequest *r ) ;
//bool sendPageApi ( TcpSocket *s , HttpRequest *r ) ;
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) ;
bool sendPagePretty ( TcpSocket *s , HttpRequest *r , char *filename ,
char *tabName ) ;
// we get like 100k submissions a day!!!
static HashTable s_htable;
@ -1200,6 +1202,42 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
if ( ! strncmp ( path ,"/help.html", pathLen ) )
return sendPageHelp ( s , r );
if ( ! strncmp ( path ,"/syntax.html", pathLen ) )
return sendPageHelp ( s , r );
if ( ! strncmp ( path ,"/widgets.html", pathLen ) )
return sendPageWidgets ( s , r );
// who uses gigablast?
if ( ! strncmp ( path ,"/users.html", pathLen ) )
return sendPagePretty ( s , r,"users.html","users"); // special
// decorate the plain html page, news.html, with our nav chrome
if ( ! strncmp ( path ,"/news.html", pathLen ) )
return sendPagePretty ( s , r , "news.html", "news");
// decorate the plain html page, rants.html, with our nav chrome
if ( ! strncmp ( path ,"/faq.html", pathLen ) )
return sendPagePretty ( s , r , "faq.html", "faq");
if ( ! strncmp ( path ,"/admin.html", pathLen ) )
return sendPagePretty ( s , r , "faq.html", "faq");
// decorate the plain html pages with our nav chrome
if ( ! strncmp ( path ,"/developer.html", pathLen ) )
return sendPagePretty ( s , r , "developer.html", "developer");
if ( ! strncmp ( path ,"/compare.html", pathLen ) )
return sendPagePretty ( s , r , "compare.html", "compare");
if ( ! strncmp ( path ,"/contact.html", pathLen ) )
return sendPagePretty ( s , r , "contact.html", "contact");
if ( ! strncmp ( path ,"/bio.html", pathLen ) )
return sendPagePretty ( s , r , "bio.html", "bio");
if ( ! strncmp ( path ,"/appliance.html", pathLen ) )
return sendPagePretty ( s , r , "appliance.html", "appliance");
if ( ! strncmp ( path ,"/api.html", pathLen ) )
return sendPageAPI ( s , r );
@ -2872,6 +2910,50 @@ TcpSocket *HttpServer::unzipReply(TcpSocket* s) {
return s;
}
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr );
bool sendPagePretty ( TcpSocket *s ,
HttpRequest *r ,
char *filename ,
char *tabName ) {
SafeBuf sb;
CollectionRec *cr = g_collectiondb.getRec ( r );
// print the chrome
printFrontPageShell ( &sb , tabName , cr ); // -1=pagenum
SafeBuf ff;
ff.safePrintf("html/%s",filename);
SafeBuf tmp;
tmp.fillFromFile ( g_hostdb.m_dir , ff.getBufStart() );
sb.safeStrcpy ( tmp.getBufStart() );
// done
sb.safePrintf("\n</html>");
char *charset = "utf-8";
char *ct = "text/html";
g_httpServer.sendDynamicPage ( s ,
sb.getBufStart(),
sb.length(),
25 , // cachetime in secs
// pick up key changes
// this was 0 before
false , // POSTREply?
ct , // content type
-1 , // http status -1->200
NULL, // cookiePtr ,
charset );
return true;
}
/*
bool sendPageApi ( TcpSocket *s , HttpRequest *r ) {

View File

@ -3999,7 +3999,7 @@ bool StateWik::getSummary( ){
0,//m_si->m_displayMetasLen ,
0,//bigSampleRadius ,
0,//bigSampleMaxLen ,
true,//m_si->m_isAdmin ,
true,//m_si->m_isRootAdmin ,
true , //requireallterms
false , //count links
0,

775
Loop.cpp

File diff suppressed because it is too large Load Diff

7
Loop.h
View File

@ -89,8 +89,14 @@ extern long long g_nowApprox;
// count of how many SIGVTALRM signals we had so far
extern long g_numAlarms;
extern long g_numVTAlarms;
extern long g_numQuickPolls;
extern long g_numSigChlds;
extern long g_numSigQueues;
extern long g_numSigOthers;
extern char g_niceness ;
// we make sure the same callback/handler is not hogging the cpu when it is
@ -165,6 +171,7 @@ class Loop {
bool m_needsToQuickPoll;
bool m_canQuickPoll;
itimerval m_quickInterrupt;
itimerval m_realInterrupt;
itimerval m_noInterrupt;
// call this when you don't want to be interrupted
void interruptsOff ( ) ;

View File

@ -178,7 +178,7 @@ class Msg20Request {
char m_getTurkForm :1;
char m_showTurkInstructions :1;
char m_isTurkSpecialQuery :1;
char m_isAdmin :1;
char m_isRootAdmin :1;
// . this is for buzz.
// . this says to compute the <absScore2> tag in their xml feed.
// . the document receives a score of 0 if it does not match the query

3
Msg4.h
View File

@ -78,7 +78,8 @@ class Msg4 {
bool addMetaList2 ( );
Msg4() { m_inUse = false; };
~Msg4() { if ( m_inUse ) { char *xx=NULL;*xx=0; } };
// why wasn't this saved in addsinprogress.dat file?
~Msg4() { if ( m_inUse ) log("BAD: MSG4 in use!!!!!!"); };
// injecting into the "test" collection likes to flush the buffers
// after each injection to make sure the data is available for

View File

@ -1465,7 +1465,7 @@ bool Msg40::launchMsg20s ( bool recalled ) {
//req.ptr_q2buf = m_si->m_sbuf3.getBufStart();
//req.size_q2buf = q3size;
req.m_isAdmin = m_si->m_isAdmin;
req.m_isRootAdmin = m_si->m_isRootAdmin;
//req.m_rulesetFilter = m_si->m_ruleset;
@ -1536,7 +1536,7 @@ bool Msg40::launchMsg20s ( bool recalled ) {
// let "ns" parm override
req.m_numSummaryLines = m_si->m_numLinesInSummary;
if(m_si->m_isAdmin && m_si->m_format == FORMAT_HTML )
if(m_si->m_isRootAdmin && m_si->m_format == FORMAT_HTML )
req.m_getGigabitVector = true;
else req.m_getGigabitVector = false;
req.m_flags = 0;

View File

@ -6,6 +6,7 @@
#include "Collectiondb.h"
//#include "CollectionRec.h"
#include "Users.h"
#include "Parms.h"
bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) ;
@ -76,8 +77,35 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
return g_httpServer.sendSuccessReply(s,format);
}
char buf [ 64*1024 ];
SafeBuf p(buf, 64*1024);
//
// CLOUD SEARCH ENGINE SUPPORT
//
// if added the coll successfully, do not print same page, jump to
// printing the basic settings page so they can add sites to it.
// crap, this GET request, "r", is missing the "c" parm sometimes.
// we need to use the "addcoll" parm anyway. maybe print a meta
// redirect then?
char *action = r->getString("action",NULL);
char guide = r->getLong("guide",0);
if ( action && ! msg && format == FORMAT_HTML && guide ) {
//return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS );
// just redirect to it
char *addColl = r->getString("addcoll",NULL);
if ( addColl )
p.safePrintf("<meta http-equiv=Refresh "
"content=\"0; URL=/admin/settings"
"?guide=1&c=%s\">",
addColl);
return g_httpServer.sendDynamicPage (s,
p.getBufStart(),
p.length());
}
// print standard header
g_pages.printAdminTop ( &p , s , r );
@ -105,6 +133,14 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
"</center><br>\n",cc,msg);
}
//
// CLOUD SEARCH ENGINE SUPPORT
//
if ( add && guide )
printGigabotAdvice ( &p , PAGE_ADDCOLL , r );
// print the add collection box
if ( add /*&& (! nc[0] || g_errno ) */ ) {
p.safePrintf (
@ -139,6 +175,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
// "<td><input type=text name=cpc value=\"%s\" size=30>"
// "</td></tr>\n",coll);
p.safePrintf ( "</table></center><br>\n");
// wrap up the form started by printAdminTop
g_pages.printAdminBottom ( &p );
long bufLen = p.length();

View File

@ -7,6 +7,8 @@
#include "PageResults.h" // for RESULT_HEIGHT
#include "Stats.h"
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ) ;
// 5 seconds
#define DEFAULT_WIDGET_RELOAD 1000
@ -883,79 +885,9 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
return true;
}
bool printScrollingWidget ( SafeBuf *sb , CollectionRec *cr ) {
// from pagecrawlbot.cpp for printCrawlDetails()
#include "PageCrawlBot.h"
///////////
//
// main > Basic > Status
//
///////////
bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
char buf [ 128000 ];
SafeBuf sb(buf,128000);
sb.reset();
// char *fs = hr->getString("format",NULL,NULL);
// char format = FORMAT_HTML;
// if ( fs && strcmp(fs,"html") == 0 ) format = FORMAT_HTML;
// if ( fs && strcmp(fs,"json") == 0 ) format = FORMAT_JSON;
// if ( fs && strcmp(fs,"xml") == 0 ) format = FORMAT_XML;
char format = hr->getReplyFormat();
// true = usedefault coll?
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) {
g_httpServer.sendErrorReply(socket,500,"invalid collection");
return true;
}
if ( format == FORMAT_JSON || format == FORMAT_XML) {
// this is in PageCrawlBot.cpp
printCrawlDetails2 ( &sb , cr , format );
char *ct = "text/xml";
if ( format == FORMAT_JSON ) ct = "application/json";
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
sb.length(),
0, // cachetime
false,//POSTReply ,
ct);
}
// print standard header
if ( format == FORMAT_HTML )
// this prints the <form tag as well
g_pages.printAdminTop ( &sb , socket , hr );
// table to split between widget and stats in left and right panes
if ( format == FORMAT_HTML ) {
sb.safePrintf("<TABLE id=pane>"
"<TR><TD valign=top>");
}
long savedLen1, savedLen2;
//
// widget
//
// put the widget in here, just sort results by spidered date
//
// the scripts do "infinite" scrolling both up and down.
// but if you are at the top then new results will load above
// you and we try to maintain your current visual state even though
// the scrollbar position will change.
//
if ( format == FORMAT_HTML ) {
// save position so we can output the widget code
// so user can embed it into their own web page
savedLen1 = sb.length();
sb.safePrintf("<script type=\"text/javascript\">\n\n");
sb->safePrintf("<script type=\"text/javascript\">\n\n");
// if user has the scrollbar at the top
// in the widget we do a search every 15 secs
@ -964,7 +896,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
// top docid and 10 results below it. that way
// no matter which of the 10 results you were
// viewing your view should remaing unchanged.
sb.safePrintf(
sb->safePrintf(
// global var
"var forcing;"
@ -984,8 +916,9 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
"var cd;"
"if ( sd ) cd=sd.firstChild;"
"var fd=0;"
"if(cd) fd=cd.getAttribute('docid');"
// if nodetype is 3 that means it says
// 'No results. Waiting for spider to kick in.'
"if(cd && cd.nodeType==1) fd=cd.getAttribute('docid');"
// if the searchbox has the focus then do not
// update the content just yet...
@ -993,9 +926,13 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
"if(qb&&qb==document.activeElement)"
"return;"
//"alert(this.responseText);"
// or if not forced and they scrolled down
// don't jerk them back up again
"if(!forcing&&sd&&sd.scrollTop!=0)return;"
// don't jerk them back up again. unless
// the inner html starts with 'No results'!
"if(!forcing&&sd&&sd.scrollTop!=0&&cd&&cd.nodeType==1)"
"return;"
// just set the widget content to the reply
@ -1079,7 +1016,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
// scroll the widget up until we hit the 0 position
sb.safePrintf(
sb->safePrintf(
"function widget123_scroll() {"
// only scroll if at the top of the widget
// and not scrolled down so we do not
@ -1111,7 +1048,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
// this function appends the search results to what is
// already in the widget.
sb.safePrintf(
sb->safePrintf(
"function widget123_handler_append() {"
// return if reply is not fully ready
"if(this.readyState != 4 )return;"
@ -1130,7 +1067,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
);
//sb.safePrintf ( "</script>\n\n" );
//sb->safePrintf ( "</script>\n\n" );
long widgetWidth = 300;
long widgetHeight = 500;
@ -1157,7 +1094,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
// get the search results from neo as soon as this div is
// being rendered, and set its contents to them
sb.safePrintf(//"<script type=text/javascript>"
sb->safePrintf(//"<script type=text/javascript>"
"function widget123_reload(force) {"
@ -1258,7 +1195,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
// . call this when scrollbar gets 5 up from bottom
// . but if < 10 new results are appended, then stop!
//
sb.safePrintf(
sb->safePrintf(
"var outstanding=0;\n\n"
"function widget123_append() {"
@ -1306,6 +1243,10 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
"var d=cd.lastChild.previousSibling;"
// must be there
"if(!d)return;"
// now that we added <hr> tags after each div do this!
"d=d.previousSibling;"
// must be there
"if(!d)return;"
// get docid/score
"u=u+\"&maxserpscore=\"+d.getAttribute('score');"
"u=u+\"&minserpdocid=\"+d.getAttribute('docid');"
@ -1358,7 +1299,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
// more search results if we are near the bottom of the
// widget.
sb.safePrintf("<div id=widget123 "
sb->safePrintf("<div id=widget123 "
"style=\"border:2px solid black;"
"position:relative;border-radius:10px;"
"width:%lipx;height:%lipx;\">"
@ -1366,16 +1307,123 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
, widgetHeight
);
//sb.safePrintf("<style>"
//sb->safePrintf("<style>"
// "a{color:white;}"
// "</style>");
sb.safePrintf("Waiting for Server...");
sb->safePrintf("Waiting for Server Response...");
// end the containing div
sb.safePrintf("</div>");
sb->safePrintf("</div>");
return true;
}
bool sendPageWidgets ( TcpSocket *socket , HttpRequest *hr ) {
// true = usedefault coll?
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) {
g_httpServer.sendErrorReply(socket,500,"invalid collection");
return true;
}
char buf [ 128000 ];
SafeBuf sb(buf,128000);
printFrontPageShell ( &sb, "widgets", cr );
sb.safePrintf("<br>");
sb.safePrintf("<br>");
//char format = hr->getReplyFormat();
//if ( format == FORMAT_HTML )
printGigabotAdvice ( &sb , PAGE_BASIC_STATUS , hr );
printScrollingWidget ( &sb , cr );
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
sb.length(),
0); // cachetime
}
// from pagecrawlbot.cpp for printCrawlDetails()
#include "PageCrawlBot.h"
///////////
//
// main > Basic > Status
//
///////////
bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
char buf [ 128000 ];
SafeBuf sb(buf,128000);
sb.reset();
// char *fs = hr->getString("format",NULL,NULL);
// char format = FORMAT_HTML;
// if ( fs && strcmp(fs,"html") == 0 ) format = FORMAT_HTML;
// if ( fs && strcmp(fs,"json") == 0 ) format = FORMAT_JSON;
// if ( fs && strcmp(fs,"xml") == 0 ) format = FORMAT_XML;
char format = hr->getReplyFormat();
// true = usedefault coll?
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) {
g_httpServer.sendErrorReply(socket,500,"invalid collection");
return true;
}
if ( format == FORMAT_JSON || format == FORMAT_XML) {
// this is in PageCrawlBot.cpp
printCrawlDetails2 ( &sb , cr , format );
char *ct = "text/xml";
if ( format == FORMAT_JSON ) ct = "application/json";
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
sb.length(),
0, // cachetime
false,//POSTReply ,
ct);
}
// print standard header
if ( format == FORMAT_HTML )
// this prints the <form tag as well
g_pages.printAdminTop ( &sb , socket , hr );
// table to split between widget and stats in left and right panes
if ( format == FORMAT_HTML ) {
sb.safePrintf("<TABLE id=pane>"
"<TR><TD valign=top>");
}
long savedLen1, savedLen2;
//
// widget
//
// put the widget in here, just sort results by spidered date
//
// the scripts do "infinite" scrolling both up and down.
// but if you are at the top then new results will load above
// you and we try to maintain your current visual state even though
// the scrollbar position will change.
//
if ( format == FORMAT_HTML ) {
// save position so we can output the widget code
// so user can embed it into their own web page
savedLen1 = sb.length();
printScrollingWidget ( &sb , cr );
savedLen2 = sb.length();

View File

@ -641,7 +641,7 @@ bool getResults ( void *state ) {
// if they don't have the ever changing key, they're probably a bot
/*
if ( (!si->m_isAssassin||si->m_isAdmin) &&
if ( (!si->m_isAssassin||si->m_isRootAdmin) &&
si->m_raw == 0 && si->m_siteLen <= 0 &&
si->m_sitesLen <= 0 ) {
// if there and robot checking on, check it
@ -7541,7 +7541,7 @@ bool printAdminLinks ( SafeBuf &sb , State7 *st ) {
SearchInput *si = &st->m_si;
if ( ! si->m_isAdmin ) return true;
if ( ! si->m_isRootAdmin ) return true;
Msg40 *msg40 = &(st->m_msg40);
// how many results were requested?
@ -8023,7 +8023,7 @@ static bool printResult ( CollectionRec *cr,
// the score if admin
if ( si->m_isAdmin && !si->m_isFriend ) {
if ( si->m_isRootAdmin && !si->m_isFriend ) {
long level = (long)msg40->getClusterLevel(ix);
char evs[1024];
sprintf(evs,"eventhash=%llu eventid=%li "
@ -8129,7 +8129,7 @@ static bool printResult ( CollectionRec *cr,
evf &= ~(evflags_t)EV_DESERIALIZED;
// print the event flags first
if ( evf && si->m_isAdmin ) {
if ( evf && si->m_isRootAdmin ) {
// color in red
if ( ! sb.safePrintf("<b><font color=red>[") )
return false;
@ -8528,7 +8528,7 @@ static bool printResult ( CollectionRec *cr,
sb.safePrintf ( " - indexed: %li days ago",days);
// do not show if more than 1 wk old! we want to seem as
// fresh as possible
else if ( ts > 0 && si->m_isAdmin && !si->m_isFriend ) {
else if ( ts > 0 && si->m_isRootAdmin && !si->m_isFriend ) {
char tbuf[100];
strftime ( tbuf , 100 , " - indexed: %b %d %Y",timeStruct);
sb.safePrintf ( "%s", tbuf );
@ -8539,7 +8539,7 @@ static bool printResult ( CollectionRec *cr,
sb.safePrintf("\n");
// this stuff is secret just for local guys!
if ( si->m_isAdmin ) { // Assassin && !si->m_isFriend ) {
if ( si->m_isRootAdmin ) { // Assassin && !si->m_isFriend ) {
// now the ip of url
//long urlip = msg40->getIp(i);
// don't combine this with the sprintf above cuz
@ -8556,7 +8556,7 @@ static bool printResult ( CollectionRec *cr,
(long)us[0],(long)us[1],(long)us[2],
(long)us[0],(long)us[1],(long)us[2]);
//if ( si->m_isAdmin && !si->m_isFriend ) {
//if ( si->m_isRootAdmin && !si->m_isFriend ) {
// . now the info link
// . if it's local, don't put the hostname/port in
// there cuz it will mess up Global Spec's machine
@ -10967,7 +10967,7 @@ bool printAllResults ( SafeBuf &sb , State7 *st , Query &qq ) {
if ( si->m_collLen2 == 4 && strncmp ( si->m_coll2, "main", 4) == 0 )
isMain = true;
// print "in collection ***" if we had a collection
if ( si->m_collLen2 >0 && ! isMain && si->m_isAdmin && printMenuJunk) {
if ( si->m_collLen2 >0 && ! isMain && si->m_isRootAdmin && printMenuJunk) {
sb.safePrintf (" in collection '<b>");
sb.safeMemcpy ( si->m_coll2 , si->m_collLen2 );
sb.safeMemcpy ( "</b>'" , 5 );
@ -17171,7 +17171,7 @@ static bool canSubmit (unsigned long h, long now, long maxUrlsPerIpDom);
class State9 {
public:
TcpSocket *m_socket;
bool m_isAdmin;
bool m_isRootAdmin;
char m_coll[MAX_COLL_LEN+1];
HttpRequest m_hr;
@ -17295,7 +17295,7 @@ bool sendPageAddEvent ( TcpSocket *s , HttpRequest *r ) {
// save socket and isAdmin
st9->m_socket = s;
st9->m_isAdmin = isAdmin;
st9->m_isRootAdmin = isAdmin;
st9->m_hr.copy ( r );
@ -17430,7 +17430,7 @@ bool gotCaptchaReply ( State9 *st9 , TcpSocket *s ) {
bool passed = true;
// turn off captcha for now
//if ( s ) passed = isCaptchaReplyCorrect ( s );
if ( ! passed && ! st9->m_isAdmin ) {
if ( ! passed && ! st9->m_isRootAdmin ) {
sb->safePrintf("<br><b>Captcha had incorrect answer</b><br>");
//return sendErrorReply9 ( st9 );
}
@ -17464,7 +17464,7 @@ bool gotCaptchaReply ( State9 *st9 , TcpSocket *s ) {
unsigned long h = iptop ( sip );
// . allow 1 submit every 1 hour
// . restrict by submitter domain ip
if ( ! st9->m_isAdmin &&
if ( ! st9->m_isRootAdmin &&
! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
// print error
sb->safePrintf("<br><b>Exceed quota</b><br>");
@ -17613,7 +17613,7 @@ bool gotCaptchaReply ( State9 *st9 , TcpSocket *s ) {
}
// log that xml so we can test it out on page parser
if ( st9->m_isAdmin && 1 == 2) {
if ( st9->m_isRootAdmin && 1 == 2) {
SafeBuf ttt;
ttt.safePrintf("<br>"
"<a href=/admin/parser?"
@ -17669,7 +17669,7 @@ bool sendReply ( void *state ) {
// extract info from state
//TcpSocket *s = st9->m_socket;
//bool isAdmin = st9->m_isAdmin;
//bool isAdmin = st9->m_isRootAdmin;
//char *url = NULL;
//if ( st9->m_urlLen ) url = st9->m_url;

View File

@ -36,7 +36,7 @@ public:
HttpRequest m_r;
char m_coll[MAX_COLL_LEN+2];
//CollectionRec *m_cr;
bool m_isAdmin;
bool m_isRootAdmin;
bool m_isLocal;
//bool m_seq;
bool m_rtq;
@ -131,7 +131,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
mnew ( st , sizeof(State2) , "PageGet1" );
// save the socket and if Host: is local in the Http request Mime
st->m_socket = s;
st->m_isAdmin = g_conf.isCollAdmin ( s , r );
st->m_isRootAdmin = g_conf.isCollAdmin ( s , r );
st->m_isLocal = r->isLocal();
st->m_docId = docId;
st->m_printed = false;
@ -284,7 +284,7 @@ bool processLoop ( void *state ) {
// error?
if ( ! na ) return sendErrorReply ( st , g_errno );
// forbidden? allow turkeys through though...
if ( ! st->m_isAdmin && *na )
if ( ! st->m_isRootAdmin && *na )
return sendErrorReply ( st , ENOCACHE );
SafeBuf *sb = &st->m_sb;
@ -332,7 +332,7 @@ bool processLoop ( void *state ) {
// error?
if ( ! vi ) return sendErrorReply ( st , g_errno );
// banned?
if ( ! st->m_isAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
if ( ! st->m_isRootAdmin && ! *vi ) return sendErrorReply (st,EDOCBANNED);
*/
// get the utf8 content

View File

@ -45,7 +45,7 @@ public:
long m_numRecs;
TcpSocket *m_socket;
HttpRequest m_r;
bool m_isAdmin;
bool m_isRootAdmin;
bool m_isLocal;
Msg36 m_msg36; // term freqs (term popularity)
long long m_termFreq;
@ -130,7 +130,7 @@ bool sendPageIndexdb ( TcpSocket *s , HttpRequest *r ) {
// save the TcpSocket
st->m_socket = s;
// and if the request is local/internal or not
st->m_isAdmin = g_conf.isCollAdmin ( s , r );
st->m_isRootAdmin = g_conf.isCollAdmin ( s , r );
st->m_isLocal = r->isLocal();
st->m_r.copy ( r );
// . check for add/delete request

View File

@ -777,7 +777,7 @@ static bool printGigabitContainingSentences ( State0 *st,
sb->safePrintf(" ");//,gi->m_numPages);
sb->safePrintf("</font>");
sb->safePrintf("</b>");
if ( si->m_isAdmin && 1 == 2 )
if ( si->m_isRootAdmin && 1 == 2 )
sb->safePrintf("[%.0f]{%li}",
gi->m_gbscore,
gi->m_minPop);
@ -1069,7 +1069,7 @@ static bool printGigabit ( State0 *st,
sb->safePrintf("(%li)",gi->m_numPages);
sb->safePrintf("</font>");
sb->safePrintf("</b>");
if ( si->m_isAdmin )
if ( si->m_isRootAdmin )
sb->safePrintf("[%.0f]{%li}",
gi->m_gbscore,
gi->m_minPop);
@ -2513,7 +2513,7 @@ bool printSearchResultsHeader ( State0 *st ) {
Query qq3;
Query *qq2;
bool firstIgnored;
bool isAdmin = si->m_isAdmin;
bool isAdmin = si->m_isRootAdmin;
if ( si->m_format != FORMAT_HTML ) isAdmin = false;
// otherwise, we had no error
@ -2971,9 +2971,9 @@ bool printSearchResultsTail ( State0 *st ) {
char abuf[300];
SafeBuf args(abuf,300);
// show banned?
if ( si->m_showBanned && ! si->m_isAdmin )
if ( si->m_showBanned && ! si->m_isRootAdmin )
args.safePrintf("&sb=1");
if ( ! si->m_showBanned && si->m_isAdmin )
if ( ! si->m_showBanned && si->m_isRootAdmin )
args.safePrintf("&sb=0");
//HttpRequest *hr = &st->m_hr;
@ -3089,7 +3089,7 @@ bool printSearchResultsTail ( State0 *st ) {
sb->safePrintf("<input name=c type=hidden value=\"%s\">",coll);
}
bool isAdmin = si->m_isAdmin;
bool isAdmin = si->m_isRootAdmin;
if ( si->m_format != FORMAT_HTML ) isAdmin = false;
if ( isAdmin && banSites.length() > 0 )
@ -3228,7 +3228,7 @@ bool printTimeAgo ( SafeBuf *sb , long ts , char *prefix , SearchInput *si ) {
else if (days< 7 )sb->safePrintf ( " - %s: %li days ago",prefix,days);
// do not show if more than 1 wk old! we want to seem as
// fresh as possible
else if ( ts > 0 ) { // && si->m_isAdmin ) {
else if ( ts > 0 ) { // && si->m_isRootAdmin ) {
struct tm *timeStruct = localtime ( &ts );
sb->safePrintf(" - %s: ",prefix);
char tmp[100];
@ -3429,8 +3429,7 @@ static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,
char format = si->m_format;
// these are handled in the <dmozEntry> logic below now
if ( format == FORMAT_XML ) return true;
if ( format == FORMAT_JSON ) return true;
if ( format != FORMAT_HTML ) return true;
// if ( format == FORMAT_XML ) {
// sb->safePrintf("\t\t<dmozCat>\n"
@ -3694,7 +3693,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// indent it if level is 2
bool indent = false;
bool isAdmin = si->m_isAdmin;
bool isAdmin = si->m_isRootAdmin;
if ( si->m_format == FORMAT_XML ) isAdmin = false;
//unsigned long long lastSiteHash = siteHash;
@ -3773,8 +3772,9 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
url,mr->ptr_imgUrl);
// if we have a thumbnail show it next to the search result,
// base64 encoded
if ( //(si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) &&
// base64 encoded. do NOT do this for the WIDGET, only for search
// results in html/xml.
if ( (si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) &&
//! mr->ptr_imgUrl &&
si->m_showImages && mr->ptr_imgData ) {
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
@ -3818,6 +3818,9 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
}
}
bool isWide = false;
long newdx = 0;
// print image for widget
if ( //mr->ptr_imgUrl &&
( si->m_format == FORMAT_WIDGET_IFRAME ||
@ -3829,6 +3832,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// prevent coring
if ( widgetWidth < 1 ) widgetWidth = 1;
// char *bg1 = "lightgray";
// char *bg2 = "white";
// char *bgcolor = bg1;
// if ( (ix % 1) == 1 ) bgcolor = bg2;
// each search result in widget has a div around it
sb->safePrintf("<div "
"class=result "
@ -3844,7 +3852,13 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
"min-height:%lipx;"//140px;"
"height:%lipx;"//140px;"
"padding:%lipx;"
//"padding-right:40px;"
"position:relative;"
// summary overflows w/o this!
"overflow-y:hidden;"
"overflow-x:hidden;"
// alternate bg color to separate results!
//"background-color:%s;"
//"display:table-cell;"
//"vertical-align:bottom;"
"\""
@ -3853,10 +3867,12 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// this is a double now. this won't work
// for streaming...
, msg40->m_msg3a.m_scores[ix]
, widgetWidth - 2*8 // padding is 8px
// subtract 8 for scrollbar on right
, widgetWidth - 2*8 - 8 // padding is 8px
, (long)RESULT_HEIGHT
, (long)RESULT_HEIGHT
, (long)PADDING
//, bgcolor
);
// if ( mr->ptr_imgUrl )
// sb->safePrintf("background-repeat:no-repeat;"
@ -3864,7 +3880,6 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// "background-image:url('%s');"
// , widgetwidth - 2*8 // padding is 8px
// , mr->ptr_imgUrl);
long newdx = 0;
if ( mr->ptr_imgData ) {
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
ThumbnailInfo *ti = ta->getThumbnailInfo(0);
@ -3892,13 +3907,15 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// if thumbnail is wide enough put text on top of it, otherwise
// image is to the left and text is to the right of image
if ( newdx > .5 * widgetWidth )
if ( newdx > .5 * widgetWidth ) {
isWide = true;
sb->safePrintf("position:absolute;"
"bottom:%li;"
"left:%li;"
, (long) PADDING
, (long) PADDING
);
}
// to align the text verticall we gotta make a textbox div
// otherwise it wraps below image! mdw
//else
@ -4279,12 +4296,28 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
bool printSummary = true;
// do not print summaries for widgets by default unless overridden
// with &summary=1
long defSum = 0;
// if no image then default the summary to on
if ( ! mr->ptr_imgData ) defSum = 1;
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX ) &&
hr->getLong("summaries",0) == 0 )
hr->getLong("summaries",defSum) == 0 )
printSummary = false;
if ( printSummary &&
(si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX ) ) {
long sumLen = strLen;
if ( sumLen > 150 ) sumLen = 150;
if ( sumLen ) {
sb->safePrintf("<br>");
sb->safeTruncateEllipsis ( str , sumLen );
}
}
if ( printSummary && si->m_format == FORMAT_HTML )
sb->brify ( str , strLen, 0 , cols ); // niceness = 0
@ -4421,6 +4454,31 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// turn off the color
sb->safePrintf ( "</font>\n" );
}
// print url for widgets now
if ( (si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX ) ) {
//sb->safePrintf ("<br><font color=gray size=-1>" );
// print url for widgets in top left if we have a wide image
// otherwise it gets truncated below the title for some reason
if ( isWide )
sb->safePrintf ("<br><font color=white size=-1 "
"style=position:absolute;left:10px;"
"top:10px;background-color:black;>" );
else if ( mr->ptr_imgData )
sb->safePrintf ("<br><font color=gray size=-1 "
"style=position:absolute;left:%lipx;"
"top:10px;>"
, (long) PADDING + newdx + 10 );
else
sb->safePrintf ("<br><font color=gray size=-1>");
// print the url now, truncated to 50 chars
sb->safeTruncateEllipsis ( url , 50 ); // cols - 30 );
sb->safePrintf ( "</font>\n" );
}
if ( si->m_format == FORMAT_XML ) {
sb->safePrintf("\t\t<url><![CDATA[");
sb->safeMemcpy ( url , urlLen );
@ -5029,7 +5087,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
if ( si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX )
sb->safePrintf("</div>");
sb->safePrintf("</div><hr>");
if ( si->m_format == FORMAT_HTML )
@ -6991,7 +7049,7 @@ bool printDMOZCrumb ( SafeBuf *sb , long catId , bool xml ) {
// dirIndex = g_categories->getIndexFromId(si->m_cat_sdir);
if (dirIndex < 0) dirIndex = 0;
// display the directory bread crumb
//if( (si->m_cat_dirId > 0 && si->m_isAdmin && !si->m_isFriend)
//if( (si->m_cat_dirId > 0 && si->m_isRootAdmin && !si->m_isFriend)
// || (si->m_cat_sdir > 0 && si->m_cat_sdirt != 0) )
// sb->safePrintf("<br><br>");
// shortcut. rtl=Right To Left language format.
@ -7028,6 +7086,8 @@ bool printDMOZCrumb ( SafeBuf *sb , long catId , bool xml ) {
bool printDmozRadioButtons ( SafeBuf *sb , long catId ) ;
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr) ;
// if catId >= 1 then print the dmoz radio button
bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId ,
SearchInput *si ) {
@ -7039,6 +7099,7 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId ,
// now make a TABLE, left PANE contains gigabits and stuff
/*
sb->safePrintf(
// logo and menu table
"<table border=0 cellspacing=5>"
@ -7058,6 +7119,13 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId ,
"<td>"
//, root
);
*/
if ( catId >= 0 ) {
CollectionRec *cr = g_collectiondb.getRec ( hr );
printFrontPageShell ( sb , "directory",cr); // PAGE_DIRECTORY
}
/*
// menu above search box
@ -7169,7 +7237,11 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId ,
// put search box in a box
sb->safePrintf("<div style="
sb->safePrintf(
"<br>"
"<br>"
"<br>"
"<div style="
"background-color:#fcc714;"
"border-style:solid;"
"border-width:3px;"
@ -7179,6 +7251,7 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId ,
"border-radius:20px;"
">");
sb->safePrintf (
//"<div style=margin-left:5px;margin-right:5px;>
"<input size=40 type=text name=q "
@ -7232,15 +7305,20 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId ,
"<b style=margin-left:-5px;font-size:18px;"
">GO</b>"
"</div>"
);
"</div>"
// print "Search [ ] sites [ ] pages in this topic or below"
if ( catId >= 0 ) {
sb->safePrintf("<br>");
printDmozRadioButtons(sb,catId);
}
sb->safePrintf( "</div>"
"<br>"
"<br>"
);
if ( catId >= 0 ) {
printDmozRadioButtons(sb,catId);
}
/*
else {
sb->safePrintf("Try your search on: "
@ -7257,13 +7335,15 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId ,
}
*/
// do not print filter bar if showing a dmoz topic
if ( catId < 0 )
printSearchFiltersBar ( sb , hr );
sb->safePrintf( "</form>\n"
"</td>"
"</tr>"
"</table>\n"
// "</td>"
// "</tr>"
// "</table>\n"
);
return true;
}
@ -8368,6 +8448,11 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
s_mi[n].m_cgi = "filetype=ps";
n++;
s_mi[n].m_menuNum = 3;
s_mi[n].m_title = "Spider Status";
s_mi[n].m_cgi = "filetype=status";
n++;
// facets
s_mi[n].m_menuNum = 4;

View File

@ -639,56 +639,58 @@ bool expandHtml ( SafeBuf& sb,
}
bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ) {
sb.safePrintf("<html>\n");
sb.safePrintf("<head>\n");
//sb.safePrintf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf8\">");
sb.safePrintf("<meta name=\"description\" content=\"A powerful, new search engine that does real-time indexing!\">\n");
sb.safePrintf("<meta name=\"keywords\" content=\"search, search engine, search engines, search the web, fresh index, green search engine, green search, clean search engine, clean search\">\n");
sb->safePrintf("<html>\n");
sb->safePrintf("<head>\n");
//sb->safePrintf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf8\">");
sb->safePrintf("<meta name=\"description\" content=\"A powerful, new search engine that does real-time indexing!\">\n");
sb->safePrintf("<meta name=\"keywords\" content=\"search, search engine, search engines, search the web, fresh index, green search engine, green search, clean search engine, clean search\">\n");
//char *title = "An Alternative Open Source Search Engine";
char *title = "An Alternative Open Source Search Engine";
if ( pageNum == 1 ) title = "Directory";
if ( pageNum == 2 ) title = "Advanced";
if ( pageNum == 3 ) title = "Add Url";
if ( pageNum == 4 ) title = "About";
if ( pageNum == 5 ) title = "Help";
if ( pageNum == 6 ) title = "API";
sb.safePrintf("<title>Gigablast - %s</title>\n",title);
sb.safePrintf("<style><!--\n");
sb.safePrintf("body {\n");
sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
sb.safePrintf("color: #000000;\n");
sb.safePrintf("font-size: 12px;\n");
sb.safePrintf("margin: 0px 0px;\n");
sb.safePrintf("letter-spacing: 0.04em;\n");
sb.safePrintf("}\n");
sb.safePrintf("a {text-decoration:none;}\n");
//sb.safePrintf("a:link {color:#00c}\n");
//sb.safePrintf("a:visited {color:#551a8b}\n");
//sb.safePrintf("a:active {color:#f00}\n");
sb.safePrintf(".bold {font-weight: bold;}\n");
sb.safePrintf(".bluetable {background:#d1e1ff;margin-bottom:15px;font-size:12px;}\n");
sb.safePrintf(".url {color:#008000;}\n");
sb.safePrintf(".cached, .cached a {font-size: 10px;color: #666666;\n");
sb.safePrintf("}\n");
sb.safePrintf("table {\n");
sb.safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
sb.safePrintf("color: #000000;\n");
sb.safePrintf("font-size: 12px;\n");
sb.safePrintf("}\n");
sb.safePrintf(".directory {font-size: 16px;}\n"
if ( strcasecmp(tabName,"search") ) title = tabName;
// if ( pageNum == 1 ) title = "Directory";
// if ( pageNum == 2 ) title = "Advanced";
// if ( pageNum == 3 ) title = "Add Url";
// if ( pageNum == 4 ) title = "About";
// if ( pageNum == 5 ) title = "Help";
// if ( pageNum == 6 ) title = "API";
sb->safePrintf("<title>Gigablast - %s</title>\n",title);
sb->safePrintf("<style><!--\n");
sb->safePrintf("body {\n");
sb->safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
sb->safePrintf("color: #000000;\n");
sb->safePrintf("font-size: 12px;\n");
sb->safePrintf("margin: 0px 0px;\n");
sb->safePrintf("letter-spacing: 0.04em;\n");
sb->safePrintf("}\n");
sb->safePrintf("a {text-decoration:none;}\n");
//sb->safePrintf("a:link {color:#00c}\n");
//sb->safePrintf("a:visited {color:#551a8b}\n");
//sb->safePrintf("a:active {color:#f00}\n");
sb->safePrintf(".bold {font-weight: bold;}\n");
sb->safePrintf(".bluetable {background:#d1e1ff;margin-bottom:15px;font-size:12px;}\n");
sb->safePrintf(".url {color:#008000;}\n");
sb->safePrintf(".cached, .cached a {font-size: 10px;color: #666666;\n");
sb->safePrintf("}\n");
sb->safePrintf("table {\n");
sb->safePrintf("font-family:Arial, Helvetica, sans-serif;\n");
sb->safePrintf("color: #000000;\n");
sb->safePrintf("font-size: 12px;\n");
sb->safePrintf("}\n");
sb->safePrintf(".directory {font-size: 16px;}\n"
".nav {font-size:20px;align:right;}\n"
);
sb.safePrintf("-->\n");
sb.safePrintf("</style>\n");
sb.safePrintf("\n");
sb.safePrintf("</head>\n");
sb.safePrintf("<script>\n");
sb.safePrintf("<!--\n");
sb.safePrintf("function x(){document.f.q.focus();}\n");
sb.safePrintf("// --></script>\n");
sb.safePrintf("<body onload=\"x()\">\n");
//sb.safePrintf("<body>\n");
sb->safePrintf("-->\n");
sb->safePrintf("</style>\n");
sb->safePrintf("\n");
sb->safePrintf("</head>\n");
sb->safePrintf("<script>\n");
sb->safePrintf("<!--\n");
sb->safePrintf("function x(){document.f.q.focus();}\n");
sb->safePrintf("// --></script>\n");
sb->safePrintf("<body onload=\"x()\">\n");
//sb->safePrintf("<body>\n");
//g_proxy.insertLoginBarDirective ( &sb );
//
@ -696,7 +698,7 @@ bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
//
sb.safePrintf("<TABLE border=0 height=100%% cellspacing=0 "
sb->safePrintf("<TABLE border=0 height=100%% cellspacing=0 "
"cellpadding=0>"
"\n<TR>\n");
@ -713,15 +715,19 @@ bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
{"DIRECTORY","/Top"},
{"ADVANCED","/adv.html"},
{"ADD URL","/addurl"},
{"WIDGETS","/widgets.html"},
{"SYNTAX","/syntax.html"},
{"USERS","/users.html"},
{"ABOUT","/about.html"},
{"HELP","/help.html"},
{"API","/api"}
{"NEWS","/news.html"},
{"FAQ","/faq.html"},
{"API","/api.html"}
};
//
// first the nav column
//
sb.safePrintf("<TD bgcolor=#f3c714 " // yellow/gold
sb->safePrintf("<TD bgcolor=#f3c714 " // yellow/gold
"valign=top "
"style=\"width:210px;"
"border-right:3px solid blue;"
@ -741,24 +747,33 @@ bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
"width:100px;"
"height:100px;"
"\">"
"<br style=line-height:10px;>"
);
if ( strcmp(tabName,"appliance") == 0 )
sb->safePrintf("<img style=margin-top:21px; width=90 "
"height=57 src=/computer2.png>");
else
sb->safePrintf("<br style=line-height:10px;>"
"<img width=54 height=79 src=/rocket.jpg>"
"</div>"
);
sb->safePrintf ( "</div>"
"</a>"
"</center>"
"<br>"
"<br>"
);
long n = sizeof(mi) / sizeof(MenuItem);
char *coll = "";
if ( cr ) coll = cr->m_coll;
for ( long i = 0 ; i < n ; i++ ) {
sb.safePrintf(
"<a href=%s>"
sb->safePrintf(
"<a href=%s?c=%s>"
"<div style=\""
"padding:5px;"
"position:relative;"
@ -772,14 +787,20 @@ bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
"font-size:14px;"
"x-overflow:;"
, mi[i].m_url
, coll
);
if ( i == pageNum )
sb.safePrintf(
//if ( i == pageNum )
bool matched = false;
if ( strcasecmp(mi[i].m_text,tabName) == 0 )
matched = true;
if ( matched )
sb->safePrintf(
"border-color:blue;"
"color:black;"
"background-color:white;\" ");
else
sb.safePrintf("border-color:white;"
sb->safePrintf("border-color:white;"
"color:white;"
"background-color:blue;\" "
" onmouseover=\""
@ -790,7 +811,7 @@ bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
"this.style.color='white';\""
);
sb.safePrintf(">"
sb->safePrintf(">"
// make button wider
"<nobr>"
"&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; "
@ -800,8 +821,8 @@ bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
//
// begin hack: white out the blue border line!!
//
if ( i == pageNum )
sb.safePrintf(
if ( matched )
sb->safePrintf(
"<div style=padding:5px;top:0;"
"background-color:white;"
"display:inline-block;"
@ -810,7 +831,7 @@ bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
"</div>"
);
// end hack
sb.safePrintf(
sb->safePrintf(
"</div>"
"</a>"
"<br>"
@ -818,9 +839,11 @@ bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
}
// admin link
sb.safePrintf(
"<a href=/admin/settings>"
sb->safePrintf(
"<a href=/admin/settings?c=%s>"
"<div style=\"background-color:green;"
// for try it out bubble:
//"position:relative;"
"padding:5px;"
"text-align:right;"
"border-width:3px;"
@ -841,24 +864,66 @@ bool printFrontPageShell ( SafeBuf &sb , long pageNum ) {
"this.style.backgroundColor='green';"
"this.style.color='white';\""
">"
/*
// try it out bubble div
"<div "
" onmouseover=\""
"this.style.box-shadow='10px 10px 5px #888888';"
"\""
" onmouseout=\""
"this.style.box-shadow='';"
"\""
"style=\""
"vertical-align:middle;"
"text-align:left;"
"cursor:pointer;"
"cursor:hand;"
//"border-color:black;"
//"border-style:solid;"
//"border-width:2px;"
"padding:3px;"
//"width:30px;"
//"height:20px;"
//"margin-top:-20px;"
"margin-left:-120px;"
"position:absolute;"
//"top:-20px;"
//"left:10px;"
"display:inline-block;"
"\""
">"
"<b style=font-size:11px;>"
"Click for demo"
"</b>"
"</div>"
*/
// end try it out bubble div
"<b>ADMIN</b> &nbsp; &nbsp;"
"</div>"
"</a>"
"<br>"
, coll
);
//
// now the MAIN column
//
sb.safePrintf("\n</TD><TD valign=top style=padding-left:30px;>\n");
sb->safePrintf("\n</TD><TD valign=top style=padding-left:30px;>\n");
sb.safePrintf("<br><br>");
sb->safePrintf("<br><br>");
sb.safePrintf("<a href=/><img border=0 width=470 "
sb->safePrintf("<a href=/><img border=0 width=470 "
"height=44 src=/gigablast.jpg></a>\n");
// sb.safePrintf("<br>"
// sb->safePrintf("<br>"
// "<img border=0 width=470 "
// "height=15 src=/bar.jpg>\n");
@ -891,7 +956,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
cr );//CollectionRec *cr ) {
}
printFrontPageShell ( sb ,0 );
printFrontPageShell ( &sb , "search" , cr );
//sb.safePrintf("<br><br>\n");
@ -1027,7 +1092,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
sb.safePrintf("<td><font size=+1><b>Open Source!</b>"
"</font><br>\n");
sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=http://www.gigablast.com/admin.html#features>Features</a>. Limited support available for free."
sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=http://www.gigablast.com/faq.html#features>Features</a>. Limited support available for free."
,80);
sb.safePrintf("<br><br>");
sb.safePrintf("</td></tr>\n");
@ -1169,6 +1234,128 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
sb.safePrintf("</td></tr>\n");
*/
//
// begin new stuff
//
// gradients
sb.safePrintf("<style><!--\n");
sb.safePrintf(".grad {");
sb.safePrintf("background: rgb(190,201,247);");
sb.safePrintf("background: url();");
sb.safePrintf("background: -moz-linear-gradient(-45deg, rgba(190,201,247,1) 0%%, rgba(11,60,237,1) 100%%);");
sb.safePrintf("background: -webkit-gradient(linear, left top, right bottom, color-stop(0%%,rgba(190,201,247,1)), color-stop(100%%,rgba(11,60,237,1)));");
sb.safePrintf("background: -webkit-linear-gradient(-45deg, rgba(190,201,247,1) 0%%,rgba(11,60,237,1) 100%%);");
sb.safePrintf("background: -o-linear-gradient(-45deg, rgba(190,201,247,1) 0%%,rgba(11,60,237,1) 100%%);");
sb.safePrintf("background: -ms-linear-gradient(-45deg, rgba(190,201,247,1) 0%%,rgba(11,60,237,1) 100%%);");
sb.safePrintf("background: linear-gradient(135deg, rgba(190,201,247,1) 0%%,rgba(11,60,237,1) 100%%);");
sb.safePrintf("filter: progid:DXImageTransform.Microsoft.gradient( startColorstr='#bec9f7', endColorstr='#0b3ced',GradientType=1 );");
sb.safePrintf("}");
sb.safePrintf("-->");
sb.safePrintf("</style>\n");
sb.safePrintf("<br>");
sb.safePrintf("<div class=grad style=\"border-radius:200px;border-color:blue;border-style:solid;border-width:3px;padding:12px;width:320px;height:320px;display:inline-block;z-index:100;color:black;position:relative;background-color:lightgray;\">");
sb.safePrintf("<br>");
sb.safePrintf("<b>");
sb.safePrintf("<font style=font-size:18px;margin-left:80px;>");
sb.safePrintf("Build Your Own");
sb.safePrintf("</font>");
sb.safePrintf("<br>");
sb.safePrintf("<font style=font-size:18px;margin-left:80px;>");
sb.safePrintf("Search Engine in the");
sb.safePrintf("</font>");
sb.safePrintf("<br>");
sb.safePrintf("<font style=font-size:18px;margin-left:80px;>");
sb.safePrintf("Cloud");
sb.safePrintf("</font>");
sb.safePrintf("</b>");
sb.safePrintf("<br>");
sb.safePrintf("<br>");
sb.safePrintf("<div style=margin-left:20px;width:270px;>");
sb.safePrintf("<a href=/admin/addcoll><img style=float:left;padding-right:15px; height=188px width=101px src=/robot3.png></a>");
//sb.safePrintf("<br>");
sb.safePrintf("<b>STEP 1.</b> <a href=/admin/addcoll?guide=1>"
"Click here to");
sb.safePrintf("<br>");
sb.safePrintf("<b>name your engine</b></a>.");
sb.safePrintf("<br>");
sb.safePrintf("<br>");
sb.safePrintf("<b>STEP 2.</b> <a href=/admin/settings?guide=1>"
"Click here to ");
sb.safePrintf("<br>");
sb.safePrintf("<b>add websites to index</b></a>.");
sb.safePrintf("<br>");
sb.safePrintf("<br>");
sb.safePrintf("<b>STEP 3.</b> <a href=/widgets.html?guide=1>"
"Click here to");
sb.safePrintf("<br>");
sb.safePrintf("<b>insert search box</b></a>.");
sb.safePrintf("</div>");
sb.safePrintf("</div>");
sb.safePrintf("<div class=grad style=\"border-radius:200px;border-color:blue;border-style:solid;border-width:3px;padding:12px;width:280px;height:280px;display:inline-block;z-index:105;color:black;margin-left:-50px;position:absolute;margin-top:50px;background-color:lightgray;\">");
sb.safePrintf("<br>");
sb.safePrintf("<br style=line-height:25px;>");
sb.safePrintf("<b>");
sb.safePrintf("<font style=font-size:18px;margin-left:40px;>");
sb.safePrintf("Web Search Appliance");
sb.safePrintf("</font>");
sb.safePrintf("<br>");
sb.safePrintf("<br>");
sb.safePrintf("<br>");
sb.safePrintf("</b>");
sb.safePrintf("<div style=margin-left:20px;width:270px;>");
sb.safePrintf("<a href=http://www.gigablast.com/appliance.html><img style=float:left;padding-bottom:20px;padding-right:10px; height=81px width=121px src=/computer2.png></a>");
sb.safePrintf("Put the web in your closet. ");
sb.safePrintf("Jump start your efforts with four 1U supermicro servers loaded with the top 2 billion pages from the web. <a href=http://www.gigablast.com/appliance.html>[learn more]</a>");
sb.safePrintf("</font>");
sb.safePrintf("</div>");
sb.safePrintf("</div>");
sb.safePrintf("<div class=grad style=\"border-radius:300px;border-color:blue;border-style:solid;border-width:3px;padding:12px;width:240px;height:240px;display:inline-block;z-index:110;color:black;margin-left:-240px;position:absolute;margin-top:230px;background-color:lightgray;\">");
sb.safePrintf("<br>");
sb.safePrintf("<b>");
sb.safePrintf("<font style=font-size:18px;margin-left:60px;>");
sb.safePrintf("Open Source");
sb.safePrintf("</font>");
sb.safePrintf("<br>");
sb.safePrintf("<br>");
sb.safePrintf("</b>");
sb.safePrintf("<div style=margin-left:30px;margin-right:5px;>");
sb.safePrintf("<a href=http://www.gigablast.com/faq.html#features><img style=float:left;padding-right:10px height=71px width=71px src=/unlocked2.png></a>");
sb.safePrintf("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=http://www.gigablast.com/faq.html#features>Features</a>.");
sb.safePrintf("</div>");
sb.safePrintf("</div>");
//sb.safePrintf("</TD></TR></TABLE></body></html>");
//
// end new stuff
//
sb.safePrintf("\n");
sb.safePrintf("\n");
@ -1180,7 +1367,9 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
printFrontPageShell ( sb , 3 );
CollectionRec *cr = g_collectiondb.getRec ( r );
printFrontPageShell ( &sb , "add url" , cr );
sb.safePrintf("<script type=\"text/javascript\">\n"
@ -1215,7 +1404,6 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
sb.safePrintf("<form method=GET "
"action=/addurl name=f>\n" );
CollectionRec *cr = g_collectiondb.getRec ( r );
char *coll = "";
if ( cr ) coll = cr->m_coll;
if ( cr )
@ -1378,8 +1566,9 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
if ( format != FORMAT_HTML )
return printTopDirectory ( sb , format );
CollectionRec *cr = g_collectiondb.getRec ( r );
printFrontPageShell ( sb , 1 );
printFrontPageShell ( &sb , "directory" , cr );
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
@ -1388,7 +1577,6 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("<form method=GET "
"action=/search name=f>\n");
CollectionRec *cr = g_collectiondb.getRec ( r );
if ( cr )
sb.safePrintf("<input type=hidden name=c value=\"%s\">",
cr->m_coll);
@ -1648,7 +1836,7 @@ bool printTopDirectory ( SafeBuf& sb , char format ) {
"<br>"
"<b>"
"Please follow the set up "
"<a href=/admin.html#dmoz>"
"<a href=/faq.html#dmoz>"
"instructions"
"</a>."
"</b>"
@ -1839,7 +2027,7 @@ public:
//Msg4 m_msg4;
Msg7 m_msg7;
TcpSocket *m_socket;
bool m_isAdmin;
bool m_isRootAdmin;
char m_coll[MAX_COLL_LEN+1];
bool m_goodAnswer;
bool m_doTuringTest;
@ -2017,7 +2205,7 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
mnew ( st1 , sizeof(State1i) , "PageAddUrl" );
// save socket and isAdmin
st1->m_socket = sock;
st1->m_isAdmin = isAdmin;
st1->m_isRootAdmin = isAdmin;
/*
// save the url
@ -2089,7 +2277,7 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
long now = getTimeGlobal();
// . allow 1 submit every 1 hour
// . restrict by submitter domain ip
if ( ! st1->m_isAdmin &&
if ( ! st1->m_isRootAdmin &&
! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
// return error page
//g_errno = ETOOEARLY;
@ -2115,7 +2303,7 @@ bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
// check it, if turing test is enabled for this collection
/*
if ( ! st1->m_isAdmin && cr->m_doTuringTest &&
if ( ! st1->m_isRootAdmin && cr->m_doTuringTest &&
! g_turingTest.isHuman(r) ) {
// log note so we know it didn't make it
g_msg = " (error: bad answer)";
@ -2170,7 +2358,7 @@ void doneInjectingWrapper3 ( void *st ) {
log(LOG_INFO,"http: add url %s (%s)",url ,mstrerror(g_errno));
// extract info from state
TcpSocket *sock = st1->m_socket;
//bool isAdmin = st1->m_isAdmin;
//bool isAdmin = st1->m_isRootAdmin;
//char *url = NULL;
//if ( st1->m_urlLen ) url = st1->m_url;
// re-null it out if just http://
@ -2391,7 +2579,9 @@ bool sendPageAdvanced ( TcpSocket *sock , HttpRequest *hr ) {
SafeBuf sb;
printFrontPageShell ( sb , 2 );
CollectionRec *cr = g_collectiondb.getRec ( hr );
printFrontPageShell ( &sb , "advanced" , cr );
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
@ -2400,7 +2590,6 @@ bool sendPageAdvanced ( TcpSocket *sock , HttpRequest *hr ) {
sb.safePrintf("<form method=GET "
"action=/search name=f>\n" );
CollectionRec *cr = g_collectiondb.getRec ( hr );
char *coll = "";
if ( cr ) coll = cr->m_coll;
if ( cr )
@ -2566,7 +2755,9 @@ bool sendPageAbout ( TcpSocket *sock , HttpRequest *hr ) {
SafeBuf sb;
printFrontPageShell ( sb , 4 );
CollectionRec *cr = g_collectiondb.getRec ( hr );
printFrontPageShell ( &sb , "about" , cr );
sb.safePrintf("<br>\n");
@ -2576,7 +2767,6 @@ bool sendPageAbout ( TcpSocket *sock , HttpRequest *hr ) {
//sb.safePrintf("<form method=GET "
// "action=/addurl name=f>\n" );
CollectionRec *cr = g_collectiondb.getRec ( hr );
char *coll = "";
if ( cr ) coll = cr->m_coll;
if ( cr )
@ -2618,7 +2808,7 @@ bool sendPageAbout ( TcpSocket *sock , HttpRequest *hr ) {
" </p>"
""
"<p>"
"Matt Wells is currently the sole maintainer and programmer of Gigablast and is open for <u>consulting work</u>. For more information, contact us at <br><img src=>"
"For more information, contact Matt directly at <br><img src=>"
"</p>"
"<br>"
"<center>"
@ -2657,7 +2847,9 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
SafeBuf sb;
printFrontPageShell ( sb , 5 );
CollectionRec *cr = g_collectiondb.getRec ( hr );
printFrontPageShell ( &sb , "syntax" , cr );
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
@ -2666,7 +2858,6 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
//sb.safePrintf("<form method=GET "
// "action=/addurl name=f>\n" );
// CollectionRec *cr = g_collectiondb.getRec ( hr );
// char *coll = "";
// if ( cr ) coll = cr->m_coll;
// if ( cr )

View File

@ -805,9 +805,15 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
"<tr class=poo><td><b>Uptime</b></td><td>%s</td></tr>\n"
"<tr class=poo><td><b>Process ID</b></td><td>%lu</td></tr>\n"
"<tr class=poo><td><b>Corrupted Disk Reads</b></td><td>%li</td></tr>\n"
"<tr class=poo><td><b>SIGALRMS</b></td><td>%li</td></tr>\n"
"<tr class=poo><td><b>SIGVTALRMS</b></td><td>%li</td></tr>\n"
"<tr class=poo><td><b>read signals</b></td><td>%lli</td></tr>\n"
"<tr class=poo><td><b>write signals</b></td><td>%lli</td></tr>\n"
"<tr class=poo><td><b>SIGCHLDS</b></td><td>%li</td></tr>\n"
"<tr class=poo><td><b>SIGQUEUES</b></td><td>%li</td></tr>\n"
"<tr class=poo><td><b>SIGOTHERS</b></td><td>%li</td></tr>\n"
//"<tr class=poo><td><b>read signals</b></td><td>%lli</td></tr>\n"
//"<tr class=poo><td><b>write signals</b></td><td>%lli</td></tr>\n"
"<tr class=poo><td><b>quickpolls</b></td><td>%li</td></tr>\n"
"<tr class=poo><td><b>Kernel Version</b></td><td>%s</td></tr>\n"
//"<tr class=poo><td><b>Gigablast Version</b></td><td>%s %s</td></tr>\n"
@ -825,8 +831,14 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
(unsigned long)getpid(),
g_numCorrupt,
g_numAlarms,
g_stats.m_readSignals,
g_stats.m_writeSignals,
g_numVTAlarms,
g_numSigChlds,
g_numSigQueues,
g_numSigOthers,
//g_stats.m_readSignals,
//g_stats.m_writeSignals,
g_numQuickPolls,
kv ,
//GBPROJECTNAME,

View File

@ -26,7 +26,7 @@ class State4 {
public:
TcpSocket *m_socket;
XmlDoc m_xd;
bool m_isAdmin;
bool m_isRootAdmin;
bool m_isLocal;
long long m_docId;
char *m_pwd;
@ -58,7 +58,7 @@ bool sendPageTitledb ( TcpSocket *s , HttpRequest *r ) {
// copy it
st->m_r.copy ( r );
// remember if http request is internal/local or not
st->m_isAdmin = g_conf.isCollAdmin ( s , r );
st->m_isRootAdmin = g_conf.isCollAdmin ( s , r );
st->m_isLocal = r->isLocal();
st->m_docId = docId;
// password, too

View File

@ -51,7 +51,7 @@ public:
long m_turkIp;
long m_date1;
long m_date2;
//char m_isAdmin;
//char m_isRootAdmin;
char m_coll [ MAX_COLL_LEN + 1];
long m_collLen;
TcpSocket *m_socket;
@ -1687,7 +1687,7 @@ public:
CaptchaState m_cst;
bool m_isSuperTurk;
long long m_tuid64;
//char m_isAdmin;
//char m_isRootAdmin;
char m_coll [ MAX_COLL_LEN + 1];
long m_collLen;
TcpSocket *m_socket;
@ -2027,7 +2027,7 @@ void doneReindexing ( void *state ) {
bool presentTurkForm ( State61 *st ) {
// set stuff now
//st->m_isAdmin = isAdmin;
//st->m_isRootAdmin = isAdmin;
// if g_errno was set then the last injection did not go through
// perhaps because of ENOMEM or the geocoder was down!

167
Pages.cpp
View File

@ -667,7 +667,7 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
// . will only add parm recs we have permission to modify
// . if no collection supplied will just return true with no g_errno
if ( isAdmin &&
! g_parms.convertHttpRequestToParmList ( r , parmList , page ) )
! g_parms.convertHttpRequestToParmList ( r, parmList, page, s))
return g_httpServer.sendErrorReply(s,505,mstrerror(g_errno));
@ -975,7 +975,7 @@ bool printTopNavButton ( char *text,
else
sb->safePrintf(
"<a style=text-decoration:none; href=%s?%s>"
"<a style=text-decoration:none; href=%s?c=%s>"
"<div "
" onmouseover=\""
@ -1190,10 +1190,11 @@ bool Pages::printAdminTop (SafeBuf *sb ,
"valign=top "
"style=\""
"width:210px;"
"max-width:210px;"
"border-right:3px solid blue;"
"\">"
"<br>"
"<br style=line-height:14px;>"
"<center>"
"<a href=/?c=%s>"
@ -1269,7 +1270,7 @@ bool Pages::printAdminTop (SafeBuf *sb ,
//"max-width:200px;"
//"min-width:200px;"
"width:200px;"
"width:190px;"
"padding:4px;" // same as TABLE_STYLE
"margin-left:10px;"
@ -1295,7 +1296,7 @@ bool Pages::printAdminTop (SafeBuf *sb ,
//status&=printCollectionNavBar ( sb, page , username , coll,pwd, qs );
// collection navbar
status&=printCollectionNavBar ( sb, page , username , coll,pwd, qs );
status&=printCollectionNavBar ( sb, page , username, coll,pwd, qs,s,r);
sb->safePrintf("</div>");
@ -1453,6 +1454,12 @@ bool Pages::printAdminTop (SafeBuf *sb ,
sb->safePrintf("<br>");
if ( page != PAGE_BASIC_SETTINGS )
return true;
// gigabot helper blurb
printGigabotAdvice ( sb , page , r );
// begin 2nd row in big table
//sb->safePrintf("</td></TR>");
@ -1461,6 +1468,117 @@ bool Pages::printAdminTop (SafeBuf *sb ,
return true;
}
bool printGigabotAdvice ( SafeBuf *sb , long page , HttpRequest *hr ) {
char format = hr->getFormat();
if ( format != FORMAT_HTML ) return true;
char guide = hr->getLong("guide",0);
if ( ! guide ) return true;
sb->safePrintf("<input type=hidden name=guide value=1>\n");
// we only show to guest users. if we are logged in as master admin
// then skip this step.
//if ( hr->isGuestAdmin() )
// return false;
// also, only show if running in matt's data cetner
//if ( ! g_conf.m_isMattWells )
// return true;
// gradient class
// yellow box
char *box =
"<table cellpadding=5 "
// full width of enclosing div
"width=100%% "
"style=\""
//"background-color:gold;"
//"border:3px blue solid;"
"background-color:lightblue;"
"border:3px blue solid;"
"border-radius:8px;"
//"max-width:500px;"
"\" "
"border=0"
">"
"<tr><td>";
char *boxEnd =
"</td></tr></table>";
char *advice = NULL;
if ( page == PAGE_ADDCOLL )
advice =
"STEP 1 of 3. "
"<br>"
"<br>"
//"Human, I am Gigabot."
//"<br><br>"
"Enter the name of your collection "
"(search engine) in the box below then hit "
"submit."
"<br>"
"<br>"
"Remember this name so you can access the controls "
"later."
// "Do not deviate from this path or you may "
// "be blasted."
;
if ( page == PAGE_BASIC_SETTINGS )
advice =
"STEP 2 of 3. "
"<br>"
"<br>"
"Enter the list of websites you want to be in your "
"search engine into the box marked <i>site list</i>."
// "<br>"
// "<br>"
// "Do not deviate from this path, or, as is always "
// "the case, you may "
// "be blasted."
;
if ( page == PAGE_BASIC_STATUS )
advice =
"STEP 3 of 3. "
"<br>"
"<br>"
"Ensure you see search results appearing in "
"the box below. If not, then you have spider "
"problems."
"<br>"
"<br>"
"Click on the links in the lower right to expose "
"the source code. Copy and paste this code "
"into your website to make a search box that "
"connects to the search engine you have created. "
;
if ( ! advice ) return true;
sb->safePrintf("<div style=max-width:490px;"
"padding-right:10px;>");
sb->safePrintf("%s",box);
// the mean looking robot
sb->safePrintf("<img style=float:left;padding-right:15px; "
"height=141px width=75px src=/robot3.png>"
"</td><td>"
"<b>"
"%s"
"</b>"
, advice
);
sb->safePrintf("%s",boxEnd);
sb->safePrintf("<br><br></div>");
return true;
}
/*
bool Pages::printAdminTop2 (SafeBuf *sb ,
@ -2190,9 +2308,10 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
}
// print documentation links
/*
if ( ! isBasic )
sb->safePrintf(" <a style=text-decoration:none "
"href=/admin.html>"
"href=/faq.html>"
"<b>"
"admin guide"
"</b></a> "
@ -2204,6 +2323,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
"<b>dev guide</b></a>"
);
*/
sb->safePrintf("</div>");
@ -2223,7 +2343,9 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
char *username,
char *coll ,
char *pwd ,
char *qs ) {
char *qs ,
TcpSocket *sock ,
HttpRequest *hr ) {
bool status = true;
//if ( ! pwd ) pwd = "";
if ( ! qs ) qs = "";
@ -2262,9 +2384,17 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
char *s = "s";
if ( g_collectiondb.m_numRecsUsed == 1 ) s = "";
bool isRootAdmin = g_conf.isRootAdmin ( sock , hr );
if ( isRootAdmin )
sb->safePrintf ( "<center><nobr><b>%li Collection%s</b></nobr>"
"</center>\n",
g_collectiondb.m_numRecsUsed , s );
else
sb->safePrintf ( "<center><nobr><b>Collections</b></nobr>"
"</center>\n");
sb->safePrintf( "<center>"
"<nobr>"
@ -2295,6 +2425,18 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cc = g_collectiondb.m_recs[i];
if ( ! cc ) continue;
//
// CLOUD SEARCH ENGINE SUPPORT
//
// if not root admin and collrec's password does not match
// the one we are logged in with (in the cookie) then skip it
// if ( ! isRootAdmin &&
// cr->m_password &&
// ! strcmp(cr->m_password,pwd) )
// continue;
char *cname = cc->m_coll;
row++;
@ -2712,7 +2854,7 @@ int parmcmp ( const void *a, const void *b ) {
#define DARK_YELLOW "ffaaaa"
#define LIGHT_YELLOW "ffcccc"
bool printFrontPageShell ( SafeBuf &sb , long pageNum ) ;
bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr ) ;
// let's use a separate section for each "page"
// then have 3 tables, the input parms,
@ -2740,7 +2882,7 @@ bool sendPageAPI ( TcpSocket *s , HttpRequest *r ) {
// new stuff
printFrontPageShell ( p , 6 );
printFrontPageShell ( &p , "api" , cr );
//p.safePrintf("<style>body,td,p,.h{font-family:arial,helvetica-neue; "
@ -2771,6 +2913,11 @@ bool sendPageAPI ( TcpSocket *s , HttpRequest *r ) {
"should use POST.");
p.safePrintf("<br><br>");
p.safePrintf("NOTE: All APIs support both http and https "
"protocols.");
p.safePrintf("<br><br>");
p.safePrintf(//"<div style=padding-left:10%%>"
"<font size=+2><b>API by pages</b></font>"
"<ul>"
@ -2967,7 +3114,7 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
"</tr>"
"<tr bgcolor=#%s>"
"<td><b>#</b></td>"
"<td><b>parm</b></td>"
"<td><b>Parm</b></td>"
//"<td><b>Page</b></td>"
"<td><b>Type</b></td>"
"<td><b>Title</b></td>"

View File

@ -29,11 +29,12 @@ extern char *g_msg;
// . declare all dynamic functions here
// . these are all defined in Page*.cpp files
// . these are called to send a dynamic page
bool sendPageWidgets ( TcpSocket *socket , HttpRequest *hr ) ;
bool sendPageBasicSettings ( TcpSocket *s , HttpRequest *r );
bool sendPageBasicStatus ( TcpSocket *s , HttpRequest *r );
//bool sendPageBasicDiffbot ( TcpSocket *s , HttpRequest *r );
bool printGigabotAdvice ( SafeBuf *sb , long page , HttpRequest *hr ) ;
bool sendPageRoot ( TcpSocket *s , HttpRequest *r );
bool sendPageRoot ( TcpSocket *s , HttpRequest *r, char *cookie );
@ -267,7 +268,9 @@ class Pages {
char *username,
char *coll ,
char *pwd ,
char *qs );
char *qs ,
TcpSocket *sock ,
HttpRequest *hr );
/*
char *printCollectionNavBar ( char *p ,
char *pend ,

177
Parms.cpp
View File

@ -1162,6 +1162,31 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r ) {
long page = g_pages.getDynamicPageNumber ( r );
char format = r->getReplyFormat();
//
// CLOUD SEARCH ENGINE SUPPORT
//
char *action = r->getString("action",NULL);
if ( page == PAGE_BASIC_SETTINGS &&
// this is non-null if handling a submit request
action &&
format == FORMAT_HTML ) {
//return g_parms.sendPageGeneric ( s, r, PAGE_BASIC_SETTINGS );
// just redirect to it
char *coll = r->getString("c",NULL);
if ( coll ) {
sb->safePrintf("<meta http-equiv=Refresh "
"content=\"0; URL=/widgets.html"
"?guide=1&c=%s\">",
coll);
return g_httpServer.sendDynamicPage (s,
sb->getBufStart(),
sb->length());
}
}
//
// some "generic" pages do additional processing on the provided input
// so we need to call those functions here...
@ -1176,7 +1201,6 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r ) {
//}
// print standard header
char format = r->getReplyFormat();
if ( format != FORMAT_XML && format != FORMAT_JSON )
g_pages.printAdminTop ( sb , s , r );
@ -2991,6 +3015,14 @@ void Parms::setParm ( char *THIS , Parm *m , long mm , long j , char *s ,
*(float *)(THIS + m->m_off + 4*j) = (float)atof ( s );
newVal = *(float *)(THIS + m->m_off + 4*j);
goto changed; }
else if ( t == TYPE_DOUBLE ) {
if( fromRequest &&
*(double *)(THIS + m->m_off + 4*j) == (double)atof ( s ) )
return;
if ( fromRequest ) oldVal = *(double *)(THIS + m->m_off + 4*j);
*(double *)(THIS + m->m_off + 4*j) = (double)atof ( s );
newVal = *(double *)(THIS + m->m_off + 4*j);
goto changed; }
else if ( t == TYPE_IP ) {
if ( fromRequest && *(long *)(THIS + m->m_off + 4*j) ==
(long)atoip (s,gbstrlen(s) ) )
@ -5638,7 +5670,7 @@ void Parms::init ( ) {
m->m_desc = "Remove all documents from the collection and re-add "
"seed urls from site list.";
// If you do this accidentally there "
//"is a <a href=/admin.html#recover>recovery procedure</a> to "
//"is a <a href=/faq.html#recover>recovery procedure</a> to "
// "get back the trashed data.";
m->m_cgi = "restart";
m->m_page = PAGE_BASIC_SETTINGS;
@ -6065,6 +6097,25 @@ void Parms::init ( ) {
m->m_flags = PF_API | PF_REQUIRED;
m++;
//
// CLOUD SEARCH ENGINE SUPPORT
//
// used to prevent a guest ip adding more than one coll
m->m_title = "user ip";
m->m_desc = "IP of user adding collection.";
m->m_cgi = "userip";
m->m_xml = "userIp";
m->m_off = (char *)&cr.m_userIp - x;
m->m_type = TYPE_STRING;
m->m_size = 16;
m->m_def = "";
m->m_group = 0;
m->m_flags = PF_HIDDEN;// | PF_NOSAVE;
m->m_page = PAGE_ADDCOLL;
m->m_obj = OBJ_COLL;
m++;
m->m_title = "add custom crawl";
m->m_desc = "add custom crawl";
m->m_cgi = "addCrawl";
@ -9061,7 +9112,7 @@ void Parms::init ( ) {
m->m_title = "admin override";
m->m_desc = "admin override";
m->m_off = (char *)&si.m_isAdmin - y;
m->m_off = (char *)&si.m_isRootAdmin - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_cgi = "admin";
@ -9456,7 +9507,11 @@ void Parms::init ( ) {
m->m_title = "max total spiders";
m->m_desc = "What is the maximum number of web "
"pages the spider is allowed to download "
"simultaneously for ALL collections PER HOST?";
"simultaneously for ALL collections PER HOST? Caution: "
"raising this too high could result in some Out of Memory "
"(OOM) errors. The hard limit is currently 300. Each "
"collection has its own limit in the <i>spider controls</i> "
"that you may have to increase as well.";
m->m_cgi = "mtsp";
m->m_off = (char *)&g_conf.m_maxTotalSpiders - g;
m->m_type = TYPE_LONG;
@ -9488,6 +9543,18 @@ void Parms::init ( ) {
m->m_obj = OBJ_CONF;
m++;
m->m_title = "allow cloud users";
m->m_desc = "Can guest users create a collection?";
m->m_cgi = "acu";
m->m_off = (char *)&g_conf.m_allowCloudUsers - g;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF;
m++;
m->m_title = "auto save frequency";
m->m_desc = "Save data in memory to disk after this many minutes "
"have passed without the data having been dumped or saved "
@ -11899,7 +11966,7 @@ void Parms::init ( ) {
m->m_cgi = "qmdt";
m->m_off = (char *)&g_conf.m_queryMaxDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "50";
m->m_def = "100";
m->m_units = "threads";
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_MASTER;
@ -11911,7 +11978,7 @@ void Parms::init ( ) {
m->m_cgi = "qmbdt";
m->m_off = (char *)&g_conf.m_queryMaxBigDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "30"; // 1
m->m_def = "60"; // 1
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
@ -11924,7 +11991,7 @@ void Parms::init ( ) {
m->m_cgi = "qmmdt";
m->m_off = (char *)&g_conf.m_queryMaxMedDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "30"; // 3
m->m_def = "80"; // 3
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
@ -11937,7 +12004,7 @@ void Parms::init ( ) {
m->m_cgi = "qmsdt";
m->m_off = (char *)&g_conf.m_queryMaxSmaDiskThreads - g;
m->m_type = TYPE_LONG;
m->m_def = "40";
m->m_def = "80";
m->m_units = "threads";
m->m_group = 0;
m->m_flags = 0;//PF_HIDDEN | PF_NOSAVE;
@ -15718,11 +15785,14 @@ void Parms::init ( ) {
m->m_title = "max spiders";
m->m_desc = "What is the maximum number of web "
"pages the spider is allowed to download "
"simultaneously PER HOST for THIS collection?";
"simultaneously PER HOST for THIS collection? The "
"maximum number of spiders over all collections is "
"controlled in the <i>master controls</i>.";
m->m_cgi = "mns";
m->m_off = (char *)&cr.m_maxNumSpiders - x;
m->m_type = TYPE_LONG;
m->m_def = "100";
// make it the hard max so control is really in the master controls
m->m_def = "300";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
@ -19594,11 +19664,37 @@ bool Parms::addCurrentParmToList2 ( SafeBuf *parmList ,
// returns false and sets g_errno on error
bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
long page ){
long page , TcpSocket *sock ) {
// false = useDefaultRec?
CollectionRec *cr = g_collectiondb.getRec ( hr , false );
//
// CLOUD SEARCH ENGINE SUPPORT
//
// if not the root admin only all user to change settings, etc.
// if the collection rec is a guest collection. i.e. in the cloud.
//
bool isRootAdmin = g_conf.isRootAdmin(sock,hr);
bool isRootColl = false;
if ( cr && strcmp(cr->m_coll,"main")==0 ) isRootColl = true;
if ( cr && strcmp(cr->m_coll,"dmoz")==0 ) isRootColl = true;
if ( cr && strcmp(cr->m_coll,"demo")==0 ) isRootColl = true;
// the main,dmoz and demo collections are root admin only
if ( ! isRootAdmin && isRootColl ) {
g_errno = ENOPERM;
return log("parms: root admin can only change main/dmoz/demo"
" collections.");
}
// just knowing the collection name is enough for a cloud user to
// modify the collection's parms. however, to modify the master
// controls or stuff in g_conf, you have to be root admin.
if ( ! g_conf.m_allowCloudUsers && ! isRootAdmin ) {
g_errno = ENOPERM;
return log("parms: permission denied for user");
}
//if ( c ) {
// cr = g_collectiondb.getRec ( hr );
// if ( ! cr ) log("parms: coll not found");
@ -19792,6 +19888,13 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
// given as a string.
val = oldCollName;
//
// CLOUD SEARCH ENGINE SUPPORT
//
// master controls require root permission
if ( m->m_obj == OBJ_CONF && ! isRootAdmin )
continue;
// add the cmd parm
if ( ! addNewParmToList2 ( parmList ,
// it might be a collection-less
@ -19830,6 +19933,29 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
//}
//
// CLOUD SEARCH ENGINE SUPPORT
//
// provide userip so when adding a new collection we can
// store it in the collection rec to ensure that the same
// IP address cannot add more than one collection.
//
if ( sock && page == PAGE_ADDCOLL ) {
char *ipStr = iptoa(sock->m_ip);
long occNum;
Parm *um = getParmFast1 ( "userip" , &occNum); // NULL = occNum
if ( ! addNewParmToList2 ( parmList ,
// HACK! operate on the to-be-added
// collrec, if there was an addcoll
// reset or restart coll cmd...
parmCollnum ,
ipStr, // val ,
occNum ,
um ) )
return false;
}
//
// now add the parms that are NOT commands
//
@ -19850,6 +19976,35 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
long occNum;
Parm *m = getParmFast1 ( field , &occNum );
//
// CLOUD SEARCH ENGINE SUPPORT
//
// master controls require root permission. otherwise, just
// knowing the collection name is enough for a cloud user
// to change settings.
//
if ( m && m->m_obj == OBJ_CONF && ! isRootAdmin )
continue;
//
// CLOUD SEARCH ENGINE SUPPORT
//
// if this IP c-block as already added a collection then do not
// allow it to add another.
//
if ( m && strcmp(m->m_cgi,"addcoll")==0 && ! isRootAdmin ) {
// see if user's c block has already added a collection
long numAdded = 0;
if ( numAdded >= 1 ) {
g_errno = ENOPERM;
log("parms: already added a collection from "
"this cloud user's c-block.");
return false;
}
}
//
// map "pause" to spidering enabled
//

View File

@ -462,7 +462,7 @@ class Parms {
long occNum ,
Parm *m ) ;
bool convertHttpRequestToParmList (HttpRequest *hr,SafeBuf *parmList,
long page);
long page , TcpSocket *sock );
Parm *getParmFast2 ( long cgiHash32 ) ;
Parm *getParmFast1 ( char *cgi , long *occNum ) ;
bool broadcastParmList ( SafeBuf *parmList ,

View File

@ -2529,7 +2529,7 @@ public:
HttpRequest m_hr;
long m_submittingNewUser;
// is this really the admin logged in as another user?
bool m_isAdmin;
bool m_isRootAdmin;
long long m_adminSessId;
long m_adminId;
@ -2702,7 +2702,7 @@ UserInfo *Proxy::getLoggedInUserInfo ( StateUser *su , SafeBuf *errmsg ) {
// reset shit
su->m_userId32 = -1;
su->m_sessionId64 = 0;
su->m_isAdmin = false;
su->m_isRootAdmin = false;
su->m_adminSessId = 0LL;
su->m_adminId = 0;
@ -2732,7 +2732,7 @@ UserInfo *Proxy::getLoggedInUserInfo ( StateUser *su , SafeBuf *errmsg ) {
// check it
if ( ui->m_lastSessionId64 != asi ) continue;
// got a match
su->m_isAdmin = true;
su->m_isRootAdmin = true;
// save the underlying admin user info
su->m_adminSessId = asi;
su->m_adminId = ui->m_userId32;
@ -3099,7 +3099,7 @@ bool printLogoutPage ( StateUser *su ) {
// disguise, then redirect back to our main page. but if we are
// the admin and NOT logged in as someone else, then log us out
// as normal!
if ( su->m_isAdmin && su->m_adminSessId != su->m_sessionId64 ) {
if ( su->m_isRootAdmin && su->m_adminSessId != su->m_sessionId64 ) {
sb.reset();
sb.safePrintf("<META HTTP-EQUIV=refresh "
"content=\"0;URL=/account\">");
@ -4551,7 +4551,7 @@ bool Proxy::printAccountingInfoPage ( StateUser *su , SafeBuf *errmsg ) {
// the admin can credit the account if he receives a wire or a check
// from a user...
/*
if ( su->m_isAdmin )
if ( su->m_isRootAdmin )
sb->safePrintf("<br>"
"<font color=red>"
"Record Wire of "

View File

@ -3592,7 +3592,7 @@ bool SafeBuf::printTimeAgo ( long ago , long now , bool shorthand ) {
}
// do not show if more than 1 wk old! we want to seem as
// fresh as possible
if ( ! printed && ago > 0 ) { // && si->m_isAdmin ) {
if ( ! printed && ago > 0 ) { // && si->m_isRootAdmin ) {
long ts = now - ago;
struct tm *timeStruct = localtime ( &ts );
char tmp[100];

View File

@ -357,8 +357,8 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
//
//////
// set m_isAdmin to zero if no correct ip or password
if ( ! g_conf.isRootAdmin ( sock , &m_hr ) ) m_isAdmin = 0;
// set m_isRootAdmin to zero if no correct ip or password
if ( ! g_conf.isRootAdmin ( sock , &m_hr ) ) m_isRootAdmin = 0;
//////////////////////////////////////

View File

@ -117,7 +117,7 @@ class SearchInput {
Query *m_q2;
char m_isAdmin;
char m_isRootAdmin;
// these are set from things above

View File

@ -1448,7 +1448,7 @@ void handleRequestc1 ( UdpSlot *slot , long niceness ) ;
// . max spiders we can have going at once for this process
// . limit to 50 to preven OOM conditions
#define MAX_SPIDERS 100
#define MAX_SPIDERS 300
class SpiderLoop {

View File

@ -18,8 +18,8 @@ Stats::Stats ( ) {
//m_minWindowStartTime = 0;
memset ( m_pts , 0 , sizeof(StatPoint)*MAX_POINTS );
m_readSignals = 0;
m_writeSignals = 0;
//m_readSignals = 0;
//m_writeSignals = 0;
m_slowDiskReads = 0;
m_queryTimes = 0;
m_numQueries = 0;
@ -606,10 +606,10 @@ void Stats::printGraphInHtml ( SafeBuf &sb ) {
//"background-color:#000000;"
"z-index:110;"
"min-height:20px;"
"min-width:3px;\">%lis</div>\n"
"min-width:3px;\">%.01fs</div>\n"
, (long)x-10
// the label:
,(long)(DT * (long long)x / (long long)DX)/1000
,(float)(DT* (long long)x / (long long)DX)/1000.0
);
// move cursor

View File

@ -110,8 +110,8 @@ class Stats {
long m_totalNumFails;
float m_avgQueryTime;
float m_successRate;
long long m_readSignals;
long long m_writeSignals;
//long long m_readSignals;
//long long m_writeSignals;
// set in BigFile.cpp
long m_slowDiskReads;

View File

@ -4041,7 +4041,7 @@ public:
//long m_bufLen;
bool m_isLocal;
//long m_fileNum;
//bool m_isAdmin;
//bool m_isRootAdmin;
//bool m_isAssassin;
// . Commented by Gourav
// . Reason:user perm no longer used
@ -4103,7 +4103,7 @@ bool sendPageTagdb ( TcpSocket *s , HttpRequest *req ) {
sizeof(State12),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
mnew ( st , sizeof(State12) , "PageTagdb" );
//st->m_isAdmin = isAdmin;
//st->m_isRootAdmin = isAdmin;
//st->m_isAssassin = isAssassin;
// . Commented by Gourav
// . Reason:user perm no longer used
@ -4537,13 +4537,13 @@ bool sendReply2 ( void *state ) {
"%s</textarea></td></tr>" , uu );
// spam assassins should not use this much power, too risky
//if ( st->m_isAdmin ) {
//if ( st->m_isRootAdmin ) {
// sb.safePrintf ("<i><font size=-1>Note: use 1.2.3.<b>0</b> to "
// "specify ip domain.</i><br>");
//}
// allow filename to load them from
//if ( st->m_isAdmin ) {
//if ( st->m_isRootAdmin ) {
sb.safePrintf("<tr class=poo>"
"<td>"
"<b>file of urls to tag</b>"

View File

@ -763,10 +763,11 @@ long Threads::timedCleanUp (long maxTime, long niceness) {
m_threadQueues[j].timedCleanUp ( i );
launchThreads();
if ( maxTime < 0 ) continue;
took = startTime - gettimeofdayInMillisecondsLocal();
if ( took <= maxTime ) continue;
// ok, we have to cut if short...
m_needsCleanup = true;
break;
@ -2347,7 +2348,13 @@ int startUp ( void *state ) {
// . it does not send us a signal automatically, so we must do it!
// . i noticed during the linkdb rebuild we were not getting the signal
sigqueue ( s_pid, GB_SIGRTMIN + 1 + t->m_niceness, svt ) ;
//sigqueue ( s_pid, GB_SIGRTMIN + 1 + t->m_niceness, svt ) ;
// i verified this breaks select() in Loop.cpp out of it's sleep
//fprintf(stderr,"threads sending SIGCHLD\n");
// try a sigchld now! doesn't it already do this? no...
sigqueue ( s_pid, SIGCHLD, svt ) ;
return 0;
}

View File

@ -22,7 +22,7 @@ pid_t getpidtid();
#define UNLINK_THREAD 5
#define GENERIC_THREAD 6
//#define SSLACCEPT_THREAD 7
#define GB_SIGRTMIN (SIGRTMIN+4)
//#define GB_SIGRTMIN (SIGRTMIN+4)
#define MAX_NICENESS 2
// . a ThreadQueue has a list of thread entries
// . each thread entry represents a thread in progress or waiting to be created

View File

@ -2631,13 +2631,13 @@ bool UdpServer::makeCallback_ass ( UdpSlot *slot ) {
// happen since we're already in an interrupt handler, so we have
// to let g_loop know to poll
// . TODO: won't he have to wakeup before he'll poll?????
#ifndef _POLLONLY_
if ( ! g_loop.m_needToPoll &&
sigqueue ( s_pid, GB_SIGRTMIN + 1 , svt ) < 0 )
g_loop.m_needToPoll = true;
#else
g_loop.m_needToPoll = true;
#endif
// #ifndef _POLLONLY_
// if ( ! g_loop.m_needToPoll &&
// sigqueue ( s_pid, GB_SIGRTMIN + 1 , svt ) < 0 )
// g_loop.m_needToPoll = true;
// #else
// g_loop.m_needToPoll = true;
// #endif
// . tell g_loop that we did a queue
// . he sets this to false before calling our makeCallbacks_ass()
g_someAreQueued = true;

View File

@ -16727,10 +16727,10 @@ void XmlDoc::filterStart_r ( bool amThread ) {
// These ulimit sizes are max virtual memory in kilobytes. let's
// keep them to 25 Megabytes
if ( ctype == CT_PDF )
snprintf(cmd,2047 ,"ulimit -v 25000 ; ulimit -t 30 ; nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s > %s", wdir , in ,out );
snprintf(cmd,2047 ,"ulimit -v 25000 ; ulimit -t 30 ; timeout 30s nice -n 19 %s/pdftohtml -q -i -noframes -stdout %s > %s", wdir , in ,out );
else if ( ctype == CT_DOC )
// "wdir" include trailing '/'? not sure
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; export ANTIWORDHOME=%s/antiword-dir ; timeout 30s nice -n 19 %s/antiword %s> %s" , wdir , wdir , in , out );
else if ( ctype == CT_XLS )
snprintf(cmd,2047, "ulimit -v 25000 ; ulimit -t 30 ; timeout 10s nice -n 19 %s/xlhtml %s > %s" , wdir , in , out );
// this is too buggy for now... causes hanging threads because it

12
html/appliance.html Normal file
View File

@ -0,0 +1,12 @@
<br><br><h1>Gigablast Web Search Appliance (GWSA)</h1>
<br><br>
<div style=width:500px;font-size:15px;>
Enjoy your own multi-billion page search engine on your own network. The index is a mirror copy of what you find when searching on <a href=http://www.gigablast.com/>gigablast.com</a>. It should be just about as fast searching or spidering, too. Periodic index updates are available.
<br><br>
Please note that the server pictured here is for entertainment only and does not resemble the Gigablast Web Search Appliance. The GWSA is much more intense looking.
<br><br>
<a href=/about.html>Contact Gigablast</a> for more information.
</div>

View File

@ -1,36 +1,5 @@
<html>
<head>
<title>Gigablast's Matt Wells Career Highlights</title>
<style>
a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
</style>
<body>
<center>
<a href=/>
<img src=/logo-med.jpg height=122 width=500>
</a>
</center>
<br>
<table width=100% cellpadding="5" cellspacing="0" border="0">
<tr bgcolor="#0340fd">
<th colspan="2">
<font color=33dcff>
Matt Wells' Career Highlights</font>
</th>
</tr>
<tr>
<td>
<br>
<br><br><br>
<h1>Matt Wells' Career Bio Page</h1>
<center>
@ -70,7 +39,7 @@ Albuquerque Journal (page <a href=/abqnews1.jpg>1</a> and <a href=/abqnews2.jpg>
<tr><td>
<a href=http://searchenginewatch.com/article/2067980/A-Conversation-With-Gigablasts-Matt-Wells><img width=128 height=128 src=http://profile.ak.fbcdn.net/hprofile-ak-ash2/373034_339395460816_1094066588_n.jpg></a>
<a href=http://searchenginewatch.com/article/2067980/A-Conversation-With-Gigablasts-Matt-Wells><img width=128 height=46 src=/sew.png></a>
</td><td>2003. Interviewed by
<a href=http://searchenginewatch.com/article/2067980/A-Conversation-With-Gigablasts-Matt-Wells>Search Engine Watch</a>
</tr></tr>
@ -141,7 +110,7 @@ Donated <a href=http://www.jezebelgallery.com/>chandelier from Jezebel Lighting<
<tr><td>
<img src=/nsa.jpg width=128 height=128>
</td><td>
The <a href="http://books.google.com/books?id=qLzoWKp2JHcC&pg=PA141&lpg=PA141&dq=gigablast&source=bl&ots=JfwmfpIPKW&sig=sXUFaCsUlxhVfrd2cc4kBKif5LY&hl=en&sa=X&ei=cbn6UfvJB-6GyQGYwICwAg&ved=0CCoQ6AEwADhQ#v=onepage&q=gigablast&f=false">NSA reviewed Gigablast</a> in its book, "Untangling the Web" which was declassified in 2013 under the Freedom of Information Act. Unfortunately for PRISM and XKeywords, Gigablast is very secure from any evesdropping and has a strict <a href=/privacy.html>Privacy Policy</a>.
The <a href="http://books.google.com/books?id=qLzoWKp2JHcC&pg=PA141&lpg=PA141&dq=gigablast&source=bl&ots=JfwmfpIPKW&sig=sXUFaCsUlxhVfrd2cc4kBKif5LY&hl=en&sa=X&ei=cbn6UfvJB-6GyQGYwICwAg&ved=0CCoQ6AEwADhQ#v=onepage&q=gigablast&f=false">NSA reviewed Gigablast</a> in its book, "Untangling the Web" which was declassified in 2013 under the Freedom of Information Act.
</td></tr>
@ -154,12 +123,3 @@ The <a href="http://books.google.com/books?id=qLzoWKp2JHcC&pg=PA141&lpg=PA141&dq
</td></tr>
</table>
<center>
Copyright &copy; 2013. All rights reserved.
</center>
</body>
</html>

BIN
html/computer2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

View File

@ -1,27 +1,9 @@
<html>
<head>
<title>Contact Gigablast</title>
<style>
a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
</style>
<body>
<center>
<a href=/>
<img src=/logo-med.jpg height=122 width=500>
</a>
</center>
<br>
<br><br><br>
<table width=100% cellpadding="5" cellspacing="0" border="0">
<tr bgcolor="#0340fd">
<th colspan="2">
<font color=33dcff>
Contact Gigablast</font>
</th>
</tr>
<tr>
<td>
<br>
@ -44,11 +26,6 @@ the e-mail address displayed below is an image rather than text.</p>
<!-- http://dataurl.net/#dataurlmaker -->
<img src="" title="Email Gigablast" alt="Email Gigablast" height="30" width="300">
<br><br>
Yours Truly,
<br><br>
Matt Wells
</td>
</tr>
</tbody></table>
@ -63,13 +40,3 @@ the e-mail address displayed below is an image rather than text.</p>
</center>
</table>
<br>
<center>
Copyright &copy; 2013. All rights reserved.
</center>
</body>
</html>

View File

@ -1,25 +1,9 @@
<html>
<head>
<title>Gigablast Engineering Overview</title>
<style>
pre {
/* width: 100%; */
background: #f8f8f8;
padding: 10px 5px;
}
</style>
</head>
<body text=#000000 bgcolor=#ffffff link=#000000 vlink=#000000 alink=#000000><style><!--body,td,a,p,.h{font-family:arial,sans-serif; font-size: 15px;}//--></style>
<center>
<img border=0 width=500 height=122 src=/logo-med.jpg>
<br><br>
</center>
<h1>Gigablast Developer Documentation</h1>
<h1>Developer Documentation</h1>
Administration documentation is <a href=/admin.html>here</a>.
FAQ is <a href=/admin.html>here</a>.
<br><br>
A work-in-progress <a href=/compare.html>comparison to SOLR</a>.

View File

@ -1,19 +1,10 @@
<html>
<head>
<title>Gigablast Administrator Documentaion</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf8" />
</head>
<body text=#000000 bgcolor=#ffffff link=#000000 vlink=#000000 alink=#000000 >
<style>body,td,p,.h{font-family:arial,sans-serif; font-size: 15px;} </style>
<div style=max-width:700px;>
<center>
<br>
<img border=0 width=500 height=122 src=/logo-med.jpg>
<br><br>
</center>
<h1>FAQ</h1>
<h1>Gigablast Administrator Documentation</h1>
Developer documentation is <a href=/developer.html>here</a>.
<br><br>
A work-in-progress <a href=/compare.html>comparison to SOLR</a>.
@ -1234,5 +1225,5 @@ All sequences of punctuation greater than 3 characters break phrases with the so
-->
</html>
</div>

406
html/news.html Normal file
View File

@ -0,0 +1,406 @@
<div style=max-width:700px;>
<br>
<br><br>
<a name=revival></a>
<font size=+1><b>15 Year Anniversary</b></font><br>
<i>September 1, 2014</i><br><br>
It's been 15 years since I first started Gigablast. It's taken some interesting directions as of late. Most notably being open source. I've decided to revive the old blog entries that you can find below and continue working on top of those.
<br><br><br><br>
<a name=gigabits></a>
<font size=+1><b>Giga Bits Introduced</b></font><br>
<i>Jan 31, 2004</i><br><br>
Gigablast now generates related concepts for your query. I call them Giga Bits. I believe it is the best concept generator in the industry, but if you don't think so please <a href="/contact.html">drop me a note</a> explaining why not, so I can improve it.
<br><br>
You can also ask Gigablast a simple question like <a href="/search?q=Who+is+President+of+Russia%3F">"Who is President of Russia?"</a> and it often comes up with the correct answer in the Giga Bits section. How do you think it does that?
<br><br>
In other news, the spider speed ups I rolled a few weeks ago are tremendously successful. I can easily burn all my bandwidth quota with insignificant load on my servers. I could not be happier with this.
<br><br>
Now I'm planning on turning Gigablast into a default AND engine. Why? Because it will decrease query latency by several times, believe or not. That should put Gigablast on par with the fastest engines in the world, even though it only runs on 8 desktop machines. But Don't worry, I will still leave the default OR functionality intact.
<br>
<br>
<br>
<br>
<a name=update></a>
<font size=+1><b>January Update Rolled</b></font><br>
<i>Jan 8, 2004</i><br><br>
Gigablast now has a more professional, but still recognizable, logo, and a new catch phrase, "Information Acceleration". Lots of changes on the back end. You should notice significantly higher quality searches. The spider algorithm was sped up several times. Gigablast should be able to index several million documents per day, but that still remains to be tested. &lt;knock on wood&gt;. Site clustering was sped up. I added the ability to force all query terms to be required by using the &rat=1 cgi parm. Now Gigablast will automatically regenerate some of its databases when they are missing. And I think I wasted two weeks working like a dog on code that I'm not going to end up using! I hate when that happens...
<br>
<br>
<br>
<br>
<a name=traffic></a>
<font size=+1><b>An Easy way to Slash Motor Vehicle Emissions</b></font><br>
<i>Dec 11, 2003</i><br><br>
Blanket the whole city with wi-fi access. (like <a href="/?redir=http://story.news.yahoo.com/news?tmpl=story&ncid=1293&e=2&u=/ap/20031211/ap_on_hi_te/wi_fi_city&sid=95573418">Cerritos, California</a>) When you want to travel from point A
to point B, tell the central traffic computer. It will then give you a time
window in which to begin your voyage and, most importantly, it will ensure that
as long as you stay within the window you will always hit green lights.
<br><br>
If you stray from your path, you'll be able to get a new window via the wi-fi network.
If everyone's car has gps and is connected to the wi-fi network,
the central computer will also be able to monitor the flow of traffic and
make adjustments to your itinerary in real-time.
Essentially, the traffic computer will be solving a large system of linear,
and possibly non-linear, constraints in real-time. Lots of fun... and think of
how much more efficient travel will be!! If someone wants to secure funding,
count me in.
<br>
<br>
<br>
<br>
<a name=spellchecker></a>
<font size=+1><b>Spellchecker Finally Finished</b></font><br>
<i>Nov 18, 2003</i><br><br>
After a large, countable number of interruptions, I've finally completed the spellchecker. I tested the word '<b>dooty</b>' on several search engines to see how they handled that misspelling. Here's what I got:
<br><br>
<table>
<tr><td><b>Source</b></td><td><b>Result</b></td></tr>
<tr><td>Alltheweb</td><td><a href="http://www.alltheweb.com/search?query=dooty">booty</a><td></tr>
<tr><td>Altavista</td><td><a href="http://search01.altavista.com/web/results?q=dooty">dhooti</a></td></tr>
<tr><td>Gigablast</td><td><a href="http://www.gigablast.com/search?q=dooty">door</a></td></tr>
<tr><td>Google</td><td><a href="http://www.google.com/search?q=dooty">doody</a></td></tr>
<tr><td>Microsoft Word</td><td>Doty</td></tr>
<tr><td>Teoma</td><td><a href="http://s.teoma.com/search?q=dooty">doty</a></td></tr>
<tr><td>Wisenut</td><td>N/A (no spellchecker)</td></tr>
</table>
<br>
So there is no one way to code a spellchecker. It's a guessing game. And, hey Wisenut, want to license a good spellchecker for cheap? <a href="/contact.html">Let me know</a>.
<br><br>
Gigablast uses its cached web pages to generate its dictionary instead of the query logs. When a word or phrase is not found in the the dictionary, Gigablast replaces it with the closest match in the dictionary. If multiple words or phrases are equally close, then Gigablast resorts to a popularity ranking.
<br><br>
One interesting thing I noticed is that in Google's spellchecker you must at least get the first letter of the word correct, otherwise, Google will not be able to recommend the correct spelling. I made Gigablast this way too, because it really cuts down on the number of words it has to search to come up with a recommendation. This also allows you to have an extremely large dictionary distributed amongst several machines, where each machine is responsible for a letter.
<br><br>
Also of note: I am planning on purchasing the hardware required for achieving a 5 billion document index capable of serving hundreds of queries per second within the next 12 months. Wish me luck... and thanks for using Gigablast.
<br>
<br>
<br>
<br>
<a name=onagain></a>
<font size=+1><b>Spiders On Again</b></font><br>
<i>Nov 10, 2003</i><br><br>
After updating the spider code I've reactivated the spiders. Gigablast should be able to spider at a faster rate with even less impact on query response time than before. So add your urls now while the addings good.
<br>
<br>
<br>
<br>
<a name=speed></a>
<font size=+1><b>Going For Speed</b></font><br>
<i>Nov 3, 2003</i><br><br>
I've finally got around to working on Gigablast's distributed caches. It was not doing a lot of caching before. The new cache class I rigged up has no memory fragmentation and minimal record overhead. It is vurhy nice.<br><br>
I've stopped spidering just for a bit so I can dedicate all Gigablast's RAM to the multi-level cache system I have in place now and see how much I can reduce query latency. Disks are still my main point of contention by far so the caching helps out a lot. But I could still use more memory.<br><br>
Take Gigablast for a <a href="/">spin</a>. See how fast it is.
<br>
<br>
<br>
<br>
<a name=metas></a>
<font size=+1><b>Bring Me Your Meta Tags</b></font><br>
<i>Oct 11, 2003</i><br><br>
As of now Gigablast supports the indexing, searching and displaying of generic meta tags. You name them I fame them. For instance, if you have a tag like <i>&lt;meta name="foo" content="bar baz"&gt;</i> in your document, then you will be able to do a search like <i><a href="/search?q=foo%3Abar&dt=foo">foo:bar</a></i> or <i><a href="/search?q=foo%3A%22bar+baz%22&dt=foo">foo:"bar baz"</a></i> and Gigablast will find your document.
<br><br>
You can tell Gigablast to display the contents of arbitrary meta tags in the search results, like <a href="/search?q=gigablast&s=10&dt=author+keywords%3A32">this</a>. Note that you must assign the <i>dt</i> cgi parameter to a space-separated list of the names of the meta tags you want to display. You can limit the number of returned characters of each tag to X characters by appending a <i>:X</i> to the name of the meta tag supplied to the <i>dt</i> parameter. In the link above, I limited the displayed keywords to 32 characters.
<br><br>
Why use generic metas? Because it is very powerful. It allows you to embed custom data in your documents, search for it and retrieve it. Originally I wanted to do something like this in XML, but now my gut instincts are that XML is not catching on because it is ugly and bloated. Meta tags are pretty and slick.
<br>
<br>
<br>
<br>
<a name=verisignstopped></a>
<font size=+1><b>Verisign Stops Destroying the Internet</b></font><br>
<i>Oct 11, 2003</i><br><br>
Ok, they actually stopped about a week ago, but I didn't get around to posting it until now. They really ought to lose their privileged position so this does not happen again. Please do not stop your boycott. They have not learned from their mistakes.
<br>
<br>
<br>
<br>
<a name=moreverisign></a>
<font size=+1><b>Verisign Continues to Damage Gigablast's Index</b></font><br>
<i>September 30, 2003</i><br><br>
When the Gigablast spider tries to download a page from a domain it first gets the associated robots.txt file for that domain. When the domain does not exist it ends up downloading a robots.txt file from verisign. There are two major problems with this. The first is that verisign's servers may be slow which will slow down Gigablast's indexing. Secondly, and this has been happening for a while now, Gigablast will still index any incoming link text for that domain, thinking that the domain still exists, but just that spider permission was denied by the robots.txt file.
<br>
<br>
So, hats off to you verisign, thanks for enhancing my index with your fantastic "service". I hope your company is around for many years so you can continue providing me with your great "services".
<br>
<br>
If you have been hurt because of verisign's greed you might want to consider joining the <a href="/?redir=http://www.geek.com/news/geeknews/2003Sep/gee20030929021965.htm">class-action lawsuit</a> announced Friday, September 26th, by the <a href="/?redir=http://www.techfirm.com/">Ira Rothken law firm</a>.
<br>
<br>
Want to learn more about how the internet is run? Check out <a href="/?redir=http://www.paradigm.nu/icann/">the ICANN movie page</a>. Movie #1 portrays verisign's CEO, Stratton Sclavos, quite well in my opinion.
<br>
<br>
<b>(10/01/03) Update #5:</b> verisign <a href="/?redir=http://www.pcworld.com/news/article/0,aid,112712,00.asp">comes under further scrutiny</a>.
<br>
<br>
<br>
<br>
<a name=verisign></a>
<font size=+1><b>Verisign Redesigns the Internet for their Own Profit</b></font><br>
<i>September 24, 2003</i><br><br>
My spiders expect to get "not found" messages when they look up a domain that does not have an IP. When verisign uses their privledged position to change the underlying fundamentals of the internet just to line their own greedy pockets it really, really perturbs me. Now, rather than get the "not found" message, my spiders get back a valid IP, the IP of verisign's commercial servers. That causes my spiders to then proceed to download the robots.txt from that domain. This can take forever if their servers are slow. What a pain. Now I have to fix my freakin' code. And that's just one of many problems this company has caused.
<br>
<br>
Please join me in boycott. I'm going to discourage everyone I know from supporting this abusive, monopolistic entity.
<br>
<br>
<b>(9/22/03) Update #1:</b> verisign <a href="/?redir=http://www.icann.org/correspondence/lewis-to-twomey-21sep03.htm">responded</a> to ICANN's request that they stop. <a href="/?redir=http://slashdot.org/articles/03/09/22/2255202.shtml?tid=126&tid=95&tid=99">See what the slashdot community has to say about this response.</a>
<br>
<br>
<b>(9/22/03) Update #2:</b> ICANN has now posted some complaints in this <a href="/?redir=http://forum.icann.org/alac-forum/redirect/">forum</a>.
<br>
<br>
<b>(9/24/03) Update #3:</b> Slashdot has more <a href="/?redir=http://yro.slashdot.org/yro/03/09/24/0134256.shtml?tid=126&tid=95&tid=98&tid=99">coverage</a>.
<br>
<br>
<b>(9/24/03) Update #4:</b> Please sign the <a href="/?redir=http://www.whois.sc/verisign-dns/">petition</a> to stop verisign.
<br>
<br>
<br>
<br>
<a name=geotags></a>
<font size=+1><b>Geo-Sensitive Search</b></font><br>
<i>September 18, 2003</i><br><br>
Gigablast now supports some special new meta tags that allow for constraining a search to a particular zipcode, city, state or country. Support was also added for the standard author, language and classification meta tags. This <a href="/tagsdemo.html">page</a> explains more. These meta tags should be standard, everyone should use them (but not abuse them!) and things will be easier for everybody.
<br><br>
Secondly, I have declared jihad against stale indexes. I am planning a significantly faster update cycle, not to mention growing the index to about 400 million pages, all hopefully in the next few months.
<br>
<br>
<br>
<br>
<a name=turing></a>
<font size=+1><b>Foiling the Addurl Scripts</b></font><br>
<i>September 6, 2003</i><br><br>
The new pseudo-Turing test on the <a href="/addurl">addurl page</a> should prevent most automated scripts from submitting boatloads of URLs. If someone actually takes the time to code a way around it then I'll just have to take it a step further. I would rather work on other things, though, so please quit abusing my free service and discontinue your scripts. Thanks.
<br>
<br>
<br>
<br>
<a name=boolean></a>
<font size=+1><b>Boolean is Here</b></font><br>
<i>September 1, 2003</i><br><br>
I just rolled out the new boolean logic code. You should be able to do nested boolean queries using the traditional AND, OR and NOT boolean operators. See the updated <a href="/help.html#boolean">help page</a> for more detail.
<br><br>
I have declared jihad against swapping and am now running the 2.4.21-rc6-rmap15j Linux kernel with swap tuned to zero using the /proc/sys/vm/pagecache knobs. So far no machines have swapped, which is great, but I'm unsure of this kernel's stability.
<br>
<br>
<br>
<br>
<a name=swap></a>
<font size=+1><b>All Swapped Out</b></font><br>
<i>August 29, 2003</i><br><br>
I no longer recommend turning the swap off, at least not on linux 2.4.22. A kernel panicked on me and froze a server. Not good. If anyone has any ideas for how I can prevent my app from being swapped out, please let me know. I've tried mlockall() within my app but that makes its memory usage explode for some reason. I've also tried Rik van Riel's 2.4.21-rc6-rmap15j.txt patch on the 2.4.21 kernel, but it still does unnecessary swapping (although, strangely, only when spidering). If you know how to fix this problem, please help!!! <a href="vmstat.html">Here</a> is the output from the vmstat command on one of my production machines running 2.4.22. And <a href="vmstatrik.html">here</a> is the output from my test machine running 2.4.21-rc6-rmap15j.txt.
<br>
<br>
<br>
<br>
<a name=kernel></a>
<font size=+1><b>Kernel Update</b></font><br>
<i>August 28, 2003</i><br><br>
I updated the Linux kernel to 2.4.22, which was just released a few days ago on <a href="/?redir=http://www.kernel.org/">kernel.org</a>. Now my gigabit cards are working, yay! I finally had to turn off swap using the swapoff command. When an application runs out of memory the swapper is supposed to write unfrequently used memory to disk so it can give that memory to the application that needs it. Unfortunately, the Linux virtual memory manager enjoys swapping out an application's memory for no good reason. This can often make an application disastrously slow, especially when the application ends up blocking on code that it doesn't expect too! And, furthermore, when the application uses the disk intensely it has to wait even longer for memory to get swapped back in from disk. I recommend that anyone who needs high performance turn off the swap and just make sure their program does not use more physical memory than is available.
<br>
<br>
<br>
<br>
<a name=gang></a>
<font size=+1><b>The Gang's All Here</b></font><br>
<i>August 17, 2003</i><br><br>
I decided to add PostScript (<a href="/search?q=type:ps">.ps</a>) , PowerPoint (<a href="/search?q=type:ppt">.ppt</a>), Excel SpreadSheet (<a href="/search?q=type:xls">.xls</a>) and Microsoft Word (<a href="/search?q=type:doc">.doc</a>) support in addition to the PDF support. Woo-hoo.
<br>
<br>
<br>
<br>
<a name=pdf></a>
<font size=+1><b>PDF Support</b></font><br>
<i>August 14, 2003</i><br><br>
Gigablast now indexes PDF documents. Try the search <a href="/search?q=type:pdf"><i>type:pdf</i></a> to see some PDF results. <i>type</i> is a new search field. It also support the text type, <a href="/search?q=type:text"><i>type:text</i></a>, and will support other file types in the future.
<br>
<br>
<br>
<br>
<a name=codeupdate3></a>
<font size=+1><b>Minor Code Updates</b></font><br>
<i>July 17, 2003</i><br><br>
I've cleaned up the keyword highlight routines so they don't highlight isolated stop words. Gigablast now displays a <a href="/superRecall.html">blue bar</a> above returned search results that do not have <b>all</b> of your query terms. When returning a page of search results Gigablast lets you know how long ago that page was cached by displaying a small message at the bottom of that page. NOTE: This small message is at the bottom of the page containing the search results, not at the bottom of any pages from the web page cache, that is a different cache entirely. Numerous updates to less user-visible things on the back end. Many bugs fixed, but still more to go. Thanks a bunch to Bruce Perens for writing the <a href="/?redir=http://www.perens.com/FreeSoftware/">Electric Fence</a> debug utility.
<br>
<br>
<br>
<br>
<a name=codeupdate2></a>
<font size=+1><b>Gigablast 2.0</b></font><br>
<i>June 20, 2003</i><br><br>
I've recently released Gigablast 2.0. Right now Gigablast can do about twice as many queries per second as before. When I take care of a few more things that rate should double again.
<br><br>
The ranking algorithm now treats phrase weights much better. If you search for something like <i><a href="/search?q=boots+in+the+uk">boots in the uk</a></i> you won't get a bunch of results that have that exact phrase in them, but rather you will get UK sites about boots (theoretically). And when you do a search like <i><a href="/search?q=all+the+king%27s+men">all the king's men</a></i> you will get results that have that exact phrase. If you find any queries for which Gigablast is especially bad, but a competing search engine is good, please <a href="/contact.html">let me know</a>, I'm am very interested.
<br><br>
2.0 also introduced a new index format. The new index is half the size of the old one. This allows my current setup to index over 400 million pages with dual redundancy. Before it was only able to index about 300 million pages. The decreased index size also speeds up the query process since only half as much data needs to be read from disk to satisfy a query.
<br><br>
I've also started a full index refresh, starting with top level pages that haven't been spidered in a while. This is especially nice because a lot of pages that were indexed before all my anti-spam algorithms were 100% in place are just now getting filtered appropriately. I've manually removed over 100,000 spam pages so far, too.
<br>
<br>
<br>
<br>
<a name=grub></a>
<font size=+1><b>My Take on Looksmart's Grub</b></font><br>
<i>Apr 19, 2003</i><br><br>
There's been some press about Grub, a program from Looksmart which you install on your machine to help Looksmart spider the web. Looksmart is only using Grub to save on their bandwidth. Essentially Grub just compresses web pages before sending them to Looksmart's indexer thus reducing the bandwidth they have to pay for by a factor of 5 or so. The same thing could be accomplished through a proxy which compresses web pages. Eventually, once the HTTP mime standard for requesting compressed web pages is better supported by web servers, Grub will not be necessary.
<br>
<br>
<br>
<br>
<a name=codeupdate></a>
<font size=+1><b>Code Update</b></font><br>
<i>Mar 25, 2003</i><br><br>
I just rolled some significant updates to Gigablast's back-end. Gigablast now has a uniformly-distributed, unreplicated search results cache. This means that if someone has done your search within the last several hours then you will get results back very fast. This also means that Gigablast can handle a lot more queries per second.
<br>
<br>
I also added lots of debug and timing messages that can be turned on and off via the Gigablast admin page. This allows me to quickly isolate problems and identify bottlenecks.
<br>
<br>
Gigablast now synchronizes the clocks on all machines on the network so the instant add-url should be more "instant". Before I made this change, one machine would tell another to spider a new url "now", where "now" was actually a few minutes into the future on the spider machine. But since everyone's currently synchronized, this will not be a problem anymore.
<br>
<br>
There were about 100 other changes and bug fixes, minor and major, that I made, too, that should result in significant performance gains. My next big set of changes should make searches at least 5 times faster, but it will probably take several months until completed. I will keep you posted.
<br>
<br>
<br>
<br>
<a name=downtime></a>
<font size=+1><b>Downtime</b></font><br>
<i>Feb 20, 2003</i><br><br>
To combat downtime I wrote a monitoring program. It will send me a text message on my cellphone if gigablast ever stops responding to queries. This should prevent extended periods of downtime by alerting me to the problem so I can promptly fix it.
<br>
<br>
<br>
<br>
<a name=uunet></a>
<font size=+1><b>Connectivity Problems. Bah!</b></font><br>
<i>Feb 14, 2003</i><br><br>
I had to turn off the main refresh spiders a few weeks ago because of internet connectivity problems. Lots of pages were inaccessible or were timing out to the point that spider performance was suffering too much.
<br><br>
After running tcpdump in combination with wget I noticed that the FIN packets of some web page transfers were being lost or delayed for over a minute. The TCP FIN packet is typically the last TCP packet sent to your browser when it retrieves a web page. It tells your browser to close the connection. Once it is received the little spinning logo in the upper right corner of your browser window should stop spinning.
<br><br>
The most significant problem was, however, that the initial incoming data packet for some URLs was being lost or excessively delayed. You can get by without receiving FIN packets but you absoultely need these TCP "P" packets. I've tested my equipment and my ISP has tested their equipment and we have both concluded that the problem is upstream. Yesterday my ISP submitted a ticket to Worldcom/UUNet. Worldcom's techs have verified the problem and thought it was... "interesting".
<br><br>
I personally think it is a bug in some filtering or monitoring software installed at one of Worldcom's NAPs (Network Access Points). NAPs are where the big internet providers interface with each other. The most popular NAPs are in big cities, the Tier-1 cities, as they're called. There are also companies that host NAP sites where the big carriers like Worldcom can install their equipment. The big carriers then set up Peering Agreements with each other. Peering Agreements state the conditions under which two or more carriers will exchange internet traffic.
<br><br>
Once you have a peering agreement in place with another carrier then you must pay them based on how much data you transfer from your network to their network across a NAP. This means that downloading a file is much cheaper than uploading a file. When you send a request to retrieve some information, that request is small compared to the amount of data it retrieves. Therefore, the carrier that hosted the server from which you got the data will end up paying more. Doh! I got off the topic. I hope they fix the problem soon!
<br>
<br>
<br>
<br>
<a name=ads></a>
<font size=+1><b>Considering Advertisements</b></font><br>
<i>Jan 10, 2003</i><br><br>
I'm now looking into serving text advertisements on top of the search results page so I can continue to fund my information retrieval research. I am also exploring the possibility of injecting ads into some of my xml-based search feeds. If you're interested in a search feed I should be able to give you an even better deal provided you can display the ads I feed you, in addition to any other ads you might want to add. If anyone has any good advice concerning what ad company I should use, I'd love to here it.
<br>
<br>
<br>
<br>
<a name=codeupdate></a>
<font size=+1><b>Code Update</b></font><br>
<i>Dec 27, 2002</i><br><br>
After a brief hiatus I've restarted the Gigablast spiders. The problem was they were having a negative impact on the query engine's performance, but now, all spider processing yields computer resources much better to the query traffic. The result is that the spidering process only runs in the space between queries. This actually involved a lot of work. I had to insert code to suspend spider-related, network transactions and cancel disk-read and disk-write threads.<br><br>
I've also launched my <a href="/gigaboost.html">Gigaboost</a> campaign. This rewards pages that link to gigablast.com with a boost in the search results rankings. The boost is only utilized to resolve ties in ranking scores so it does not taint the quality of the index.<br><br>
Gigablast.nu, in Scandinavia, now has a news index built from news sources in the Scandinavian region. It is not publically available just yet because there's still a few details we are working out.
I've also added better duplicate detection and removal. It won't be very noticable until the index refresh cycle completes.
In addition Gigablast now removes session ids from urls, but, this only applies to new links and will be back pedaled to fix urls already in the index at a later date.
There is also a new summary generator installed. It's over ten times faster than the old one. If you notice any problems with it please contact me. As always, I appreciate any constructive input you have to give.
<br>
<br>
<br>
<br>
<a name=corruption></a>
<font size=+1><b>Data Corruption Mysteries</b></font><br>
<i>Dec 20, 2002</i><br><br>
I've been having problems with my hard drives. I have a bunch of Maxtor 160GB drives (Model # = 4G160J8) running on Linux 2.4.17 with the <a href="/ide.2.4.17.02152002.patch.bz2">48-bit LBA patch</a>. Each machine has 4 of these drives on them, 2 on each IDE slot. I've had about 160 gigabytes of data on one before so I know the patch seems to do the job. But every now and then a drive will mess up a write. I do a lot of writing and it usually takes tens of gigabytes of writing before a drive does this. It writes out about 8 bytes that don't match what should have been written. This causes index corruption and I've had to install work-arounds in my code to detect and patch it.
<br>
<br>
I'm not sure if the problem is with the hard drive itself or with Linux. I've made sure that the problem wasn't in my code by doing a read after each write to verify. I thought it might be my motherboard or CPU. I use AMDs and Giga-byte motherboards. But gigablast.nu in Sweden has the same problem and it uses a Pentium 3. Furthermore, gigablast.nu uses a RAID of 160GB Maxtors, whereas gigablast.com does not. Gigablast.nu uses version 2.4.19 of Linux with the 48-bit LBA patch. So the problem seems to be with Linux, the LBA patch or the hard drive itself.
<br>
<br>
On top of all this mess, about 1 Maxtor, out of the 32 I have, completely fails on me every 4 months. The drive just gives I/O errors to the kernel and brings the whole system down. Luckily, gigablast.com implements a redundant architecture so the failing server will be replaced by his backup. So far Maxtor has replaced the drives I had fail. If you give them your credit card number they'll even send the replacements out in advance. But I believe the failure problem is an indicator that the data corruption problem is hard drive related, not Linux related. If anyone has any insight into this problem please let me know, you could quite easily be my hero.
<br>
<br>
If you're still reading this you're pretty hard core so <a href="/output.html">here's</a> what /var/log/messages says when the 4G160J8 completely fails.
<br>
<br>
<br>
<br>
<a name=pvr></a>
<font size=+1><b>Personal Video Recorders (PVRs)</b></font><br>
<i>Dec 20, 2002</i><br><br>
Boy, these things are great. I bought a Tivo last year for my wife and she loved it. At first though she wasn't that enthusiastic because she wasn't very familiar with it. But now we rarely rent any more video tapes from Blockbuster or Hollywood video because there's always something interesting to watch on the Tivo. You just let it know what shows you like and it will record them anytime they come on. We always have an overflow of Simpsons and Seinfeld epsidoes on there.
<br>
<br>
In the future though I don't think Tivo is going to make it. The reason? Home networking. Because I'm a professional computer person, we already have a home network installed. If the TV had an ethernet jack it would be in our network. 100Mbps is fast enough to send it a high-quality video stream from the computers already on the network. I have a cable modem which, in the future, should allow the computer using it to rip signals from the cable station, as well. For now though, you could split your cable and plug the new end into a tuner card on your PC. So once someone comes out with a small device for the television that converts an ethernet-based mpeg stream to a video signal we can use our home PC to act as the TIVO. This device should be pretty cheap, I'd imagine around $30 or so. The only thing you'd need then is a way to allow the remote control to talk to your PC.
<br>
<br>
Now I read about the EFF suing "Hollywood" in order to clarify consumer rights of fair use. Specifically, the EFF was said to be representing Replay TV. Hey! Isn't Replay TV owned in part by Disney (aka Hollywood)... hmmmm... Seems like Disney might have pretty good control over the outcome of this case. I think it's a conflict of interest when such an important trial, which would set precedence for many cases to come, has the same plantiff as defendant.
<br>
<br>
This makes me wonder about when Disney's Go.com division got sued by Overture (then known as Goto.com) for logo infringement. Disney had to pay around 20 million to Overture. I wonder what kind of ties Disney had to Overture. Ok, maybe I'm being a conspiracy theorist, so I'll stop now.
<br>
<br>
<br>
<br>
<a name=ecs></a>
<font size=+1><b>ECS K7S5A Motherboard Mayhem</b></font><br>
<i>Dec 20, 2002</i><br><br>
I pinch pennies. When I bought my 8 servers I got the cheapest motherboards I could get for my AMD 1.4GHz Athlon T-Birds. At the time, in late January 2002, they turned out to be the K7S5A's. While running my search engine on them I experienced lots of segmentation faults. I spent a couple of days pouring over the code wondering if I was tripping out. It wasn't until I ran memtest86 at boot time (ran by lilo) that I found memory was being corrupted. I even tried new memory sticks to no avail. Fortunately I found some pages on the web that addressed the problem. It was the motherboard. It took me many hours to replace them on all 8 servers. I don't recommend ECS. I've been very happy with the Giga-byte motherboards I have now.
<br><br><br>
<br><br><br>
</div>

BIN
html/robot3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

BIN
html/sew.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.3 KiB

BIN
html/unlocked2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

BIN
html/user1.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.5 KiB

BIN
html/user2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

BIN
html/user3.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

18
html/users.html Normal file
View File

@ -0,0 +1,18 @@
<br><br><br>
<h1>People that Use Gigablast</h1>
<table cellpadding=10 style=max-width:500px;><tr><td>
<img width=80 height=120 src=/user1.jpeg>
</td><td>
"somewhere in albuquerque there's a 1 person company that produces extremely efficient, fast and insanely flexible search product. we use it for an extra-secret BI project and been up and running in less than an hour. with matt nearby is like having a top google engineer tweak and improve your very own search engine. thank you matt for everything you've done for us!"
<br>
<br>
<font color=gray>-- Anonymous Corporate User</font>
</td></tr>
</table>

View File

@ -4016,7 +4016,7 @@ void doCmdAll ( int fd, void *state ) {
SafeBuf parmList;
// returns false and sets g_errno on error
if ( ! g_parms.convertHttpRequestToParmList ( &s_r , &parmList ,0) ) {
if (!g_parms.convertHttpRequestToParmList(&s_r,&parmList,0,NULL)){
log("cmd: error converting command: %s",mstrerror(g_errno));
exit(0);
}