mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
spider proxy updates
This commit is contained in:
parent
0b9b77ea46
commit
806cf79b73
@ -87,7 +87,10 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
char *userAgent , char *proto , bool doPost ,
|
||||
char *cookie , char *additionalHeader ,
|
||||
// if posting something, how many bytes is it?
|
||||
long postContentLen ) {
|
||||
long postContentLen ,
|
||||
// are we sending the request through an http proxy?
|
||||
// if so this will be non-zero
|
||||
long proxyIp ) {
|
||||
|
||||
m_reqBufValid = false;
|
||||
|
||||
@ -96,6 +99,9 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
char *hptr = getHostFast ( url , &hlen , &port );
|
||||
char *path = getPathFast ( url );
|
||||
|
||||
// use the full url if sending to an http proxy
|
||||
if ( proxyIp ) path = url;
|
||||
|
||||
char *pathEnd = NULL;
|
||||
char *postData = NULL;
|
||||
if ( doPost ) {
|
||||
|
@ -54,7 +54,8 @@ class HttpRequest {
|
||||
bool doPost = false ,
|
||||
char *cookie = NULL ,
|
||||
char *additionalHeader = NULL , // does not incl \r\n
|
||||
long postContentLen = -1 ); // for content-length of POST
|
||||
long postContentLen = -1 , // for content-length of POST
|
||||
long proxyIp = 0 );
|
||||
|
||||
// use this
|
||||
SafeBuf m_reqBuf;
|
||||
|
@ -136,6 +136,10 @@ bool HttpServer::getDoc ( char *url ,
|
||||
if ( ip == -1 )
|
||||
log("http: you probably didn't mean to set ip=-1 did you? "
|
||||
"try setting to 0.");
|
||||
|
||||
// ignore if -1 as well
|
||||
if ( proxyIp == -1 ) proxyIp = 0;
|
||||
|
||||
//log(LOG_WARN, "http: get doc %s", url->getUrl());
|
||||
// use the HttpRequest class
|
||||
HttpRequest r;
|
||||
@ -165,7 +169,12 @@ bool HttpServer::getDoc ( char *url ,
|
||||
if ( ! fullRequest ) {
|
||||
if ( ! r.set ( url , offset , size , ifModifiedSince ,
|
||||
userAgent , proto , doPost , cookie ,
|
||||
additionalHeader , pcLen ) ) return true;
|
||||
// pass in proxyIp because if it is a
|
||||
// request being sent to a proxy we have to
|
||||
// say "GET http://www.xyz.com/" the full
|
||||
// url, not just a relative path.
|
||||
additionalHeader , pcLen , proxyIp ) )
|
||||
return true;
|
||||
reqSize = r.getRequestLen();
|
||||
req = (char *) mmalloc( reqSize + pcLen ,"HttpServer");
|
||||
if ( req )
|
||||
|
@ -1127,6 +1127,9 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
|
||||
// this must be outside of table, submit button follows
|
||||
if ( fmt == FORMAT_HTML ) sb->safePrintf ( "<br>\n" );
|
||||
|
||||
if ( page == PAGE_SPIDERPROXIES )
|
||||
printSpiderProxyTable ( sb );
|
||||
|
||||
// url filter page has a test table
|
||||
if ( page == PAGE_FILTERS && fmt == FORMAT_HTML ) {
|
||||
// wrap up the form, print a submit button
|
||||
|
12
Process.cpp
12
Process.cpp
@ -587,10 +587,6 @@ bool Process::isAnyTreeSaving ( ) {
|
||||
void powerMonitorWrapper ( int fd , void *state ) {
|
||||
if ( g_isYippy ) return;
|
||||
|
||||
// also download test urls from spider proxies to ensure they
|
||||
// are up and running properly
|
||||
downloadTestUrlFromProxies();
|
||||
|
||||
// only if in matt wells datacenter
|
||||
if ( ! g_conf.m_isMattWells )
|
||||
return;
|
||||
@ -849,6 +845,11 @@ void doneCmdWrapper ( void *state ) {
|
||||
}
|
||||
|
||||
void hdtempWrapper ( int fd , void *state ) {
|
||||
|
||||
// also download test urls from spider proxies to ensure they
|
||||
// are up and running properly
|
||||
downloadTestUrlFromProxies();
|
||||
|
||||
// reset this... why?
|
||||
g_errno = 0;
|
||||
// do not get if already getting
|
||||
@ -1789,6 +1790,9 @@ bool Process::saveBlockingFiles1 ( ) {
|
||||
// save the login table
|
||||
g_users.save();
|
||||
|
||||
// save stats on spider proxies if any
|
||||
saveSpiderProxyStats();
|
||||
|
||||
// save the query log buffer if it was modified by the
|
||||
// runSeoQueryLoop() in seo.cpp which updates its
|
||||
// QueryLogEntry::m_minTop50Score member and corresponding timestamp
|
||||
|
@ -5156,6 +5156,11 @@ char *Proxy::storeLoginBar ( char *reply ,
|
||||
newReply[len] = c;
|
||||
return newReply;
|
||||
}
|
||||
|
||||
// temp fix take it out because it is not working right
|
||||
mp[0] = 'x';
|
||||
return newReply;
|
||||
|
||||
// point to first digit in there
|
||||
mp += 16;
|
||||
// store our new content length as ascii into test buf
|
||||
|
18
SafeBuf.cpp
18
SafeBuf.cpp
@ -3378,23 +3378,26 @@ bool SafeBuf::base64Encode ( char *sx , long len , long niceness ) {
|
||||
}
|
||||
|
||||
// "ts" is a delta-t in seconds
|
||||
bool SafeBuf::printTimeAgo ( long ts , long now ) {
|
||||
bool SafeBuf::printTimeAgo ( long ago , long now ) {
|
||||
// Jul 23, 1971
|
||||
if ( ! reserve2x(200) ) return false;
|
||||
// for printing
|
||||
long secs = 1000;
|
||||
long mins = 1000;
|
||||
long hrs = 1000;
|
||||
long days ;
|
||||
if ( ts > 0 ) {
|
||||
mins = (long)((now - ts)/60);
|
||||
hrs = (long)((now - ts)/3600);
|
||||
days = (long)((now - ts)/(3600*24));
|
||||
if ( ago > 0 ) {
|
||||
secs = (long)((ago)/1);
|
||||
mins = (long)((ago)/60);
|
||||
hrs = (long)((ago)/3600);
|
||||
days = (long)((ago)/(3600*24));
|
||||
if ( mins < 0 ) mins = 0;
|
||||
if ( hrs < 0 ) hrs = 0;
|
||||
if ( days < 0 ) days = 0;
|
||||
}
|
||||
// print the time ago
|
||||
if ( mins ==1)safePrintf("%li minute ago",mins);
|
||||
if ( mins==0 ) safePrintf("%li seconds ago",secs);
|
||||
else if ( mins ==1)safePrintf("%li minute ago",mins);
|
||||
else if (mins<60)safePrintf ( "%li minutes ago",mins);
|
||||
else if ( hrs == 1 )safePrintf ( "%li hour ago",hrs);
|
||||
else if ( hrs < 24 )safePrintf ( "%li hours ago",hrs);
|
||||
@ -3402,7 +3405,8 @@ bool SafeBuf::printTimeAgo ( long ts , long now ) {
|
||||
else if (days< 7 )safePrintf ( "%li days ago",days);
|
||||
// do not show if more than 1 wk old! we want to seem as
|
||||
// fresh as possible
|
||||
else if ( ts > 0 ) { // && si->m_isAdmin ) {
|
||||
else if ( ago > 0 ) { // && si->m_isAdmin ) {
|
||||
long ts = now - ago;
|
||||
struct tm *timeStruct = localtime ( &ts );
|
||||
char tmp[100];
|
||||
strftime(tmp,100,"%b %d %Y",timeStruct);
|
||||
|
@ -131,8 +131,8 @@ bool buildProxyTable ( ) {
|
||||
msg = "not enough digits for an ip";
|
||||
if ( pc > 1 )
|
||||
msg = "too many colons";
|
||||
if ( dc != 4 )
|
||||
msg = "need 4 dots for an ip address";
|
||||
if ( dc != 3 )
|
||||
msg = "need 3 dots for an ip address";
|
||||
if ( bc )
|
||||
msg = "got illegal char in ip:port listing";
|
||||
if ( msg ) {
|
||||
@ -155,7 +155,7 @@ bool buildProxyTable ( ) {
|
||||
|
||||
// and the port default is 80
|
||||
long port = 80;
|
||||
if ( portStr ) port = atol2(portStr,s-portStr);
|
||||
if ( portStr ) port = atol2(portStr+1,s-portStr-1);
|
||||
if ( port < 0 || port > 65535 ) {
|
||||
log("spider: got bad proxy port for %s",p);
|
||||
return false;
|
||||
@ -174,6 +174,9 @@ bool buildProxyTable ( ) {
|
||||
// see if in table
|
||||
long islot = s_iptab.getSlot( &ipKey);
|
||||
|
||||
// advance p
|
||||
p = s;
|
||||
|
||||
// if in there, keep it as is
|
||||
if ( islot >= 0 ) continue;
|
||||
|
||||
@ -206,6 +209,17 @@ bool buildProxyTable ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// save the stats
|
||||
bool saveSpiderProxyStats ( ) {
|
||||
// save hash table
|
||||
return s_iptab.save(g_hostdb.m_dir,"spiderproxystats.dat");
|
||||
}
|
||||
|
||||
bool loadSpiderProxyStats ( ) {
|
||||
// save hash table
|
||||
return s_iptab.load(g_hostdb.m_dir,"spiderproxystats.dat");
|
||||
}
|
||||
|
||||
// . we call this from Parms.cpp which prints out the proxy related controls
|
||||
// and this table below them...
|
||||
// . allows user to see the stats of each spider proxy
|
||||
@ -265,7 +279,8 @@ bool printSpiderProxyTable ( SafeBuf *sb ) {
|
||||
|
||||
char *bg = LIGHT_BLUE;
|
||||
// mark with light red bg if last test url attempt failed
|
||||
if ( sp->m_lastDownloadTookMS == -1 )
|
||||
if ( sp->m_lastDownloadTookMS == -1 &&
|
||||
sp->m_lastDownloadTestAttemptMS>0 )
|
||||
bg = "ffa6a6";
|
||||
|
||||
// print it
|
||||
@ -279,23 +294,31 @@ bool printSpiderProxyTable ( SafeBuf *sb ) {
|
||||
);
|
||||
|
||||
// last SUCCESSFUL download time ago. when it completed.
|
||||
long ago = now - sp->m_lastSuccessfulTestMS;
|
||||
long ago = now - sp->m_lastSuccessfulTestMS/1000;
|
||||
sb->safePrintf("<td>");
|
||||
// like 1 minute ago etc.
|
||||
sb->printTimeAgo ( ago , now );
|
||||
if ( sp->m_lastSuccessfulTestMS <= 0 )
|
||||
sb->safePrintf("none");
|
||||
else
|
||||
sb->printTimeAgo ( ago , now );
|
||||
sb->safePrintf("</td>");
|
||||
|
||||
// last download time ago
|
||||
ago = now - sp->m_lastDownloadTestAttemptMS;
|
||||
ago = now - sp->m_lastDownloadTestAttemptMS/1000;
|
||||
sb->safePrintf("<td>");
|
||||
// like 1 minute ago etc.
|
||||
sb->printTimeAgo ( ago , now );
|
||||
if ( sp->m_lastDownloadTestAttemptMS<= 0 )
|
||||
sb->safePrintf("none");
|
||||
else
|
||||
sb->printTimeAgo ( ago , now );
|
||||
sb->safePrintf("</td>");
|
||||
|
||||
// how long to download the test url?
|
||||
if ( sp->m_lastDownloadTookMS != -1 )
|
||||
sb->safePrintf("<td>%lims</td>",
|
||||
(long)sp->m_lastDownloadTookMS);
|
||||
else if ( sp->m_lastDownloadTestAttemptMS<= 0 )
|
||||
sb->safePrintf("<td>unknown</td>");
|
||||
else
|
||||
sb->safePrintf("<td>"
|
||||
"<font color=red>FAILED</font>"
|
||||
@ -304,7 +327,7 @@ bool printSpiderProxyTable ( SafeBuf *sb ) {
|
||||
sb->safePrintf("</tr>\n");
|
||||
}
|
||||
|
||||
sb->safePrintf("</table>");
|
||||
sb->safePrintf("</table><br>");
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -320,6 +343,10 @@ void gotTestUrlReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
// free that thing
|
||||
//mfree ( ss , sizeof(spip) ,"spip" );
|
||||
|
||||
// note it
|
||||
log("sproxy: got test url reply: %s",
|
||||
s->m_readBuf);
|
||||
|
||||
// we can get the spider proxy ip/port from the socket because
|
||||
// we sent this url download request to that spider proxy
|
||||
unsigned long long key = (unsigned long)s->m_ip;
|
||||
@ -360,7 +387,7 @@ bool downloadTestUrlFromProxies ( ) {
|
||||
Host *h0 = g_hostdb.getFirstAliveHost();
|
||||
if ( g_hostdb.m_myHost != h0 ) return true;
|
||||
|
||||
long nowms = gettimeofdayInMillisecondsLocal();
|
||||
long long nowms = gettimeofdayInMillisecondsLocal();
|
||||
|
||||
for ( long i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
|
||||
|
||||
@ -371,8 +398,8 @@ bool downloadTestUrlFromProxies ( ) {
|
||||
|
||||
long long elapsed = nowms - sp->m_lastDownloadTestAttemptMS;
|
||||
|
||||
// hit test url once per minute
|
||||
if ( elapsed < 60 ) continue;
|
||||
// hit test url once per 31 seconds
|
||||
if ( elapsed < 31000 ) continue;
|
||||
|
||||
// or if never came back yet!
|
||||
if ( sp->m_isWaiting ) continue;
|
||||
@ -641,6 +668,11 @@ bool initSpiderProxyStuff() {
|
||||
if ( ! g_udpServer.registerHandler ( 0x55, handleRequest55 ))
|
||||
return false;
|
||||
|
||||
// key is ip/port
|
||||
s_iptab.set(8,sizeof(SpiderProxy),0,NULL,0,false,0,"siptab");
|
||||
|
||||
loadSpiderProxyStats();
|
||||
|
||||
// build the s_iptab hashtable for the first time
|
||||
buildProxyTable ();
|
||||
|
||||
|
@ -11,4 +11,10 @@ bool downloadTestUrlFromProxies();
|
||||
// called by Parms.cpp when user changes the list of proxyips
|
||||
bool buildProxyTable ( );
|
||||
|
||||
// show spider proxy stats, called by Parms.cpp
|
||||
bool printSpiderProxyTable ( SafeBuf *sb ) ;
|
||||
|
||||
// save stats on the spider proxies if any
|
||||
bool saveSpiderProxyStats();
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user