#include "gb-include.h"
#include "HttpServer.h"
#include "Pages.h"
#include "Collectiondb.h"
#include "HashTable.h"
#include "Stats.h"
#include "Users.h"
#include "XmlDoc.h" // gbzip
#include "UdpServer.h"
#include "Proxy.h"
#include "PageCrawlBot.h"
// a global class extern'd in .h file
HttpServer g_httpServer;
// this is defined in PageEvents.cpp
//bool sendPageSiteMap ( TcpSocket *s , HttpRequest *r ) ;
bool sendPageApi ( TcpSocket *s , HttpRequest *r ) ;
bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) ;
// we get like 100k submissions a day!!!
static HashTable s_htable;
static bool s_init = false;
static long s_lastTime = 0;
// declare our C functions
static void requestHandlerWrapper ( TcpSocket *s ) ;
static void cleanUp ( void *state , TcpSocket *s ) ;
static void getMsgPieceWrapper ( int fd , void *state /*sockDesc*/ );
static void getSSLMsgPieceWrapper ( int fd , void *state /*sockDesc*/ );
// we now use the socket descriptor as state info for TcpServer instead of
// the TcpSocket ptr in case it got destroyed
static long getMsgPiece ( TcpSocket *s );
static void gotDocWrapper ( void *state, TcpSocket *s );
static void handleRequestfd ( UdpSlot *slot , long niceness ) ;
//bool sendPageAbout ( TcpSocket *s , HttpRequest *r , char *path ) ;
static long s_numOutgoingSockets = 0;
// reset the tcp servers
void HttpServer::reset() {
m_tcp.reset();
m_ssltcp.reset();
s_htable.reset();
s_numOutgoingSockets = 0;
}
bool HttpServer::init ( short port,
short sslPort,
void handlerWrapper( TcpSocket *s )) {
// our mime table that maps a file extension to a content type
HttpMime mm;
if ( ! mm.init() ) return false;
// make it essentially infinite
//m_maxOpenSockets = 1000000;
//well, not infinite
m_maxOpenSockets = g_conf.m_httpMaxSockets;
m_uncompressedBytes = m_bytesDownloaded = 1;
//if we haven't been given the handlerwrapper, use default
//used only by proxy right now
// qatest sets up a client-only httpserver, so don't set a
// handlerWrapper if no listening port
if (!handlerWrapper && (port || sslPort))
handlerWrapper = requestHandlerWrapper;
if ( ! g_udpServer.registerHandler ( 0xfd , handleRequestfd ) )
return false;
// set our base TcpServer class
if ( ! m_tcp.init( *handlerWrapper ,
getMsgSize ,
getMsgPiece ,
port ,
//&g_conf.m_httpMaxSockets ) ) return false;
&m_maxOpenSockets ) ) return false;
//g_conf.m_httpMaxReadBufSize ,
//g_conf.m_httpMaxSendBufSize ) ) return false;
// set our secure TcpServer class
if ( ! m_ssltcp.init ( handlerWrapper,
getMsgSize,
getMsgPiece,
sslPort,
&g_conf.m_httpsMaxSockets,
true ) ) {
// this was required for communicating with an email alert
// web server, but no longer do i use them
//return false;
// don't break, just log and don't do SSL
log ( "https: SSL Server Failed To Init, Continuing..." );
m_ssltcp.reset();
}
// log an innocent msg
log(LOG_INIT,"http: Listening on TCP port %i with sd=%i",
port, m_tcp.m_sock );
// log for https
if (m_ssltcp.m_ready)
log(LOG_INIT,"https: Listening on TCP port %i with sd=%i",
sslPort, m_ssltcp.m_sock );
return true;
}
// . get a url's document
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . IMPORTANT: we free doc/docLen, so NULLify s->m_readBuf if you want it
// . IMPORTANT: same goes for s->m_sendBuf
// . timeout in milliseconds since no read OR no write
// . proxyIp is 0 if we don't have one
bool HttpServer::getDoc ( char *url ,
long ip ,
long offset ,
long size ,
time_t ifModifiedSince ,
void *state ,
void (* callback)( void *state , TcpSocket *s ) ,
long timeout ,
long proxyIp ,
short proxyPort,
long maxTextDocLen ,
long maxOtherDocLen ,
char *userAgent ,
//bool respectDownloadLimit ,
char *proto ,
bool doPost ,
char *cookie ,
char *additionalHeader ,
char *fullRequest ,
char *postContent ) {
// sanity
if ( ip == -1 )
log("http: you probably didn't mean to set ip=-1 did you? "
"try setting to 0.");
//log(LOG_WARN, "http: get doc %s", url->getUrl());
// use the HttpRequest class
HttpRequest r;
// set default port
long defPort = 80;
// check for a secured site
TcpServer *tcp = &m_tcp;
if ( url && strncasecmp(url, "https://", 8) == 0 ) {
if (!m_ssltcp.m_ready) {
// TODO: set an error here
log("https: Trying to get HTTPS site when SSL "
"TcpServer not ready: %s",url);
g_errno = ESSLNOTREADY;
return true;
}
tcp = &m_ssltcp;
defPort = 443;
}
long pcLen = 0;
if ( postContent ) pcLen = gbstrlen(postContent);
char *req = NULL;
long reqSize;
// this returns false and sets g_errno on error
if ( ! fullRequest ) {
if ( ! r.set ( url , offset , size , ifModifiedSince ,
userAgent , proto , doPost , cookie ,
additionalHeader , pcLen ) ) return true;
reqSize = r.getRequestLen();
req = (char *) mmalloc( reqSize + pcLen ,"HttpServer");
if ( req )
memcpy ( req , r.getRequest() , reqSize );
if ( req && pcLen ) {
memcpy ( req + reqSize, postContent , pcLen );
reqSize += pcLen;
}
}
else {
// does not contain \0 i guess
reqSize = gbstrlen(fullRequest);
req = (char *) mdup ( fullRequest , reqSize,"HttpServer");
}
// . get the request from the static buffer and dup it
// . return true and set g_errno on error
if ( ! req ) return true;
long hostLen ;
long port = defPort;
char *host = getHostFast ( url , &hostLen , &port );
//if ( g_conf.m_logDebugSpider )
// log("spider: httprequest = %s", req );
// do we have an ip to send to? assume not
if ( proxyIp ) { ip = proxyIp ; port = proxyPort; }
// special NULL case
if ( !state || !callback ) {
// . send it away
// . callback will be called on completion of transaction
// . be sure to free "req/reqSize" in callback() somewhere
if ( ip )
return m_tcp.sendMsg ( ip ,
port ,
req ,
reqSize ,
reqSize ,
reqSize , // msgTotalSize
state ,
callback ,
timeout ,
maxTextDocLen ,
maxOtherDocLen );
// otherwise pass the hostname
return m_tcp.sendMsg ( host ,
hostLen ,
port ,
req ,
reqSize ,
reqSize ,
reqSize , // msgTotalSize
state ,
callback ,
timeout ,
maxTextDocLen ,
maxOtherDocLen );
}
// if too many downloads already, return error
//if ( respectDownloadLimit &&
// s_numOutgoingSockets >= g_conf.m_httpMaxDownloadSockets ||
if ( s_numOutgoingSockets >= MAX_DOWNLOADS ) {
mfree ( req, reqSize, "HttpServer" );
g_errno = ETOOMANYDOWNLOADS;
log("http: already have %li sockets downloading. Sending "
"back ETOOMANYDOWNLOADS.",(long)MAX_DOWNLOADS);
return true;
}
// increment usage
long n = 0;
while ( states[n] ) {
n++;
// should not happen...
if ( n >= MAX_DOWNLOADS ) {
mfree ( req, reqSize, "HttpServer" );
g_errno = ETOOMANYDOWNLOADS;
log("http: already have %li sockets downloading",
(long)MAX_DOWNLOADS);
return true;
}
}
states [n] = state;
callbacks[n] = callback;
s_numOutgoingSockets++;
// debug
log(LOG_DEBUG,"http: Getting doc with ip=%s state=%lu url=%s.",
iptoa(ip),(unsigned long)state,url);
// . send it away
// . callback will be called on completion of transaction
// . be sure to free "req/reqSize" in callback() somewhere
if ( ip ) {
if ( ! tcp->sendMsg ( ip ,
port ,
req ,
reqSize ,
reqSize ,
reqSize , // msgTotalSize
(void*)n ,
gotDocWrapper ,
timeout ,
maxTextDocLen ,
maxOtherDocLen ) )
return false;
// otherwise we didn't block
states[n] = NULL;
callbacks[n] = NULL;
s_numOutgoingSockets--;
return true;
}
// otherwise pass the hostname
if ( ! tcp->sendMsg ( host ,
hostLen ,
port ,
req ,
reqSize ,
reqSize ,
reqSize , // msgTotalSize
(void*)n ,
gotDocWrapper ,
timeout ,
maxTextDocLen ,
maxOtherDocLen ) )
return false;
// otherwise we didn't block
states[n] = NULL;
callbacks[n] = NULL;
s_numOutgoingSockets--;
return true;
}
// . get a url's document
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . IMPORTANT: we free doc/docLen, so NULLify s->m_readBuf if you want it
// . IMPORTANT: same goes for s->m_sendBuf
// . timeout in milliseconds since no read OR no write
// . proxyIp is 0 if we don't have one
bool HttpServer::getDoc ( long ip,
long port,
char *request,
long requestLen,
void *state ,
void (* callback)( void *state , TcpSocket *s ) ,
long timeout ,
long maxTextDocLen ,
long maxOtherDocLen ) {
//bool respectDownloadLimit ) {
TcpServer *tcp = &m_tcp;
//stupid incoming request has 1024 bytes mostly, while we need to
//send exactly what was needed
long reqSize = gbstrlen ( request );
char *req = (char *) mdup ( request, reqSize,"HttpServer" );
if ( ! req ) return true;
// if too many downloads already, return error
if ( s_numOutgoingSockets >= MAX_DOWNLOADS ) {
mfree ( req, reqSize, "HttpServer" );
g_errno = ETOOMANYDOWNLOADS;
log("http: already have %li sockets downloading",
(long)MAX_DOWNLOADS);
return true;
}
// increment usage
long n = 0;
while ( states[n] ) {
n++;
// should not happen...
if ( n >= MAX_DOWNLOADS ) {
mfree ( req, reqSize, "HttpServer" );
g_errno = ETOOMANYDOWNLOADS;
log("http: already have %li sockets downloading",
(long)MAX_DOWNLOADS);
return true;
}
}
states [n] = state;
callbacks[n] = callback;
s_numOutgoingSockets++;
// debug
log(LOG_DEBUG,"http: Getting doc with ip=%s state=%lu. %s",
iptoa(ip),(unsigned long)state, req);
// . send it away
// . callback will be called on completion of transaction
// . be sure to free "req/reqSize" in callback() somewhere
if ( ! tcp->sendMsg ( ip ,
port ,
req ,
reqSize ,
reqSize ,
reqSize , // msgTotalSize
(void*)n ,
gotDocWrapper ,
timeout ,
maxTextDocLen ,
maxOtherDocLen ) )
return false;
// otherwise we didn't block
states[n] = NULL;
callbacks[n] = NULL;
s_numOutgoingSockets--;
return true;
}
void gotDocWrapper ( void *state, TcpSocket *s ) {
g_httpServer.gotDoc ( (long)state, s );
}
bool HttpServer::gotDoc ( long n, TcpSocket *s ) {
void *state = states[n];
void (*callback)(void *state, TcpSocket *s) = callbacks[n];
// debug
log(LOG_DEBUG,"http: Got doc with state=%lu.",(long)state);
states[n] = NULL;
callbacks[n] = NULL;
s_numOutgoingSockets--;
//figure out if it came back zipped, unzip if so.
//if(g_conf.m_gzipDownloads && !g_errno && s->m_readBuf)
// now wikipedia force gzip on us regardless
if( !g_errno && s->m_readBuf) {
// this could set g_errno to EBADMIME or ECORRUPTHTTPGZIP
s = unzipReply(s);
}
// callback
callback ( state, s );
return true;
}
// . handle an incoming HTTP request
void requestHandlerWrapper ( TcpSocket *s ) {
g_httpServer.requestHandler ( s );
}
// . a udp handler wrapper
// . the proxy sends us udp packets with msgtype = 0xfd ("forward")
void handleRequestfd ( UdpSlot *slot , long niceness ) {
// if we are proxy, that is just wrong! a proxy does not send
// this msg to another proxy, only to the flock
// no! now a compression proxy will forward a query to a regular
// proxy so that the search result pages can be compressed to
// save bandwidth so we can serve APN's queries over lobonet
// which is only 2Mbps.
//if ( g_proxy.isCompressionProxy() ) { char *xx=NULL;*xx=0; }
if ( g_hostdb.m_myHost->m_type==HT_QCPROXY) {char *xx=NULL;*xx=0;}
// if niceness is 0, use the higher priority udpServer
//UdpServer *us = &g_udpServer;
// get the request
char *request = slot->m_readBuf;
long requestSize = slot->m_readBufSize;
long requestAlloc = slot->m_readBufMaxSize;
// sanity check, must at least contain \0 and ip (5 bytes total)
if ( requestSize < 5 ) { char *xx=NULL;*xx=0; }
// make a fake TcpSocket
TcpSocket *s = (TcpSocket *)mcalloc(sizeof(TcpSocket),"tcpudp");
// this sucks
if ( ! s ) {
log("http: could not allocate for TcpSocket. Out of memory.");
g_udpServer.sendErrorReply ( slot , g_errno );
return;
}
// HACK: Proxy.cpp crammed the real ip into the end of the request
s->m_ip = *(long *)(request+requestSize-4);
// callee will free this buffer...
s->m_readBuf = request;
// actually remove the ending \0 as well as 4 byte ip
s->m_readOffset = requestSize - 5;
s->m_readBufSize = requestAlloc;
s->m_this = &g_httpServer.m_tcp;
// HACK: let TcpServer know to send a UDP reply, not a tcp reply!
s->m_udpSlot = slot;
// . let the TcpSocket free that readBuf when we free the TcpSocket,
// there in TcpServer.cpp::sendMsg()
// . PageDirectory.cpp actually realloc() the TcpSocket::m_readBuf
// so we have to be careful not to let UdpServer free it in the
// udpSlot because it will have been reallocated by PageDirectory.cpp
slot->m_readBuf = NULL;
// HACK: this is used as a unique identifier for registering callbacks
// so let's set the high bit here to avoid conflicting with normal
// TCP socket descriptors that might be reading the same file! But
// now we are not allowing proxy to forward regular file requests,
// so hopefully, we won't even use this hacked m_sd.
s->m_sd = (long)slot | 0x80000000;
// if we are a proxy receiving a request from a compression proxy
// then use the proxy handler function
if ( g_proxy.isProxy() ) {
// this should call g_httpServer.sendDynamicPage() which
// should compress the 0xfd reply to be sent back to the
// compression proxy
g_proxy.handleRequest ( s );
return;
}
// log this out on gk144 to see why dropping
if ( g_conf.m_logDebugBuild )
log("fd: handling request transid=%li %s",
slot->m_transId, request );
// ultimately, Tcp::sendMsg() should be called which will free "s"
g_httpServer.requestHandler ( s );
}
// . if this returns false "s" will be destroyed
// . if request is not GET or HEAD we send an HTTP 400 error code
// . ALWAYS calls m_tcp.sendMsg ( s ,... )
// . may block on something before calling that however
// . NEVER calls m_tcp.destroySocket ( s )
// . One Exception: sendErrorReply() can call it if cannot form error reply
void HttpServer::requestHandler ( TcpSocket *s ) {
// debug msg
//log("got request, readBufUsed=%i",s->m_readOffset);
// parse the http request
HttpRequest r;
// debug
/*
unsigned char foo[1024];
unsigned char *pp = foo;
pp += sprintf ( (char *)pp,"GET /search?qcs=iso-8859-1&k0c=107207&code=1M9VNT6&spell=1&ns=2&nrt=0&rat=0&sc=1&DR=1&qh=0&bq2&q=");
//pp += sprintf ( (char *)pp,"GET /search?k0c=107207&code=1M9VNT6&spell=1&ns=2&nrt=0&rat=0&sc=1&DR=1&qh=0&bq2&q=");
static char ddd[] = {
0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2,
0xa2, 0xc3, 0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a,
0xc2, 0xac, 0xc3, 0x82, 0xc2, 0xa6, 0xc3, 0x83, 0xc6, 0x92,
0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xe2,
0x80, 0x9a, 0xc3, 0x82, 0xc2, 0x81, 0xc3, 0x83, 0xc6, 0x92,
0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xc2,
0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3, 0x82,
0xc2, 0xa1, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80,
0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xe2, 0x80, 0xb9, 0xc3, 0xa2,
0xe2, 0x82, 0xac, 0xc2, 0xa0, 0xc3, 0x83, 0xc6, 0x92, 0xc3,
0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xc2, 0xa2,
0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3, 0x82, 0xc2,
0xa6, 0x20, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93,
0xc3, 0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa7, 0xc3,
0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2,
0xc3, 0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2,
0xac, 0xc3, 0x85, 0xc2, 0xbe, 0xc3, 0x83, 0xc6, 0x92, 0xc3,
0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xc2, 0xa2,
0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3, 0x82, 0xc2,
0xa6, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e,
0xc2, 0xa2, 0xc3, 0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80,
0x9a, 0xc2, 0xac, 0xc3, 0x82, 0xc2, 0xa0, 0xc3, 0x83, 0xc6,
0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3, 0x83, 0xe2, 0x80, 0x9a,
0xc3, 0x82, 0xc2, 0xb8, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2,
0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83, 0xe2, 0x80, 0xb9,
0xc3, 0xa2, 0xe2, 0x82, 0xac, 0xc2, 0xa0, 0xc3, 0x83, 0xc6,
0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3, 0x83,
0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac, 0xc3,
0x82, 0xc2, 0xa6, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5,
0x93, 0xc3, 0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa9,
0x20, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3,
0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa7, 0xc3, 0x83,
0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3,
0x83, 0xc2, 0xa2, 0xc3, 0xa2, 0xe2, 0x80, 0x9a, 0xc2, 0xac,
0xc3, 0x85, 0xc2, 0xbe, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b,
0xc5, 0x93, 0xc3, 0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2,
0xa8, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e,
0xc2, 0xa2, 0xc3, 0x83, 0xe2, 0x80, 0xa6, 0xc3, 0x82, 0xc2,
0xa0, 0xc3, 0x83, 0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3,
0x83, 0xe2, 0x80, 0x9a, 0xc3, 0x82, 0xc2, 0xa6, 0xc3, 0x83,
0xc6, 0x92, 0xc3, 0xa2, 0xe2, 0x80, 0x9e, 0xc2, 0xa2, 0xc3,
0x83, 0xe2, 0x80, 0xa6, 0xc3, 0x82, 0xc2, 0xa0, 0xc3, 0x83,
0xc6, 0x92, 0xc3, 0x8b, 0xc5, 0x93, 0xc3, 0x83, 0xe2, 0x80,
0x9a, 0xc3, 0x82, 0xc2, 0xa9, 0x00, 0x00, 0xda, 0xda, 0xda,
0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda,
0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda,
0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0xda, 0x74,
0x65, 0x73, 0x2c, 0x20, 0x68, 0x59, 0x00, 0x00, 0x00, 0xac,
0xed, 0x3b, 0x09, 0xac, 0xed, 0x3b, 0x09, 0x78, 0x51, 0xa7,
0x24, 0xf8, 0xd0, 0xa7, 0x24, 0x00, 0x00, 0x00, 0x00, 0x0a,
0x00};
for ( long i = 0 ; i < 435 ; i++ ) {
// again:
*pp = ddd[i]; // rand() % 256;
//if ( *pp < 0x80 ) goto again;
pp++;
}
*pp = 0;
*/
// . since we own the data, we'll free readBuf on r's destruction
// . this returns false and sets g_errno on error
// . but it should still set m_request to the readBuf to delete it
// so don't worry about memory leaking s's readBuf
// . if the request is bad then return an HTTP error code
// . this should copy the request into it's own local buffer
// . we now pass "s" so we can get the src ip/port to see if
// it's from lenny
bool status = r.set ( s->m_readBuf , s->m_readOffset , s ) ;
//bool status = r.set ( (char *)foo , pp - foo , s ) ;
// is this the admin
//bool isAdmin = g_collectiondb.isAdmin ( &r , s );
// i guess assume MASTER admin...
//bool isAdmin = g_users.hasPermission ( &r , PAGE_MASTER , s );
bool isAdmin = r.isLocal();
// never proxy admin pages for security reasons
if ( s->m_udpSlot ) isAdmin = false;
//bool isIpInNetwork = g_hostdb.isIpInNetwork ( s->m_ip );
// . if host does not allow http requests (except for admin) then bail
// . used to prevent seo/email spammers from querying other machines
// and getting around our seo robot protection
//if ( ! g_conf.m_httpServerEnabled ) {
//&& ! isAdmin &&
// quick hack so we can add ips to the connect ips list in
// the master controls security table
// ! g_conf.isConnectIp ( s->m_ip ) ) {
// log("query: Returning 403 Forbidden. HttpServer is disabled "
// "in the master controls. ip=%s",iptoa(s->m_ip));
// sendErrorReply ( s , 403 , "Forbidden" );
// return;
//}
// get the server this socket uses
TcpServer *tcp = s->m_this;
// get the max sockets that can be opened at any one time
long max;
if ( tcp == &m_ssltcp ) max = g_conf.m_httpsMaxSockets;
else max = g_conf.m_httpMaxSockets;
// just a safety catch
if ( max < 2 ) max = 2;
// limit injects to less sockets than the max, so the administrator
// and regular queries will take precedence
long imax = max - 10;
if ( imax < 10 ) imax = max - 1;
if ( imax < 2 ) imax = 2;
if ( strncmp ( s->m_readBuf , "GET /inject" , 11 ) == 0 ) {
// reset "max" to something smaller
max = imax;
// do not consider injects to be coming from admin ever
isAdmin = false;
}
// enforce open connections here
//if ( used >= g_conf.m_httpMaxSockets + 10 ) {
// log("query: Too many sockets open for ip=%s. Destroying.",
// iptoa(s->m_ip));
// m_tcp.destroySocket ( s );
// return;
//}
// enforce the open socket quota iff not admin and not from intranet
if ( ! isAdmin && tcp->m_numIncomingUsed >= max &&
// make sure request is not from proxy
! s->m_udpSlot &&
!tcp->closeLeastUsed()) {
static long s_last = 0;
static long s_count = 0;
long now = getTimeLocal();
if ( now - s_last < 5 )
s_count++;
else {
log("query: Too many sockets open. Sending 500 "
"http status code to %s. (msgslogged=%li)",
iptoa(s->m_ip),s_count);
s_count = 0;
s_last = now;
}
sendErrorReply ( s , 500 , "Too many sockets open.");
// count as a failed query so we send an email alert if too
// many of these happen
g_stats.m_closedSockets++;
return;
}
// . read Buf should be freed on s's recycling/destruction in TcpServer
// . always free the readBuf since TcpServer does not
// . be sure not to set s->m_readBuf to NULL because it's used by
// TcpServer to determine if we're sending/reading a request/reply
// mfree ( s->m_readBuf , s->m_readBufSize );
// set status to false if it's not a HEAD or GET request
//if ( ! r.isGETRequest() && ! r.isHEADRequest() ) status = false;
// if the HttpRequest was bogus come here
if ( ! status ) {
// log a bad request
log("http: Got bad request from %s: %s",
iptoa(s->m_ip),mstrerror(g_errno));
// cancel the g_errno, we'll send a BAD REQUEST reply to them
g_errno = 0;
// . this returns false if blocked, true otherwise
// . this sets g_errno on error
// . this will destroy(s) if cannot malloc send buffer
sendErrorReply ( s , 400, "Bad Request" );
return;
}
// log the request iff filename does not end in .gif .jpg .
char *f = r.getFilename();
long flen = r.getFilenameLen();
bool isGif = ( f && flen >= 4 && strncmp(&f[flen-4],".gif",4) == 0 );
bool isJpg = ( f && flen >= 4 && strncmp(&f[flen-4],".jpg",4) == 0 );
bool isBmp = ( f && flen >= 4 && strncmp(&f[flen-4],".bmp",4) == 0 );
bool isPng = ( f && flen >= 4 && strncmp(&f[flen-4],".png",4) == 0 );
bool isIco = ( f && flen >= 4 && strncmp(&f[flen-4],".ico",4) == 0 );
bool isPic = (isGif | isJpg | isBmp | isPng || isIco);
// get time format: 7/23/1971 10:45:32
// . crap this cores if we use getTimeGlobal() and we are not synced
// with host #0, so just use local time i guess in that case
time_t tt ;
if ( isClockInSync() ) tt = getTimeGlobal();
else tt = getTimeLocal();
struct tm *timeStruct = localtime ( &tt );
char buf[64];
strftime ( buf , 100 , "%b %d %T", timeStruct);
// save ip in case "s" gets destroyed
long ip = s->m_ip;
// . likewise, set cgi buf up here, too
// . if it is a post request, log the posted data, too
/*
char cgi[20058];
cgi[0] = '\0';
if ( r.isPOSTRequest() ) {
long plen = r.m_cgiBufLen;
if ( plen >= 20052 ) plen = 20052;
char *pp1 = cgi ;
char *pp2 = r.m_cgiBuf;
// . when parsing cgi parms, HttpRequest converts the
// &'s to \0's so it can avoid having to malloc a
// separate m_cgiBuf
// . now it also converts ='s to 0's, so flip flop back
// and forth
char dd = '=';
for ( long i = 0 ; i < plen ; i++ , pp1++, pp2++ ) {
if ( *pp2 == '\0' ) {
*pp1 = dd;
if ( dd == '=' ) dd = '&';
else dd = '=';
continue;
}
if ( *pp2 == ' ' ) *pp1 = '+';
else *pp1 = *pp2;
}
if ( r.m_cgiBufLen >= 20052 ) {
pp1[0]='.'; pp1[1]='.'; pp1[2]='.'; pp1 += 3; }
*pp1 = '\0';
}
*/
//get this value before we send the reply, because r can be
//destroyed when we send.
long dontLog = r.getLong("dontlog",0);
// turn off for now
//dontLog = 0;
// !!!!
// TcpServer::sendMsg() may free s->m_readBuf if doing udp forwarding
// !!!!
char stackMem[1024];
long maxLen = s->m_readOffset;
if ( maxLen > 1020 ) maxLen = 1020;
memcpy(stackMem,s->m_readBuf,maxLen);
stackMem[maxLen] = '\0';
// . sendReply returns false if blocked, true otherwise
// . sets g_errno on error
// . this calls sendErrorReply (404) if file does not exist
// . g_msg is a ptr to a message like " (perm denied)" for ex. and
// it should be set by PageAddUrl.cpp, PageResults.cpp, etc.
g_msg = "";
sendReply ( s , &r , isAdmin) ;
// log the request down here now so we can include
// "(permission denied)" on the line if we should
if ( ! isPic ) {
// what url refered user to this one?
char *ref = r.getReferer();
// skip over http:// in the referer
if ( strncasecmp ( ref , "http://" , 7 ) == 0 ) ref += 7;
// fix cookie for logging
/*
char cbuf[5000];
char *p = r.m_cookiePtr;
long plen = r.m_cookieLen;
if ( plen >= 4998 ) plen = 4998;
char *pend = r.m_cookiePtr + plen;
char *dst = cbuf;
for ( ; p < pend ; p++ ) {
*dst = *p;
if ( ! *p ) *dst = ';';
dst++;
}
*dst = '\0';
*/
// store the page access request in accessdb
//g_accessdb.addAccess ( &r , ip );
// if autobanned and we should not log, return now
if ( g_msg && ! g_conf.m_logAutobannedQueries &&
strstr(g_msg,"autoban") ) {
}
else if(isAdmin && dontLog) {
//dont log if they ask us not to.
}
// . log the request
// . electric fence (efence) seg faults here on iptoa() for
// some strange reason
else if ( g_conf.m_logHttpRequests ) // && ! cgi[0] )
logf (LOG_INFO,"http: %s %s %s %s "
//"cookie=\"%s\" "
//"%s "
"%s",
buf,iptoa(ip),
// can't use s->m_readBuf[] here because
// might have called TcpServer::sendMsg() which
// might have freed it if doing udp forwarding.
// can't use r.getRequest() because it inserts
// \0's in there for cgi parm parsing.
stackMem,
//s->m_readBuf,//r.getRequest(),
ref,
//r.m_cookiePtr,
//r.getUserAgent(),
g_msg);
/*
else if ( g_conf.m_logHttpRequests )
logf (LOG_INFO,"http: %s %s %s %s %s "
//"cookie=\"%s\" "
//"%s "
"%s",
buf,iptoa(ip),
s->m_readBuf,//r.getRequest(),
cgi,
ref,
//cbuf,//r.m_cookiePtr,
//r.getUserAgent(),
g_msg);
*/
}
// if no error, we completed w/o blocking so return
//if ( ! g_errno ) return;
// if g_errno was set then send an error msg
//return sendErrorReply ( s, 500 , mstrerror(g_errno) );
}
/*
// it's better to hardcode this so we never lose it!
bool sendPageRobotsTxt ( TcpSocket *s , HttpRequest *r ) {
SafeBuf sb;
sb.safePrintf ( "User-Agent: *\n"
"Disallow: *\n"
//"Disallow: /search?makewidget=1\n"
//"Disallow: /search?clockset=\n"
"\n"
);
// this should copy it since sb is on stack
return g_httpServer.sendDynamicPage ( s ,
sb.getBufStart(),
sb.m_length ,
0 ,
"text/html");
}
*/
bool endsWith(char *haystack, int haystackLen, char *needle, int needleLen) {
return haystackLen >= needleLen && !strncmp(haystack + haystackLen - needleLen, needle, needleLen);
}
// . reply to a GET (including partial get) or HEAD request
// . HEAD just returns the MIME header for the file requested
// . returns false if blocked, true otherwise
// . sets g_errno on error
// . it calls TcpServer::sendMsg(s,...)
// . with a File * as the callback data
// . and with cleanUp() as the callback function
// . if sendMsg(s,...) blocks this cleanUp() will be called before the
// socket gets recycled or destroyed (on error)
bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
// get the server this socket uses
TcpServer *tcp = s->m_this;
// if there is a redir=http:// blah in the request then redirect
long redirLen = r->getRedirLen() ;
char *redir = NULL;
// . we may be serving multiple hostnames
// . www.gigablast.com, gigablast.com, www.inifinte.info,
// infinite.info, www.microdemocracy.com
// . get the host: field from the MIME
// . should be NULL terminated
char *h = r->getHost();
if(redirLen > 0) redir = r->getRedir();
else if (!isAdmin &&
*g_conf.m_redirect != '\0' &&
// was "raw"
r->getLong("xml", -1) == -1 &&
// do not redirect a 'gb proxy stop' request away,
// which POSTS cast=0&save=1. that is done from the
// command line, and for some reason it is not isAdmin
r->getLong("save", -1) != -1 &&
r->getString("site") == NULL &&
r->getString("sites") == NULL) {
//direct all non-raw, non admin traffic away.
redir = g_conf.m_redirect;
redirLen = gbstrlen(g_conf.m_redirect);
}
char *hdom = h;
if ( strncasecmp(hdom,"www.",4) == 0 ) hdom += 4;
// auto redirect eventguru.com to www.eventguru.com so cookies
// are consistent
if ( ! redir &&
( strcasecmp ( h , "eventguru.com" ) == 0 ||
strcasecmp ( hdom , "flurbit.com" ) == 0 ||
strcasecmp ( hdom , "flurbits.com" ) == 0 ||
strcasecmp ( hdom , "flurpit.com" ) == 0 ||
strcasecmp ( hdom , "flurbot.com" ) == 0 ||
strcasecmp ( hdom , "flurbits.com" ) == 0 ||
strcasecmp ( hdom , "flurbyte.com" ) == 0 ||
strcasecmp ( hdom , "eventstereo.com" ) == 0 ||
strcasecmp ( hdom , "eventcrier.com" ) == 0 ||
strcasecmp ( hdom , "eventwidget.com" ) == 0 ) ) {
redir = "http://www.eventguru.com/";
redirLen = gbstrlen(redir);
}
if ( redirLen > 0 ) {
// . generate our mime header
// . see http://www.vbip.com/winsock/winsock_http_08_01.asp
HttpMime m;
m.makeRedirMime (redir,redirLen);
// the query compress proxy, qcproxy, should handle this
// on its level... but we will support ZET anyway
return sendReply2 ( m.getMime(),m.getMimeLen(), NULL,0,s);
}
// . get info about the file requested
// . use a "size" of -1 for the WHOLE file
// . a non GET request should use a "size" of 0 (like HEAD)
char *path = r->getFilename();
long pathLen = r->getFilenameLen();
// paths with ..'s are from hackers!
for ( char *p = path ; *p ; p++ )
if ( *p == '.' && *(p+1) == '.' )
return sendErrorReply(s,404,"bad request");
// dump urls or json objects or pages?
// "GET /crawlbot/downloadurls"
// "GET /crawlbot/downloadobjects"
// "GET /crawlbot/downloadpages"
if ( strncmp ( path , "/crawlbot/download/" ,19 ) == 0 ||
// add 4 to length of needle to account for /vXX.
(pathLen >= 20 && strnstr(path, "/crawl/download/", 20)) ||
(pathLen >= 19 && strnstr(path, "/bulk/download/", 19)) )
return sendBackDump ( s , r );
// "GET /download/mycoll_urls.csv"
if ( strncmp ( path , "/download/", 10 ) == 0 )
return sendBackDump ( s , r );
// . is it a diffbot api request, like "GET /api/*"
// . ie "/api/startcrawl" or "/api/stopcrawl" etc.?
//if ( strncmp ( path , "/api/" , 5 ) == 0 )
// // this will call g_httpServer.sendDynamicPage() to send
// // back the reply when it is done generating the reply.
// // this function is in Diffbot.cpp.
// return handleDiffbotRequest ( s , r );
// for adding to browser list of search engines
if ( strncmp ( path, "/eventguru.xml", 14 ) == 0 ) {
SafeBuf sb(256);
sb.safePrintf(
"