open-source-search-engine/Msg13.h
Matt Wells 7fcc2ab4e1 in the sockets table page,
show url download requests that are queued up to prevent
hammering an ip. also show the first 500 bytes of the send buf
in the http server sockets table.
2015-08-25 09:34:45 -07:00

214 lines
6.0 KiB
C++

// Matt Wells, copyright Oct 2001
// . ask another host to download a url for you
// . the remote host will also use a cache if m_maxCacheAge > 0
// . used for downloading and caching robots.txt
// . if m_compressReply then the host compressed the http reply before
// sending it back to you via udp
#ifndef _MSG13_H_
#define _MSG13_H_
#include "Url.h" // MAX_URL_LEN
#include "SpiderProxy.h" // MAXUSERNAMEPWD
// max crawl delay form proxy backoff of 1 minute (60 seconds)
#define MAX_PROXYCRAWLDELAYMS 60000
void resetMsg13Caches ( ) ;
bool printHammerQueueTable ( SafeBuf *sb ) ;
extern char *g_fakeReply;
class Msg13Request {
public:
// the top portion of Msg13Request is sent to handleRequest54()
// in SpiderProxy.cpp to get and return proxies, as well as to
// ban proxies.
int32_t getProxyRequestSize() { return (char *)&m_lastHack-(char *)this;};
int32_t m_urlIp;
int32_t m_lbId; // loadbucket id
// the http proxy to use to download
int32_t m_proxyIp;
uint16_t m_proxyPort;
int32_t m_banProxyIp;
uint16_t m_banProxyPort;
char m_opCode;
char m_lastHack;
collnum_t m_collnum;
// not part of the proxy request, but set from ProxyReply:
int32_t m_numBannedProxies;
// . if using proxies, how many proxies have we tried to download
// this url through
// . used internally in Msg13.cpp
int32_t m_proxyTries;
// if using proxies, did host #0 tell us there were more to try if
// this one did not work out?
bool m_hasMoreProxiesToTry;
// we call this function after the imposed crawl-delay is over
void (*m_hammerCallback)(class Msg13Request *r);
int64_t m_urlHash48;
int32_t m_firstIp;
// when it was stored in the hammer queue
int64_t m_stored;
// a tmp hack var referencing into m_url[] below
char *m_proxiedUrl;
int32_t m_proxiedUrlLen;
int64_t m_downloadStartTimeMS;
char m_niceness;
int32_t m_ifModifiedSince;
int32_t m_maxCacheAge;
int32_t m_maxTextDocLen;
int32_t m_maxOtherDocLen;
// in milliseconds. use -1 if none or unknown.
int32_t m_crawlDelayMS;
// for linked list, this is the hammer queue
class Msg13Request *m_nextLink;
char m_proxyUsernamePwdAuth[MAXUSERNAMEPWD];
// if doing spider compression, compute contentHash32 of document
// downloaded, and if it matches this then send back EDOCUNCHANGED
int32_t m_contentHash32;
// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
char m_isCustomCrawl;
// send back error ENOGOODDATE if it does not have one. but if
// harvestLinks is true, just send back a filtered list of links
int32_t m_requireGoodDate:1;
int32_t m_harvestLinksIfNoGoodDate:1;
int32_t m_compressReply:1;
int32_t m_useCompressionProxy:1;
// if m_forwardDownloadRequest is true then we pick the host to
// download this url based on the IP address, the idea being that
// only one host is responsible for downloading from a particular
// ip address. this keeps webmasters happier so they can block us
// by just blocking one ip address. and it makes it easier for them
// to analyze their web logs.
int32_t m_forwardDownloadRequest:1;
int32_t m_isScraping:1;
// does url end in /robots.txt ?
int32_t m_isRobotsTxt:1;
// should we call getTestDoc()/addTestDoc() like for the "test" coll
// and for Test.cpp?
int32_t m_useTestCache:1;
int32_t m_addToTestCache:1;
int32_t m_skipHammerCheck:1;
int32_t m_attemptedIframeExpansion:1;
int32_t m_crawlDelayFromEnd:1;
int32_t m_forEvents:1;
// does m_url represent a FULL http request mime and NOT just a url?
// this happens when gigablast is being used like a squid proxy.
int32_t m_isSquidProxiedUrl:1;
int32_t m_foundInCache:1;
int32_t m_forceUseFloaters:1;
int32_t m_wasInTableBeforeStarting:1;
int32_t m_isRootSeedUrl:1;
//int32_t m_testParserEnabled:1;
//int32_t m_testSpiderEnabled:1;
//int32_t m_isPageParser:1;
//int32_t m_isPageInject:1;
// if we just end up calling HttpServer::getDoc() via calling
// downloadDoc() then we set this for callback purposes
class Msg13 *m_parent;
// on the other hand, if we are called indirectly by handleRequest13()
// then we set m_udpSlot.
class UdpSlot *m_udpSlot;
class TcpSocket *m_tcpSocket;
// used for addTestDoc() and caching. msg13 sets this
int64_t m_urlHash64;
int32_t m_spideredTime;
// used for caching (and for request table, wait in line table)
int64_t m_cacheKey;
char m_testDir[32];
// msg13 sets this too, so you don't have to worry about setting it
//int32_t m_urlLen;
// includes \0 termination
//char m_url[MAX_URL_LEN+1];
char *ptr_url;
char *ptr_cookie;
int32_t size_url;
int32_t size_cookie;
// string buf for deserializeMsg() function
char m_buf[0];
int32_t getSize() {
return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};
// zero it all out
void reset() {
//memset (this,0,(char *)m_url - (char *)this + 1);
memset (this,0,sizeof(Msg13Request));
m_maxTextDocLen = -1; // no limit
m_maxOtherDocLen = -1; // no limit
m_crawlDelayMS = -1; // unknown or none
m_collnum = (collnum_t)-1;
};
};
class Msg13 {
public:
Msg13() ;
~Msg13();
void reset() ;
// register our request handler with g_udpServer (called by main.cpp)
static bool registerHandler();
static class RdbCache *getHttpCacheRobots();
static class RdbCache *getHttpCacheOthers();
bool getDoc ( Msg13Request *r ,
bool isTestColl ,
void *state ,
void (*callback)(void *state) );
bool forwardRequest();
bool gotForwardedReply ( class UdpSlot *slot );
bool gotFinalReply ( char *reply, int32_t replySize, int32_t replyAllocSize);
// keep public so wrappers can access
void *m_state;
void (* m_callback) (void *state );
// we now store the uncompressed http reply in here
char *m_replyBuf;
int32_t m_replyBufSize;
int32_t m_replyBufAllocSize;
// point to it
Msg13Request *m_request;
//char m_tmpBuf32[32];
};
bool getTestSpideredDate ( Url *u , int32_t *origSpideredDate , char *testDir ) ;
bool addTestSpideredDate ( Url *u , int32_t spideredTime , char *testDir ) ;
extern RdbCache s_hammerCache;
#endif