// Matt Wells, copyright Oct 2001 // . ask another host to download a url for you // . the remote host will also use a cache if m_maxCacheAge > 0 // . used for downloading and caching robots.txt // . if m_compressReply then the host compressed the http reply before // sending it back to you via udp #ifndef _MSG13_H_ #define _MSG13_H_ #include "Url.h" // MAX_URL_LEN void resetMsg13Caches ( ) ; extern char *g_fakeReply; class Msg13Request { public: // the top portion of Msg13Request is sent to handleRequest54() // in SpiderProxy.cpp to get and return proxies, as well as to // ban proxies. long getProxyRequestSize() { return (char *)&m_lastHack-(char *)this;}; long m_urlIp; long m_lbId; // loadbucket id // the http proxy to use to download long m_proxyIp; short m_proxyPort; long m_banProxyIp; short m_banProxyPort; char m_opCode; char m_lastHack; // not part of the proxy request, but set from ProxyReply: long m_numBannedProxies; // . if using proxies, how many proxies have we tried to download // this url through // . used internally in Msg13.cpp long m_proxyTries; // if using proxies, did host #0 tell us there were more to try if // this one did not work out? bool m_hasMoreProxiesToTry; // we call this function after the imposed crawl-delay is over void (*m_hammerCallback)(class Msg13Request *r); long long m_urlHash48; long m_firstIp; // a tmp hack var referencing into m_url[] below char *m_proxiedUrl; long m_proxiedUrlLen; char m_niceness; long m_ifModifiedSince; long m_maxCacheAge; long m_maxTextDocLen; long m_maxOtherDocLen; // in milliseconds. use -1 if none or unknown. long m_crawlDelayMS; // for linked list, this is the hammer queue class Msg13Request *m_nextLink; // if doing spider compression, compute contentHash32 of document // downloaded, and if it matches this then send back EDOCUNCHANGED long m_contentHash32; // copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks char m_isCustomCrawl; // send back error ENOGOODDATE if it does not have one. but if // harvestLinks is true, just send back a filtered list of links long m_requireGoodDate:1; long m_harvestLinksIfNoGoodDate:1; long m_compressReply:1; long m_useCompressionProxy:1; // if m_forwardDownloadRequest is true then we pick the host to // download this url based on the IP address, the idea being that // only one host is responsible for downloading from a particular // ip address. this keeps webmasters happier so they can block us // by just blocking one ip address. and it makes it easier for them // to analyze their web logs. long m_forwardDownloadRequest:1; long m_isScraping:1; // does url end in /robots.txt ? long m_isRobotsTxt:1; // should we call getTestDoc()/addTestDoc() like for the "test" coll // and for Test.cpp? long m_useTestCache:1; long m_addToTestCache:1; long m_skipHammerCheck:1; long m_attemptedIframeExpansion:1; long m_crawlDelayFromEnd:1; long m_forEvents:1; // does m_url represent a FULL http request mime and NOT just a url? // this happens when gigablast is being used like a squid proxy. long m_isSquidProxiedUrl:1; long m_foundInCache:1; long m_forceUseFloaters:1; //long m_testParserEnabled:1; //long m_testSpiderEnabled:1; //long m_isPageParser:1; //long m_isPageInject:1; // if we just end up calling HttpServer::getDoc() via calling // downloadDoc() then we set this for callback purposes class Msg13 *m_parent; // on the other hand, if we are called indirectly by handleRequest13() // then we set m_udpSlot. class UdpSlot *m_udpSlot; class TcpSocket *m_tcpSocket; // used for addTestDoc() and caching. msg13 sets this long long m_urlHash64; long m_spideredTime; // used for caching (and for request table, wait in line table) long long m_cacheKey; char m_testDir[32]; // msg13 sets this too, so you don't have to worry about setting it //long m_urlLen; // includes \0 termination //char m_url[MAX_URL_LEN+1]; char *ptr_url; char *ptr_cookie; long size_url; long size_cookie; // string buf for deserializeMsg() function char m_buf[0]; long getSize() { return ((char *)ptr_url-(char *)this) +size_url+size_cookie;}; // zero it all out void reset() { //memset (this,0,(char *)m_url - (char *)this + 1); memset (this,0,sizeof(Msg13Request)); m_maxTextDocLen = -1; // no limit m_maxOtherDocLen = -1; // no limit m_crawlDelayMS = -1; // unknown or none }; }; class Msg13 { public: Msg13() ; ~Msg13(); void reset() ; // register our request handler with g_udpServer (called by main.cpp) static bool registerHandler(); static class RdbCache *getHttpCacheRobots(); static class RdbCache *getHttpCacheOthers(); bool getDoc ( Msg13Request *r , bool isTestColl , void *state , void (*callback)(void *state) ); bool forwardRequest(); bool gotForwardedReply ( class UdpSlot *slot ); bool gotFinalReply ( char *reply, long replySize, long replyAllocSize); // keep public so wrappers can access void *m_state; void (* m_callback) (void *state ); // we now store the uncompressed http reply in here char *m_replyBuf; long m_replyBufSize; long m_replyBufAllocSize; // point to it Msg13Request *m_request; //char m_tmpBuf32[32]; }; bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) ; bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) ; extern RdbCache s_hammerCache; #endif