open-source-search-engine/Msg13.h

194 lines
5.4 KiB
C
Raw Normal View History

2013-08-03 00:12:24 +04:00
// Matt Wells, copyright Oct 2001
// . ask another host to download a url for you
// . the remote host will also use a cache if m_maxCacheAge > 0
// . used for downloading and caching robots.txt
// . if m_compressReply then the host compressed the http reply before
// sending it back to you via udp
#ifndef _MSG13_H_
#define _MSG13_H_
#include "Url.h" // MAX_URL_LEN
void resetMsg13Caches ( ) ;
extern char *g_fakeReply;
2013-08-03 00:12:24 +04:00
class Msg13Request {
public:
2014-06-03 01:59:15 +04:00
2014-06-07 02:11:51 +04:00
// the top portion of Msg13Request is sent to handleRequest54()
// in SpiderProxy.cpp to get and return proxies, as well as to
// ban proxies.
long getProxyRequestSize() { return (char *)&m_lastHack-(char *)this;};
long m_urlIp;
2014-06-03 01:59:15 +04:00
long m_lbId; // loadbucket id
2014-06-07 02:11:51 +04:00
// the http proxy to use to download
long m_proxyIp;
short m_proxyPort;
long m_banProxyIp;
short m_banProxyPort;
char m_opCode;
char m_lastHack;
// not part of the proxy request, but set from ProxyReply:
long m_numBannedProxies;
// . if using proxies, how many proxies have we tried to download
// this url through
// . used internally in Msg13.cpp
long m_proxyTries;
// if using proxies, did host #0 tell us there were more to try if
// this one did not work out?
bool m_hasMoreProxiesToTry;
// we call this function after the imposed crawl-delay is over
void (*m_hammerCallback)(class Msg13Request *r);
long long m_urlHash48;
long m_firstIp;
2014-06-03 01:59:15 +04:00
2014-06-13 00:05:45 +04:00
// a tmp hack var referencing into m_url[] below
char *m_proxiedUrl;
long m_proxiedUrlLen;
2013-08-03 00:12:24 +04:00
char m_niceness;
long m_ifModifiedSince;
long m_maxCacheAge;
long m_maxTextDocLen;
long m_maxOtherDocLen;
2013-11-23 06:26:34 +04:00
// in milliseconds. use -1 if none or unknown.
long m_crawlDelayMS;
// for linked list, this is the hammer queue
class Msg13Request *m_nextLink;
2013-08-03 00:12:24 +04:00
// if doing spider compression, compute contentHash32 of document
// downloaded, and if it matches this then send back EDOCUNCHANGED
long m_contentHash32;
// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
char m_isCustomCrawl;
2013-08-03 00:12:24 +04:00
// send back error ENOGOODDATE if it does not have one. but if
// harvestLinks is true, just send back a filtered list of links
long m_requireGoodDate:1;
long m_harvestLinksIfNoGoodDate:1;
long m_compressReply:1;
long m_useCompressionProxy:1;
// if m_forwardDownloadRequest is true then we pick the host to
// download this url based on the IP address, the idea being that
// only one host is responsible for downloading from a particular
// ip address. this keeps webmasters happier so they can block us
// by just blocking one ip address. and it makes it easier for them
// to analyze their web logs.
long m_forwardDownloadRequest:1;
long m_isScraping:1;
// does url end in /robots.txt ?
long m_isRobotsTxt:1;
// should we call getTestDoc()/addTestDoc() like for the "test" coll
// and for Test.cpp?
long m_useTestCache:1;
long m_addToTestCache:1;
long m_skipHammerCheck:1;
long m_attemptedIframeExpansion:1;
long m_crawlDelayFromEnd:1;
long m_forEvents:1;
2014-06-09 23:42:05 +04:00
// does m_url represent a FULL http request mime and NOT just a url?
// this happens when gigablast is being used like a squid proxy.
long m_isSquidProxiedUrl:1;
long m_foundInCache:1;
2013-08-03 00:12:24 +04:00
//long m_testParserEnabled:1;
//long m_testSpiderEnabled:1;
//long m_isPageParser:1;
//long m_isPageInject:1;
// if we just end up calling HttpServer::getDoc() via calling
// downloadDoc() then we set this for callback purposes
class Msg13 *m_parent;
// on the other hand, if we are called indirectly by handleRequest13()
// then we set m_udpSlot.
class UdpSlot *m_udpSlot;
2014-06-03 01:59:15 +04:00
class TcpSocket *m_tcpSocket;
2013-08-03 00:12:24 +04:00
// used for addTestDoc() and caching. msg13 sets this
long long m_urlHash64;
long m_spideredTime;
// used for caching (and for request table, wait in line table)
long long m_cacheKey;
char m_testDir[32];
// msg13 sets this too, so you don't have to worry about setting it
//long m_urlLen;
2013-08-03 00:12:24 +04:00
// includes \0 termination
//char m_url[MAX_URL_LEN+1];
char *ptr_url;
char *ptr_cookie;
long size_url;
long size_cookie;
2013-08-03 00:12:24 +04:00
2014-08-07 03:00:25 +04:00
// string buf for deserializeMsg() function
char m_buf[0];
2013-08-03 00:12:24 +04:00
long getSize() {
return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};
2013-08-03 00:12:24 +04:00
// zero it all out
void reset() {
//memset (this,0,(char *)m_url - (char *)this + 1);
memset (this,0,sizeof(Msg13Request));
2013-08-03 00:12:24 +04:00
m_maxTextDocLen = -1; // no limit
m_maxOtherDocLen = -1; // no limit
2013-11-23 06:26:34 +04:00
m_crawlDelayMS = -1; // unknown or none
2013-08-03 00:12:24 +04:00
};
};
class Msg13 {
public:
Msg13() ;
~Msg13();
void reset() ;
// register our request handler with g_udpServer (called by main.cpp)
static bool registerHandler();
static class RdbCache *getHttpCacheRobots();
static class RdbCache *getHttpCacheOthers();
bool getDoc ( Msg13Request *r ,
bool isTestColl ,
void *state ,
void (*callback)(void *state) );
bool forwardRequest();
bool gotForwardedReply ( class UdpSlot *slot );
bool gotFinalReply ( char *reply, long replySize, long replyAllocSize);
// keep public so wrappers can access
void *m_state;
void (* m_callback) (void *state );
// we now store the uncompressed http reply in here
char *m_replyBuf;
long m_replyBufSize;
long m_replyBufAllocSize;
// point to it
Msg13Request *m_request;
2014-06-03 01:59:15 +04:00
//char m_tmpBuf32[32];
2013-08-03 00:12:24 +04:00
};
bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) ;
bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) ;
extern RdbCache s_hammerCache;
#endif