open-source-search-engine/Msg13.h

// Matt Wells, copyright Oct 2001

// . ask another host to download a url for you
// . the remote host will also use a cache if m_maxCacheAge > 0
// . used for downloading and caching robots.txt
// . if m_compressReply then the host compressed the http reply before
//   sending it back to you via udp

#ifndef _MSG13_H_
#define _MSG13_H_

#include "Url.h" // MAX_URL_LEN

void resetMsg13Caches ( ) ;

extern char *g_fakeReply;

class Msg13Request {
public:

	// the top portion of Msg13Request is sent to handleRequest54()
	// in SpiderProxy.cpp to get and return proxies, as well as to
	// ban proxies.
	long getProxyRequestSize() { return (char *)&m_lastHack-(char *)this;};
	long  m_urlIp;
	long  m_lbId; // loadbucket id
	// the http proxy to use to download
	long  m_proxyIp;
	short m_proxyPort;
	long  m_banProxyIp;
	short m_banProxyPort;
	char  m_opCode;
	char  m_lastHack;

	// not part of the proxy request, but set from ProxyReply:
	long  m_numBannedProxies;
	// . if using proxies, how many proxies have we tried to download 
	//   this url through
	// . used internally in Msg13.cpp
	long m_proxyTries;
	// if using proxies, did host #0 tell us there were more to try if
	// this one did not work out?
	bool m_hasMoreProxiesToTry;

	// we call this function after the imposed crawl-delay is over
	void (*m_hammerCallback)(class Msg13Request *r);


	long long m_urlHash48;
	long  m_firstIp;

	// a tmp hack var referencing into m_url[] below
	char *m_proxiedUrl;
	long  m_proxiedUrlLen;

	char  m_niceness;
	long  m_ifModifiedSince;
	long  m_maxCacheAge;
	long  m_maxTextDocLen;
	long  m_maxOtherDocLen;
	// in milliseconds. use -1 if none or unknown.
	long  m_crawlDelayMS;
	// for linked list, this is the hammer queue
	class Msg13Request *m_nextLink;
	// if doing spider compression, compute contentHash32 of document
	// downloaded, and if it matches this then send back EDOCUNCHANGED
	long  m_contentHash32;
	// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
	char m_isCustomCrawl;
	// send back error ENOGOODDATE if it does not have one. but if
	// harvestLinks is true, just send back a filtered list of links
	long  m_requireGoodDate:1;
	long  m_harvestLinksIfNoGoodDate:1;
	long  m_compressReply:1;
	long  m_useCompressionProxy:1;
	// if m_forwardDownloadRequest is true then we pick the host to 
	// download this url based on the IP address, the idea being that
	// only one host is responsible for downloading from a particular
	// ip address. this keeps webmasters happier so they can block us
	// by just blocking one ip address. and it makes it easier for them
	// to analyze their web logs.
	long  m_forwardDownloadRequest:1;
	long  m_isScraping:1;
	// does url end in /robots.txt ?
	long  m_isRobotsTxt:1; 
	// should we call getTestDoc()/addTestDoc() like for the "test" coll
	// and for Test.cpp?
	long  m_useTestCache:1; 
	long  m_addToTestCache:1;
	long  m_skipHammerCheck:1;
	long  m_attemptedIframeExpansion:1;
	long  m_crawlDelayFromEnd:1;
	long  m_forEvents:1;

	// does m_url represent a FULL http request mime and NOT just a url?
	// this happens when gigablast is being used like a squid proxy.
	long  m_isSquidProxiedUrl:1;

	long  m_foundInCache:1;

	//long  m_testParserEnabled:1;
	//long  m_testSpiderEnabled:1;
	//long  m_isPageParser:1;
	//long  m_isPageInject:1;

	// if we just end up calling HttpServer::getDoc() via calling
	// downloadDoc() then we set this for callback purposes
	class Msg13 *m_parent;

	// on the other hand, if we are called indirectly by handleRequest13()
	// then we set m_udpSlot.
	class UdpSlot *m_udpSlot;

	class TcpSocket *m_tcpSocket;

	// used for addTestDoc() and caching. msg13 sets this
	long long m_urlHash64;	
	long      m_spideredTime;
	// used for caching (and for request table, wait in line table)
	long long m_cacheKey;
	char      m_testDir[32];
	// msg13 sets this too, so you don't have to worry about setting it
	//long      m_urlLen;
	// includes \0 termination
	//char      m_url[MAX_URL_LEN+1];

	char *ptr_url;
	char *ptr_cookie;

	long  size_url;
	long  size_cookie;

	// string buf for deserializeMsg() function
	char m_buf[0];

	long getSize() {
		return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};

	// zero it all out
	void reset() {
		//memset (this,0,(char *)m_url - (char *)this + 1); 
		memset (this,0,sizeof(Msg13Request));
		m_maxTextDocLen  = -1; // no limit
		m_maxOtherDocLen = -1; // no limit
		m_crawlDelayMS   = -1; // unknown or none
	};
};

class Msg13 {

 public:

	Msg13() ;
	~Msg13();
	void reset() ;

	// register our request handler with g_udpServer (called by main.cpp)
	static bool registerHandler();

	static class RdbCache *getHttpCacheRobots();
	static class RdbCache *getHttpCacheOthers();

	bool getDoc ( Msg13Request *r ,
		      bool isTestColl ,
		      void   *state             ,
		      void  (*callback)(void *state) );

	bool forwardRequest();

	bool gotForwardedReply ( class UdpSlot *slot );
	bool gotFinalReply ( char *reply, long replySize, long replyAllocSize);

	// keep public so wrappers can access
	void *m_state;
	void  (* m_callback) (void *state );

	// we now store the uncompressed http reply in here
	char *m_replyBuf;
	long  m_replyBufSize;
	long  m_replyBufAllocSize;

	// point to it
	Msg13Request *m_request;

	//char m_tmpBuf32[32];
};

bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) ;
bool addTestSpideredDate ( Url *u , long  spideredTime     , char *testDir ) ;

extern RdbCache s_hammerCache;

#endif
Initial file population. 2013-08-03 00:12:24 +04:00			`// Matt Wells, copyright Oct 2001`

			`// . ask another host to download a url for you`
			`// . the remote host will also use a cache if m_maxCacheAge > 0`
			`// . used for downloading and caching robots.txt`
			`// . if m_compressReply then the host compressed the http reply before`
			`// sending it back to you via udp`

			`#ifndef _MSG13_H_`
			`#define _MSG13_H_`

			`#include "Url.h" // MAX_URL_LEN`

			`void resetMsg13Caches ( ) ;`

more mem leak fixes for fake bulk job empty http replies 2014-06-06 07:09:12 +04:00			`extern char *g_fakeReply;`

Initial file population. 2013-08-03 00:12:24 +04:00			`class Msg13Request {`
			`public:`
more spider proxy fixes 2014-06-03 01:59:15 +04:00
got new floater/proxy logic compiling. 2014-06-07 02:11:51 +04:00			`// the top portion of Msg13Request is sent to handleRequest54()`
			`// in SpiderProxy.cpp to get and return proxies, as well as to`
			`// ban proxies.`
			`long getProxyRequestSize() { return (char )&m_lastHack-(char )this;};`
			`long m_urlIp;`
more spider proxy fixes 2014-06-03 01:59:15 +04:00			`long m_lbId; // loadbucket id`
got new floater/proxy logic compiling. 2014-06-07 02:11:51 +04:00			`// the http proxy to use to download`
			`long m_proxyIp;`
			`short m_proxyPort;`
			`long m_banProxyIp;`
			`short m_banProxyPort;`
			`char m_opCode;`
			`char m_lastHack;`

			`// not part of the proxy request, but set from ProxyReply:`
			`long m_numBannedProxies;`
			`// . if using proxies, how many proxies have we tried to download`
			`// this url through`
			`// . used internally in Msg13.cpp`
			`long m_proxyTries;`
			`// if using proxies, did host #0 tell us there were more to try if`
			`// this one did not work out?`
			`bool m_hasMoreProxiesToTry;`

			`// we call this function after the imposed crawl-delay is over`
			`void (m_hammerCallback)(class Msg13Request r);`


			`long long m_urlHash48;`
			`long m_firstIp;`
more spider proxy fixes 2014-06-03 01:59:15 +04:00
more fixes for sectiondb markup code 2014-06-13 00:05:45 +04:00			`// a tmp hack var referencing into m_url[] below`
			`char *m_proxiedUrl;`
			`long m_proxiedUrlLen;`

Initial file population. 2013-08-03 00:12:24 +04:00			`char m_niceness;`
			`long m_ifModifiedSince;`
			`long m_maxCacheAge;`
			`long m_maxTextDocLen;`
			`long m_maxOtherDocLen;`
enforce crawl delay perfectly. 2013-11-23 06:26:34 +04:00			`// in milliseconds. use -1 if none or unknown.`
			`long m_crawlDelayMS;`
			`// for linked list, this is the hammer queue`
			`class Msg13Request *m_nextLink;`
Initial file population. 2013-08-03 00:12:24 +04:00			`// if doing spider compression, compute contentHash32 of document`
			`// downloaded, and if it matches this then send back EDOCUNCHANGED`
			`long m_contentHash32;`
do not download bulkjob urls in crawlbot. just return a fake http reply. however, do use crawl-delay throttling logic. deduping is already turned off for bulk jobs so it should be ok. 2014-03-21 23:40:38 +04:00			`// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks`
			`char m_isCustomCrawl;`
Initial file population. 2013-08-03 00:12:24 +04:00			`// send back error ENOGOODDATE if it does not have one. but if`
			`// harvestLinks is true, just send back a filtered list of links`
			`long m_requireGoodDate:1;`
			`long m_harvestLinksIfNoGoodDate:1;`
			`long m_compressReply:1;`
			`long m_useCompressionProxy:1;`
			`// if m_forwardDownloadRequest is true then we pick the host to`
			`// download this url based on the IP address, the idea being that`
			`// only one host is responsible for downloading from a particular`
			`// ip address. this keeps webmasters happier so they can block us`
			`// by just blocking one ip address. and it makes it easier for them`
			`// to analyze their web logs.`
			`long m_forwardDownloadRequest:1;`
			`long m_isScraping:1;`
			`// does url end in /robots.txt ?`
			`long m_isRobotsTxt:1;`
			`// should we call getTestDoc()/addTestDoc() like for the "test" coll`
			`// and for Test.cpp?`
			`long m_useTestCache:1;`
			`long m_addToTestCache:1;`
			`long m_skipHammerCheck:1;`
			`long m_attemptedIframeExpansion:1;`
measure crawl delay by default from start of each download now. it is a parm in msg13request. 2013-11-27 02:07:28 +04:00			`long m_crawlDelayFromEnd:1;`
			`long m_forEvents:1;`
completed squid proxy simulation code 2014-06-09 23:42:05 +04:00
			`// does m_url represent a FULL http request mime and NOT just a url?`
			`// this happens when gigablast is being used like a squid proxy.`
			`long m_isSquidProxiedUrl:1;`

more vote infusion and squid proxy fixes. 2014-07-10 01:57:58 +04:00			`long m_foundInCache:1;`

Initial file population. 2013-08-03 00:12:24 +04:00			`//long m_testParserEnabled:1;`
			`//long m_testSpiderEnabled:1;`
			`//long m_isPageParser:1;`
			`//long m_isPageInject:1;`

			`// if we just end up calling HttpServer::getDoc() via calling`
			`// downloadDoc() then we set this for callback purposes`
			`class Msg13 *m_parent;`

			`// on the other hand, if we are called indirectly by handleRequest13()`
			`// then we set m_udpSlot.`
			`class UdpSlot *m_udpSlot;`

more spider proxy fixes 2014-06-03 01:59:15 +04:00			`class TcpSocket *m_tcpSocket;`

Initial file population. 2013-08-03 00:12:24 +04:00			`// used for addTestDoc() and caching. msg13 sets this`
			`long long m_urlHash64;`
			`long m_spideredTime;`
			`// used for caching (and for request table, wait in line table)`
			`long long m_cacheKey;`
			`char m_testDir[32];`
			`// msg13 sets this too, so you don't have to worry about setting it`
fix nyt.com cookie redir bug. fixed bug when POSTing injection request with multipart/form-data. 2014-08-06 04:04:11 +04:00			`//long m_urlLen;`
Initial file population. 2013-08-03 00:12:24 +04:00			`// includes \0 termination`
fix nyt.com cookie redir bug. fixed bug when POSTing injection request with multipart/form-data. 2014-08-06 04:04:11 +04:00			`//char m_url[MAX_URL_LEN+1];`

			`char *ptr_url;`
			`char *ptr_cookie;`

			`long size_url;`
			`long size_cookie;`
Initial file population. 2013-08-03 00:12:24 +04:00
get qa test working after nyt bug fix 2014-08-07 03:00:25 +04:00			`// string buf for deserializeMsg() function`
			`char m_buf[0];`

Initial file population. 2013-08-03 00:12:24 +04:00			`long getSize() {`
fix nyt.com cookie redir bug. fixed bug when POSTing injection request with multipart/form-data. 2014-08-06 04:04:11 +04:00			`return ((char )ptr_url-(char )this) +size_url+size_cookie;};`
Initial file population. 2013-08-03 00:12:24 +04:00
			`// zero it all out`
			`void reset() {`
fix nyt.com cookie redir bug. fixed bug when POSTing injection request with multipart/form-data. 2014-08-06 04:04:11 +04:00			`//memset (this,0,(char )m_url - (char )this + 1);`
			`memset (this,0,sizeof(Msg13Request));`
Initial file population. 2013-08-03 00:12:24 +04:00			`m_maxTextDocLen = -1; // no limit`
			`m_maxOtherDocLen = -1; // no limit`
enforce crawl delay perfectly. 2013-11-23 06:26:34 +04:00			`m_crawlDelayMS = -1; // unknown or none`
Initial file population. 2013-08-03 00:12:24 +04:00			`};`
			`};`

			`class Msg13 {`

			`public:`

			`Msg13() ;`
			`~Msg13();`
			`void reset() ;`

			`// register our request handler with g_udpServer (called by main.cpp)`
			`static bool registerHandler();`

			`static class RdbCache *getHttpCacheRobots();`
			`static class RdbCache *getHttpCacheOthers();`

			`bool getDoc ( Msg13Request *r ,`
			`bool isTestColl ,`
			`void *state ,`
			`void (callback)(void state) );`

			`bool forwardRequest();`

			`bool gotForwardedReply ( class UdpSlot *slot );`
			`bool gotFinalReply ( char *reply, long replySize, long replyAllocSize);`

			`// keep public so wrappers can access`
			`void *m_state;`
			`void (* m_callback) (void *state );`

			`// we now store the uncompressed http reply in here`
			`char *m_replyBuf;`
			`long m_replyBufSize;`
			`long m_replyBufAllocSize;`

			`// point to it`
			`Msg13Request *m_request;`
using msg55 when done downloading through a proxy to record stats for load balancing on host #0 2014-06-03 00:48:33 +04:00
more spider proxy fixes 2014-06-03 01:59:15 +04:00			`//char m_tmpBuf32[32];`
Initial file population. 2013-08-03 00:12:24 +04:00			`};`

			`bool getTestSpideredDate ( Url u , long origSpideredDate , char *testDir ) ;`
			`bool addTestSpideredDate ( Url u , long spideredTime , char testDir ) ;`

			`extern RdbCache s_hammerCache;`

			`#endif`