mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
2279 lines
68 KiB
C++
2279 lines
68 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Msg13.h"
|
|
#include "UdpServer.h"
|
|
#include "HttpServer.h"
|
|
#include "Stats.h"
|
|
#include "HashTableX.h"
|
|
#include "XmlDoc.h"
|
|
#include "Test.h"
|
|
#include "Speller.h"
|
|
|
|
long convertIntoLinks ( char *reply , long replySize ) ;
|
|
long filterRobotsTxt ( char *reply , long replySize , HttpMime *mime ,
|
|
long niceness , char *userAgent , long uaLen ) ;
|
|
bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts );
|
|
void gotIframeExpandedContent ( void *state ) ;
|
|
|
|
void scanHammerQueue ( int fd , void *state );
|
|
void downloadTheDocForReals ( Msg13Request *r ) ;
|
|
|
|
// utility functions
|
|
bool getTestSpideredDate ( Url *u , long *origSpiderDate , char *testDir ) ;
|
|
bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) ;
|
|
bool getTestDoc ( char *u , class TcpSocket *ts , Msg13Request *r );
|
|
bool addTestDoc ( long long urlHash64 , char *httpReply , long httpReplySize ,
|
|
long err , Msg13Request *r ) ;
|
|
|
|
static void gotForwardedReplyWrapper ( void *state , UdpSlot *slot ) ;
|
|
static void handleRequest13 ( UdpSlot *slot , long niceness ) ;
|
|
//static bool downloadDoc ( UdpSlot *slot, Msg13Request* r ) ;
|
|
static void gotHttpReply ( void *state , TcpSocket *ts ) ;
|
|
static void gotHttpReply2 ( void *state ,
|
|
char *reply ,
|
|
long replySize ,
|
|
long replyAllocSize ,
|
|
TcpSocket *ts ) ;
|
|
static void passOnReply ( void *state , UdpSlot *slot ) ;
|
|
|
|
bool hasIframe ( char *reply, long replySize , long niceness );
|
|
long hasGoodDates ( char *content,
|
|
long contentLen,
|
|
Xml *xml,
|
|
Words *words,
|
|
char ctype,
|
|
long niceness );
|
|
char getContentTypeQuick ( HttpMime *mime, char *reply, long replySize ,
|
|
long niceness ) ;
|
|
long convertIntoLinks ( char *reply, long replySize , Xml *xml ,
|
|
long niceness ) ;
|
|
|
|
// cache for robots.txt pages
|
|
static RdbCache s_httpCacheRobots;
|
|
// cache for other pages
|
|
static RdbCache s_httpCacheOthers;
|
|
// queue up identical requests
|
|
static HashTableX s_rt;
|
|
|
|
void resetMsg13Caches ( ) {
|
|
s_httpCacheRobots.reset();
|
|
s_httpCacheOthers.reset();
|
|
s_rt.reset();
|
|
}
|
|
|
|
RdbCache *Msg13::getHttpCacheRobots() { return &s_httpCacheRobots; }
|
|
RdbCache *Msg13::getHttpCacheOthers() { return &s_httpCacheOthers; }
|
|
|
|
Msg13::Msg13() {
|
|
m_replyBuf = NULL;
|
|
}
|
|
|
|
Msg13::~Msg13() {
|
|
reset();
|
|
}
|
|
|
|
void Msg13::reset() {
|
|
if (m_replyBuf) mfree(m_replyBuf,m_replyBufAllocSize,"msg13rb");
|
|
m_replyBuf = NULL;
|
|
}
|
|
|
|
|
|
bool Msg13::registerHandler ( ) {
|
|
// . register ourselves with the udp server
|
|
// . it calls our callback when it receives a msg of type 0x0A
|
|
if ( ! g_udpServer.registerHandler ( 0x13, handleRequest13 ))
|
|
return false;
|
|
|
|
// use 3MB per cache
|
|
long memRobots = 3000000;
|
|
long memOthers = 2000000;
|
|
// assume 15k avg cache file
|
|
long maxCacheNodesRobots = memRobots / 106;
|
|
long maxCacheNodesOthers = memOthers / (10*1024);
|
|
|
|
if ( ! s_httpCacheRobots.init ( memRobots ,
|
|
-1 , // fixedDataSize
|
|
false , // lists o recs?
|
|
maxCacheNodesRobots ,
|
|
false , // use half keys
|
|
"robots.txt" , // dbname
|
|
true ))// save to disk
|
|
return false;
|
|
|
|
if ( ! s_httpCacheOthers.init ( memOthers ,
|
|
-1 , // fixedDataSize
|
|
false , // lists o recs?
|
|
maxCacheNodesOthers ,
|
|
false , // use half keys
|
|
"htmlPages" , // dbname
|
|
true ))// save to disk
|
|
return false;
|
|
|
|
// . set up the request table (aka wait in line table)
|
|
// . allowDups = "true"
|
|
if ( ! s_rt.set ( 8 , 4 , 0 , NULL , 0 , true,0,"wait13tbl") )
|
|
return false;
|
|
|
|
if ( ! g_loop.registerSleepCallback(10,NULL,scanHammerQueue) )
|
|
return log("build: Failed to register timer callback for "
|
|
"hammer queue.");
|
|
|
|
|
|
// success
|
|
return true;
|
|
}
|
|
|
|
// . returns false if blocked, returns true otherwise
|
|
// . returns true and sets g_errno on error
|
|
bool Msg13::getDoc ( Msg13Request *r,
|
|
bool isTestColl ,
|
|
void *state,void(*callback)(void *state)){
|
|
|
|
// reset in case we are being reused
|
|
reset();
|
|
|
|
/*
|
|
char buf[1024];
|
|
char *s = "<td class=\"smallfont\" align=\"right\">November 14th, 2011 10:06 AM</td>\r\n\t\t";
|
|
strcpy(buf,s);
|
|
Xml xml;
|
|
long status = hasGoodDates ( buf ,
|
|
gbstrlen(buf),
|
|
&xml ,
|
|
CT_HTML,
|
|
0 );
|
|
*/
|
|
|
|
// set these even though we are not doing events, so we can use
|
|
// the event spider proxies on scproxy3
|
|
r->m_requireGoodDate = 0;
|
|
r->m_harvestLinksIfNoGoodDate = 1;
|
|
|
|
m_state = state;
|
|
m_callback = callback;
|
|
|
|
m_request = r;
|
|
// sanity check
|
|
if ( r->m_urlIp == 0 ) { char *xx = NULL; *xx = 0; }
|
|
if ( r->m_urlIp == -1 ) { char *xx = NULL; *xx = 0; }
|
|
|
|
// set this
|
|
r->m_urlLen = gbstrlen ( r->m_url );
|
|
r->m_urlHash64 = hash64 ( r->m_url , r->m_urlLen );
|
|
|
|
// sanity check, if spidering the test coll make sure one of
|
|
// these is true!! this prevents us from mistakenly turning it off
|
|
// and not using the doc cache on disk like we should
|
|
if ( isTestColl &&
|
|
! r->m_testDir[0] &&
|
|
//! g_conf.m_testSpiderEnabled &&
|
|
//! g_conf.m_testParserEnabled &&
|
|
//! r->m_isPageParser &&
|
|
r->m_useTestCache ) {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
//r->m_testSpiderEnabled = (bool)g_conf.m_testSpiderEnabled;
|
|
//r->m_testParserEnabled = (bool)g_conf.m_testParserEnabled;
|
|
// but default to parser dir if we are the test coll so that
|
|
// the [analyze] link works!
|
|
//if ( isTestColl && ! r->m_testSpiderEnabled )
|
|
// r->m_testParserEnabled = true;
|
|
|
|
// is this a /robots.txt url?
|
|
if ( r->m_urlLen > 12 &&
|
|
! strncmp ( r->m_url + r->m_urlLen - 11,"/robots.txt",11))
|
|
r->m_isRobotsTxt = true;
|
|
|
|
// force caching if getting robots.txt so is compressed in cache
|
|
if ( r->m_isRobotsTxt )
|
|
r->m_compressReply = true;
|
|
|
|
// do not get .google.com/ crap
|
|
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
|
|
|
// set it for this too
|
|
//if ( g_conf.m_useCompressionProxy ) {
|
|
// r->m_useCompressionProxy = true;
|
|
// r->m_compressReply = true;
|
|
//}
|
|
|
|
// make the cache key
|
|
r->m_cacheKey = r->m_urlHash64;
|
|
// a compressed reply is different than a non-compressed reply
|
|
if ( r->m_compressReply ) r->m_cacheKey ^= 0xff;
|
|
|
|
// always forward these so we can use the robots.txt cache
|
|
if ( r->m_isRobotsTxt ) r->m_forwardDownloadRequest = true;
|
|
|
|
// always forward for now until things work better!
|
|
r->m_forwardDownloadRequest = true;
|
|
|
|
// download it ourselves rather than forward it off to another host?
|
|
//if ( r->m_forwardDownloadRequest ) return forwardRequest ( );
|
|
|
|
return forwardRequest ( );
|
|
|
|
// gotHttpReply() and passOnReply() call our Msg13::gotDocReply*()
|
|
// functions if Msg13Request::m_parent is non-NULL
|
|
//r->m_parent = this;
|
|
|
|
// . returns false if blocked, etc.
|
|
// . if this doesn't block it calls getFinalReply()
|
|
//return downloadDoc ( NULL , r ) ;
|
|
}
|
|
|
|
bool Msg13::forwardRequest ( ) {
|
|
|
|
// shortcut
|
|
Msg13Request *r = m_request;
|
|
|
|
//
|
|
// forward this request to the host responsible for this url's ip
|
|
//
|
|
long nh = g_hostdb.m_numHosts;
|
|
long hostId = hash32h(((unsigned long)r->m_firstIp >> 8), 0) % nh;
|
|
// get host to send to from hostId
|
|
Host *h = NULL;
|
|
// . pick first alive host, starting with "hostId" as the hostId
|
|
// . if all dead, send to the original and we will timeout > 200 secs
|
|
for ( long count = 0 ; count <= nh ; count++ ) {
|
|
// get that host
|
|
//h = g_hostdb.getProxy ( hostId );;
|
|
h = g_hostdb.getHost ( hostId );
|
|
// stop if he is alive
|
|
if ( ! g_hostdb.isDead ( h ) ) break;
|
|
// get the next otherwise
|
|
if ( ++hostId >= nh ) hostId = 0;
|
|
}
|
|
|
|
// forward it to self if we are the spider proxy!!!
|
|
if ( g_hostdb.m_myHost->m_isProxy )
|
|
h = g_hostdb.m_myHost;
|
|
|
|
// log it
|
|
if ( g_conf.m_logDebugSpider )
|
|
logf ( LOG_DEBUG,
|
|
"spider: sending download request of %s firstIp=%s "
|
|
"uh48=%llu to "
|
|
"host %li (child=%li)", r->m_url, iptoa(r->m_firstIp),
|
|
r->m_urlHash48, hostId,
|
|
r->m_skipHammerCheck);
|
|
|
|
|
|
// fill up the request
|
|
long requestSize = r->getSize();
|
|
|
|
// . otherwise, send the request to the key host
|
|
// . returns false and sets g_errno on error
|
|
// . now wait for 2 minutes before timing out
|
|
if ( ! g_udpServer.sendRequest ( (char *)r ,
|
|
requestSize ,
|
|
0x13 , // msgType 0x13
|
|
h->m_ip ,
|
|
h->m_port ,
|
|
// it was not using the proxy! because
|
|
// it thinks the hostid #0 is not
|
|
// the proxy... b/c ninad screwed that
|
|
// up by giving proxies the same ids
|
|
// as regular hosts!
|
|
-1 , // h->m_hostId ,
|
|
NULL ,
|
|
this , // state data
|
|
gotForwardedReplyWrapper ,
|
|
200 )){// 200 sec timeout
|
|
// sanity check
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// report it
|
|
log("spider: msg13 request: %s",mstrerror(g_errno));
|
|
// g_errno must be set!
|
|
return true;
|
|
}
|
|
// otherwise we block
|
|
return false;
|
|
}
|
|
|
|
void gotForwardedReplyWrapper ( void *state , UdpSlot *slot ) {
|
|
// shortcut
|
|
Msg13 *THIS = (Msg13 *)state;
|
|
// return if this blocked
|
|
if ( ! THIS->gotForwardedReply ( slot ) ) return;
|
|
// callback
|
|
THIS->m_callback ( THIS->m_state );
|
|
}
|
|
|
|
bool Msg13::gotForwardedReply ( UdpSlot *slot ) {
|
|
// don't let udpserver free the request, it's our m_request[]
|
|
slot->m_sendBufAlloc = NULL;
|
|
// what did he give us?
|
|
char *reply = slot->m_readBuf;
|
|
long replySize = slot->m_readBufSize;
|
|
long replyAllocSize = slot->m_readBufMaxSize;
|
|
// UdpServer::makeReadBuf() sets m_readBuf to -1 when calling
|
|
// alloc() with a zero length, so fix that
|
|
if ( replySize == 0 ) reply = NULL;
|
|
// this is messed up. why is it happening?
|
|
if ( reply == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// we are responsible for freeing reply now
|
|
if ( ! g_errno ) slot->m_readBuf = NULL;
|
|
|
|
return gotFinalReply ( reply , replySize , replyAllocSize );
|
|
}
|
|
|
|
bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
|
|
|
// how is this happening? ah from image downloads...
|
|
if ( m_replyBuf ) { char *xx=NULL;*xx=0; }
|
|
|
|
// assume none
|
|
m_replyBuf = NULL;
|
|
m_replyBufSize = 0;
|
|
|
|
// shortcut
|
|
Msg13Request *r = m_request;
|
|
|
|
//log("msg13: reply=%lx replysize=%li g_errno=%s",
|
|
// (long)reply,(long)replySize,mstrerror(g_errno));
|
|
|
|
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
|
logf(LOG_DEBUG,"spider: FINALIZED %s firstIp=%s",
|
|
r->m_url,iptoa(r->m_firstIp));
|
|
|
|
|
|
// . if timed out probably the host is now dead so try another one!
|
|
// . return if that blocked
|
|
if ( g_errno == EUDPTIMEDOUT ) {
|
|
// try again
|
|
log("spider: retrying1. had error for %s : %s",
|
|
r->m_url,mstrerror(g_errno));
|
|
// return if that blocked
|
|
if ( ! forwardRequest ( ) ) return false;
|
|
// a different g_errno should be set now!
|
|
}
|
|
|
|
if ( g_errno ) {
|
|
// this error msg is repeated in XmlDoc::logIt() so no need
|
|
// for it here
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("spider: error for %s: %s",
|
|
r->m_url,mstrerror(g_errno));
|
|
return true;
|
|
}
|
|
|
|
// set it
|
|
m_replyBuf = reply;
|
|
m_replyBufSize = replySize;
|
|
m_replyBufAllocSize = replyAllocSize;
|
|
|
|
// sanity check
|
|
if ( replySize > 0 && ! reply ) { char *xx=NULL;*xx=0; }
|
|
|
|
// no uncompressing if reply is empty
|
|
if ( replySize == 0 ) return true;
|
|
|
|
// if it was not compressed we are done! no need to uncompress it
|
|
if ( ! r->m_compressReply ) return true;
|
|
|
|
// get uncompressed size
|
|
uint32_t unzippedLen = *(long*)reply;
|
|
// sanity checks
|
|
if ( unzippedLen < 0 || unzippedLen > 10000000 ) {
|
|
log("spider: downloaded probable corrupt gzipped doc "
|
|
"with unzipped len of %li",(long)unzippedLen);
|
|
g_errno = ECORRUPTDATA;
|
|
return true;
|
|
}
|
|
// make buffer to hold uncompressed data
|
|
char *newBuf = (char*)mmalloc(unzippedLen, "Msg13Unzip");
|
|
if( ! newBuf ) {
|
|
g_errno = ENOMEM;
|
|
return true;
|
|
}
|
|
// make another var to get mangled by gbuncompress
|
|
unsigned long uncompressedLen = unzippedLen;
|
|
// uncompress it
|
|
int zipErr = gbuncompress( (unsigned char*)newBuf , // dst
|
|
&uncompressedLen , // dstLen
|
|
(unsigned char*)reply+4 , // src
|
|
replySize - 4 ); // srcLen
|
|
if(zipErr != Z_OK || uncompressedLen!=(long unsigned int)unzippedLen) {
|
|
log("spider: had error unzipping Msg13 reply. unzipped "
|
|
"len should be %li but is %li. ziperr=%li",
|
|
(long)uncompressedLen,(long)unzippedLen,(long)zipErr);
|
|
mfree (newBuf, unzippedLen, "Msg13UnzipError");
|
|
g_errno = ECORRUPTDATA;//EBADREPLYSIZE;
|
|
return true;
|
|
}
|
|
// all http replies should end in a \0. otherwise its likely
|
|
// a compression error. i think i saw this on roadrunner core
|
|
// a machine once in XmlDoc.cpp because httpReply did not end in \0
|
|
//if ( uncompressedLen>0 && newBuf[uncompressedLen-1] ) {
|
|
// log("spider: had http reply with no NULL term");
|
|
// mfree(newBuf,unzippedLen,"Msg13Null");
|
|
// g_errno = EBADREPLYSIZE;
|
|
// return true;
|
|
//}
|
|
|
|
// count it for stats
|
|
g_stats.m_compressedBytesIn += replySize;
|
|
|
|
// free compressed
|
|
mfree ( reply , replyAllocSize ,"ufree" );
|
|
|
|
// assign uncompressed
|
|
m_replyBuf = newBuf;
|
|
m_replyBufSize = uncompressedLen;
|
|
m_replyBufAllocSize = unzippedLen;
|
|
|
|
|
|
// log it for now
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("http: got doc %s %li to %li",
|
|
r->m_url,(long)replySize,(long)uncompressedLen);
|
|
|
|
return true;
|
|
}
|
|
|
|
RdbCache s_hammerCache;
|
|
static bool s_flag = false;
|
|
Msg13Request *s_hammerQueueHead = NULL;
|
|
Msg13Request *s_hammerQueueTail = NULL;
|
|
|
|
// . only return false if you want slot to be nuked w/o replying
|
|
// . MUST always call g_udpServer::sendReply() or sendErrorReply()
|
|
void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
|
|
|
// cast it
|
|
Msg13Request *r = (Msg13Request *)slot->m_readBuf;
|
|
// use slot niceness
|
|
r->m_niceness = niceness;
|
|
// . sanity - otherwise xmldoc::set cores!
|
|
// . no! sometimes the niceness gets converted!
|
|
//if ( niceness == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// make sure we do not download gigablast.com admin pages!
|
|
if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->m_urlLen >= 7 ) {
|
|
Url url;
|
|
url.set ( r->m_url );
|
|
// . never download /master urls from ips of hosts in cluster
|
|
// . TODO: FIX! the pages might be in another cluster!
|
|
if ( ( strncasecmp ( url.getPath() , "/master/" , 8 ) == 0 ||
|
|
strncasecmp ( url.getPath() , "/admin/" , 7 ) == 0 )) {
|
|
log("spider: Got request to download possible "
|
|
"gigablast control page %s. Sending back "
|
|
"ERESTRICTEDPAGE.",
|
|
url.getUrl());
|
|
g_errno = ERESTRICTEDPAGE;
|
|
g_udpServer.sendErrorReply(slot,g_errno);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// . use a max cached age of 24hrs for robots.txt files
|
|
// . this returns true if robots.txt file for hostname found in cache
|
|
// . don't copy since, we analyze immediately and don't block
|
|
char *rec;
|
|
long recSize;
|
|
// get the cache
|
|
RdbCache *c = &s_httpCacheOthers;
|
|
if ( r->m_isRobotsTxt ) c = &s_httpCacheRobots;
|
|
// the key is just the 64 bit hash of the url
|
|
key_t k; k.n1 = 0; k.n0 = r->m_cacheKey;
|
|
// see if in there already
|
|
bool inCache = c->getRecord ( (collnum_t)0 , // share btwn colls
|
|
k , // cacheKey
|
|
&rec ,
|
|
&recSize ,
|
|
true , // copy?
|
|
r->m_maxCacheAge , // 24*60*60 ,
|
|
true ); // stats?
|
|
|
|
// . an empty rec is a cached not found (no robot.txt file)
|
|
// . therefore it's allowed, so set *reply to 1 (true)
|
|
if ( inCache ) {
|
|
// helpful for debugging. even though you may see a robots.txt
|
|
// redirect and think we are downloading that each time,
|
|
// we are not... the redirect is cached here as well.
|
|
//log("spider: %s was in cache",r->m_url);
|
|
// . send the cached reply back
|
|
// . this will free send/read bufs on completion/g_errno
|
|
g_udpServer.sendReply_ass ( rec , recSize , rec, recSize,slot);
|
|
return;
|
|
}
|
|
|
|
// log it so we can see if we are hammering
|
|
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
|
logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
|
|
r->m_url,iptoa(r->m_firstIp));
|
|
|
|
// temporary hack
|
|
if ( r->m_parent ) { char *xx=NULL;*xx=0; }
|
|
|
|
// assume we do not add it!
|
|
r->m_addToTestCache = false;
|
|
|
|
if ( ! s_flag ) {
|
|
s_flag = true;
|
|
s_hammerCache.init ( 5000 , // maxcachemem,
|
|
8 , // fixed data size
|
|
false , // support lists?
|
|
500 , // max nodes
|
|
false , // use half keys?
|
|
"hamcache" , // dbname
|
|
false , // load from disk?
|
|
12 , // key size
|
|
12 , // data key size?
|
|
-1 );// numPtrsMax
|
|
}
|
|
|
|
// we skip it if its a frame page, robots.txt, root doc or some other
|
|
// page that is a "child" page of the main page we are spidering
|
|
if ( ! r->m_skipHammerCheck ) {
|
|
// . make sure we are not hammering an ip
|
|
// . returns 0 if currently downloading a url from that ip
|
|
// . returns -1 if not found
|
|
long long last=s_hammerCache.getLongLong(0,r->m_firstIp,
|
|
30,true);
|
|
// get time now
|
|
long long nowms = gettimeofdayInMilliseconds();
|
|
// how long has it been since last download START time?
|
|
long long waited = nowms - last;
|
|
|
|
bool queueIt = false;
|
|
if ( last > 0 && waited < r->m_crawlDelayMS ) queueIt = true;
|
|
// a "last" of 0 means currently downloading
|
|
if ( r->m_crawlDelayMS > 0 && last == 0LL ) queueIt = true;
|
|
// a last of -1 means not found. so first time i guess.
|
|
if ( last == -1 ) queueIt = false;
|
|
|
|
// . queue it up if we haven't waited long enough
|
|
// . then the functionr, checkQueue(), will re-eval all
|
|
// the download requests in this hammer queue every 10ms.
|
|
// . it will just lookup the lastdownload time in the cache,
|
|
// which will store maybe a -1 if currently downloading...
|
|
if ( queueIt ) {
|
|
// debug
|
|
//log("spider: adding %s to crawldelayqueue",r->m_url);
|
|
// save this
|
|
r->m_udpSlot = slot;
|
|
r->m_nextLink = NULL;
|
|
// add it to queue
|
|
if ( ! s_hammerQueueHead ) {
|
|
s_hammerQueueHead = r;
|
|
s_hammerQueueTail = r;
|
|
}
|
|
else {
|
|
s_hammerQueueTail->m_nextLink = r;
|
|
s_hammerQueueTail = r;
|
|
}
|
|
return;
|
|
}
|
|
|
|
|
|
// if we had it in cache check the wait time
|
|
if ( last > 0 && waited < r->m_crawlDelayMS ) {
|
|
log("spider: hammering firstIp=%s url=%s "
|
|
"only waited %lli ms of %li ms",
|
|
iptoa(r->m_firstIp),r->m_url,waited,
|
|
r->m_crawlDelayMS);
|
|
// this guy has too many redirects and it fails us...
|
|
// BUT do not core if running live, only if for test
|
|
// collection
|
|
// for now disable it seems like 99.9% good... but
|
|
// still cores on some wierd stuff...
|
|
//if(r->m_useTestCache && r->m_firstIp!=-1944679785 ) {
|
|
// char*xx = NULL; *xx = 0; }
|
|
}
|
|
// store time now
|
|
//s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
|
|
// note it
|
|
//if ( g_conf.m_logDebugSpider )
|
|
// log("spider: adding download end time of %llu for "
|
|
// "firstIp=%s "
|
|
// "url=%s "
|
|
// "to msg13::hammerCache",
|
|
// nowms,iptoa(r->m_firstIp),r->m_url);
|
|
// clear error from that if any, not important really
|
|
g_errno = 0;
|
|
}
|
|
|
|
// try to get it from the test cache?
|
|
TcpSocket ts;
|
|
if ( r->m_useTestCache && getTestDoc ( r->m_url, &ts , r ) ) {
|
|
// save this
|
|
r->m_udpSlot = slot;
|
|
// store the request so gotHttpReply can reply to it
|
|
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
|
|
g_udpServer.sendErrorReply(slot,g_errno);
|
|
return;
|
|
}
|
|
// sanity check
|
|
if ( ts.m_readOffset < 0 ) { char *xx=NULL;*xx=0; }
|
|
if ( ts.m_readBufSize < 0 ) { char *xx=NULL;*xx=0; }
|
|
// reply to it right away
|
|
gotHttpReply ( r , &ts );
|
|
// done
|
|
return;
|
|
}
|
|
|
|
// if wanted it to be in test cache but it was not, we have to
|
|
// download it, so use a fresh ip! we ran into a problem when
|
|
// downloading a new doc from an old ip in ips.txt!!
|
|
if ( r->m_useTestCache )
|
|
r->m_urlIp = 0;
|
|
|
|
// save this
|
|
r->m_udpSlot = slot;
|
|
// sanity check
|
|
if ( ! slot ) { char *xx=NULL;*xx=0; }
|
|
|
|
// send to a proxy if we are doing compression and not a proxy
|
|
if ( r->m_useCompressionProxy && ! g_hostdb.m_myHost->m_isProxy ) {
|
|
// use this key to select which proxy host
|
|
long key = ((uint32_t)r->m_firstIp >> 8);
|
|
// send to host "h"
|
|
Host *h = g_hostdb.getBestSpiderCompressionProxy(&key);
|
|
if ( g_conf.m_logDebugSpider )
|
|
log(LOG_DEBUG,"spider: sending to compression proxy "
|
|
"%s:%lu",iptoa(h->m_ip),(unsigned long)h->m_port);
|
|
// . otherwise, send the request to the key host
|
|
// . returns false and sets g_errno on error
|
|
// . now wait for 2 minutes before timing out
|
|
if ( ! g_udpServer.sendRequest ( (char *)r ,
|
|
r->getSize() ,
|
|
0x13 , // msgType 0x13
|
|
h->m_ip ,
|
|
h->m_port ,
|
|
// we are sending to the proxy
|
|
// so make this -1
|
|
-1 , // h->m_hostId ,
|
|
NULL ,
|
|
r , // state data
|
|
passOnReply ,
|
|
200 , // 200 sec timeout
|
|
-1,//backoff
|
|
-1,//maxwait
|
|
NULL,//replybuf
|
|
0,//replybufmaxsize
|
|
niceness)) {
|
|
// g_errno should be set
|
|
g_udpServer.sendErrorReply(slot,g_errno);
|
|
return;
|
|
}
|
|
// wait for it
|
|
return;
|
|
}
|
|
|
|
|
|
// do not get .google.com/ crap
|
|
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
|
|
|
downloadTheDocForReals ( r );
|
|
}
|
|
|
|
void downloadTheDocForReals ( Msg13Request *r ) {
|
|
|
|
// are we the first?
|
|
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
|
|
// wait in line cuz someone else downloading it now
|
|
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
|
|
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
|
return;
|
|
}
|
|
|
|
// this means our callback will be called
|
|
if ( ! firstInLine ) {
|
|
//log("spider: inlining %s",r->m_url);
|
|
return;
|
|
}
|
|
|
|
// . store time now
|
|
// . no, now we store 0 to indicate in progress, then we
|
|
// will overwrite it with a timestamp when the download completes
|
|
// . but if measuring crawldelay from beginning of the download then
|
|
// store the current time
|
|
// . do NOT do this when downloading robots.txt etc. type files
|
|
// which should have skipHammerCheck set to true
|
|
if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck ) {
|
|
s_hammerCache.addLongLong(0,r->m_firstIp, 0LL);//nowms);
|
|
}
|
|
else if ( ! r->m_skipHammerCheck ) {
|
|
// get time now
|
|
long long nowms = gettimeofdayInMilliseconds();
|
|
s_hammerCache.addLongLong(0,r->m_firstIp, nowms);
|
|
}
|
|
|
|
// note it
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("spider: adding special \"in-progress\" time of %lli for "
|
|
"firstIp=%s "
|
|
"url=%s "
|
|
"to msg13::hammerCache",
|
|
-1LL,iptoa(r->m_firstIp),r->m_url);
|
|
|
|
|
|
// flag this
|
|
r->m_addToTestCache = true;
|
|
// note it here
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("spider: downloading %s (%s) (skiphammercheck=%li)",
|
|
r->m_url,iptoa(r->m_urlIp) ,
|
|
(long)r->m_skipHammerCheck);
|
|
|
|
// use the default agent unless scraping
|
|
// force to event guru bot for now
|
|
//char *agent = "Mozilla/5.0 (compatible; ProCogSEOBot/1.0; +http://www.procog.com/ )";
|
|
//char *agent = "Mozilla/5.0 (compatible; GigaBot/1.0; +http://www.gigablast.com/ )";
|
|
char *agent = g_conf.m_spiderUserAgent;
|
|
if ( r->m_isScraping )
|
|
agent = "Mozilla/4.0 "
|
|
"(compatible; MSIE 6.0; Windows 98; "
|
|
"Win 9x 4.90)" ;
|
|
|
|
// for bulk jobs avoid actual downloads of the page for efficiency
|
|
if ( r->m_isCustomCrawl == 2 ) {
|
|
char *s =
|
|
"HTTP/1.0 200 (OK)\r\n"
|
|
"Content-Length: 0\r\n"
|
|
"Connection: Close\r\n"
|
|
"Content-Type: text/html\r\n\r\n";
|
|
long slen = gbstrlen(s);
|
|
long fakeBufSize = slen + 1;
|
|
char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
|
|
gotHttpReply2 ( r ,
|
|
fakeBuf,
|
|
fakeBufSize, // include \0
|
|
fakeBufSize, // allocsize
|
|
NULL ); // tcpsock
|
|
return;
|
|
}
|
|
|
|
|
|
// download it
|
|
if ( ! g_httpServer.getDoc ( r->m_url ,
|
|
r->m_urlIp ,
|
|
0 , // offset
|
|
-1 ,
|
|
r->m_ifModifiedSince ,
|
|
r , // state
|
|
gotHttpReply , // callback
|
|
30*1000 , // 30 sec timeout
|
|
r->m_httpProxyIp ,
|
|
r->m_httpProxyPort ,
|
|
r->m_maxTextDocLen ,
|
|
r->m_maxOtherDocLen ,
|
|
agent ) )
|
|
// return false if blocked
|
|
return;
|
|
// . log this so i know about it
|
|
// . g_errno MUST be set so that we do not DECREMENT
|
|
// the outstanding dom/ip counts in gotDoc() below
|
|
// because we did not increment them above
|
|
logf(LOG_DEBUG,"spider: http server had error: %s",mstrerror(g_errno));
|
|
// g_errno should be set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// if called from ourselves. return true with g_errno set.
|
|
//if ( r->m_parent ) return true;
|
|
// if did not block -- should have been an error. call callback
|
|
gotHttpReply ( r , NULL );
|
|
return ;
|
|
}
|
|
|
|
void gotHttpReply ( void *state , TcpSocket *ts ) {
|
|
// if we had no error, TcpSocket should be legit
|
|
if ( ts ) {
|
|
gotHttpReply2 ( state ,
|
|
ts->m_readBuf ,
|
|
ts->m_readOffset ,
|
|
ts->m_readBufSize,
|
|
ts );
|
|
// now after we return TcpServer will DESTROY "ts" and
|
|
// free m_readBuf... so we should not have any reference to it
|
|
return;
|
|
}
|
|
// sanity check, if ts is NULL must have g_errno set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; } // g_errno=EBADENG...
|
|
// if g_errno is set i guess ts is NULL!
|
|
gotHttpReply2 ( state , NULL ,0 , 0 , NULL );
|
|
}
|
|
|
|
void gotHttpReply2 ( void *state ,
|
|
char *reply ,
|
|
long replySize ,
|
|
long replyAllocSize ,
|
|
TcpSocket *ts ) {
|
|
|
|
// save error
|
|
long savedErr = g_errno;
|
|
|
|
Msg13Request *r = (Msg13Request *) state;
|
|
UdpSlot *slot = r->m_udpSlot;
|
|
|
|
// error?
|
|
if ( g_errno && g_conf.m_logDebugSpider )
|
|
log("spider: http reply (msg13) had error = %s "
|
|
"for %s at ip %s",
|
|
mstrerror(g_errno),r->m_url,iptoa(r->m_urlIp));
|
|
|
|
// get time now
|
|
long long nowms = gettimeofdayInMilliseconds();
|
|
// . now store the current time in the cache
|
|
// . do NOT do this for robots.txt etc. where we skip hammer check
|
|
if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck )
|
|
s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
|
|
// note it
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("spider: adding final download end time of %lli for "
|
|
"firstIp=%s "
|
|
"url=%s "
|
|
"to msg13::hammerCache",
|
|
nowms,iptoa(r->m_firstIp),r->m_url);
|
|
|
|
|
|
// sanity. this was happening from iframe download
|
|
//if ( g_errno == EDNSTIMEDOUT ) { char *xx=NULL;*xx=0; }
|
|
|
|
// . sanity check - robots.txt requests must always be compressed
|
|
// . saves space in the cache
|
|
if ( ! r->m_compressReply && r->m_isRobotsTxt ) {char *xx=NULL; *xx=0;}
|
|
// null terminate it always! -- unless already null terminated...
|
|
if ( replySize > 0 && reply[replySize-1] ) reply[replySize++] = '\0';
|
|
// sanity check
|
|
if ( replySize > replyAllocSize ) { char *xx=NULL;*xx=0; }
|
|
|
|
// save original size
|
|
long originalSize = replySize;
|
|
|
|
// . add the reply to our test cache
|
|
// . if g_errno is set to something like "TCP Timed Out" then
|
|
// we end up saving a blank robots.txt or doc here...
|
|
if ( r->m_useTestCache && r->m_addToTestCache )
|
|
addTestDoc ( r->m_urlHash64,reply,replySize,
|
|
savedErr , r );
|
|
|
|
// note it
|
|
if ( r->m_useTestCache && g_conf.m_logDebugSpider )
|
|
logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%llu",
|
|
r->m_url,iptoa(r->m_firstIp),r->m_urlHash48);
|
|
|
|
long niceness = r->m_niceness;
|
|
|
|
// sanity check
|
|
if ( replySize>0 && reply[replySize-1]!= '\0') { char *xx=NULL;*xx=0; }
|
|
|
|
// assume http status is 200
|
|
bool goodStatus = true;
|
|
|
|
long long *docsPtr = NULL;
|
|
long long *bytesInPtr = NULL;
|
|
long long *bytesOutPtr = NULL;
|
|
|
|
// use this mime
|
|
HttpMime mime;
|
|
long httpStatus = 0; // 200;
|
|
|
|
// do not do any of the content analysis routines below if we
|
|
// had a g_errno like ETCPTIMEDOUT or EBADMIME or whatever...
|
|
if ( savedErr ) goodStatus = false;
|
|
|
|
// no, its on the content only, NOT including mime
|
|
long mimeLen = 0;
|
|
|
|
// only bother rewriting the error mime if user wanted compression
|
|
// otherwise, don't bother rewriting it.
|
|
// DO NOT do this if savedErr is set because we end up calling
|
|
// sendErorrReply() below for that!
|
|
if ( replySize>0 && r->m_compressReply && ! savedErr ) {
|
|
// exclude the \0 i guess. use NULL for url.
|
|
mime.set ( reply , replySize - 1, NULL );
|
|
// no, its on the content only, NOT including mime
|
|
mimeLen = mime.getMimeLen();
|
|
// get this
|
|
httpStatus = mime.getHttpStatus();
|
|
// if it's -1, unknown i guess, then force to 505
|
|
// server side error. we get an EBADMIME for our g_errno
|
|
// when we enter this loop sometimes, so in that case...
|
|
if ( httpStatus == -1 ) httpStatus = 505;
|
|
if ( savedErr ) httpStatus = 505;
|
|
// if bad http status, re-write it
|
|
if ( httpStatus != 200 ) {
|
|
char tmpBuf[2048];
|
|
char *p = tmpBuf;
|
|
p += sprintf( tmpBuf,
|
|
"HTTP/1.0 %li\r\n"
|
|
"Content-Length: 0\r\n" ,
|
|
httpStatus );
|
|
// convery redirect urls back to requester
|
|
char *loc = mime.getLocationField();
|
|
long locLen = mime.getLocationFieldLen();
|
|
// if too big, forget it! otherwise we breach tmpBuf
|
|
if ( loc && locLen > 0 && locLen < 1024 ) {
|
|
p += sprintf ( p , "Location: " );
|
|
memcpy ( p , loc , locLen );
|
|
p += locLen;
|
|
memcpy ( p , "\r\n", 2 );
|
|
p += 2;
|
|
}
|
|
// close it up
|
|
p += sprintf ( p , "\r\n" );
|
|
// copy it over as new reply, include \0
|
|
long newSize = p - tmpBuf + 1;
|
|
if ( newSize >= 2048 ) { char *xx=NULL;*xx=0; }
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressMimeErrorDocs;
|
|
bytesInPtr = &g_stats.m_compressMimeErrorBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressMimeErrorBytesOut;
|
|
// only replace orig reply if we are smaller
|
|
if ( newSize < replySize ) {
|
|
memcpy ( reply , tmpBuf , newSize );
|
|
replySize = newSize;
|
|
}
|
|
// reset content hash
|
|
goodStatus = false;
|
|
}
|
|
}
|
|
|
|
//Xml xml;
|
|
//Words words;
|
|
|
|
// point to the content
|
|
char *content = reply + mimeLen;
|
|
// reduce length by that
|
|
long contentLen = replySize - 1 - mimeLen;
|
|
// fix bad crap
|
|
if ( contentLen < 0 ) contentLen = 0;
|
|
|
|
/*
|
|
if ( replySize > 0 &&
|
|
goodStatus &&
|
|
r->m_forEvents &&
|
|
! r->m_isRobotsTxt &&
|
|
r->m_compressReply ) {
|
|
// Links class required Xml class
|
|
if ( ! xml.set ( content ,
|
|
contentLen , // lennotsize! do not include \0
|
|
false , // ownData?
|
|
false , // purexml?
|
|
0 , // version! (unused)
|
|
false , // set parents?
|
|
niceness ) )
|
|
log("scproxy: xml set had error: %s",
|
|
mstrerror(g_errno));
|
|
// definitely compute the wordids so Dates.cpp can see if they
|
|
// are a month name or whatever...
|
|
if ( ! words.set ( &xml , true , niceness ) )
|
|
log("scproxy: words set had error: %s",
|
|
mstrerror(g_errno));
|
|
}
|
|
|
|
if ( replySize > 0 &&
|
|
goodStatus &&
|
|
r->m_forEvents &&
|
|
!r->m_isRobotsTxt &&
|
|
r->m_compressReply ) {
|
|
long cs = getCharsetFast ( &mime,
|
|
r->m_url,
|
|
content,
|
|
contentLen,
|
|
niceness);
|
|
if ( cs != csUTF8 && // UTF-8
|
|
cs != csISOLatin1 && // ISO-8859-1
|
|
cs != csASCII &&
|
|
cs != csUnknown &&
|
|
cs != cswindows1256 &&
|
|
cs != cswindows1250 &&
|
|
cs != cswindows1255 &&
|
|
cs != cswindows1252 ) { // windows-1252
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressBadCharsetDocs;
|
|
bytesInPtr = &g_stats.m_compressBadCharsetBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressBadCharsetBytesOut;
|
|
replySize = 0;
|
|
}
|
|
}
|
|
*/
|
|
|
|
if ( replySize > 0 &&
|
|
goodStatus &&
|
|
//r->m_forEvents &&
|
|
!r->m_isRobotsTxt &&
|
|
r->m_compressReply ) {
|
|
// get the content type from mime
|
|
char ct = mime.getContentType();
|
|
if ( ct != CT_HTML &&
|
|
ct != CT_TEXT &&
|
|
ct != CT_XML &&
|
|
ct != CT_PDF &&
|
|
ct != CT_DOC &&
|
|
ct != CT_XLS &&
|
|
ct != CT_PPT &&
|
|
ct != CT_PS ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressBadCTypeDocs;
|
|
bytesInPtr = &g_stats.m_compressBadCTypeBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressBadCTypeBytesOut;
|
|
replySize = 0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
if ( replySize > 0 &&
|
|
goodStatus &&
|
|
r->m_forEvents &&
|
|
! r->m_isRobotsTxt &&
|
|
r->m_compressReply ) {
|
|
// make sure we loaded the unifiedDict (do now in main.cpp)
|
|
//g_speller.init();
|
|
// detect language, if we can
|
|
long score;
|
|
// returns -1 and sets g_errno on error,
|
|
// because 0 means langUnknown
|
|
long langid = words.getLanguage(NULL,1000,niceness,&score);
|
|
// anything 2+ is non-english
|
|
if ( langid >= 2 ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressBadLangDocs;
|
|
bytesInPtr = &g_stats.m_compressBadLangBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressBadLangBytesOut;
|
|
replySize = 0;
|
|
}
|
|
}
|
|
*/
|
|
|
|
// sanity
|
|
if ( reply && replySize>0 && reply[replySize-1]!='\0') {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
bool hasIframe2 = false;
|
|
if ( r->m_compressReply &&
|
|
goodStatus &&
|
|
! r->m_isRobotsTxt )
|
|
hasIframe2 = hasIframe ( reply , replySize, niceness ) ;
|
|
|
|
// sanity
|
|
if ( reply && replySize>0 && reply[replySize-1]!='\0') {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
if ( hasIframe2 && ! r->m_attemptedIframeExpansion ) {
|
|
// must have ts i think
|
|
if ( ! ts ) { char *xx=NULL; *xx=0; }
|
|
// sanity
|
|
if ( ts->m_readBuf != reply ) { char *xx=NULL;*xx=0;}
|
|
// . try to expand each iframe tag in there
|
|
// . return without sending a reply back if this blocks
|
|
// . it will return true and set g_errno on error
|
|
// . when it has fully expanded the doc's iframes it we
|
|
// re-call this gotHttpReply() function but with the
|
|
// TcpServer's buf swapped out to be the buf that has the
|
|
// expanded iframes in it
|
|
// . returns false if blocks
|
|
// . returns true if did not block, sets g_errno on error
|
|
// . if it blocked it will recall THIS function
|
|
if ( ! getIframeExpandedContent ( r , ts ) )
|
|
return;
|
|
// ok, did we have an error?
|
|
if ( g_errno )
|
|
log("scproxy: xml set for %s had error: %s",
|
|
r->m_url,mstrerror(g_errno));
|
|
// otherwise, i guess we had no iframes worthy of expanding
|
|
// so pretend we do not have any iframes
|
|
hasIframe2 = false;
|
|
// crap... had an error, give up i guess
|
|
// record in the stats
|
|
//docsPtr = &g_stats.m_compressHasIframeDocs;
|
|
//bytesInPtr = &g_stats.m_compressHasIframeBytesIn;
|
|
//bytesOutPtr = &g_stats.m_compressHasIframeBytesOut;
|
|
}
|
|
|
|
// sanity
|
|
if ( reply && replySize>0 && reply[replySize-1]!='\0') {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// compute content hash
|
|
if ( r->m_contentHash32 &&
|
|
replySize>0 &&
|
|
goodStatus &&
|
|
r->m_compressReply &&
|
|
// if we got iframes we can't tell if content changed
|
|
! hasIframe2 ) {
|
|
// compute it
|
|
long ch32 = getContentHash32Fast( (unsigned char *)content ,
|
|
contentLen ,
|
|
niceness );
|
|
// unchanged?
|
|
if ( ch32 == r->m_contentHash32 ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressUnchangedDocs;
|
|
bytesInPtr = &g_stats.m_compressUnchangedBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressUnchangedBytesOut;
|
|
// do not send anything back
|
|
replySize = 0;
|
|
// and set error
|
|
savedErr = EDOCUNCHANGED;
|
|
}
|
|
}
|
|
|
|
// nuke the content if from flurbit.com website!!
|
|
if ( r->m_url &&
|
|
replySize>0 &&
|
|
goodStatus &&
|
|
strstr ( r->m_url,"flurbit.com/" ) ) {
|
|
// note it in log
|
|
log("msg13: got flurbit url: %s",r->m_url);
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressUnchangedDocs;
|
|
bytesInPtr = &g_stats.m_compressUnchangedBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressUnchangedBytesOut;
|
|
// do not send anything back
|
|
replySize = 0;
|
|
}
|
|
|
|
|
|
// by default assume it has a good date
|
|
long status = 1;
|
|
|
|
// sanity
|
|
if ( reply && replySize>0 && reply[replySize-1]!='\0') {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// sanity
|
|
if ( reply && replySize>0 && reply[replySize-1]!='\0') {
|
|
char *xx=NULL;*xx=0; }
|
|
|
|
// force it good for debugging
|
|
//status = 1;
|
|
// xml set error?
|
|
//if ( status == -1 ) {
|
|
// // sanity
|
|
// if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// // g_errno must have been set!
|
|
// savedErr = g_errno;
|
|
// replySize = 0;
|
|
//}
|
|
// these are typically roots!
|
|
if ( status == 1 &&
|
|
// override HasIFrame with "FullPageRequested" if it has
|
|
// an iframe, because that is the overriding stat. i.e. if
|
|
// we ignored if it had iframes, we'd still end up here...
|
|
( ! docsPtr || docsPtr == &g_stats.m_compressHasIframeDocs ) &&
|
|
r->m_compressReply ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressFullPageDocs;
|
|
bytesInPtr = &g_stats.m_compressFullPageBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressFullPageBytesOut;
|
|
}
|
|
// hey, it had a good date on it...
|
|
else if ( status == 1 &&
|
|
! docsPtr &&
|
|
r->m_compressReply ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressHasDateDocs;
|
|
bytesInPtr = &g_stats.m_compressHasDateBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressHasDateBytesOut;
|
|
}
|
|
|
|
// sanity check
|
|
if ( status != -1 && status != 0 && status != 1 ){char *xx=NULL;*xx=0;}
|
|
|
|
if ( r->m_isRobotsTxt &&
|
|
goodStatus &&
|
|
! savedErr &&
|
|
r->m_compressReply &&
|
|
httpStatus == 200 ) {
|
|
// . just take out the lines we need...
|
|
// . if no user-agent line matches * or gigabot/flurbot we
|
|
// will get just a \0 for the reply, replySize=1!
|
|
//char *ua = "ProCogBot";//"EventGuruBot";//r->m_userAgent;
|
|
// take this out until it works for
|
|
// user-agent: *\ndisallow: blah
|
|
//char *ua = "Gigabot";
|
|
//long uaLen = gbstrlen(ua);
|
|
//replySize = filterRobotsTxt (reply,replySize,&mime,niceness,
|
|
// ua,uaLen);
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressRobotsTxtDocs;
|
|
bytesInPtr = &g_stats.m_compressRobotsTxtBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressRobotsTxtBytesOut;
|
|
}
|
|
|
|
// unknown by default
|
|
if ( ! docsPtr ) {
|
|
// record in the stats
|
|
docsPtr = &g_stats.m_compressUnknownTypeDocs;
|
|
bytesInPtr = &g_stats.m_compressUnknownTypeBytesIn;
|
|
bytesOutPtr = &g_stats.m_compressUnknownTypeBytesOut;
|
|
}
|
|
|
|
// assume we did not compress it
|
|
bool compressed = false;
|
|
// compress if we should. do not compress if we are original requester
|
|
// because we call gotFinalReply() with the reply right below here.
|
|
// CAUTION: do not compress empty replies.
|
|
// do not bother if savedErr is set because we use sendErrorReply
|
|
// to send that back!
|
|
if ( r->m_compressReply && replySize>0 && ! savedErr ) {
|
|
// how big should the compression buf be?
|
|
long need = sizeof(long) + // unzipped size
|
|
(long)(replySize * 1.01) + // worst case size
|
|
25; // for zlib
|
|
// for 7-zip
|
|
need += 300;
|
|
// back buffer to hold compressed reply
|
|
unsigned long compressedLen;
|
|
char *compressedBuf = (char*)mmalloc(need, "Msg13Zip");
|
|
if ( ! compressedBuf ) {
|
|
g_errno = ENOMEM;
|
|
g_udpServer.sendErrorReply(slot,g_errno);
|
|
return;
|
|
}
|
|
|
|
// store uncompressed length as first four bytes in the
|
|
// compressedBuf
|
|
*(long *)compressedBuf = replySize;
|
|
// the remaining bytes are for data
|
|
compressedLen = need - 4;
|
|
// leave the first 4 bytes to hold the uncompressed size
|
|
int zipErr = gbcompress( (unsigned char*)compressedBuf+4,
|
|
&compressedLen,
|
|
(unsigned char*)reply,
|
|
replySize);
|
|
if(zipErr != Z_OK) {
|
|
log("spider: had error zipping Msg13 reply.");
|
|
mfree (compressedBuf, need, "Msg13ZipError");
|
|
g_errno = ECORRUPTDATA;
|
|
g_udpServer.sendErrorReply(slot,g_errno);
|
|
return;
|
|
}
|
|
// . free the uncompressed reply so tcpserver does not have to
|
|
// . no, now TcpServer will nuke it!!! or if called from
|
|
// gotIframeExpansion(), then deleting the xmldoc will nuke
|
|
// it
|
|
//mfree ( reply , replyAllocSize , "msg13ubuf" );
|
|
// it is toast
|
|
//if ( ts ) ts->m_readBuf = NULL;
|
|
// record the uncompressed size.
|
|
reply = compressedBuf;
|
|
replySize = 4 + compressedLen;
|
|
replyAllocSize = need;
|
|
// sanity check
|
|
if ( replySize<0||replySize>100000000 ) { char *xx=NULL;*xx=0;}
|
|
// we did compress it
|
|
compressed = true;
|
|
}
|
|
|
|
// record the stats
|
|
if ( docsPtr ) {
|
|
// we download a doc
|
|
*docsPtr = *docsPtr + 1;
|
|
// we spidered it at this size
|
|
*bytesInPtr += originalSize;
|
|
// and spit it back out at this size
|
|
*bytesOutPtr += replySize;
|
|
// and this always, the total
|
|
g_stats.m_compressAllDocs++;
|
|
g_stats.m_compressAllBytesIn += originalSize;
|
|
g_stats.m_compressAllBytesOut += replySize;
|
|
}
|
|
|
|
// store reply in the cache (might be compressed)
|
|
if ( r->m_maxCacheAge > 0 ) { // && ! r->m_parent ) {
|
|
// get the cache
|
|
RdbCache *c = &s_httpCacheOthers;
|
|
// use robots cache if we are a robots.txt file
|
|
if ( r->m_isRobotsTxt ) c = &s_httpCacheRobots;
|
|
// key is based on url hash
|
|
key_t k; k.n1 = 0; k.n0 = r->m_cacheKey;
|
|
// add it, use a generic collection
|
|
c->addRecord ( (collnum_t) 0 , k , reply , replySize );
|
|
// ignore errors caching it
|
|
g_errno = 0;
|
|
}
|
|
|
|
// shortcut
|
|
UdpServer *us = &g_udpServer;
|
|
|
|
// how many have this key?
|
|
long count = s_rt.getCount ( &r->m_cacheKey );
|
|
// sanity check
|
|
if ( count < 1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// send a reply for all waiting in line
|
|
long tableSlot;
|
|
// loop
|
|
for ( ; ( tableSlot = s_rt.getSlot ( &r->m_cacheKey) ) >= 0 ; ) {
|
|
// use this
|
|
long err = 0;
|
|
// set g_errno appropriately
|
|
//if ( ! ts || savedErr ) err = savedErr;
|
|
if ( savedErr ) err = savedErr;
|
|
// sanity check. must be empty on any error
|
|
if ( reply && replySize > 0 && err ) {
|
|
// ETCPIMEDOUT can happen with a partial buf
|
|
if ( err != ETCPTIMEDOUT &&
|
|
// sometimes zipped content from page
|
|
// is corrupt... we don't even request
|
|
// gzipped http replies but they send it anyway
|
|
err != ECORRUPTHTTPGZIP &&
|
|
// now httpserver::gotDoc's call to
|
|
// unzipReply() can also set g_errno to
|
|
// EBADMIME
|
|
err != EBADMIME &&
|
|
// this happens sometimes in unzipReply()
|
|
err != ENOMEM &&
|
|
// this page had a bad mime
|
|
err != ECORRUPTHTTPGZIP &&
|
|
// broken pipe
|
|
err != EPIPE &&
|
|
// connection reset by peer
|
|
err != ECONNRESET ) {
|
|
char*xx=NULL;*xx=0;}
|
|
}
|
|
// replicate the reply. might return NULL and set g_errno
|
|
char *copy = reply;
|
|
long copyAllocSize = replyAllocSize;
|
|
// . only copy it if we are not the last guy in the table
|
|
// . no, now always copy it
|
|
if ( --count > 0 && ! err ) {
|
|
copy = (char *)mdup(reply,replySize,"msg13d");
|
|
copyAllocSize = replySize;
|
|
}
|
|
// get request
|
|
Msg13Request *r2;
|
|
r2 = *(Msg13Request **)s_rt.getValueFromSlot(tableSlot);
|
|
// get udp slot for this transaction
|
|
UdpSlot *slot = r2->m_udpSlot;
|
|
// remove from list
|
|
s_rt.removeSlot ( tableSlot );
|
|
// send back error? maybe...
|
|
if ( err ) {
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("proxy: msg13: sending back error: %s "
|
|
"for url %s with ip %s",
|
|
mstrerror(err),
|
|
r2->m_url,
|
|
iptoa(r2->m_urlIp));
|
|
g_udpServer.sendErrorReply ( slot , err );
|
|
continue;
|
|
}
|
|
// send reply
|
|
us->sendReply_ass ( copy,replySize,copy,copyAllocSize, slot );
|
|
// now final udp slot will free the reply, so tcp server
|
|
// no longer has to. set this tcp buf to null then.
|
|
if ( ts && ts->m_readBuf == reply && count == 0 )
|
|
ts->m_readBuf = NULL;
|
|
}
|
|
// return now if we sent a regular non-error reply. it will have
|
|
// sent the reply buffer and udpserver will free it when its done
|
|
// transmitting it.
|
|
//if ( ts && ! savedErr ) return;
|
|
// otherwise, we sent back a quick little error reply and have to
|
|
// free the buffer here now. i think this was the mem leak we were
|
|
// seeing.
|
|
//if ( ! reply ) return;
|
|
// do not let tcpserver free it
|
|
//if ( ts ) ts->m_readBuf = NULL;
|
|
// we free it - if it was never sent over a udp slot
|
|
if ( savedErr && compressed )
|
|
mfree ( reply , replyAllocSize , "msg13ubuf" );
|
|
}
|
|
|
|
|
|
void passOnReply ( void *state , UdpSlot *slot ) {
|
|
// send that back
|
|
Msg13Request *r = (Msg13Request *)state;
|
|
// core for now
|
|
//char *xx=NULL;*xx=0;
|
|
// don't let udpserver free the request, it's our m_request[]
|
|
slot->m_sendBufAlloc = NULL;
|
|
|
|
/*
|
|
// do not pass it on, we are where it stops if this is non-null
|
|
if ( r->m_parent ) {
|
|
r->m_parent->gotForwardedReply ( slot );
|
|
return ;
|
|
}
|
|
*/
|
|
|
|
if ( g_errno ) {
|
|
log("spider: error from proxy for %s: %s",
|
|
r->m_url,mstrerror(g_errno));
|
|
g_udpServer.sendErrorReply(r->m_udpSlot, g_errno);
|
|
return;
|
|
}
|
|
|
|
// what did he give us?
|
|
char *reply = slot->m_readBuf;
|
|
long replySize = slot->m_readBufSize;
|
|
long replyAllocSize = slot->m_readBufMaxSize;
|
|
// do not allow "slot" to free the read buf since it is being used
|
|
// as the send buf for "udpSlot"
|
|
slot->m_readBuf = NULL;
|
|
slot->m_readBufSize = 0;
|
|
//long replyAllocSize = slot->m_readBufSize;
|
|
// just forward it on
|
|
g_udpServer.sendReply_ass (reply,replySize,
|
|
reply,replyAllocSize,
|
|
r->m_udpSlot);
|
|
}
|
|
|
|
//
|
|
//
|
|
// . UTILITY FUNCTIONS for injecting into the "qatest123" collection
|
|
// . we need to ensure that the web pages remain constant so we store them
|
|
//
|
|
//
|
|
|
|
// . returns true if found on disk in the test subdir
|
|
// . returns false with g_errno set on error
|
|
// . now that we are lower level in Msg13.cpp, set "ts" not "slot"
|
|
bool getTestDoc ( char *u , TcpSocket *ts , Msg13Request *r ) {
|
|
// sanity check
|
|
//if ( strcmp(m_coll,"qatest123") ) { char *xx=NULL;*xx=0; }
|
|
// hash the url into 64 bits
|
|
long long h = hash64 ( u , gbstrlen(u) );
|
|
// read the spider date file first
|
|
char fn[300];
|
|
File f;
|
|
|
|
// default to being from PageInject
|
|
//char *td = "test-page-inject";
|
|
//if ( r->m_testSpiderEnabled ) td = "test-spider";
|
|
//if ( r->m_testParserEnabled ) td = "test-parser";
|
|
//if ( r->m_isPageParser ) td = "test-page-parser";
|
|
char *td = r->m_testDir;
|
|
//if ( r->m_isPageInject ) td = "test-page-inject";
|
|
//if ( ! td ) td = "test-page-parser";
|
|
if ( ! td[0] ) { char *xx=NULL;*xx=0; }
|
|
// make http reply filename
|
|
sprintf(fn,"%s/%s/doc.%llu.html",g_hostdb.m_dir,td,h);
|
|
// look it up
|
|
f.set ( fn );
|
|
// try to get it
|
|
if ( ! f.doesExist() ) {
|
|
//if ( g_conf.m_logDebugSpider )
|
|
log("test: doc not found in test cache: %s (%llu)",
|
|
u,h);
|
|
return false;
|
|
}
|
|
// get size
|
|
long fs = f.getFileSize();
|
|
// error?
|
|
if ( fs == -1 )
|
|
return log("test: error getting file size from test");
|
|
// make a buf
|
|
char *buf = (char *)mmalloc ( fs + 1 , "gtd");
|
|
// no mem?
|
|
if ( ! buf ) return log("test: no mem to get html file");
|
|
// open it
|
|
f.open ( O_RDWR );
|
|
// read the HTTP REPLY in
|
|
long rs = f.read ( buf , fs , 0 );
|
|
// not read enough?
|
|
if ( rs != fs ) {
|
|
mfree ( buf,fs,"gtd");
|
|
return log("test: read returned %li != %li",rs,fs);
|
|
}
|
|
f.close();
|
|
// null term it
|
|
buf[fs] = '\0';
|
|
|
|
// was it error=%lu ?
|
|
if ( ! strncmp(buf,"errno=",6) ) {
|
|
ts->m_readBuf = NULL;
|
|
ts->m_readBufSize = 0;
|
|
ts->m_readOffset = 0;
|
|
g_errno = atol(buf+6);
|
|
// fix mem leak
|
|
mfree ( buf , fs+1 , "gtd" );
|
|
// log it for now
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("test: GOT ERROR doc in test cache: %s (%llu) "
|
|
"[%s]",u,h, mstrerror(g_errno));
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
return true;
|
|
}
|
|
|
|
// log it for now
|
|
//if ( g_conf.m_logDebugSpider )
|
|
log("test: GOT doc in test cache: %s (%llu)",u,h);
|
|
|
|
//fprintf(stderr,"scp gk252:/e/test-spider/doc.%llu.* /home/mwells/gigablast/test-parser/\n",h);
|
|
|
|
// set the slot up now
|
|
//slot->m_readBuf = buf;
|
|
//slot->m_readBufSize = fs;
|
|
//slot->m_readBufMaxSize = fs;
|
|
ts->m_readBuf = buf;
|
|
ts->m_readOffset = fs ;
|
|
// if we had something, trim off the \0 so msg13.cpp can add it back
|
|
if ( fs > 0 ) ts->m_readOffset--;
|
|
ts->m_readBufSize = fs + 1;
|
|
return true;
|
|
}
|
|
|
|
bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) {
|
|
// hash the url into 64 bits
|
|
long long uh64 = hash64(u->getUrl(),u->getUrlLen());
|
|
// read the spider date file first
|
|
char fn[300];
|
|
File f;
|
|
// get the spider date then
|
|
sprintf(fn,"%s/%s/doc.%llu.spiderdate.txt",
|
|
g_hostdb.m_dir,testDir,uh64);
|
|
// look it up
|
|
f.set ( fn );
|
|
// try to get it
|
|
if ( ! f.doesExist() ) return false;
|
|
// get size
|
|
long fs = f.getFileSize();
|
|
// error?
|
|
if ( fs == -1 ) return log("test: error getting file size from test");
|
|
// open it
|
|
f.open ( O_RDWR );
|
|
// make a buf
|
|
char dbuf[200];
|
|
// read the date in (int format)
|
|
long rs = f.read ( dbuf , fs , 0 );
|
|
// sanity check
|
|
if ( rs <= 0 ) { char *xx=NULL;*xx=0; }
|
|
// get it
|
|
*origSpideredDate = atoi ( dbuf );
|
|
// close it
|
|
f.close();
|
|
// note it
|
|
//log("test: read spiderdate of %lu for %s",*origSpideredDate,
|
|
// u->getUrl());
|
|
// good to go
|
|
return true;
|
|
}
|
|
|
|
bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) {
|
|
// set this
|
|
long long uh64 = hash64(u->getUrl(),u->getUrlLen());
|
|
// make that into a filename
|
|
char fn[300];
|
|
sprintf(fn,"%s/%s/doc.%llu.spiderdate.txt",
|
|
g_hostdb.m_dir,testDir,uh64);
|
|
// look it up
|
|
File f; f.set ( fn );
|
|
// if already there, return now
|
|
if ( f.doesExist() ) return true;
|
|
// make it into buf
|
|
char dbuf[200]; sprintf ( dbuf ,"%lu\n",spideredTime);
|
|
// open it
|
|
f.open ( O_RDWR | O_CREAT );
|
|
// write it now
|
|
long ws = f.write ( dbuf , gbstrlen(dbuf) , 0 );
|
|
// close it
|
|
f.close();
|
|
// panic?
|
|
if ( ws != (long)gbstrlen(dbuf) )
|
|
return log("test: error writing %li != %li to %s",ws,
|
|
(long)gbstrlen(dbuf),fn);
|
|
// close it up
|
|
//f.close();
|
|
return true;
|
|
}
|
|
|
|
// add it to our "qatest123" subdir
|
|
bool addTestDoc ( long long urlHash64 , char *httpReply , long httpReplySize ,
|
|
long err , Msg13Request *r ) {
|
|
|
|
char fn[300];
|
|
// default to being from PageInject
|
|
//char *td = "test-page-inject";
|
|
//if ( r->m_testSpiderEnabled ) td = "test-spider";
|
|
//if ( r->m_testParserEnabled ) td = "test-parser";
|
|
//if ( r->m_isPageParser ) td = "test-page-parser";
|
|
//if ( r->m_isPageInject ) td = "test-page-inject";
|
|
char *td = r->m_testDir;
|
|
if ( ! td[0] ) { char *xx=NULL;*xx=0; }
|
|
// make that into a filename
|
|
sprintf(fn,"%s/%s/doc.%llu.html",g_hostdb.m_dir,td,urlHash64);
|
|
// look it up
|
|
File f; f.set ( fn );
|
|
// if already there, return now
|
|
if ( f.doesExist() ) return true;
|
|
// open it
|
|
f.open ( O_RDWR | O_CREAT );
|
|
// log it for now
|
|
if ( g_conf.m_logDebugSpider )
|
|
log("test: ADDING doc to test cache: %llu",urlHash64);
|
|
|
|
// write error only?
|
|
if ( err ) {
|
|
char ebuf[256];
|
|
sprintf(ebuf,"errno=%lu\n",err);
|
|
f.write(ebuf,gbstrlen(ebuf),0);
|
|
f.close();
|
|
return true;
|
|
}
|
|
|
|
// write it now
|
|
long ws = f.write ( httpReply , httpReplySize , 0 );
|
|
// close it
|
|
f.close();
|
|
// panic?
|
|
if ( ws != httpReplySize )
|
|
return log("test: error writing %li != %li to %s",ws,
|
|
httpReplySize,fn);
|
|
// all done, success
|
|
return true;
|
|
}
|
|
|
|
// . convert html/xml doc in place into a buffer of links, \n separated
|
|
// . return new reply size
|
|
// . return -1 on error w/ g_errno set on error
|
|
// . replySize includes terminating \0??? i dunno
|
|
long convertIntoLinks ( char *reply ,
|
|
long replySize ,
|
|
Xml *xml ,
|
|
long niceness ) {
|
|
// the "doQuickSet" is just for us and make things faster and
|
|
// more compressed...
|
|
Links links;
|
|
if ( ! links.set ( false , // useRelNoFollow
|
|
xml ,
|
|
NULL , // parentUrl
|
|
false , // setLinkHashes
|
|
NULL , // baseUrl
|
|
0 , // version (unused)
|
|
niceness ,
|
|
false ,
|
|
NULL,
|
|
true ) ) // doQuickSet? YES!!!
|
|
return -1;
|
|
// use this to ensure we do not breach
|
|
char *dstEnd = reply + replySize;
|
|
// . store into the new buffer
|
|
// . use memcpy() because it deal with potential overlap issues
|
|
char *dst = reply;
|
|
// store the thing first
|
|
if ( dst + 100 >= dstEnd )
|
|
// if no room, forget it
|
|
return 0;
|
|
// first the mime
|
|
dst += sprintf ( dst ,
|
|
"HTTP/1.0 200\r\n"
|
|
"Content-Length: " );
|
|
// save that place
|
|
char *saved = dst;
|
|
// now write a placeholder number
|
|
dst += sprintf ( dst , "00000000\r\n\r\n" );
|
|
|
|
// save this
|
|
char *content = dst;
|
|
|
|
// this tells xmldoc.cpp what's up
|
|
//memcpy ( dst , "<!--links-->\n", 13 );
|
|
//dst += 13;
|
|
// iterate over the links
|
|
for ( long i = 0 ; i < links.m_numLinks ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get link
|
|
char *str = links.getLink(i);
|
|
// get size
|
|
long len = links.getLinkLen(i);
|
|
// ensure no breach. if so, return now
|
|
if ( dst + len + 2 > dstEnd ) return dst - reply;
|
|
// lead it
|
|
memcpy ( dst, "<a href=", 8 );
|
|
dst += 8;
|
|
// copy over, should be ok with overlaps
|
|
memcpy ( dst , str , len );
|
|
dst += len;
|
|
// end tag and line
|
|
memcpy ( dst , "></a>\n", 6 );
|
|
dst += 6;
|
|
}
|
|
// null term it!
|
|
*dst++ = '\0';
|
|
// content length
|
|
long clen = dst - content - 1;
|
|
// the last digit
|
|
char *dptr = saved + 7;
|
|
// store it up top in the mime header
|
|
for ( long x = 0 ; x < 8 ; x++ ) {
|
|
//if ( clen == 0 ) *dptr-- = ' ';
|
|
if ( clen == 0 ) break;
|
|
*dptr-- = '0' + (clen % 10);
|
|
clen /= 10;
|
|
}
|
|
// the new replysize is just this plain list of links
|
|
return dst - reply;
|
|
}
|
|
|
|
// returns true if <iframe> tag in there
|
|
bool hasIframe ( char *reply, long replySize , long niceness ) {
|
|
if ( ! reply || replySize <= 0 ) return false;
|
|
char *p = reply;
|
|
// exclude \0
|
|
char *pend = reply + replySize - 1;
|
|
for ( ; p < pend ; p++ ) {
|
|
QUICKPOLL(niceness);
|
|
if ( *p != '<' ) continue;
|
|
if ( to_lower_a (p[1]) != 'i' ) continue;
|
|
if ( to_lower_a (p[2]) != 'f' ) continue;
|
|
if ( to_lower_a (p[3]) != 'r' ) continue;
|
|
if ( to_lower_a (p[4]) != 'a' ) continue;
|
|
if ( to_lower_a (p[5]) != 'm' ) continue;
|
|
if ( to_lower_a (p[6]) != 'e' ) continue;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// . returns -1 with g_errno set on error
|
|
// . returns 0 if has no future date
|
|
// . returns 1 if does have future date
|
|
// . TODO: for each street/city/state address, whether it is inlined or not,
|
|
// look it up in zak's db that has all the street names and their city/state.
|
|
// if it's in there then set AF_VERIFIED_STREET i guess...
|
|
long hasGoodDates ( char *content ,
|
|
long contentLen ,
|
|
Xml *xml ,
|
|
Words *words,
|
|
char ctype ,
|
|
long niceness ) {
|
|
// now scan the text nodes for dates i guess...
|
|
Dates dates;
|
|
if ( ! dates.parseDates ( words ,
|
|
DF_FROM_BODY ,
|
|
NULL , // bits
|
|
NULL , // sections
|
|
niceness ,
|
|
NULL ,
|
|
ctype ) )
|
|
return -1;
|
|
// get the current year/month/etc in utc
|
|
long now = getTimeLocal();
|
|
struct tm *timeStruct = gmtime ( &now );
|
|
long year = 1900 + timeStruct->tm_year;
|
|
// day of month. starts at 1.
|
|
long day = timeStruct->tm_mday;
|
|
// 0 is january. but we use 1 for january in Dates.cpp, so add 1.
|
|
long month = timeStruct->tm_mon + 1;
|
|
|
|
bool gotTOD = false;
|
|
bool gotMonthDow = false;
|
|
|
|
Date *d1 = NULL;
|
|
Date *d2 = NULL;
|
|
|
|
// scan the dates we got, looking for certain types
|
|
for ( long i = 0 ; i < dates.m_numDatePtrs ; i++ ) {
|
|
// shortcut
|
|
Date *di = dates.m_datePtrs[i];
|
|
// skip if nuked
|
|
if ( ! di ) continue;
|
|
// shortcut
|
|
datetype_t dt = di->m_hasType;
|
|
// must be a tod month or dow
|
|
if ( !(dt & (DT_TOD|DT_MONTH|DT_DOW)) ) continue;
|
|
// get the date's year
|
|
long diyear = di->m_maxYear;
|
|
if ( (long)di->m_year <= 0 ) diyear = 0;
|
|
// if it has a year but it is old, forget it
|
|
if ( diyear > 0 && diyear < year ) continue;
|
|
// get the date's month
|
|
long dimonth = di->m_month;
|
|
// if has no year but, assuming it was this year, the month
|
|
// and monthday is over
|
|
if ( diyear == year && // this year,before or nonr
|
|
dimonth == month && // this month
|
|
di->m_dayNum > 0 &&
|
|
di->m_dayNum <= day )
|
|
continue;
|
|
// the same, but month is any before
|
|
if ( diyear == year &&
|
|
dimonth > 0 &&
|
|
dimonth < month ) continue;
|
|
// an unknown year (clock detector kinda)
|
|
if ( diyear == 0 &&
|
|
dimonth == month &&
|
|
di->m_dayNum > 0 &&
|
|
di->m_dayNum <= day )
|
|
continue;
|
|
// recently past date PROBABLY...
|
|
if ( diyear == 0 &&
|
|
dimonth > 0 &&
|
|
dimonth < month &&
|
|
// but more than 3 months back might be referring to
|
|
// NEXT YEAR!!! so cap it at that
|
|
dimonth > month - 3 )
|
|
continue;
|
|
// got one
|
|
if ( dt & DT_TOD ) {
|
|
gotTOD = true;
|
|
if ( ! d1 ) d1 = di;
|
|
}
|
|
if ( dt & (DT_MONTH|DT_DOW) ) {
|
|
gotMonthDow = true;
|
|
if ( ! d2 ) d2 = di;
|
|
}
|
|
}
|
|
// none found!
|
|
if ( ! gotTOD ) return 0;
|
|
if ( ! gotMonthDow ) return 0;
|
|
|
|
Addresses aa;
|
|
if ( ! aa.set ( NULL , // sections
|
|
words ,
|
|
NULL , // bits
|
|
NULL , // tag rec
|
|
NULL , // url
|
|
0 , // docid
|
|
0 , // collnum
|
|
0 , // domhash32
|
|
0 , // ip
|
|
niceness ,
|
|
NULL , // pbuf, safebuf
|
|
NULL , // state
|
|
NULL , // callback
|
|
ctype ,
|
|
NULL , // siteTitleBuf
|
|
0 , // siteTitleBufSize
|
|
NULL )) // xmldoc ptr
|
|
// return -1 with g_errno set on error
|
|
return -1;
|
|
// scan the addresses
|
|
for ( long i = 0 ; i < aa.m_am.getNumPtrs() ; i++ ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// get it
|
|
Address *ad = (Address *)aa.m_am.getPtr(i);
|
|
// inlined?
|
|
bool inlined = (ad->m_flags & AF_INLINED);
|
|
// that is good enough
|
|
if ( inlined ) return 1;
|
|
// verified somehow?
|
|
bool vs = ( ad->m_flags & AF_VERIFIED_STREET);
|
|
// that is good too, although how did it get verified?
|
|
if ( vs ) return 1;
|
|
}
|
|
// ok, nothing inlined or verified...
|
|
return 0;
|
|
}
|
|
|
|
char getContentTypeQuick ( HttpMime *mime,
|
|
char *reply ,
|
|
long replySize ,
|
|
long niceness ) {
|
|
char ctype = mime->getContentType();
|
|
char ctype2 = 0;
|
|
if ( replySize>0 && reply ) {
|
|
// mime is start of reply, so skip to content section
|
|
char *content = reply + mime->getMimeLen();
|
|
// defined in XmlDoc.cpp...
|
|
ctype2 = getContentTypeFromContent(content,niceness);
|
|
}
|
|
if ( ctype2 ) ctype = ctype2;
|
|
return ctype;
|
|
}
|
|
|
|
// . return new size, might be zero...
|
|
// . use a minimal mime as well
|
|
// . keep in same buffer
|
|
long filterRobotsTxt ( char *reply ,
|
|
long replySize ,
|
|
HttpMime *mime ,
|
|
long niceness ,
|
|
char *userAgent ,
|
|
long userAgentLen ) {
|
|
// bail if nothing
|
|
if ( ! reply || replySize <= 0 ) return replySize;
|
|
// skip mime
|
|
char *content = reply + mime->getMimeLen();
|
|
char *s = content;
|
|
// end of a line
|
|
char *end;
|
|
char *agent = NULL;
|
|
char *dst = reply;
|
|
// get first user-agent
|
|
for ( ; *s ; s = end ) {
|
|
// breathe
|
|
QUICKPOLL(niceness);
|
|
// record line start
|
|
char *start = s;
|
|
// skip non breaking white space
|
|
while ( *s && (*s == ' ' || *s == '\t') ) s++;
|
|
// skip to next non-empty line
|
|
for ( end = s ; *end && *end != '\n' ; end++ );
|
|
// advance over \n
|
|
if ( *end ) end++;
|
|
// is it a comment line? skip if so
|
|
if ( *s == '#' ) continue;
|
|
// need "user-agent", but eof works too...
|
|
if ( *s ) {
|
|
if ( to_lower_a(s[0]) != 'u' ) continue;
|
|
if ( to_lower_a(s[1]) != 's' ) continue;
|
|
if ( strncasecmp ( s, "user-agent",10 ) ) continue;
|
|
}
|
|
// if we already had an agent and now another one... stop!
|
|
if ( ! *s || agent ) {
|
|
// this is a problem... if somehow its got a smaller
|
|
// mime than us, we can't let our new mime overwrite
|
|
// the user-agent line we were going to memcpy()
|
|
if ( reply + 16 > agent ) return replySize;
|
|
if ( dst == reply ) {
|
|
memcpy ( dst , "HTTP/1.0 200\r\n\r\n", 16 );
|
|
dst += 16;
|
|
}
|
|
// store the user-agent and following allows/disallows
|
|
memcpy ( dst, agent , start - agent );
|
|
dst += ( start - agent );
|
|
// restart
|
|
agent = NULL;
|
|
// eof?
|
|
if ( ! *s ) break;
|
|
}
|
|
// record line start
|
|
char *lineStart = s;
|
|
// skip over that
|
|
s += 10;
|
|
// then a colon or not!
|
|
for ( ; *s ; s++ ) {
|
|
if ( *s == ':' ) continue;
|
|
if ( *s == ' ' ) continue;
|
|
if ( *s == '\t' ) continue;
|
|
break;
|
|
}
|
|
// craziness? need a bot name, otherwise, skip the line
|
|
if ( ! is_alnum_a(*s) && *s != '*' ) continue;
|
|
// did the user-agent line match our bot name?
|
|
bool match = false;
|
|
// then the user-agent
|
|
if ( *s == '*' ) match = true;
|
|
if ( strncasecmp(s,userAgent,userAgentLen) == 0 ) match =true;
|
|
/*
|
|
if ( strncasecmp(s,"gigabot",7) == 0 ) match = true;
|
|
if ( strncasecmp(s,"flurbot",7) == 0 ) match = true;
|
|
if ( strncasecmp(s,"eventgurubot",12) == 0 ) match = true;
|
|
if ( strncasecmp(s,"procogbot",8) == 0 ) match = true;
|
|
if ( strncasecmp(s,"probot",6) == 0 ) match = true;
|
|
*/
|
|
// record agent position if we matched!
|
|
if ( match ) agent = lineStart;
|
|
// now a sequence of allow/disallow lines until
|
|
// we hit another user agent
|
|
}
|
|
// if nothing keep it zero
|
|
if ( dst - reply == 0 ) return 0;
|
|
// otherwise null term it. this could be a one byte \0 reply!!! no mime
|
|
*dst++ = '\0';
|
|
// all done, return new replysize...
|
|
return dst - reply;
|
|
}
|
|
|
|
// returns false if blocks, true otherwise
|
|
bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
|
|
|
if ( ! ts ) { char *xx=NULL;*xx=0; }
|
|
|
|
long niceness = r->m_niceness;
|
|
|
|
// ok, we've an attempt now
|
|
r->m_attemptedIframeExpansion = true;
|
|
|
|
// we are doing something to destroy reply, so make a copy of it!
|
|
long copySize = ts->m_readOffset + 1;
|
|
char *copy = (char *)mdup ( ts->m_readBuf , copySize , "ifrmcpy" );
|
|
if ( ! copy ) return true;
|
|
// sanity, must include \0 at the end
|
|
if ( copy[copySize-1] ) { char *xx=NULL;*xx=0; }
|
|
|
|
// need a new state for it, use XmlDoc itself
|
|
XmlDoc *xd;
|
|
try { xd = new ( XmlDoc ); }
|
|
catch ( ... ) {
|
|
mfree ( copy , copySize , "ifrmcpy" );
|
|
g_errno = ENOMEM;
|
|
return true;
|
|
}
|
|
mnew ( xd , sizeof(XmlDoc),"msg13xd");
|
|
|
|
// make a fake spider request so we can do it
|
|
SpiderRequest sreq;
|
|
sreq.reset();
|
|
strcpy(sreq.m_url,r->m_url);
|
|
long firstIp = hash32n(r->m_url);
|
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
|
sreq.setKey( firstIp,0LL, false );
|
|
sreq.m_isInjecting = 1;
|
|
sreq.m_isPageInject = 1;
|
|
sreq.m_hopCount = 0;//m_hopCount;
|
|
sreq.m_hopCountValid = 1;
|
|
sreq.m_fakeFirstIp = 1;
|
|
sreq.m_firstIp = firstIp;
|
|
|
|
// log it now
|
|
if ( g_conf.m_logDebugBuild )
|
|
log("scproxy: expanding iframes for %s",r->m_url);
|
|
|
|
// . use the enormous power of our new XmlDoc class
|
|
// . this returns false with g_errno set on error
|
|
// . sometimes niceness is 0, like when the UdpSlot
|
|
// gets its niceness converted, (see
|
|
// UdpSlot::m_converetedNiceness).
|
|
if ( ! xd->set4 ( &sreq ,
|
|
NULL ,
|
|
"main", // HACK!! m_coll ,
|
|
NULL , // pbuf
|
|
// give it a niceness of 1, we have to be
|
|
// careful since we are a niceness of 0!!!!
|
|
1, //niceness, // 1 ,
|
|
NULL , // content ,
|
|
false, // deleteFromIndex ,
|
|
0 )) { // forcedIp
|
|
// log it
|
|
log("scproxy: xmldoc set error: %s",mstrerror(g_errno));
|
|
// now nuke xmldoc
|
|
mdelete ( xd , sizeof(XmlDoc) , "msg13xd" );
|
|
delete ( xd );
|
|
// g_errno should be set if that returned false
|
|
return true;
|
|
}
|
|
|
|
// . re-set the niceness because it will core if we set it with
|
|
// a niceness of 0...
|
|
xd->m_niceness = niceness;
|
|
|
|
// we already downloaded the httpReply so this is valid. no need
|
|
// to check robots.txt again for that url, but perhaps for the
|
|
// iframe urls.
|
|
xd->m_isAllowed = true;
|
|
xd->m_isAllowedValid = true;
|
|
|
|
// save stuff for calling gotHttpReply() back later with the
|
|
// iframe expanded document
|
|
xd->m_r = r;
|
|
|
|
// so XmlDoc::getExtraDoc doesn't have any issues
|
|
xd->m_firstIp = 123456;
|
|
xd->m_firstIpValid = true;
|
|
|
|
// try using xmldoc to do it
|
|
xd->m_httpReply = copy;
|
|
xd->m_httpReplySize = copySize;
|
|
xd->m_httpReplyAllocSize = copySize;
|
|
xd->m_httpReplyValid = true;
|
|
|
|
// we claimed this buffer, do not let TcpServer destroy it!
|
|
//ts->m_readBuf = NULL;//(char *)0x1234;
|
|
|
|
// tell it to skip msg13 and call httpServer.getDoc directly
|
|
xd->m_isSpiderProxy = true;
|
|
|
|
// do not let XmlDoc::getRedirUrl() try to get old title rec
|
|
xd->m_oldDocValid = true;
|
|
xd->m_oldDoc = NULL;
|
|
// can't be NULL, xmldoc uses for g_errno
|
|
xd->ptr_linkInfo1 = (LinkInfo *)0x01;
|
|
xd->ptr_linkInfo2 = (LinkInfo *)0x01;
|
|
xd->size_linkInfo1 = 0 ;
|
|
xd->size_linkInfo2 = 0 ;
|
|
xd->m_linkInfo1Valid = true;
|
|
xd->m_linkInfo2Valid = true;
|
|
|
|
// call this as callback
|
|
xd->setCallback ( xd , gotIframeExpandedContent );
|
|
|
|
xd->m_redirUrlValid = true;
|
|
xd->ptr_redirUrl = NULL;
|
|
xd->size_redirUrl = 0;
|
|
|
|
xd->m_downloadEndTimeValid = true;
|
|
xd->m_downloadEndTime = gettimeofdayInMillisecondsLocal();
|
|
|
|
// now get the expanded content
|
|
char **ec = xd->getExpandedUtf8Content();
|
|
// this means it blocked
|
|
if ( ec == (void *)-1 ) {
|
|
//log("scproxy: waiting for %s",r->m_url);
|
|
return false;
|
|
}
|
|
// return true with g_errno set
|
|
if ( ! ec ) {
|
|
log("scproxy: iframe expansion error: %s",mstrerror(g_errno));
|
|
// g_errno should be set
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// clean up
|
|
}
|
|
|
|
// it did not block so signal gotIframeExpandedContent to not call
|
|
// gotHttpReply()
|
|
//xd->m_r = NULL;
|
|
|
|
// hey... it did block and we are stil;l printing this!!
|
|
// it happens when the iframe src is google or bing.. usually maps
|
|
// so i'd think indicative of something special
|
|
if ( g_conf.m_logDebugBuild )
|
|
log("scproxy: got iframe expansion without blocking for url=%s"
|
|
" err=%s",r->m_url,mstrerror(g_errno));
|
|
|
|
// save g_errno for returning
|
|
long saved = g_errno;
|
|
|
|
// this also means that the iframe tag was probably not expanded
|
|
// because it was from google.com or bing.com or had a bad src attribut
|
|
// or bad url in the src attribute.
|
|
// so we have set m_attemptedIframeExpansion, just recall using
|
|
// the original TcpSocket ptr... and this time we should not be
|
|
// re-called because m_attemptedIframeExpansion is now true
|
|
//gotHttpReply2 ( r, NULL , 0 , 0 , NULL );
|
|
|
|
// we can't be messing with it!! otherwise we'd have to reutrn
|
|
// a new reply size i guess
|
|
if ( xd->m_didExpansion ) { char *xx=NULL;*xx=0; }
|
|
|
|
// try to reconstruct ts
|
|
//ts->m_readBuf = xd->m_httpReply;
|
|
// and do not allow xmldoc to free that buf
|
|
//xd->m_httpReply = NULL;
|
|
|
|
// now nuke xmldoc
|
|
mdelete ( xd , sizeof(XmlDoc) , "msg13xd" );
|
|
delete ( xd );
|
|
|
|
// reinstate g_errno in case mdelete() reset it
|
|
g_errno = saved;
|
|
|
|
// no blocking then...
|
|
return true;
|
|
}
|
|
|
|
void gotIframeExpandedContent ( void *state ) {
|
|
// save error in case mdelete nukes it
|
|
long saved = g_errno;
|
|
|
|
XmlDoc *xd = (XmlDoc *)state;
|
|
// this was stored in xd
|
|
Msg13Request *r = xd->m_r;
|
|
|
|
//log("scproxy: done waiting for %s",r->m_url);
|
|
|
|
// note it
|
|
if ( g_conf.m_logDebugBuild )
|
|
log("scproxy: got iframe expansion for url=%s",r->m_url);
|
|
|
|
// assume we had no expansion or there was an error
|
|
char *reply = NULL;
|
|
long replySize = 0;
|
|
|
|
// . if no error, then grab it
|
|
// . if failed to get the iframe content then m_didExpansion should
|
|
// be false
|
|
if ( ! g_errno && xd->m_didExpansion ) {
|
|
// original mime should have been valid
|
|
if ( ! xd->m_mimeValid ) { char *xx=NULL;*xx=0; }
|
|
// insert the mime into the expansion buffer! m_esbuf
|
|
xd->m_esbuf.insert2 ( xd->m_httpReply ,
|
|
xd->m_mime.getMimeLen() ,
|
|
0 );
|
|
// . get our buffer with the expanded iframes in it
|
|
// . make sure that has the mime in it too
|
|
//reply = xd->m_expandedUtf8Content;
|
|
//replySize = xd->m_expandedUtf8ContentSize;
|
|
// just to make sure nothing bad happens, null this out
|
|
xd->m_expandedUtf8Content = NULL;
|
|
// this new reply includes the original mime!
|
|
reply = xd->m_esbuf.getBufStart();
|
|
// include \0? yes.
|
|
replySize = xd->m_esbuf.length() + 1;
|
|
// sanity. must be null terminated
|
|
if ( reply[replySize-1] ) { char *xx=NULL;*xx=0; }
|
|
}
|
|
// if expansion did not pan out, use original reply i guess
|
|
else if ( ! g_errno ) {
|
|
reply = xd->m_httpReply;
|
|
replySize = xd->m_httpReplySize;
|
|
}
|
|
|
|
// log it so we know why we are getting EDNSTIMEDOUT msgs back
|
|
// on the main cluster!
|
|
if ( g_errno )
|
|
log("scproxy: error getting iframe content for url=%s : %s",
|
|
r->m_url,mstrerror(g_errno));
|
|
// sanity check
|
|
if ( reply && reply[replySize-1] != '\0') { char *xx=NULL;*xx=0; }
|
|
// pass back the error we had, if any
|
|
g_errno = saved;
|
|
// . then resume the reply processing up above as if this was the
|
|
// document that was downloaded.
|
|
// . PASS g_errno BACK TO THIS if it was set, like ETCPTIMEDOUT
|
|
gotHttpReply2 ( r, reply, replySize , replySize , NULL );
|
|
|
|
// no, let's not dup it and pass what we got in, since ts is NULL
|
|
// it should not free it!!!
|
|
|
|
// . now destroy it
|
|
// . the reply should have been sent back as a msg13 reply either
|
|
// as a normal reply or an error reply
|
|
// . nuke out state then, including the xmldoc
|
|
// . was there an error, maybe a TCPTIMEDOUT???
|
|
mdelete ( xd , sizeof(XmlDoc) , "msg13xd" );
|
|
delete ( xd );
|
|
}
|
|
|
|
// call this once every 10ms to launch queued up download requests so that
|
|
// we respect crawl delay for sure
|
|
void scanHammerQueue ( int fd , void *state ) {
|
|
|
|
if ( ! s_hammerQueueHead ) return;
|
|
|
|
long long nowms = gettimeofdayInMilliseconds();
|
|
|
|
top:
|
|
|
|
Msg13Request *r = s_hammerQueueHead;
|
|
if ( ! r ) return;
|
|
|
|
Msg13Request *prev = NULL;
|
|
long long waited = -1LL;
|
|
Msg13Request *nextLink = NULL;
|
|
|
|
// scan down the linked list of queued of msg13 requests
|
|
for ( ; r ; prev = r , r = nextLink ) {
|
|
|
|
// downloadTheDocForReals() could free "r" so save this here
|
|
nextLink = r->m_nextLink;
|
|
|
|
long long last;
|
|
last = s_hammerCache.getLongLong(0,r->m_firstIp,30,true);
|
|
// is one from this ip outstanding?
|
|
if ( last == 0LL && r->m_crawlDelayFromEnd ) continue;
|
|
// download finished?
|
|
if ( last > 0 ) {
|
|
waited = nowms - last;
|
|
// but skip if haven't waited long enough
|
|
if ( waited < r->m_crawlDelayMS ) continue;
|
|
}
|
|
// debug
|
|
//log("spider: downloading %s from crawldelay queue "
|
|
// "waited=%llims crawldelay=%lims",
|
|
// r->m_url,waited,r->m_crawlDelayMS);
|
|
|
|
// good to go
|
|
downloadTheDocForReals ( r );
|
|
//
|
|
// remove from future scans
|
|
//
|
|
if ( prev )
|
|
prev->m_nextLink = nextLink;
|
|
|
|
if ( s_hammerQueueHead == r )
|
|
s_hammerQueueHead = nextLink;
|
|
|
|
if ( s_hammerQueueTail == r )
|
|
s_hammerQueueTail = prev;
|
|
|
|
// if "r" was freed by downloadTheDocForReals() then
|
|
// in the next iteration of this loop, "prev" will point
|
|
// to a freed memory area, so start from the top again
|
|
goto top;
|
|
|
|
// try to download some more i guess...
|
|
}
|
|
}
|