mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
fix nyt.com cookie redir bug.
fixed bug when POSTing injection request with multipart/form-data.
This commit is contained in:
parent
146e45db56
commit
cc1ceaaac2
@ -159,9 +159,10 @@ bool HttpMime::parse ( char *mime , long mimeLen , Url *url ) {
|
|||||||
}
|
}
|
||||||
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 )
|
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 )
|
||||||
m_contentType = getContentTypePrivate ( p + 13 );
|
m_contentType = getContentTypePrivate ( p + 13 );
|
||||||
else if ( strncasecmp ( p , "Set-Cookie: " ,11) == 0 ) {
|
else if ( strncasecmp ( p , "Set-Cookie:" ,10) == 0 ) {
|
||||||
m_cookie = p + 11;
|
m_cookie = p + 11;
|
||||||
m_cookieLen = gbstrlen ( p + 11 );
|
if ( m_cookie[0] == ' ' ) m_cookie++;
|
||||||
|
m_cookieLen = gbstrlen ( m_cookie );
|
||||||
}
|
}
|
||||||
else if ( strncasecmp ( p , "Location:" , 9) == 0 ) {
|
else if ( strncasecmp ( p , "Location:" , 9) == 0 ) {
|
||||||
// point to it
|
// point to it
|
||||||
|
@ -181,6 +181,10 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
|||||||
if ( size == 0 ) cmd = "HEAD";
|
if ( size == 0 ) cmd = "HEAD";
|
||||||
if ( doPost ) cmd = "POST";
|
if ( doPost ) cmd = "POST";
|
||||||
|
|
||||||
|
// crap, can't spider nyt.com if we are 1.0, so use 1.0 but also
|
||||||
|
// note Connection: Close\r\n when making requests
|
||||||
|
//proto = "HTTP/1.1";
|
||||||
|
|
||||||
// . now use "Accept-Language: en" to tell servers we prefer english
|
// . now use "Accept-Language: en" to tell servers we prefer english
|
||||||
// . i removed keep-alive connection since some connections close on
|
// . i removed keep-alive connection since some connections close on
|
||||||
// non-200 ok http statuses and we think they're open since close
|
// non-200 ok http statuses and we think they're open since close
|
||||||
@ -212,6 +216,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
|||||||
"Host: %s\r\n"
|
"Host: %s\r\n"
|
||||||
"%s"
|
"%s"
|
||||||
"User-Agent: %s\r\n"
|
"User-Agent: %s\r\n"
|
||||||
|
"Connection: Close\r\n"
|
||||||
//"Connection: Keep-Alive\r\n"
|
//"Connection: Keep-Alive\r\n"
|
||||||
"Accept-Language: en\r\n"
|
"Accept-Language: en\r\n"
|
||||||
//"Accept: */*\r\n\r\n" ,
|
//"Accept: */*\r\n\r\n" ,
|
||||||
@ -226,6 +231,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
|||||||
"Host: %s\r\n"
|
"Host: %s\r\n"
|
||||||
"%s"
|
"%s"
|
||||||
"User-Agent: %s\r\n"
|
"User-Agent: %s\r\n"
|
||||||
|
"Connection: Close\r\n"
|
||||||
//"Connection: Keep-Alive\r\n"
|
//"Connection: Keep-Alive\r\n"
|
||||||
"Accept-Language: en\r\n"
|
"Accept-Language: en\r\n"
|
||||||
//"Accept: */*\r\n"
|
//"Accept: */*\r\n"
|
||||||
@ -246,6 +252,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
|||||||
"Host: %s\r\n"
|
"Host: %s\r\n"
|
||||||
"%s"
|
"%s"
|
||||||
"User-Agent: %s\r\n"
|
"User-Agent: %s\r\n"
|
||||||
|
"Connection: Close\r\n"
|
||||||
//"Connection: Keep-Alive\r\n"
|
//"Connection: Keep-Alive\r\n"
|
||||||
"Accept-Language: en\r\n"
|
"Accept-Language: en\r\n"
|
||||||
//"Accept: */*\r\n"
|
//"Accept: */*\r\n"
|
||||||
@ -275,6 +282,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
|||||||
"Accept: */*\r\n"
|
"Accept: */*\r\n"
|
||||||
"Host: %s\r\n"
|
"Host: %s\r\n"
|
||||||
"%s"
|
"%s"
|
||||||
|
"Connection: Close\r\n"
|
||||||
//"Connection: Keep-Alive\r\n"
|
//"Connection: Keep-Alive\r\n"
|
||||||
//"Accept-Language: en\r\n"
|
//"Accept-Language: en\r\n"
|
||||||
"%s",
|
"%s",
|
||||||
@ -417,6 +425,12 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
|||||||
else log("http: Got POST request without \\r\\n\\r\\n.");
|
else log("http: Got POST request without \\r\\n\\r\\n.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool multipart = false;
|
||||||
|
if ( m_requestType == 2 ) { // is POST?
|
||||||
|
char *cd =strcasestr(req,"Content-Type: multipart/form-data");
|
||||||
|
if ( cd ) multipart = true;
|
||||||
|
}
|
||||||
|
|
||||||
// . point to the file path
|
// . point to the file path
|
||||||
// . skip over the "GET "
|
// . skip over the "GET "
|
||||||
long filenameStart = 4 ;
|
long filenameStart = 4 ;
|
||||||
@ -812,7 +826,8 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Put '\0' back into the HttpRequest buffer...
|
// Put '\0' back into the HttpRequest buffer...
|
||||||
if (m_cgiBuf){
|
// crap, not if we are multi-part unencoded stuff...
|
||||||
|
if ( m_cgiBuf && ! multipart ) {
|
||||||
// do not mangle the "ucontent"!
|
// do not mangle the "ucontent"!
|
||||||
long cgiBufLen = m_cgiBufLen;
|
long cgiBufLen = m_cgiBufLen;
|
||||||
cgiBufLen -= m_ucontentLen;
|
cgiBufLen -= m_ucontentLen;
|
||||||
|
@ -190,8 +190,14 @@ bool HttpServer::getDoc ( char *url ,
|
|||||||
char *host = getHostFast ( url , &hostLen , &port );
|
char *host = getHostFast ( url , &hostLen , &port );
|
||||||
|
|
||||||
|
|
||||||
|
// mdw23
|
||||||
//if ( g_conf.m_logDebugSpider )
|
//if ( g_conf.m_logDebugSpider )
|
||||||
// log("spider: httprequest = %s", req );
|
// {
|
||||||
|
// SafeBuf tmp;
|
||||||
|
// tmp.safeMemcpy ( req , reqSize );
|
||||||
|
// tmp.nullTerm();
|
||||||
|
// log("spider: httprequest = %s", tmp.getBufStart() );
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
// do we have an ip to send to? assume not
|
// do we have an ip to send to? assume not
|
||||||
|
@ -692,7 +692,9 @@ bool Images::downloadImage ( ) {
|
|||||||
r->m_addToTestCache = 1;
|
r->m_addToTestCache = 1;
|
||||||
}
|
}
|
||||||
// url is the most important
|
// url is the most important
|
||||||
strcpy(r->m_url,m_imageUrl.getUrl());
|
//strcpy(r->m_url,m_imageUrl.getUrl());
|
||||||
|
r-> ptr_url = m_imageUrl.getUrl();
|
||||||
|
r->size_url = m_imageUrl.getUrlLen()+1; // include \0
|
||||||
// . try to download it
|
// . try to download it
|
||||||
// . i guess we are ignoring hammers at this point
|
// . i guess we are ignoring hammers at this point
|
||||||
if ( ! m_msg13.getDoc(r,false,this,downloadImageWrapper))
|
if ( ! m_msg13.getDoc(r,false,this,downloadImageWrapper))
|
||||||
|
112
Msg13.cpp
112
Msg13.cpp
@ -164,8 +164,8 @@ bool Msg13::getDoc ( Msg13Request *r,
|
|||||||
if ( r->m_urlIp == -1 ) { char *xx = NULL; *xx = 0; }
|
if ( r->m_urlIp == -1 ) { char *xx = NULL; *xx = 0; }
|
||||||
|
|
||||||
// set this
|
// set this
|
||||||
r->m_urlLen = gbstrlen ( r->m_url );
|
//r->m_urlLen = gbstrlen ( r->ptr_url );
|
||||||
r->m_urlHash64 = hash64 ( r->m_url , r->m_urlLen );
|
r->m_urlHash64 = hash64 ( r->ptr_url , r->size_url-1);//m_urlLen );
|
||||||
|
|
||||||
// sanity check, if spidering the test coll make sure one of
|
// sanity check, if spidering the test coll make sure one of
|
||||||
// these is true!! this prevents us from mistakenly turning it off
|
// these is true!! this prevents us from mistakenly turning it off
|
||||||
@ -186,8 +186,8 @@ bool Msg13::getDoc ( Msg13Request *r,
|
|||||||
// r->m_testParserEnabled = true;
|
// r->m_testParserEnabled = true;
|
||||||
|
|
||||||
// is this a /robots.txt url?
|
// is this a /robots.txt url?
|
||||||
if ( r->m_urlLen > 12 &&
|
if ( r->size_url - 1 > 12 &&
|
||||||
! strncmp ( r->m_url + r->m_urlLen - 11,"/robots.txt",11))
|
! strncmp ( r->ptr_url + r->size_url -1 -11,"/robots.txt",11))
|
||||||
r->m_isRobotsTxt = true;
|
r->m_isRobotsTxt = true;
|
||||||
|
|
||||||
// force caching if getting robots.txt so is compressed in cache
|
// force caching if getting robots.txt so is compressed in cache
|
||||||
@ -195,7 +195,7 @@ bool Msg13::getDoc ( Msg13Request *r,
|
|||||||
r->m_compressReply = true;
|
r->m_compressReply = true;
|
||||||
|
|
||||||
// do not get .google.com/ crap
|
// do not get .google.com/ crap
|
||||||
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
//if ( strstr(r->ptr_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||||
|
|
||||||
// set it for this too
|
// set it for this too
|
||||||
//if ( g_conf.m_useCompressionProxy ) {
|
//if ( g_conf.m_useCompressionProxy ) {
|
||||||
@ -261,19 +261,33 @@ bool Msg13::forwardRequest ( ) {
|
|||||||
logf ( LOG_DEBUG,
|
logf ( LOG_DEBUG,
|
||||||
"spider: sending download request of %s firstIp=%s "
|
"spider: sending download request of %s firstIp=%s "
|
||||||
"uh48=%llu to "
|
"uh48=%llu to "
|
||||||
"host %li (child=%li)", r->m_url, iptoa(r->m_firstIp),
|
"host %li (child=%li)", r->ptr_url, iptoa(r->m_firstIp),
|
||||||
r->m_urlHash48, hostId,
|
r->m_urlHash48, hostId,
|
||||||
r->m_skipHammerCheck);
|
r->m_skipHammerCheck);
|
||||||
|
|
||||||
|
|
||||||
// fill up the request
|
// fill up the request
|
||||||
long requestSize = r->getSize();
|
long requestBufSize = r->getSize();
|
||||||
|
|
||||||
|
// we have to serialize it now because it has cookies as well as
|
||||||
|
// the url.
|
||||||
|
char *requestBuf = serializeMsg ( sizeof(Msg39Request),
|
||||||
|
&r->size_url,
|
||||||
|
&r->size_cookie,
|
||||||
|
&r->ptr_url,
|
||||||
|
r,
|
||||||
|
&requestBufSize ,
|
||||||
|
NULL ,
|
||||||
|
0,//RBUF_SIZE ,
|
||||||
|
false );
|
||||||
|
// g_errno should be set in this case, most likely to ENOMEM
|
||||||
|
if ( ! requestBuf ) return true;
|
||||||
|
|
||||||
// . otherwise, send the request to the key host
|
// . otherwise, send the request to the key host
|
||||||
// . returns false and sets g_errno on error
|
// . returns false and sets g_errno on error
|
||||||
// . now wait for 2 minutes before timing out
|
// . now wait for 2 minutes before timing out
|
||||||
if ( ! g_udpServer.sendRequest ( (char *)r ,
|
if ( ! g_udpServer.sendRequest ( requestBuf, // (char *)r ,
|
||||||
requestSize ,
|
requestBufSize ,
|
||||||
0x13 , // msgType 0x13
|
0x13 , // msgType 0x13
|
||||||
h->m_ip ,
|
h->m_ip ,
|
||||||
h->m_port ,
|
h->m_port ,
|
||||||
@ -309,7 +323,8 @@ void gotForwardedReplyWrapper ( void *state , UdpSlot *slot ) {
|
|||||||
|
|
||||||
bool Msg13::gotForwardedReply ( UdpSlot *slot ) {
|
bool Msg13::gotForwardedReply ( UdpSlot *slot ) {
|
||||||
// don't let udpserver free the request, it's our m_request[]
|
// don't let udpserver free the request, it's our m_request[]
|
||||||
slot->m_sendBufAlloc = NULL;
|
// no, now let him free it because it was serialized into there
|
||||||
|
//slot->m_sendBufAlloc = NULL;
|
||||||
// what did he give us?
|
// what did he give us?
|
||||||
char *reply = slot->m_readBuf;
|
char *reply = slot->m_readBuf;
|
||||||
long replySize = slot->m_readBufSize;
|
long replySize = slot->m_readBufSize;
|
||||||
@ -343,7 +358,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
|||||||
|
|
||||||
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
||||||
logf(LOG_DEBUG,"spider: FINALIZED %s firstIp=%s",
|
logf(LOG_DEBUG,"spider: FINALIZED %s firstIp=%s",
|
||||||
r->m_url,iptoa(r->m_firstIp));
|
r->ptr_url,iptoa(r->m_firstIp));
|
||||||
|
|
||||||
|
|
||||||
// . if timed out probably the host is now dead so try another one!
|
// . if timed out probably the host is now dead so try another one!
|
||||||
@ -351,7 +366,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
|||||||
if ( g_errno == EUDPTIMEDOUT ) {
|
if ( g_errno == EUDPTIMEDOUT ) {
|
||||||
// try again
|
// try again
|
||||||
log("spider: retrying1. had error for %s : %s",
|
log("spider: retrying1. had error for %s : %s",
|
||||||
r->m_url,mstrerror(g_errno));
|
r->ptr_url,mstrerror(g_errno));
|
||||||
// return if that blocked
|
// return if that blocked
|
||||||
if ( ! forwardRequest ( ) ) return false;
|
if ( ! forwardRequest ( ) ) return false;
|
||||||
// a different g_errno should be set now!
|
// a different g_errno should be set now!
|
||||||
@ -362,7 +377,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
|||||||
// for it here
|
// for it here
|
||||||
if ( g_conf.m_logDebugSpider )
|
if ( g_conf.m_logDebugSpider )
|
||||||
log("spider: error for %s: %s",
|
log("spider: error for %s: %s",
|
||||||
r->m_url,mstrerror(g_errno));
|
r->ptr_url,mstrerror(g_errno));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -435,7 +450,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
|||||||
// log it for now
|
// log it for now
|
||||||
if ( g_conf.m_logDebugSpider )
|
if ( g_conf.m_logDebugSpider )
|
||||||
log("http: got doc %s %li to %li",
|
log("http: got doc %s %li to %li",
|
||||||
r->m_url,(long)replySize,(long)uncompressedLen);
|
r->ptr_url,(long)replySize,(long)uncompressedLen);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -458,9 +473,9 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
|||||||
//if ( niceness == 0 ) { char *xx=NULL;*xx=0; }
|
//if ( niceness == 0 ) { char *xx=NULL;*xx=0; }
|
||||||
|
|
||||||
// make sure we do not download gigablast.com admin pages!
|
// make sure we do not download gigablast.com admin pages!
|
||||||
if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->m_urlLen >= 7 ) {
|
if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->size_url-1 >= 7 ) {
|
||||||
Url url;
|
Url url;
|
||||||
url.set ( r->m_url );
|
url.set ( r->ptr_url );
|
||||||
// . never download /master urls from ips of hosts in cluster
|
// . never download /master urls from ips of hosts in cluster
|
||||||
// . TODO: FIX! the pages might be in another cluster!
|
// . TODO: FIX! the pages might be in another cluster!
|
||||||
if ( ( strncasecmp ( url.getPath() , "/master/" , 8 ) == 0 ||
|
if ( ( strncasecmp ( url.getPath() , "/master/" , 8 ) == 0 ||
|
||||||
@ -500,7 +515,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
|||||||
// helpful for debugging. even though you may see a robots.txt
|
// helpful for debugging. even though you may see a robots.txt
|
||||||
// redirect and think we are downloading that each time,
|
// redirect and think we are downloading that each time,
|
||||||
// we are not... the redirect is cached here as well.
|
// we are not... the redirect is cached here as well.
|
||||||
//log("spider: %s was in cache",r->m_url);
|
//log("spider: %s was in cache",r->ptr_url);
|
||||||
// . send the cached reply back
|
// . send the cached reply back
|
||||||
// . this will free send/read bufs on completion/g_errno
|
// . this will free send/read bufs on completion/g_errno
|
||||||
g_udpServer.sendReply_ass ( rec , recSize , rec, recSize,slot);
|
g_udpServer.sendReply_ass ( rec , recSize , rec, recSize,slot);
|
||||||
@ -510,7 +525,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
|||||||
// log it so we can see if we are hammering
|
// log it so we can see if we are hammering
|
||||||
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
||||||
logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
|
logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
|
||||||
r->m_url,iptoa(r->m_firstIp));
|
r->ptr_url,iptoa(r->m_firstIp));
|
||||||
|
|
||||||
// temporary hack
|
// temporary hack
|
||||||
if ( r->m_parent ) { char *xx=NULL;*xx=0; }
|
if ( r->m_parent ) { char *xx=NULL;*xx=0; }
|
||||||
@ -559,7 +574,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
|||||||
// which will store maybe a -1 if currently downloading...
|
// which will store maybe a -1 if currently downloading...
|
||||||
if ( queueIt ) {
|
if ( queueIt ) {
|
||||||
// debug
|
// debug
|
||||||
//log("spider: adding %s to crawldelayqueue",r->m_url);
|
//log("spider: adding %s to crawldelayqueue",r->ptr_url);
|
||||||
// save this
|
// save this
|
||||||
r->m_udpSlot = slot;
|
r->m_udpSlot = slot;
|
||||||
r->m_nextLink = NULL;
|
r->m_nextLink = NULL;
|
||||||
@ -580,7 +595,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
|||||||
if ( last > 0 && waited < r->m_crawlDelayMS ) {
|
if ( last > 0 && waited < r->m_crawlDelayMS ) {
|
||||||
log("spider: hammering firstIp=%s url=%s "
|
log("spider: hammering firstIp=%s url=%s "
|
||||||
"only waited %lli ms of %li ms",
|
"only waited %lli ms of %li ms",
|
||||||
iptoa(r->m_firstIp),r->m_url,waited,
|
iptoa(r->m_firstIp),r->ptr_url,waited,
|
||||||
r->m_crawlDelayMS);
|
r->m_crawlDelayMS);
|
||||||
// this guy has too many redirects and it fails us...
|
// this guy has too many redirects and it fails us...
|
||||||
// BUT do not core if running live, only if for test
|
// BUT do not core if running live, only if for test
|
||||||
@ -598,14 +613,14 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
|||||||
// "firstIp=%s "
|
// "firstIp=%s "
|
||||||
// "url=%s "
|
// "url=%s "
|
||||||
// "to msg13::hammerCache",
|
// "to msg13::hammerCache",
|
||||||
// nowms,iptoa(r->m_firstIp),r->m_url);
|
// nowms,iptoa(r->m_firstIp),r->ptr_url);
|
||||||
// clear error from that if any, not important really
|
// clear error from that if any, not important really
|
||||||
g_errno = 0;
|
g_errno = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// try to get it from the test cache?
|
// try to get it from the test cache?
|
||||||
TcpSocket ts;
|
TcpSocket ts;
|
||||||
if ( r->m_useTestCache && getTestDoc ( r->m_url, &ts , r ) ) {
|
if ( r->m_useTestCache && getTestDoc ( r->ptr_url, &ts , r ) ) {
|
||||||
// save this
|
// save this
|
||||||
r->m_udpSlot = slot;
|
r->m_udpSlot = slot;
|
||||||
// store the request so gotHttpReply can reply to it
|
// store the request so gotHttpReply can reply to it
|
||||||
@ -672,7 +687,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
|||||||
|
|
||||||
|
|
||||||
// do not get .google.com/ crap
|
// do not get .google.com/ crap
|
||||||
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
//if ( strstr(r->ptr_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||||
|
|
||||||
downloadTheDocForReals ( r );
|
downloadTheDocForReals ( r );
|
||||||
}
|
}
|
||||||
@ -689,7 +704,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
|||||||
|
|
||||||
// this means our callback will be called
|
// this means our callback will be called
|
||||||
if ( ! firstInLine ) {
|
if ( ! firstInLine ) {
|
||||||
//log("spider: inlining %s",r->m_url);
|
//log("spider: inlining %s",r->ptr_url);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -715,7 +730,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
|||||||
"firstIp=%s "
|
"firstIp=%s "
|
||||||
"url=%s "
|
"url=%s "
|
||||||
"to msg13::hammerCache",
|
"to msg13::hammerCache",
|
||||||
-1LL,iptoa(r->m_firstIp),r->m_url);
|
-1LL,iptoa(r->m_firstIp),r->ptr_url);
|
||||||
|
|
||||||
|
|
||||||
// flag this
|
// flag this
|
||||||
@ -723,7 +738,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
|||||||
// note it here
|
// note it here
|
||||||
if ( g_conf.m_logDebugSpider )
|
if ( g_conf.m_logDebugSpider )
|
||||||
log("spider: downloading %s (%s) (skiphammercheck=%li)",
|
log("spider: downloading %s (%s) (skiphammercheck=%li)",
|
||||||
r->m_url,iptoa(r->m_urlIp) ,
|
r->ptr_url,iptoa(r->m_urlIp) ,
|
||||||
(long)r->m_skipHammerCheck);
|
(long)r->m_skipHammerCheck);
|
||||||
|
|
||||||
// use the default agent unless scraping
|
// use the default agent unless scraping
|
||||||
@ -755,7 +770,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
|||||||
|
|
||||||
|
|
||||||
// download it
|
// download it
|
||||||
if ( ! g_httpServer.getDoc ( r->m_url ,
|
if ( ! g_httpServer.getDoc ( r->ptr_url ,
|
||||||
r->m_urlIp ,
|
r->m_urlIp ,
|
||||||
0 , // offset
|
0 , // offset
|
||||||
-1 ,
|
-1 ,
|
||||||
@ -767,7 +782,10 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
|||||||
r->m_httpProxyPort ,
|
r->m_httpProxyPort ,
|
||||||
r->m_maxTextDocLen ,
|
r->m_maxTextDocLen ,
|
||||||
r->m_maxOtherDocLen ,
|
r->m_maxOtherDocLen ,
|
||||||
agent ) )
|
agent ,
|
||||||
|
"HTTP/1.0" , // protocol
|
||||||
|
false , // do POST?
|
||||||
|
r->ptr_cookie ) )
|
||||||
// return false if blocked
|
// return false if blocked
|
||||||
return;
|
return;
|
||||||
// . log this so i know about it
|
// . log this so i know about it
|
||||||
@ -818,7 +836,7 @@ void gotHttpReply2 ( void *state ,
|
|||||||
if ( g_errno && g_conf.m_logDebugSpider )
|
if ( g_errno && g_conf.m_logDebugSpider )
|
||||||
log("spider: http reply (msg13) had error = %s "
|
log("spider: http reply (msg13) had error = %s "
|
||||||
"for %s at ip %s",
|
"for %s at ip %s",
|
||||||
mstrerror(g_errno),r->m_url,iptoa(r->m_urlIp));
|
mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));
|
||||||
|
|
||||||
// get time now
|
// get time now
|
||||||
long long nowms = gettimeofdayInMilliseconds();
|
long long nowms = gettimeofdayInMilliseconds();
|
||||||
@ -832,7 +850,7 @@ void gotHttpReply2 ( void *state ,
|
|||||||
"firstIp=%s "
|
"firstIp=%s "
|
||||||
"url=%s "
|
"url=%s "
|
||||||
"to msg13::hammerCache",
|
"to msg13::hammerCache",
|
||||||
nowms,iptoa(r->m_firstIp),r->m_url);
|
nowms,iptoa(r->m_firstIp),r->ptr_url);
|
||||||
|
|
||||||
|
|
||||||
// sanity. this was happening from iframe download
|
// sanity. this was happening from iframe download
|
||||||
@ -859,7 +877,7 @@ void gotHttpReply2 ( void *state ,
|
|||||||
// note it
|
// note it
|
||||||
if ( r->m_useTestCache && g_conf.m_logDebugSpider )
|
if ( r->m_useTestCache && g_conf.m_logDebugSpider )
|
||||||
logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%llu",
|
logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%llu",
|
||||||
r->m_url,iptoa(r->m_firstIp),r->m_urlHash48);
|
r->ptr_url,iptoa(r->m_firstIp),r->m_urlHash48);
|
||||||
|
|
||||||
long niceness = r->m_niceness;
|
long niceness = r->m_niceness;
|
||||||
|
|
||||||
@ -986,7 +1004,7 @@ void gotHttpReply2 ( void *state ,
|
|||||||
!r->m_isRobotsTxt &&
|
!r->m_isRobotsTxt &&
|
||||||
r->m_compressReply ) {
|
r->m_compressReply ) {
|
||||||
long cs = getCharsetFast ( &mime,
|
long cs = getCharsetFast ( &mime,
|
||||||
r->m_url,
|
r->ptr_url,
|
||||||
content,
|
content,
|
||||||
contentLen,
|
contentLen,
|
||||||
niceness);
|
niceness);
|
||||||
@ -1088,7 +1106,7 @@ void gotHttpReply2 ( void *state ,
|
|||||||
// ok, did we have an error?
|
// ok, did we have an error?
|
||||||
if ( g_errno )
|
if ( g_errno )
|
||||||
log("scproxy: xml set for %s had error: %s",
|
log("scproxy: xml set for %s had error: %s",
|
||||||
r->m_url,mstrerror(g_errno));
|
r->ptr_url,mstrerror(g_errno));
|
||||||
// otherwise, i guess we had no iframes worthy of expanding
|
// otherwise, i guess we had no iframes worthy of expanding
|
||||||
// so pretend we do not have any iframes
|
// so pretend we do not have any iframes
|
||||||
hasIframe2 = false;
|
hasIframe2 = false;
|
||||||
@ -1128,12 +1146,12 @@ void gotHttpReply2 ( void *state ,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// nuke the content if from flurbit.com website!!
|
// nuke the content if from flurbit.com website!!
|
||||||
if ( r->m_url &&
|
if ( r->ptr_url &&
|
||||||
replySize>0 &&
|
replySize>0 &&
|
||||||
goodStatus &&
|
goodStatus &&
|
||||||
strstr ( r->m_url,"flurbit.com/" ) ) {
|
strstr ( r->ptr_url,"flurbit.com/" ) ) {
|
||||||
// note it in log
|
// note it in log
|
||||||
log("msg13: got flurbit url: %s",r->m_url);
|
log("msg13: got flurbit url: %s",r->ptr_url);
|
||||||
// record in the stats
|
// record in the stats
|
||||||
docsPtr = &g_stats.m_compressUnchangedDocs;
|
docsPtr = &g_stats.m_compressUnchangedDocs;
|
||||||
bytesInPtr = &g_stats.m_compressUnchangedBytesIn;
|
bytesInPtr = &g_stats.m_compressUnchangedBytesIn;
|
||||||
@ -1366,7 +1384,7 @@ void gotHttpReply2 ( void *state ,
|
|||||||
log("proxy: msg13: sending back error: %s "
|
log("proxy: msg13: sending back error: %s "
|
||||||
"for url %s with ip %s",
|
"for url %s with ip %s",
|
||||||
mstrerror(err),
|
mstrerror(err),
|
||||||
r2->m_url,
|
r2->ptr_url,
|
||||||
iptoa(r2->m_urlIp));
|
iptoa(r2->m_urlIp));
|
||||||
g_udpServer.sendErrorReply ( slot , err );
|
g_udpServer.sendErrorReply ( slot , err );
|
||||||
continue;
|
continue;
|
||||||
@ -1412,7 +1430,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
|
|||||||
|
|
||||||
if ( g_errno ) {
|
if ( g_errno ) {
|
||||||
log("spider: error from proxy for %s: %s",
|
log("spider: error from proxy for %s: %s",
|
||||||
r->m_url,mstrerror(g_errno));
|
r->ptr_url,mstrerror(g_errno));
|
||||||
g_udpServer.sendErrorReply(r->m_udpSlot, g_errno);
|
g_udpServer.sendErrorReply(r->m_udpSlot, g_errno);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -2014,8 +2032,8 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
|||||||
// make a fake spider request so we can do it
|
// make a fake spider request so we can do it
|
||||||
SpiderRequest sreq;
|
SpiderRequest sreq;
|
||||||
sreq.reset();
|
sreq.reset();
|
||||||
strcpy(sreq.m_url,r->m_url);
|
strcpy(sreq.m_url,r->ptr_url);
|
||||||
long firstIp = hash32n(r->m_url);
|
long firstIp = hash32n(r->ptr_url);
|
||||||
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
||||||
sreq.setKey( firstIp,0LL, false );
|
sreq.setKey( firstIp,0LL, false );
|
||||||
sreq.m_isInjecting = 1;
|
sreq.m_isInjecting = 1;
|
||||||
@ -2027,7 +2045,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
|||||||
|
|
||||||
// log it now
|
// log it now
|
||||||
if ( g_conf.m_logDebugBuild )
|
if ( g_conf.m_logDebugBuild )
|
||||||
log("scproxy: expanding iframes for %s",r->m_url);
|
log("scproxy: expanding iframes for %s",r->ptr_url);
|
||||||
|
|
||||||
// . use the enormous power of our new XmlDoc class
|
// . use the enormous power of our new XmlDoc class
|
||||||
// . this returns false with g_errno set on error
|
// . this returns false with g_errno set on error
|
||||||
@ -2108,7 +2126,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
|||||||
char **ec = xd->getExpandedUtf8Content();
|
char **ec = xd->getExpandedUtf8Content();
|
||||||
// this means it blocked
|
// this means it blocked
|
||||||
if ( ec == (void *)-1 ) {
|
if ( ec == (void *)-1 ) {
|
||||||
//log("scproxy: waiting for %s",r->m_url);
|
//log("scproxy: waiting for %s",r->ptr_url);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// return true with g_errno set
|
// return true with g_errno set
|
||||||
@ -2128,7 +2146,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
|||||||
// so i'd think indicative of something special
|
// so i'd think indicative of something special
|
||||||
if ( g_conf.m_logDebugBuild )
|
if ( g_conf.m_logDebugBuild )
|
||||||
log("scproxy: got iframe expansion without blocking for url=%s"
|
log("scproxy: got iframe expansion without blocking for url=%s"
|
||||||
" err=%s",r->m_url,mstrerror(g_errno));
|
" err=%s",r->ptr_url,mstrerror(g_errno));
|
||||||
|
|
||||||
// save g_errno for returning
|
// save g_errno for returning
|
||||||
long saved = g_errno;
|
long saved = g_errno;
|
||||||
@ -2169,11 +2187,11 @@ void gotIframeExpandedContent ( void *state ) {
|
|||||||
// this was stored in xd
|
// this was stored in xd
|
||||||
Msg13Request *r = xd->m_r;
|
Msg13Request *r = xd->m_r;
|
||||||
|
|
||||||
//log("scproxy: done waiting for %s",r->m_url);
|
//log("scproxy: done waiting for %s",r->ptr_url);
|
||||||
|
|
||||||
// note it
|
// note it
|
||||||
if ( g_conf.m_logDebugBuild )
|
if ( g_conf.m_logDebugBuild )
|
||||||
log("scproxy: got iframe expansion for url=%s",r->m_url);
|
log("scproxy: got iframe expansion for url=%s",r->ptr_url);
|
||||||
|
|
||||||
// assume we had no expansion or there was an error
|
// assume we had no expansion or there was an error
|
||||||
char *reply = NULL;
|
char *reply = NULL;
|
||||||
@ -2212,7 +2230,7 @@ void gotIframeExpandedContent ( void *state ) {
|
|||||||
// on the main cluster!
|
// on the main cluster!
|
||||||
if ( g_errno )
|
if ( g_errno )
|
||||||
log("scproxy: error getting iframe content for url=%s : %s",
|
log("scproxy: error getting iframe content for url=%s : %s",
|
||||||
r->m_url,mstrerror(g_errno));
|
r->ptr_url,mstrerror(g_errno));
|
||||||
// sanity check
|
// sanity check
|
||||||
if ( reply && reply[replySize-1] != '\0') { char *xx=NULL;*xx=0; }
|
if ( reply && reply[replySize-1] != '\0') { char *xx=NULL;*xx=0; }
|
||||||
// pass back the error we had, if any
|
// pass back the error we had, if any
|
||||||
@ -2270,7 +2288,7 @@ void scanHammerQueue ( int fd , void *state ) {
|
|||||||
// debug
|
// debug
|
||||||
//log("spider: downloading %s from crawldelay queue "
|
//log("spider: downloading %s from crawldelay queue "
|
||||||
// "waited=%llims crawldelay=%lims",
|
// "waited=%llims crawldelay=%lims",
|
||||||
// r->m_url,waited,r->m_crawlDelayMS);
|
// r->ptr_url,waited,r->m_crawlDelayMS);
|
||||||
|
|
||||||
// good to go
|
// good to go
|
||||||
downloadTheDocForReals ( r );
|
downloadTheDocForReals ( r );
|
||||||
|
15
Msg13.h
15
Msg13.h
@ -80,16 +80,23 @@ public:
|
|||||||
long long m_cacheKey;
|
long long m_cacheKey;
|
||||||
char m_testDir[32];
|
char m_testDir[32];
|
||||||
// msg13 sets this too, so you don't have to worry about setting it
|
// msg13 sets this too, so you don't have to worry about setting it
|
||||||
long m_urlLen;
|
//long m_urlLen;
|
||||||
// includes \0 termination
|
// includes \0 termination
|
||||||
char m_url[MAX_URL_LEN+1];
|
//char m_url[MAX_URL_LEN+1];
|
||||||
|
|
||||||
|
char *ptr_url;
|
||||||
|
char *ptr_cookie;
|
||||||
|
|
||||||
|
long size_url;
|
||||||
|
long size_cookie;
|
||||||
|
|
||||||
long getSize() {
|
long getSize() {
|
||||||
return ((char *)m_url-(char *)this) +m_urlLen +1;};
|
return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};
|
||||||
|
|
||||||
// zero it all out
|
// zero it all out
|
||||||
void reset() {
|
void reset() {
|
||||||
memset (this,0,(char *)m_url - (char *)this + 1);
|
//memset (this,0,(char *)m_url - (char *)this + 1);
|
||||||
|
memset (this,0,sizeof(Msg13Request));
|
||||||
m_maxTextDocLen = -1; // no limit
|
m_maxTextDocLen = -1; // no limit
|
||||||
m_maxOtherDocLen = -1; // no limit
|
m_maxOtherDocLen = -1; // no limit
|
||||||
m_crawlDelayMS = -1; // unknown or none
|
m_crawlDelayMS = -1; // unknown or none
|
||||||
|
16
Url.cpp
16
Url.cpp
@ -195,14 +195,14 @@ void Url::set ( char *t , long tlen , bool addWWW , bool stripSessionId ,
|
|||||||
s[len]='\0';
|
s[len]='\0';
|
||||||
|
|
||||||
// make http:////www.xyz.com into http://www.xyz.com
|
// make http:////www.xyz.com into http://www.xyz.com
|
||||||
if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" , 9 ) ) {
|
// if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" ,9) ){
|
||||||
memcpy (s+7,s+9,len-9+1);
|
// memcpy (s+7,s+9,len-9+1);
|
||||||
len -= 2;
|
// len -= 2;
|
||||||
}
|
// }
|
||||||
if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////", 10 ) ) {
|
// if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////",10)){
|
||||||
memcpy (s+8,s+10,len-9+1);
|
// memcpy (s+8,s+10,len-9+1);
|
||||||
len -= 2;
|
// len -= 2;
|
||||||
}
|
// }
|
||||||
|
|
||||||
// . remove session ids from s
|
// . remove session ids from s
|
||||||
// . ';' most likely preceeds a session id
|
// . ';' most likely preceeds a session id
|
||||||
|
32
XmlDoc.cpp
32
XmlDoc.cpp
@ -9412,6 +9412,23 @@ Url **XmlDoc::getRedirUrl() {
|
|||||||
// breathe
|
// breathe
|
||||||
QUICKPOLL(m_niceness);
|
QUICKPOLL(m_niceness);
|
||||||
|
|
||||||
|
// get cookie for redirect to fix nyt.com
|
||||||
|
char *cookie = mime.getCookie();
|
||||||
|
// find end of cookie at the semicolon
|
||||||
|
char *s = cookie;
|
||||||
|
for ( ; s && *s && *s != ';' ; s++ );
|
||||||
|
if ( s && *s == ';' ) {
|
||||||
|
// do not include ;
|
||||||
|
long clen = s - cookie;
|
||||||
|
m_redirCookieBuf.reset();
|
||||||
|
m_redirCookieBuf.safeMemcpy ( cookie , clen );
|
||||||
|
m_redirCookieBuf.nullTerm();
|
||||||
|
m_redirCookieBufValid = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// mdw23
|
||||||
|
//log("http: reply=%s",m_httpReply);
|
||||||
|
|
||||||
// a hack for removing session ids already in there. for
|
// a hack for removing session ids already in there. for
|
||||||
// brilliantshopper's bs4 collection and gk0 cluster
|
// brilliantshopper's bs4 collection and gk0 cluster
|
||||||
//bool forceRedirect = false;
|
//bool forceRedirect = false;
|
||||||
@ -9520,7 +9537,7 @@ Url **XmlDoc::getRedirUrl() {
|
|||||||
// . if we followed too many then bail
|
// . if we followed too many then bail
|
||||||
// . www.motorolamobility.com www.outlook.com ... failed when we
|
// . www.motorolamobility.com www.outlook.com ... failed when we
|
||||||
// had >= 4 here
|
// had >= 4 here
|
||||||
if ( ++m_numRedirects >= 7 ) {
|
if ( ++m_numRedirects >= 10 ) {
|
||||||
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
||||||
return &m_redirUrlPtr;
|
return &m_redirUrlPtr;
|
||||||
}
|
}
|
||||||
@ -14711,7 +14728,9 @@ char **XmlDoc::getHttpReply2 ( ) {
|
|||||||
// clear it first
|
// clear it first
|
||||||
r->reset();
|
r->reset();
|
||||||
// and set the url
|
// and set the url
|
||||||
strcpy ( r->m_url , cu->getUrl() );
|
//strcpy ( r->m_url , cu->getUrl() );
|
||||||
|
r->ptr_url = cu->getUrl();
|
||||||
|
r->size_url = cu->getUrlLen()+1;
|
||||||
// sanity check
|
// sanity check
|
||||||
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
||||||
// max to download in bytes. currently 1MB.
|
// max to download in bytes. currently 1MB.
|
||||||
@ -14747,6 +14766,15 @@ char **XmlDoc::getHttpReply2 ( ) {
|
|||||||
r->m_ifModifiedSince = 0;
|
r->m_ifModifiedSince = 0;
|
||||||
r->m_skipHammerCheck = 0;
|
r->m_skipHammerCheck = 0;
|
||||||
|
|
||||||
|
if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
|
||||||
|
r->ptr_cookie = m_redirCookieBuf.getBufStart();
|
||||||
|
r->size_cookie = m_redirCookieBuf.length() + 1;
|
||||||
|
// . only do once per redirect
|
||||||
|
// . do not invalidate because we might have to carry it
|
||||||
|
// through to the next redir... unless we change domain
|
||||||
|
// . this fixes the nyt.com bug some more
|
||||||
|
//m_redirCookieBufValid = false;
|
||||||
|
}
|
||||||
|
|
||||||
// . this is -1 if unknown. none found in robots.txt or provided
|
// . this is -1 if unknown. none found in robots.txt or provided
|
||||||
// in the custom crawl parms.
|
// in the custom crawl parms.
|
||||||
|
2
XmlDoc.h
2
XmlDoc.h
@ -962,6 +962,7 @@ class XmlDoc {
|
|||||||
|
|
||||||
Url m_redirUrl;
|
Url m_redirUrl;
|
||||||
Url *m_redirUrlPtr;
|
Url *m_redirUrlPtr;
|
||||||
|
SafeBuf m_redirCookieBuf;
|
||||||
Url m_metaRedirUrl;
|
Url m_metaRedirUrl;
|
||||||
Url *m_metaRedirUrlPtr;
|
Url *m_metaRedirUrlPtr;
|
||||||
Url m_canonicalRedirUrl;
|
Url m_canonicalRedirUrl;
|
||||||
@ -1235,6 +1236,7 @@ class XmlDoc {
|
|||||||
//bool m_tryAgainTimeDeltaValid;
|
//bool m_tryAgainTimeDeltaValid;
|
||||||
//bool m_eliminateMenusValid;
|
//bool m_eliminateMenusValid;
|
||||||
bool m_redirUrlValid;
|
bool m_redirUrlValid;
|
||||||
|
bool m_redirCookieBufValid;
|
||||||
bool m_metaRedirUrlValid;
|
bool m_metaRedirUrlValid;
|
||||||
bool m_canonicalRedirUrlValid;
|
bool m_canonicalRedirUrlValid;
|
||||||
bool m_statusMsgValid;
|
bool m_statusMsgValid;
|
||||||
|
Loading…
Reference in New Issue
Block a user