fix nyt.com cookie redir bug.

fixed bug when POSTing injection request with multipart/form-data.
This commit is contained in:
mwells 2014-08-05 17:04:11 -07:00
parent 146e45db56
commit cc1ceaaac2
9 changed files with 145 additions and 66 deletions

View File

@ -159,9 +159,10 @@ bool HttpMime::parse ( char *mime , long mimeLen , Url *url ) {
} }
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 ) else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 )
m_contentType = getContentTypePrivate ( p + 13 ); m_contentType = getContentTypePrivate ( p + 13 );
else if ( strncasecmp ( p , "Set-Cookie: " ,11) == 0 ) { else if ( strncasecmp ( p , "Set-Cookie:" ,10) == 0 ) {
m_cookie = p + 11; m_cookie = p + 11;
m_cookieLen = gbstrlen ( p + 11 ); if ( m_cookie[0] == ' ' ) m_cookie++;
m_cookieLen = gbstrlen ( m_cookie );
} }
else if ( strncasecmp ( p , "Location:" , 9) == 0 ) { else if ( strncasecmp ( p , "Location:" , 9) == 0 ) {
// point to it // point to it

View File

@ -181,6 +181,10 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
if ( size == 0 ) cmd = "HEAD"; if ( size == 0 ) cmd = "HEAD";
if ( doPost ) cmd = "POST"; if ( doPost ) cmd = "POST";
// crap, can't spider nyt.com if we are 1.0, so use 1.0 but also
// note Connection: Close\r\n when making requests
//proto = "HTTP/1.1";
// . now use "Accept-Language: en" to tell servers we prefer english // . now use "Accept-Language: en" to tell servers we prefer english
// . i removed keep-alive connection since some connections close on // . i removed keep-alive connection since some connections close on
// non-200 ok http statuses and we think they're open since close // non-200 ok http statuses and we think they're open since close
@ -212,6 +216,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
"Host: %s\r\n" "Host: %s\r\n"
"%s" "%s"
"User-Agent: %s\r\n" "User-Agent: %s\r\n"
"Connection: Close\r\n"
//"Connection: Keep-Alive\r\n" //"Connection: Keep-Alive\r\n"
"Accept-Language: en\r\n" "Accept-Language: en\r\n"
//"Accept: */*\r\n\r\n" , //"Accept: */*\r\n\r\n" ,
@ -226,6 +231,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
"Host: %s\r\n" "Host: %s\r\n"
"%s" "%s"
"User-Agent: %s\r\n" "User-Agent: %s\r\n"
"Connection: Close\r\n"
//"Connection: Keep-Alive\r\n" //"Connection: Keep-Alive\r\n"
"Accept-Language: en\r\n" "Accept-Language: en\r\n"
//"Accept: */*\r\n" //"Accept: */*\r\n"
@ -246,6 +252,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
"Host: %s\r\n" "Host: %s\r\n"
"%s" "%s"
"User-Agent: %s\r\n" "User-Agent: %s\r\n"
"Connection: Close\r\n"
//"Connection: Keep-Alive\r\n" //"Connection: Keep-Alive\r\n"
"Accept-Language: en\r\n" "Accept-Language: en\r\n"
//"Accept: */*\r\n" //"Accept: */*\r\n"
@ -275,6 +282,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
"Accept: */*\r\n" "Accept: */*\r\n"
"Host: %s\r\n" "Host: %s\r\n"
"%s" "%s"
"Connection: Close\r\n"
//"Connection: Keep-Alive\r\n" //"Connection: Keep-Alive\r\n"
//"Accept-Language: en\r\n" //"Accept-Language: en\r\n"
"%s", "%s",
@ -417,6 +425,12 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
else log("http: Got POST request without \\r\\n\\r\\n."); else log("http: Got POST request without \\r\\n\\r\\n.");
} }
bool multipart = false;
if ( m_requestType == 2 ) { // is POST?
char *cd =strcasestr(req,"Content-Type: multipart/form-data");
if ( cd ) multipart = true;
}
// . point to the file path // . point to the file path
// . skip over the "GET " // . skip over the "GET "
long filenameStart = 4 ; long filenameStart = 4 ;
@ -812,7 +826,8 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
} }
// Put '\0' back into the HttpRequest buffer... // Put '\0' back into the HttpRequest buffer...
if (m_cgiBuf){ // crap, not if we are multi-part unencoded stuff...
if ( m_cgiBuf && ! multipart ) {
// do not mangle the "ucontent"! // do not mangle the "ucontent"!
long cgiBufLen = m_cgiBufLen; long cgiBufLen = m_cgiBufLen;
cgiBufLen -= m_ucontentLen; cgiBufLen -= m_ucontentLen;

View File

@ -190,8 +190,14 @@ bool HttpServer::getDoc ( char *url ,
char *host = getHostFast ( url , &hostLen , &port ); char *host = getHostFast ( url , &hostLen , &port );
// mdw23
//if ( g_conf.m_logDebugSpider ) //if ( g_conf.m_logDebugSpider )
// log("spider: httprequest = %s", req ); // {
// SafeBuf tmp;
// tmp.safeMemcpy ( req , reqSize );
// tmp.nullTerm();
// log("spider: httprequest = %s", tmp.getBufStart() );
// }
// do we have an ip to send to? assume not // do we have an ip to send to? assume not

View File

@ -692,7 +692,9 @@ bool Images::downloadImage ( ) {
r->m_addToTestCache = 1; r->m_addToTestCache = 1;
} }
// url is the most important // url is the most important
strcpy(r->m_url,m_imageUrl.getUrl()); //strcpy(r->m_url,m_imageUrl.getUrl());
r-> ptr_url = m_imageUrl.getUrl();
r->size_url = m_imageUrl.getUrlLen()+1; // include \0
// . try to download it // . try to download it
// . i guess we are ignoring hammers at this point // . i guess we are ignoring hammers at this point
if ( ! m_msg13.getDoc(r,false,this,downloadImageWrapper)) if ( ! m_msg13.getDoc(r,false,this,downloadImageWrapper))

112
Msg13.cpp
View File

@ -164,8 +164,8 @@ bool Msg13::getDoc ( Msg13Request *r,
if ( r->m_urlIp == -1 ) { char *xx = NULL; *xx = 0; } if ( r->m_urlIp == -1 ) { char *xx = NULL; *xx = 0; }
// set this // set this
r->m_urlLen = gbstrlen ( r->m_url ); //r->m_urlLen = gbstrlen ( r->ptr_url );
r->m_urlHash64 = hash64 ( r->m_url , r->m_urlLen ); r->m_urlHash64 = hash64 ( r->ptr_url , r->size_url-1);//m_urlLen );
// sanity check, if spidering the test coll make sure one of // sanity check, if spidering the test coll make sure one of
// these is true!! this prevents us from mistakenly turning it off // these is true!! this prevents us from mistakenly turning it off
@ -186,8 +186,8 @@ bool Msg13::getDoc ( Msg13Request *r,
// r->m_testParserEnabled = true; // r->m_testParserEnabled = true;
// is this a /robots.txt url? // is this a /robots.txt url?
if ( r->m_urlLen > 12 && if ( r->size_url - 1 > 12 &&
! strncmp ( r->m_url + r->m_urlLen - 11,"/robots.txt",11)) ! strncmp ( r->ptr_url + r->size_url -1 -11,"/robots.txt",11))
r->m_isRobotsTxt = true; r->m_isRobotsTxt = true;
// force caching if getting robots.txt so is compressed in cache // force caching if getting robots.txt so is compressed in cache
@ -195,7 +195,7 @@ bool Msg13::getDoc ( Msg13Request *r,
r->m_compressReply = true; r->m_compressReply = true;
// do not get .google.com/ crap // do not get .google.com/ crap
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; } //if ( strstr(r->ptr_url,".google.com/") ) { char *xx=NULL;*xx=0; }
// set it for this too // set it for this too
//if ( g_conf.m_useCompressionProxy ) { //if ( g_conf.m_useCompressionProxy ) {
@ -261,19 +261,33 @@ bool Msg13::forwardRequest ( ) {
logf ( LOG_DEBUG, logf ( LOG_DEBUG,
"spider: sending download request of %s firstIp=%s " "spider: sending download request of %s firstIp=%s "
"uh48=%llu to " "uh48=%llu to "
"host %li (child=%li)", r->m_url, iptoa(r->m_firstIp), "host %li (child=%li)", r->ptr_url, iptoa(r->m_firstIp),
r->m_urlHash48, hostId, r->m_urlHash48, hostId,
r->m_skipHammerCheck); r->m_skipHammerCheck);
// fill up the request // fill up the request
long requestSize = r->getSize(); long requestBufSize = r->getSize();
// we have to serialize it now because it has cookies as well as
// the url.
char *requestBuf = serializeMsg ( sizeof(Msg39Request),
&r->size_url,
&r->size_cookie,
&r->ptr_url,
r,
&requestBufSize ,
NULL ,
0,//RBUF_SIZE ,
false );
// g_errno should be set in this case, most likely to ENOMEM
if ( ! requestBuf ) return true;
// . otherwise, send the request to the key host // . otherwise, send the request to the key host
// . returns false and sets g_errno on error // . returns false and sets g_errno on error
// . now wait for 2 minutes before timing out // . now wait for 2 minutes before timing out
if ( ! g_udpServer.sendRequest ( (char *)r , if ( ! g_udpServer.sendRequest ( requestBuf, // (char *)r ,
requestSize , requestBufSize ,
0x13 , // msgType 0x13 0x13 , // msgType 0x13
h->m_ip , h->m_ip ,
h->m_port , h->m_port ,
@ -309,7 +323,8 @@ void gotForwardedReplyWrapper ( void *state , UdpSlot *slot ) {
bool Msg13::gotForwardedReply ( UdpSlot *slot ) { bool Msg13::gotForwardedReply ( UdpSlot *slot ) {
// don't let udpserver free the request, it's our m_request[] // don't let udpserver free the request, it's our m_request[]
slot->m_sendBufAlloc = NULL; // no, now let him free it because it was serialized into there
//slot->m_sendBufAlloc = NULL;
// what did he give us? // what did he give us?
char *reply = slot->m_readBuf; char *reply = slot->m_readBuf;
long replySize = slot->m_readBufSize; long replySize = slot->m_readBufSize;
@ -343,7 +358,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads ) if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
logf(LOG_DEBUG,"spider: FINALIZED %s firstIp=%s", logf(LOG_DEBUG,"spider: FINALIZED %s firstIp=%s",
r->m_url,iptoa(r->m_firstIp)); r->ptr_url,iptoa(r->m_firstIp));
// . if timed out probably the host is now dead so try another one! // . if timed out probably the host is now dead so try another one!
@ -351,7 +366,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
if ( g_errno == EUDPTIMEDOUT ) { if ( g_errno == EUDPTIMEDOUT ) {
// try again // try again
log("spider: retrying1. had error for %s : %s", log("spider: retrying1. had error for %s : %s",
r->m_url,mstrerror(g_errno)); r->ptr_url,mstrerror(g_errno));
// return if that blocked // return if that blocked
if ( ! forwardRequest ( ) ) return false; if ( ! forwardRequest ( ) ) return false;
// a different g_errno should be set now! // a different g_errno should be set now!
@ -362,7 +377,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
// for it here // for it here
if ( g_conf.m_logDebugSpider ) if ( g_conf.m_logDebugSpider )
log("spider: error for %s: %s", log("spider: error for %s: %s",
r->m_url,mstrerror(g_errno)); r->ptr_url,mstrerror(g_errno));
return true; return true;
} }
@ -435,7 +450,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
// log it for now // log it for now
if ( g_conf.m_logDebugSpider ) if ( g_conf.m_logDebugSpider )
log("http: got doc %s %li to %li", log("http: got doc %s %li to %li",
r->m_url,(long)replySize,(long)uncompressedLen); r->ptr_url,(long)replySize,(long)uncompressedLen);
return true; return true;
} }
@ -458,9 +473,9 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
//if ( niceness == 0 ) { char *xx=NULL;*xx=0; } //if ( niceness == 0 ) { char *xx=NULL;*xx=0; }
// make sure we do not download gigablast.com admin pages! // make sure we do not download gigablast.com admin pages!
if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->m_urlLen >= 7 ) { if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->size_url-1 >= 7 ) {
Url url; Url url;
url.set ( r->m_url ); url.set ( r->ptr_url );
// . never download /master urls from ips of hosts in cluster // . never download /master urls from ips of hosts in cluster
// . TODO: FIX! the pages might be in another cluster! // . TODO: FIX! the pages might be in another cluster!
if ( ( strncasecmp ( url.getPath() , "/master/" , 8 ) == 0 || if ( ( strncasecmp ( url.getPath() , "/master/" , 8 ) == 0 ||
@ -500,7 +515,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
// helpful for debugging. even though you may see a robots.txt // helpful for debugging. even though you may see a robots.txt
// redirect and think we are downloading that each time, // redirect and think we are downloading that each time,
// we are not... the redirect is cached here as well. // we are not... the redirect is cached here as well.
//log("spider: %s was in cache",r->m_url); //log("spider: %s was in cache",r->ptr_url);
// . send the cached reply back // . send the cached reply back
// . this will free send/read bufs on completion/g_errno // . this will free send/read bufs on completion/g_errno
g_udpServer.sendReply_ass ( rec , recSize , rec, recSize,slot); g_udpServer.sendReply_ass ( rec , recSize , rec, recSize,slot);
@ -510,7 +525,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
// log it so we can see if we are hammering // log it so we can see if we are hammering
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads ) if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s", logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
r->m_url,iptoa(r->m_firstIp)); r->ptr_url,iptoa(r->m_firstIp));
// temporary hack // temporary hack
if ( r->m_parent ) { char *xx=NULL;*xx=0; } if ( r->m_parent ) { char *xx=NULL;*xx=0; }
@ -559,7 +574,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
// which will store maybe a -1 if currently downloading... // which will store maybe a -1 if currently downloading...
if ( queueIt ) { if ( queueIt ) {
// debug // debug
//log("spider: adding %s to crawldelayqueue",r->m_url); //log("spider: adding %s to crawldelayqueue",r->ptr_url);
// save this // save this
r->m_udpSlot = slot; r->m_udpSlot = slot;
r->m_nextLink = NULL; r->m_nextLink = NULL;
@ -580,7 +595,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
if ( last > 0 && waited < r->m_crawlDelayMS ) { if ( last > 0 && waited < r->m_crawlDelayMS ) {
log("spider: hammering firstIp=%s url=%s " log("spider: hammering firstIp=%s url=%s "
"only waited %lli ms of %li ms", "only waited %lli ms of %li ms",
iptoa(r->m_firstIp),r->m_url,waited, iptoa(r->m_firstIp),r->ptr_url,waited,
r->m_crawlDelayMS); r->m_crawlDelayMS);
// this guy has too many redirects and it fails us... // this guy has too many redirects and it fails us...
// BUT do not core if running live, only if for test // BUT do not core if running live, only if for test
@ -598,14 +613,14 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
// "firstIp=%s " // "firstIp=%s "
// "url=%s " // "url=%s "
// "to msg13::hammerCache", // "to msg13::hammerCache",
// nowms,iptoa(r->m_firstIp),r->m_url); // nowms,iptoa(r->m_firstIp),r->ptr_url);
// clear error from that if any, not important really // clear error from that if any, not important really
g_errno = 0; g_errno = 0;
} }
// try to get it from the test cache? // try to get it from the test cache?
TcpSocket ts; TcpSocket ts;
if ( r->m_useTestCache && getTestDoc ( r->m_url, &ts , r ) ) { if ( r->m_useTestCache && getTestDoc ( r->ptr_url, &ts , r ) ) {
// save this // save this
r->m_udpSlot = slot; r->m_udpSlot = slot;
// store the request so gotHttpReply can reply to it // store the request so gotHttpReply can reply to it
@ -672,7 +687,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
// do not get .google.com/ crap // do not get .google.com/ crap
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; } //if ( strstr(r->ptr_url,".google.com/") ) { char *xx=NULL;*xx=0; }
downloadTheDocForReals ( r ); downloadTheDocForReals ( r );
} }
@ -689,7 +704,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
// this means our callback will be called // this means our callback will be called
if ( ! firstInLine ) { if ( ! firstInLine ) {
//log("spider: inlining %s",r->m_url); //log("spider: inlining %s",r->ptr_url);
return; return;
} }
@ -715,7 +730,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
"firstIp=%s " "firstIp=%s "
"url=%s " "url=%s "
"to msg13::hammerCache", "to msg13::hammerCache",
-1LL,iptoa(r->m_firstIp),r->m_url); -1LL,iptoa(r->m_firstIp),r->ptr_url);
// flag this // flag this
@ -723,7 +738,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
// note it here // note it here
if ( g_conf.m_logDebugSpider ) if ( g_conf.m_logDebugSpider )
log("spider: downloading %s (%s) (skiphammercheck=%li)", log("spider: downloading %s (%s) (skiphammercheck=%li)",
r->m_url,iptoa(r->m_urlIp) , r->ptr_url,iptoa(r->m_urlIp) ,
(long)r->m_skipHammerCheck); (long)r->m_skipHammerCheck);
// use the default agent unless scraping // use the default agent unless scraping
@ -755,7 +770,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
// download it // download it
if ( ! g_httpServer.getDoc ( r->m_url , if ( ! g_httpServer.getDoc ( r->ptr_url ,
r->m_urlIp , r->m_urlIp ,
0 , // offset 0 , // offset
-1 , -1 ,
@ -767,7 +782,10 @@ void downloadTheDocForReals ( Msg13Request *r ) {
r->m_httpProxyPort , r->m_httpProxyPort ,
r->m_maxTextDocLen , r->m_maxTextDocLen ,
r->m_maxOtherDocLen , r->m_maxOtherDocLen ,
agent ) ) agent ,
"HTTP/1.0" , // protocol
false , // do POST?
r->ptr_cookie ) )
// return false if blocked // return false if blocked
return; return;
// . log this so i know about it // . log this so i know about it
@ -818,7 +836,7 @@ void gotHttpReply2 ( void *state ,
if ( g_errno && g_conf.m_logDebugSpider ) if ( g_errno && g_conf.m_logDebugSpider )
log("spider: http reply (msg13) had error = %s " log("spider: http reply (msg13) had error = %s "
"for %s at ip %s", "for %s at ip %s",
mstrerror(g_errno),r->m_url,iptoa(r->m_urlIp)); mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));
// get time now // get time now
long long nowms = gettimeofdayInMilliseconds(); long long nowms = gettimeofdayInMilliseconds();
@ -832,7 +850,7 @@ void gotHttpReply2 ( void *state ,
"firstIp=%s " "firstIp=%s "
"url=%s " "url=%s "
"to msg13::hammerCache", "to msg13::hammerCache",
nowms,iptoa(r->m_firstIp),r->m_url); nowms,iptoa(r->m_firstIp),r->ptr_url);
// sanity. this was happening from iframe download // sanity. this was happening from iframe download
@ -859,7 +877,7 @@ void gotHttpReply2 ( void *state ,
// note it // note it
if ( r->m_useTestCache && g_conf.m_logDebugSpider ) if ( r->m_useTestCache && g_conf.m_logDebugSpider )
logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%llu", logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%llu",
r->m_url,iptoa(r->m_firstIp),r->m_urlHash48); r->ptr_url,iptoa(r->m_firstIp),r->m_urlHash48);
long niceness = r->m_niceness; long niceness = r->m_niceness;
@ -986,7 +1004,7 @@ void gotHttpReply2 ( void *state ,
!r->m_isRobotsTxt && !r->m_isRobotsTxt &&
r->m_compressReply ) { r->m_compressReply ) {
long cs = getCharsetFast ( &mime, long cs = getCharsetFast ( &mime,
r->m_url, r->ptr_url,
content, content,
contentLen, contentLen,
niceness); niceness);
@ -1088,7 +1106,7 @@ void gotHttpReply2 ( void *state ,
// ok, did we have an error? // ok, did we have an error?
if ( g_errno ) if ( g_errno )
log("scproxy: xml set for %s had error: %s", log("scproxy: xml set for %s had error: %s",
r->m_url,mstrerror(g_errno)); r->ptr_url,mstrerror(g_errno));
// otherwise, i guess we had no iframes worthy of expanding // otherwise, i guess we had no iframes worthy of expanding
// so pretend we do not have any iframes // so pretend we do not have any iframes
hasIframe2 = false; hasIframe2 = false;
@ -1128,12 +1146,12 @@ void gotHttpReply2 ( void *state ,
} }
// nuke the content if from flurbit.com website!! // nuke the content if from flurbit.com website!!
if ( r->m_url && if ( r->ptr_url &&
replySize>0 && replySize>0 &&
goodStatus && goodStatus &&
strstr ( r->m_url,"flurbit.com/" ) ) { strstr ( r->ptr_url,"flurbit.com/" ) ) {
// note it in log // note it in log
log("msg13: got flurbit url: %s",r->m_url); log("msg13: got flurbit url: %s",r->ptr_url);
// record in the stats // record in the stats
docsPtr = &g_stats.m_compressUnchangedDocs; docsPtr = &g_stats.m_compressUnchangedDocs;
bytesInPtr = &g_stats.m_compressUnchangedBytesIn; bytesInPtr = &g_stats.m_compressUnchangedBytesIn;
@ -1366,7 +1384,7 @@ void gotHttpReply2 ( void *state ,
log("proxy: msg13: sending back error: %s " log("proxy: msg13: sending back error: %s "
"for url %s with ip %s", "for url %s with ip %s",
mstrerror(err), mstrerror(err),
r2->m_url, r2->ptr_url,
iptoa(r2->m_urlIp)); iptoa(r2->m_urlIp));
g_udpServer.sendErrorReply ( slot , err ); g_udpServer.sendErrorReply ( slot , err );
continue; continue;
@ -1412,7 +1430,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
if ( g_errno ) { if ( g_errno ) {
log("spider: error from proxy for %s: %s", log("spider: error from proxy for %s: %s",
r->m_url,mstrerror(g_errno)); r->ptr_url,mstrerror(g_errno));
g_udpServer.sendErrorReply(r->m_udpSlot, g_errno); g_udpServer.sendErrorReply(r->m_udpSlot, g_errno);
return; return;
} }
@ -2014,8 +2032,8 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
// make a fake spider request so we can do it // make a fake spider request so we can do it
SpiderRequest sreq; SpiderRequest sreq;
sreq.reset(); sreq.reset();
strcpy(sreq.m_url,r->m_url); strcpy(sreq.m_url,r->ptr_url);
long firstIp = hash32n(r->m_url); long firstIp = hash32n(r->ptr_url);
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
sreq.setKey( firstIp,0LL, false ); sreq.setKey( firstIp,0LL, false );
sreq.m_isInjecting = 1; sreq.m_isInjecting = 1;
@ -2027,7 +2045,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
// log it now // log it now
if ( g_conf.m_logDebugBuild ) if ( g_conf.m_logDebugBuild )
log("scproxy: expanding iframes for %s",r->m_url); log("scproxy: expanding iframes for %s",r->ptr_url);
// . use the enormous power of our new XmlDoc class // . use the enormous power of our new XmlDoc class
// . this returns false with g_errno set on error // . this returns false with g_errno set on error
@ -2108,7 +2126,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
char **ec = xd->getExpandedUtf8Content(); char **ec = xd->getExpandedUtf8Content();
// this means it blocked // this means it blocked
if ( ec == (void *)-1 ) { if ( ec == (void *)-1 ) {
//log("scproxy: waiting for %s",r->m_url); //log("scproxy: waiting for %s",r->ptr_url);
return false; return false;
} }
// return true with g_errno set // return true with g_errno set
@ -2128,7 +2146,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
// so i'd think indicative of something special // so i'd think indicative of something special
if ( g_conf.m_logDebugBuild ) if ( g_conf.m_logDebugBuild )
log("scproxy: got iframe expansion without blocking for url=%s" log("scproxy: got iframe expansion without blocking for url=%s"
" err=%s",r->m_url,mstrerror(g_errno)); " err=%s",r->ptr_url,mstrerror(g_errno));
// save g_errno for returning // save g_errno for returning
long saved = g_errno; long saved = g_errno;
@ -2169,11 +2187,11 @@ void gotIframeExpandedContent ( void *state ) {
// this was stored in xd // this was stored in xd
Msg13Request *r = xd->m_r; Msg13Request *r = xd->m_r;
//log("scproxy: done waiting for %s",r->m_url); //log("scproxy: done waiting for %s",r->ptr_url);
// note it // note it
if ( g_conf.m_logDebugBuild ) if ( g_conf.m_logDebugBuild )
log("scproxy: got iframe expansion for url=%s",r->m_url); log("scproxy: got iframe expansion for url=%s",r->ptr_url);
// assume we had no expansion or there was an error // assume we had no expansion or there was an error
char *reply = NULL; char *reply = NULL;
@ -2212,7 +2230,7 @@ void gotIframeExpandedContent ( void *state ) {
// on the main cluster! // on the main cluster!
if ( g_errno ) if ( g_errno )
log("scproxy: error getting iframe content for url=%s : %s", log("scproxy: error getting iframe content for url=%s : %s",
r->m_url,mstrerror(g_errno)); r->ptr_url,mstrerror(g_errno));
// sanity check // sanity check
if ( reply && reply[replySize-1] != '\0') { char *xx=NULL;*xx=0; } if ( reply && reply[replySize-1] != '\0') { char *xx=NULL;*xx=0; }
// pass back the error we had, if any // pass back the error we had, if any
@ -2270,7 +2288,7 @@ void scanHammerQueue ( int fd , void *state ) {
// debug // debug
//log("spider: downloading %s from crawldelay queue " //log("spider: downloading %s from crawldelay queue "
// "waited=%llims crawldelay=%lims", // "waited=%llims crawldelay=%lims",
// r->m_url,waited,r->m_crawlDelayMS); // r->ptr_url,waited,r->m_crawlDelayMS);
// good to go // good to go
downloadTheDocForReals ( r ); downloadTheDocForReals ( r );

15
Msg13.h
View File

@ -80,16 +80,23 @@ public:
long long m_cacheKey; long long m_cacheKey;
char m_testDir[32]; char m_testDir[32];
// msg13 sets this too, so you don't have to worry about setting it // msg13 sets this too, so you don't have to worry about setting it
long m_urlLen; //long m_urlLen;
// includes \0 termination // includes \0 termination
char m_url[MAX_URL_LEN+1]; //char m_url[MAX_URL_LEN+1];
char *ptr_url;
char *ptr_cookie;
long size_url;
long size_cookie;
long getSize() { long getSize() {
return ((char *)m_url-(char *)this) +m_urlLen +1;}; return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};
// zero it all out // zero it all out
void reset() { void reset() {
memset (this,0,(char *)m_url - (char *)this + 1); //memset (this,0,(char *)m_url - (char *)this + 1);
memset (this,0,sizeof(Msg13Request));
m_maxTextDocLen = -1; // no limit m_maxTextDocLen = -1; // no limit
m_maxOtherDocLen = -1; // no limit m_maxOtherDocLen = -1; // no limit
m_crawlDelayMS = -1; // unknown or none m_crawlDelayMS = -1; // unknown or none

16
Url.cpp
View File

@ -195,14 +195,14 @@ void Url::set ( char *t , long tlen , bool addWWW , bool stripSessionId ,
s[len]='\0'; s[len]='\0';
// make http:////www.xyz.com into http://www.xyz.com // make http:////www.xyz.com into http://www.xyz.com
if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" , 9 ) ) { // if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" ,9) ){
memcpy (s+7,s+9,len-9+1); // memcpy (s+7,s+9,len-9+1);
len -= 2; // len -= 2;
} // }
if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////", 10 ) ) { // if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////",10)){
memcpy (s+8,s+10,len-9+1); // memcpy (s+8,s+10,len-9+1);
len -= 2; // len -= 2;
} // }
// . remove session ids from s // . remove session ids from s
// . ';' most likely preceeds a session id // . ';' most likely preceeds a session id

View File

@ -9412,6 +9412,23 @@ Url **XmlDoc::getRedirUrl() {
// breathe // breathe
QUICKPOLL(m_niceness); QUICKPOLL(m_niceness);
// get cookie for redirect to fix nyt.com
char *cookie = mime.getCookie();
// find end of cookie at the semicolon
char *s = cookie;
for ( ; s && *s && *s != ';' ; s++ );
if ( s && *s == ';' ) {
// do not include ;
long clen = s - cookie;
m_redirCookieBuf.reset();
m_redirCookieBuf.safeMemcpy ( cookie , clen );
m_redirCookieBuf.nullTerm();
m_redirCookieBufValid = true;
}
// mdw23
//log("http: reply=%s",m_httpReply);
// a hack for removing session ids already in there. for // a hack for removing session ids already in there. for
// brilliantshopper's bs4 collection and gk0 cluster // brilliantshopper's bs4 collection and gk0 cluster
//bool forceRedirect = false; //bool forceRedirect = false;
@ -9520,7 +9537,7 @@ Url **XmlDoc::getRedirUrl() {
// . if we followed too many then bail // . if we followed too many then bail
// . www.motorolamobility.com www.outlook.com ... failed when we // . www.motorolamobility.com www.outlook.com ... failed when we
// had >= 4 here // had >= 4 here
if ( ++m_numRedirects >= 7 ) { if ( ++m_numRedirects >= 10 ) {
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS; if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
return &m_redirUrlPtr; return &m_redirUrlPtr;
} }
@ -14711,7 +14728,9 @@ char **XmlDoc::getHttpReply2 ( ) {
// clear it first // clear it first
r->reset(); r->reset();
// and set the url // and set the url
strcpy ( r->m_url , cu->getUrl() ); //strcpy ( r->m_url , cu->getUrl() );
r->ptr_url = cu->getUrl();
r->size_url = cu->getUrlLen()+1;
// sanity check // sanity check
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; } if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
// max to download in bytes. currently 1MB. // max to download in bytes. currently 1MB.
@ -14747,6 +14766,15 @@ char **XmlDoc::getHttpReply2 ( ) {
r->m_ifModifiedSince = 0; r->m_ifModifiedSince = 0;
r->m_skipHammerCheck = 0; r->m_skipHammerCheck = 0;
if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
r->ptr_cookie = m_redirCookieBuf.getBufStart();
r->size_cookie = m_redirCookieBuf.length() + 1;
// . only do once per redirect
// . do not invalidate because we might have to carry it
// through to the next redir... unless we change domain
// . this fixes the nyt.com bug some more
//m_redirCookieBufValid = false;
}
// . this is -1 if unknown. none found in robots.txt or provided // . this is -1 if unknown. none found in robots.txt or provided
// in the custom crawl parms. // in the custom crawl parms.

View File

@ -962,6 +962,7 @@ class XmlDoc {
Url m_redirUrl; Url m_redirUrl;
Url *m_redirUrlPtr; Url *m_redirUrlPtr;
SafeBuf m_redirCookieBuf;
Url m_metaRedirUrl; Url m_metaRedirUrl;
Url *m_metaRedirUrlPtr; Url *m_metaRedirUrlPtr;
Url m_canonicalRedirUrl; Url m_canonicalRedirUrl;
@ -1235,6 +1236,7 @@ class XmlDoc {
//bool m_tryAgainTimeDeltaValid; //bool m_tryAgainTimeDeltaValid;
//bool m_eliminateMenusValid; //bool m_eliminateMenusValid;
bool m_redirUrlValid; bool m_redirUrlValid;
bool m_redirCookieBufValid;
bool m_metaRedirUrlValid; bool m_metaRedirUrlValid;
bool m_canonicalRedirUrlValid; bool m_canonicalRedirUrlValid;
bool m_statusMsgValid; bool m_statusMsgValid;