fix nyt.com cookie redir bug.

fixed bug when POSTing injection request with multipart/form-data.
2024-10-04 12:17:35 +03:00 · 2014-08-05 17:04:11 -07:00 · 2014-08-05 17:04:11 -07:00 · cc1ceaaac2
commit cc1ceaaac2
parent 146e45db56
9 changed files with 145 additions and 66 deletions
--- a/HttpMime.cpp
+++ b/HttpMime.cpp
@ -159,9 +159,10 @@ bool HttpMime::parse ( char *mime , long mimeLen , Url *url ) {
 		}
 		else if ( strncasecmp ( p , "Content-Type:"   ,13) == 0 ) 
 			m_contentType = getContentTypePrivate ( p + 13 );
-		else if ( strncasecmp ( p , "Set-Cookie: "   ,11) == 0 ) {
+		else if ( strncasecmp ( p , "Set-Cookie:"   ,10) == 0 ) {
 			m_cookie = p + 11;
-			m_cookieLen = gbstrlen ( p + 11 );
+			if ( m_cookie[0] == ' ' ) m_cookie++;
+			m_cookieLen = gbstrlen ( m_cookie );
 		}
 		else if ( strncasecmp ( p , "Location:"       , 9) == 0 ) {
 			// point to it
--- a/HttpRequest.cpp
+++ b/HttpRequest.cpp
@ -181,6 +181,10 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 	if ( size == 0 ) cmd = "HEAD";
 	if ( doPost    ) cmd = "POST";

+	// crap, can't spider nyt.com if we are 1.0, so use 1.0 but also
+	// note Connection: Close\r\n when making requests
+	//proto = "HTTP/1.1";
+
 	 // . now use "Accept-Language: en" to tell servers we prefer english
 	 // . i removed keep-alive connection since some connections close on
 	 //   non-200 ok http statuses and we think they're open since close
@ -212,6 +216,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 			   "Host: %s\r\n"
 			   "%s"
 			   "User-Agent: %s\r\n"
+			   "Connection: Close\r\n"
 			   //"Connection: Keep-Alive\r\n" 
 			   "Accept-Language: en\r\n"
 			   //"Accept: */*\r\n\r\n" ,
@ -226,6 +231,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 			   "Host: %s\r\n"
 			   "%s"
 			   "User-Agent: %s\r\n"
+			   "Connection: Close\r\n"
 			   //"Connection: Keep-Alive\r\n"
 			   "Accept-Language: en\r\n"
 			   //"Accept: */*\r\n"
@ -246,6 +252,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 			   "Host: %s\r\n"
 			   "%s"
 			   "User-Agent: %s\r\n"
+			   "Connection: Close\r\n"
 			   //"Connection: Keep-Alive\r\n"
 			   "Accept-Language: en\r\n"
 			   //"Accept: */*\r\n"
@ -275,6 +282,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 			   "Accept: */*\r\n" 
 			   "Host: %s\r\n"
 			   "%s"
+			   "Connection: Close\r\n"
 			   //"Connection: Keep-Alive\r\n"
 			   //"Accept-Language: en\r\n"
 				"%s",
@ -417,6 +425,12 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 		 else log("http: Got POST request without \\r\\n\\r\\n.");
 	 }

+	 bool multipart = false;
+	 if ( m_requestType == 2 ) { // is POST?
+		 char *cd =strcasestr(req,"Content-Type: multipart/form-data");
+		 if ( cd ) multipart = true;
+	 }
+
 	 // . point to the file path 
 	 // . skip over the "GET "
 	 long filenameStart = 4 ;
@ -812,7 +826,8 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
 	 }

 	 // Put '\0' back into the HttpRequest buffer...
-	 if (m_cgiBuf){
+	 // crap, not if we are multi-part unencoded stuff...
+	 if ( m_cgiBuf && ! multipart ) {
 		 // do not mangle the "ucontent"!
 		 long cgiBufLen = m_cgiBufLen;
 		 cgiBufLen -= m_ucontentLen;
--- a/HttpServer.cpp
+++ b/HttpServer.cpp
@ -190,8 +190,14 @@ bool HttpServer::getDoc ( char   *url      ,
 	char *host = getHostFast ( url , &hostLen , &port );
 	

+	// mdw23
 	//if ( g_conf.m_logDebugSpider )
-	//	log("spider: httprequest = %s", req );
+	// {
+	// 	SafeBuf tmp;
+	// 	tmp.safeMemcpy ( req , reqSize );
+	// 	tmp.nullTerm();
+	// 	log("spider: httprequest = %s", tmp.getBufStart() );
+	// }


 	// do we have an ip to send to? assume not
--- a/Images.cpp
+++ b/Images.cpp
@ -692,7 +692,9 @@ bool Images::downloadImage ( ) {
 		r->m_addToTestCache = 1;
 	}
 	// url is the most important
-	strcpy(r->m_url,m_imageUrl.getUrl());
+	//strcpy(r->m_url,m_imageUrl.getUrl());
+	r-> ptr_url = m_imageUrl.getUrl();
+	r->size_url = m_imageUrl.getUrlLen()+1; // include \0
 	// . try to download it
 	// . i guess we are ignoring hammers at this point
 	if ( ! m_msg13.getDoc(r,false,this,downloadImageWrapper)) 
--- a/Msg13.cpp
+++ b/Msg13.cpp
@ -164,8 +164,8 @@ bool Msg13::getDoc ( Msg13Request *r,
 	if ( r->m_urlIp == -1 ) { char *xx = NULL; *xx = 0; }

 	// set this
-	r->m_urlLen    = gbstrlen ( r->m_url );
-	r->m_urlHash64 = hash64 ( r->m_url , r->m_urlLen );
+	//r->m_urlLen    = gbstrlen ( r->ptr_url );
+	r->m_urlHash64 = hash64 ( r->ptr_url , r->size_url-1);//m_urlLen );

 	// sanity check, if spidering the test coll make sure one of 
 	// these is true!! this prevents us from mistakenly turning it off
@ -186,8 +186,8 @@ bool Msg13::getDoc ( Msg13Request *r,
 	//	r->m_testParserEnabled = true;

 	// is this a /robots.txt url?
-	if ( r->m_urlLen > 12 && 
-	     ! strncmp ( r->m_url + r->m_urlLen - 11,"/robots.txt",11))
+	if ( r->size_url - 1 > 12 && 
+	     ! strncmp ( r->ptr_url + r->size_url -1 -11,"/robots.txt",11))
 		r->m_isRobotsTxt = true;

 	// force caching if getting robots.txt so is compressed in cache
@ -195,7 +195,7 @@ bool Msg13::getDoc ( Msg13Request *r,
 		r->m_compressReply = true;

 	// do not get .google.com/ crap
-	//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
+	//if ( strstr(r->ptr_url,".google.com/") ) { char *xx=NULL;*xx=0; }

 	// set it for this too
 	//if ( g_conf.m_useCompressionProxy ) {
@ -261,19 +261,33 @@ bool Msg13::forwardRequest ( ) {
 		logf ( LOG_DEBUG, 
 		       "spider: sending download request of %s firstIp=%s "
 		       "uh48=%llu to "
-		       "host %li (child=%li)", r->m_url, iptoa(r->m_firstIp), 
+		       "host %li (child=%li)", r->ptr_url, iptoa(r->m_firstIp), 
 		       r->m_urlHash48, hostId,
 		       r->m_skipHammerCheck);


 	// fill up the request
-	long requestSize = r->getSize();
+	long requestBufSize = r->getSize();
+
+	// we have to serialize it now because it has cookies as well as
+	// the url.
+	char *requestBuf = serializeMsg ( sizeof(Msg39Request),
+					  &r->size_url,
+					  &r->size_cookie,
+					  &r->ptr_url,
+					  r,
+					  &requestBufSize ,
+					  NULL , 
+					  0,//RBUF_SIZE , 
+					  false );
+	// g_errno should be set in this case, most likely to ENOMEM
+	if ( ! requestBuf ) return true;

 	// . otherwise, send the request to the key host
 	// . returns false and sets g_errno on error
 	// . now wait for 2 minutes before timing out
-	if ( ! g_udpServer.sendRequest ( (char *)r    ,
-					 requestSize  , 
+	if ( ! g_udpServer.sendRequest ( requestBuf, // (char *)r    ,
+					 requestBufSize  , 
 					 0x13         , // msgType 0x13
 					 h->m_ip      ,
 					 h->m_port    ,
@ -309,7 +323,8 @@ void gotForwardedReplyWrapper ( void *state , UdpSlot *slot ) {

 bool Msg13::gotForwardedReply ( UdpSlot *slot ) {
 	// don't let udpserver free the request, it's our m_request[]
-	slot->m_sendBufAlloc = NULL;
+	// no, now let him free it because it was serialized into there
+	//slot->m_sendBufAlloc = NULL;
 	// what did he give us?
 	char *reply          = slot->m_readBuf;
 	long  replySize      = slot->m_readBufSize;
@ -343,7 +358,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){

 	if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
 		logf(LOG_DEBUG,"spider: FINALIZED %s firstIp=%s",
-		     r->m_url,iptoa(r->m_firstIp));
+		     r->ptr_url,iptoa(r->m_firstIp));


 	// . if timed out probably the host is now dead so try another one!
@ -351,7 +366,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
 	if ( g_errno == EUDPTIMEDOUT ) {
 		// try again
 		log("spider: retrying1. had error for %s : %s",
-		    r->m_url,mstrerror(g_errno));
+		    r->ptr_url,mstrerror(g_errno));
 		// return if that blocked
 		if ( ! forwardRequest ( ) ) return false;
 		// a different g_errno should be set now!
@ -362,7 +377,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
 		// for it here
 		if ( g_conf.m_logDebugSpider )
 			log("spider: error for %s: %s",
-			    r->m_url,mstrerror(g_errno));
+			    r->ptr_url,mstrerror(g_errno));
 		return true;
 	}

@ -435,7 +450,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
 	// log it for now
 	if ( g_conf.m_logDebugSpider )
 		log("http: got doc %s %li to %li",
-		    r->m_url,(long)replySize,(long)uncompressedLen);
+		    r->ptr_url,(long)replySize,(long)uncompressedLen);

 	return true;
 }
@ -458,9 +473,9 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 	//if ( niceness == 0 ) { char *xx=NULL;*xx=0; }

 	// make sure we do not download gigablast.com admin pages!
-	if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->m_urlLen >= 7 ) {
+	if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->size_url-1 >= 7 ) {
 		Url url;
-		url.set ( r->m_url );
+		url.set ( r->ptr_url );
 		// . never download /master urls from ips of hosts in cluster
 		// . TODO: FIX! the pages might be in another cluster!
 		if ( ( strncasecmp ( url.getPath() , "/master/" , 8 ) == 0 ||
@ -500,7 +515,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 		// helpful for debugging. even though you may see a robots.txt
 		// redirect and think we are downloading that each time,
 		// we are not... the redirect is cached here as well.
-		//log("spider: %s was in cache",r->m_url);
+		//log("spider: %s was in cache",r->ptr_url);
 		// . send the cached reply back
 		// . this will free send/read bufs on completion/g_errno
 		g_udpServer.sendReply_ass ( rec , recSize , rec, recSize,slot);
@ -510,7 +525,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 	// log it so we can see if we are hammering
 	if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
 		logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
-		     r->m_url,iptoa(r->m_firstIp));
+		     r->ptr_url,iptoa(r->m_firstIp));

 	// temporary hack
 	if ( r->m_parent ) { char *xx=NULL;*xx=0; }
@ -559,7 +574,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 		//   which will store maybe a -1 if currently downloading...
 		if ( queueIt ) {
 			// debug
-			//log("spider: adding %s to crawldelayqueue",r->m_url);
+			//log("spider: adding %s to crawldelayqueue",r->ptr_url);
 			// save this
 			r->m_udpSlot = slot;
 			r->m_nextLink = NULL;
@ -580,7 +595,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 		if ( last > 0 && waited < r->m_crawlDelayMS ) {
 			log("spider: hammering firstIp=%s url=%s "
 			    "only waited %lli ms of %li ms",
-			    iptoa(r->m_firstIp),r->m_url,waited,
+			    iptoa(r->m_firstIp),r->ptr_url,waited,
 			    r->m_crawlDelayMS);
 			// this guy has too many redirects and it fails us...
 			// BUT do not core if running live, only if for test
@ -598,14 +613,14 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {
 		//	    "firstIp=%s "
 		//	    "url=%s "
 		//	    "to msg13::hammerCache",
-		//	    nowms,iptoa(r->m_firstIp),r->m_url);
+		//	    nowms,iptoa(r->m_firstIp),r->ptr_url);
 		// clear error from that if any, not important really
 		g_errno = 0;
 	}

 	// try to get it from the test cache?
 	TcpSocket ts;
-	if ( r->m_useTestCache && getTestDoc ( r->m_url, &ts , r ) ) {
+	if ( r->m_useTestCache && getTestDoc ( r->ptr_url, &ts , r ) ) {
 		// save this
 		r->m_udpSlot = slot;
 		// store the request so gotHttpReply can reply to it
@ -672,7 +687,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness  ) {


 	// do not get .google.com/ crap
-	//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
+	//if ( strstr(r->ptr_url,".google.com/") ) { char *xx=NULL;*xx=0; }

 	downloadTheDocForReals ( r );
 }
@ -689,7 +704,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {

 	// this means our callback will be called
 	if ( ! firstInLine ) {
-		//log("spider: inlining %s",r->m_url);
+		//log("spider: inlining %s",r->ptr_url);
 		return;
 	}

@ -715,7 +730,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
 		    "firstIp=%s "
 		    "url=%s "
 		    "to msg13::hammerCache",
-		    -1LL,iptoa(r->m_firstIp),r->m_url);
+		    -1LL,iptoa(r->m_firstIp),r->ptr_url);


 	// flag this
@ -723,7 +738,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
 	// note it here
 	if ( g_conf.m_logDebugSpider )
 		log("spider: downloading %s (%s) (skiphammercheck=%li)",
-		    r->m_url,iptoa(r->m_urlIp) ,
+		    r->ptr_url,iptoa(r->m_urlIp) ,
 		    (long)r->m_skipHammerCheck);

 	// use the default agent unless scraping
@ -755,7 +770,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {


 	// download it
-	if ( ! g_httpServer.getDoc ( r->m_url             ,
+	if ( ! g_httpServer.getDoc ( r->ptr_url             ,
 				     r->m_urlIp           ,
 				     0                    , // offset
 				     -1                   ,
@ -767,7 +782,10 @@ void downloadTheDocForReals ( Msg13Request *r ) {
 				     r->m_httpProxyPort   ,
 				     r->m_maxTextDocLen   ,
 				     r->m_maxOtherDocLen  ,
-				     agent                ) )
+				     agent                ,
+				     "HTTP/1.0"           , // protocol
+				     false                , // do POST?
+				     r->ptr_cookie        ) )
 		// return false if blocked
 		return;
 	// . log this so i know about it
@ -818,7 +836,7 @@ void gotHttpReply2 ( void *state ,
 	if ( g_errno && g_conf.m_logDebugSpider )
 		log("spider: http reply (msg13) had error = %s "
 		    "for %s at ip %s",
-		    mstrerror(g_errno),r->m_url,iptoa(r->m_urlIp));
+		    mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));

 	// get time now
 	long long nowms = gettimeofdayInMilliseconds();
@ -832,7 +850,7 @@ void gotHttpReply2 ( void *state ,
 		    "firstIp=%s "
 		    "url=%s "
 		    "to msg13::hammerCache",
-		    nowms,iptoa(r->m_firstIp),r->m_url);
+		    nowms,iptoa(r->m_firstIp),r->ptr_url);


 	// sanity. this was happening from iframe download
@ -859,7 +877,7 @@ void gotHttpReply2 ( void *state ,
 	// note it
 	if ( r->m_useTestCache && g_conf.m_logDebugSpider )
 		logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%llu",
-		     r->m_url,iptoa(r->m_firstIp),r->m_urlHash48);
+		     r->ptr_url,iptoa(r->m_firstIp),r->m_urlHash48);

 	long niceness = r->m_niceness;

@ -986,7 +1004,7 @@ void gotHttpReply2 ( void *state ,
 	     !r->m_isRobotsTxt && 
 	     r->m_compressReply ) {
 		long cs = getCharsetFast ( &mime,
-					   r->m_url,
+					   r->ptr_url,
 					   content,
 					   contentLen,
 					   niceness);
@ -1088,7 +1106,7 @@ void gotHttpReply2 ( void *state ,
 		// ok, did we have an error?
 		if ( g_errno )
 			log("scproxy: xml set for %s had error: %s",
-			    r->m_url,mstrerror(g_errno));
+			    r->ptr_url,mstrerror(g_errno));
 		// otherwise, i guess we had no iframes worthy of expanding
 		// so pretend we do not have any iframes
 		hasIframe2 = false;
@ -1128,12 +1146,12 @@ void gotHttpReply2 ( void *state ,
 	}

 	// nuke the content if from flurbit.com website!!
-	if ( r->m_url &&
+	if ( r->ptr_url &&
 	     replySize>0 &&
 	     goodStatus &&
-	     strstr ( r->m_url,"flurbit.com/" ) ) {
+	     strstr ( r->ptr_url,"flurbit.com/" ) ) {
 		// note it in log
-		log("msg13: got flurbit url: %s",r->m_url);
+		log("msg13: got flurbit url: %s",r->ptr_url);
 		// record in the stats
 		docsPtr     = &g_stats.m_compressUnchangedDocs;
 		bytesInPtr  = &g_stats.m_compressUnchangedBytesIn;
@ -1366,7 +1384,7 @@ void gotHttpReply2 ( void *state ,
 				log("proxy: msg13: sending back error: %s "
 				    "for url %s with ip %s",
 				    mstrerror(err),
-				    r2->m_url,
+				    r2->ptr_url,
 				    iptoa(r2->m_urlIp));
 			g_udpServer.sendErrorReply ( slot , err );
 			continue;
@ -1412,7 +1430,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {

 	if ( g_errno ) {
 		log("spider: error from proxy for %s: %s",
-		    r->m_url,mstrerror(g_errno));
+		    r->ptr_url,mstrerror(g_errno));
 		g_udpServer.sendErrorReply(r->m_udpSlot, g_errno);
 		return;
 	}
@ -2014,8 +2032,8 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
 	// make a fake spider request so we can do it
 	SpiderRequest sreq;
 	sreq.reset();
-	strcpy(sreq.m_url,r->m_url);
-	long firstIp = hash32n(r->m_url);
+	strcpy(sreq.m_url,r->ptr_url);
+	long firstIp = hash32n(r->ptr_url);
 	if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
 	sreq.setKey( firstIp,0LL, false );
 	sreq.m_isInjecting   = 1; 
@ -2027,7 +2045,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {

 	// log it now
 	if ( g_conf.m_logDebugBuild ) 
-		log("scproxy: expanding iframes for %s",r->m_url);
+		log("scproxy: expanding iframes for %s",r->ptr_url);

 	// . use the enormous power of our new XmlDoc class
 	// . this returns false with g_errno set on error
@ -2108,7 +2126,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
 	char **ec = xd->getExpandedUtf8Content();
 	// this means it blocked
 	if ( ec == (void *)-1 ) {
-		//log("scproxy: waiting for %s",r->m_url);
+		//log("scproxy: waiting for %s",r->ptr_url);
 		return false;
 	}
 	// return true with g_errno set
@ -2128,7 +2146,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
 	// so i'd think indicative of something special
 	if ( g_conf.m_logDebugBuild ) 
 		log("scproxy: got iframe expansion without blocking for url=%s"
-		    " err=%s",r->m_url,mstrerror(g_errno));
+		    " err=%s",r->ptr_url,mstrerror(g_errno));

 	// save g_errno for returning
 	long saved = g_errno;
@ -2169,11 +2187,11 @@ void gotIframeExpandedContent ( void *state ) {
 	// this was stored in xd
 	Msg13Request *r = xd->m_r;

-	//log("scproxy: done waiting for %s",r->m_url);
+	//log("scproxy: done waiting for %s",r->ptr_url);

 	// note it
 	if ( g_conf.m_logDebugBuild ) 
-		log("scproxy: got iframe expansion for url=%s",r->m_url);
+		log("scproxy: got iframe expansion for url=%s",r->ptr_url);

 	// assume we had no expansion or there was an error
 	char *reply          = NULL;
@ -2212,7 +2230,7 @@ void gotIframeExpandedContent ( void *state ) {
 	// on the main cluster!
 	if ( g_errno )
 		log("scproxy: error getting iframe content for url=%s : %s",
-		    r->m_url,mstrerror(g_errno));
+		    r->ptr_url,mstrerror(g_errno));
 	// sanity check
 	if ( reply && reply[replySize-1] != '\0') { char *xx=NULL;*xx=0; }
 	// pass back the error we had, if any
@ -2270,7 +2288,7 @@ void scanHammerQueue ( int fd , void *state ) {
 		// debug
 		//log("spider: downloading %s from crawldelay queue "
 		//    "waited=%llims crawldelay=%lims", 
-		//    r->m_url,waited,r->m_crawlDelayMS);
+		//    r->ptr_url,waited,r->m_crawlDelayMS);

 		// good to go
 		downloadTheDocForReals ( r );
--- a/Msg13.h
+++ b/Msg13.h
@ -80,16 +80,23 @@ public:
 	long long m_cacheKey;
 	char      m_testDir[32];
 	// msg13 sets this too, so you don't have to worry about setting it
-	long      m_urlLen;
+	//long      m_urlLen;
 	// includes \0 termination
-	char      m_url[MAX_URL_LEN+1];
+	//char      m_url[MAX_URL_LEN+1];
+
+	char *ptr_url;
+	char *ptr_cookie;
+
+	long  size_url;
+	long  size_cookie;

 	long getSize() {
-		return ((char *)m_url-(char *)this) +m_urlLen +1;};
+		return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};

 	// zero it all out
 	void reset() {
-		memset (this,0,(char *)m_url - (char *)this + 1); 
+		//memset (this,0,(char *)m_url - (char *)this + 1); 
+		memset (this,0,sizeof(Msg13Request));
 		m_maxTextDocLen  = -1; // no limit
 		m_maxOtherDocLen = -1; // no limit
 		m_crawlDelayMS   = -1; // unknown or none
--- a/Url.cpp
+++ b/Url.cpp
@ -195,14 +195,14 @@ void Url::set ( char *t , long tlen , bool addWWW , bool stripSessionId ,
 	s[len]='\0';

 	// make http:////www.xyz.com into http://www.xyz.com
-	if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" , 9 ) ) {
-		memcpy (s+7,s+9,len-9+1);
-		len -= 2;
-	}
-	if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////", 10 ) ) {
-		memcpy (s+8,s+10,len-9+1);
-		len -= 2;
-	}
+	// if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" ,9) ){
+	// 	memcpy (s+7,s+9,len-9+1);
+	// 	len -= 2;
+	// }
+	// if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////",10)){
+	// 	memcpy (s+8,s+10,len-9+1);
+	// 	len -= 2;
+	// }

 	// . remove session ids from s
 	// . ';' most likely preceeds a session id
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -9412,6 +9412,23 @@ Url **XmlDoc::getRedirUrl() {
 	// breathe
 	QUICKPOLL(m_niceness);

+	// get cookie for redirect to fix nyt.com
+	char *cookie = mime.getCookie();
+	// find end of cookie at the semicolon
+	char *s = cookie;
+	for ( ; s && *s && *s != ';' ; s++ );
+	if ( s && *s == ';' ) {
+		// do not include ;
+		long clen = s - cookie;
+		m_redirCookieBuf.reset();
+		m_redirCookieBuf.safeMemcpy ( cookie , clen );
+		m_redirCookieBuf.nullTerm();
+		m_redirCookieBufValid = true;
+	}
+
+	// mdw23
+	//log("http: reply=%s",m_httpReply);
+
 	// a hack for removing session ids already in there. for 
 	// brilliantshopper's bs4 collection and gk0 cluster
 	//bool forceRedirect = false;
@ -9520,7 +9537,7 @@ Url **XmlDoc::getRedirUrl() {
 	// . if we followed too many then bail
 	// . www.motorolamobility.com www.outlook.com ... failed when we 
 	//   had >= 4 here
-	if ( ++m_numRedirects >= 7 ) {
+	if ( ++m_numRedirects >= 10 ) {
 		if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
 		return &m_redirUrlPtr;
 	}
@ -14711,7 +14728,9 @@ char **XmlDoc::getHttpReply2 ( ) {
 	// clear it first
 	r->reset();
 	// and set the url
-	strcpy ( r->m_url , cu->getUrl() );
+	//strcpy ( r->m_url , cu->getUrl() );
+	r->ptr_url  = cu->getUrl();
+	r->size_url = cu->getUrlLen()+1;
 	// sanity check
 	if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
 	// max to download in bytes. currently 1MB.
@ -14747,6 +14766,15 @@ char **XmlDoc::getHttpReply2 ( ) {
 	r->m_ifModifiedSince        = 0;
 	r->m_skipHammerCheck        = 0;

+	if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
+		r->ptr_cookie  = m_redirCookieBuf.getBufStart();
+		r->size_cookie = m_redirCookieBuf.length() + 1;
+		// . only do once per redirect
+		// . do not invalidate because we might have to carry it
+		//   through to the next redir... unless we change domain
+		// . this fixes the nyt.com bug some more
+		//m_redirCookieBufValid = false;
+	}

 	// . this is -1 if unknown. none found in robots.txt or provided
 	//   in the custom crawl parms.
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -962,6 +962,7 @@ class XmlDoc {

 	Url        m_redirUrl;
 	Url       *m_redirUrlPtr;
+	SafeBuf    m_redirCookieBuf;
 	Url        m_metaRedirUrl;
 	Url       *m_metaRedirUrlPtr;
 	Url        m_canonicalRedirUrl;
@ -1235,6 +1236,7 @@ class XmlDoc {
 	//bool m_tryAgainTimeDeltaValid;
 	//bool m_eliminateMenusValid;
 	bool m_redirUrlValid;
+	bool m_redirCookieBufValid;
 	bool m_metaRedirUrlValid;
 	bool m_canonicalRedirUrlValid;
 	bool m_statusMsgValid;