mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
fix nyt.com cookie redir bug.
fixed bug when POSTing injection request with multipart/form-data.
This commit is contained in:
parent
146e45db56
commit
cc1ceaaac2
@ -159,9 +159,10 @@ bool HttpMime::parse ( char *mime , long mimeLen , Url *url ) {
|
||||
}
|
||||
else if ( strncasecmp ( p , "Content-Type:" ,13) == 0 )
|
||||
m_contentType = getContentTypePrivate ( p + 13 );
|
||||
else if ( strncasecmp ( p , "Set-Cookie: " ,11) == 0 ) {
|
||||
else if ( strncasecmp ( p , "Set-Cookie:" ,10) == 0 ) {
|
||||
m_cookie = p + 11;
|
||||
m_cookieLen = gbstrlen ( p + 11 );
|
||||
if ( m_cookie[0] == ' ' ) m_cookie++;
|
||||
m_cookieLen = gbstrlen ( m_cookie );
|
||||
}
|
||||
else if ( strncasecmp ( p , "Location:" , 9) == 0 ) {
|
||||
// point to it
|
||||
|
@ -181,6 +181,10 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
if ( size == 0 ) cmd = "HEAD";
|
||||
if ( doPost ) cmd = "POST";
|
||||
|
||||
// crap, can't spider nyt.com if we are 1.0, so use 1.0 but also
|
||||
// note Connection: Close\r\n when making requests
|
||||
//proto = "HTTP/1.1";
|
||||
|
||||
// . now use "Accept-Language: en" to tell servers we prefer english
|
||||
// . i removed keep-alive connection since some connections close on
|
||||
// non-200 ok http statuses and we think they're open since close
|
||||
@ -212,6 +216,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
"Host: %s\r\n"
|
||||
"%s"
|
||||
"User-Agent: %s\r\n"
|
||||
"Connection: Close\r\n"
|
||||
//"Connection: Keep-Alive\r\n"
|
||||
"Accept-Language: en\r\n"
|
||||
//"Accept: */*\r\n\r\n" ,
|
||||
@ -226,6 +231,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
"Host: %s\r\n"
|
||||
"%s"
|
||||
"User-Agent: %s\r\n"
|
||||
"Connection: Close\r\n"
|
||||
//"Connection: Keep-Alive\r\n"
|
||||
"Accept-Language: en\r\n"
|
||||
//"Accept: */*\r\n"
|
||||
@ -246,6 +252,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
"Host: %s\r\n"
|
||||
"%s"
|
||||
"User-Agent: %s\r\n"
|
||||
"Connection: Close\r\n"
|
||||
//"Connection: Keep-Alive\r\n"
|
||||
"Accept-Language: en\r\n"
|
||||
//"Accept: */*\r\n"
|
||||
@ -275,6 +282,7 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
"Accept: */*\r\n"
|
||||
"Host: %s\r\n"
|
||||
"%s"
|
||||
"Connection: Close\r\n"
|
||||
//"Connection: Keep-Alive\r\n"
|
||||
//"Accept-Language: en\r\n"
|
||||
"%s",
|
||||
@ -417,6 +425,12 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
else log("http: Got POST request without \\r\\n\\r\\n.");
|
||||
}
|
||||
|
||||
bool multipart = false;
|
||||
if ( m_requestType == 2 ) { // is POST?
|
||||
char *cd =strcasestr(req,"Content-Type: multipart/form-data");
|
||||
if ( cd ) multipart = true;
|
||||
}
|
||||
|
||||
// . point to the file path
|
||||
// . skip over the "GET "
|
||||
long filenameStart = 4 ;
|
||||
@ -812,7 +826,8 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
}
|
||||
|
||||
// Put '\0' back into the HttpRequest buffer...
|
||||
if (m_cgiBuf){
|
||||
// crap, not if we are multi-part unencoded stuff...
|
||||
if ( m_cgiBuf && ! multipart ) {
|
||||
// do not mangle the "ucontent"!
|
||||
long cgiBufLen = m_cgiBufLen;
|
||||
cgiBufLen -= m_ucontentLen;
|
||||
|
@ -190,8 +190,14 @@ bool HttpServer::getDoc ( char *url ,
|
||||
char *host = getHostFast ( url , &hostLen , &port );
|
||||
|
||||
|
||||
// mdw23
|
||||
//if ( g_conf.m_logDebugSpider )
|
||||
// log("spider: httprequest = %s", req );
|
||||
// {
|
||||
// SafeBuf tmp;
|
||||
// tmp.safeMemcpy ( req , reqSize );
|
||||
// tmp.nullTerm();
|
||||
// log("spider: httprequest = %s", tmp.getBufStart() );
|
||||
// }
|
||||
|
||||
|
||||
// do we have an ip to send to? assume not
|
||||
|
@ -692,7 +692,9 @@ bool Images::downloadImage ( ) {
|
||||
r->m_addToTestCache = 1;
|
||||
}
|
||||
// url is the most important
|
||||
strcpy(r->m_url,m_imageUrl.getUrl());
|
||||
//strcpy(r->m_url,m_imageUrl.getUrl());
|
||||
r-> ptr_url = m_imageUrl.getUrl();
|
||||
r->size_url = m_imageUrl.getUrlLen()+1; // include \0
|
||||
// . try to download it
|
||||
// . i guess we are ignoring hammers at this point
|
||||
if ( ! m_msg13.getDoc(r,false,this,downloadImageWrapper))
|
||||
|
112
Msg13.cpp
112
Msg13.cpp
@ -164,8 +164,8 @@ bool Msg13::getDoc ( Msg13Request *r,
|
||||
if ( r->m_urlIp == -1 ) { char *xx = NULL; *xx = 0; }
|
||||
|
||||
// set this
|
||||
r->m_urlLen = gbstrlen ( r->m_url );
|
||||
r->m_urlHash64 = hash64 ( r->m_url , r->m_urlLen );
|
||||
//r->m_urlLen = gbstrlen ( r->ptr_url );
|
||||
r->m_urlHash64 = hash64 ( r->ptr_url , r->size_url-1);//m_urlLen );
|
||||
|
||||
// sanity check, if spidering the test coll make sure one of
|
||||
// these is true!! this prevents us from mistakenly turning it off
|
||||
@ -186,8 +186,8 @@ bool Msg13::getDoc ( Msg13Request *r,
|
||||
// r->m_testParserEnabled = true;
|
||||
|
||||
// is this a /robots.txt url?
|
||||
if ( r->m_urlLen > 12 &&
|
||||
! strncmp ( r->m_url + r->m_urlLen - 11,"/robots.txt",11))
|
||||
if ( r->size_url - 1 > 12 &&
|
||||
! strncmp ( r->ptr_url + r->size_url -1 -11,"/robots.txt",11))
|
||||
r->m_isRobotsTxt = true;
|
||||
|
||||
// force caching if getting robots.txt so is compressed in cache
|
||||
@ -195,7 +195,7 @@ bool Msg13::getDoc ( Msg13Request *r,
|
||||
r->m_compressReply = true;
|
||||
|
||||
// do not get .google.com/ crap
|
||||
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||
//if ( strstr(r->ptr_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// set it for this too
|
||||
//if ( g_conf.m_useCompressionProxy ) {
|
||||
@ -261,19 +261,33 @@ bool Msg13::forwardRequest ( ) {
|
||||
logf ( LOG_DEBUG,
|
||||
"spider: sending download request of %s firstIp=%s "
|
||||
"uh48=%llu to "
|
||||
"host %li (child=%li)", r->m_url, iptoa(r->m_firstIp),
|
||||
"host %li (child=%li)", r->ptr_url, iptoa(r->m_firstIp),
|
||||
r->m_urlHash48, hostId,
|
||||
r->m_skipHammerCheck);
|
||||
|
||||
|
||||
// fill up the request
|
||||
long requestSize = r->getSize();
|
||||
long requestBufSize = r->getSize();
|
||||
|
||||
// we have to serialize it now because it has cookies as well as
|
||||
// the url.
|
||||
char *requestBuf = serializeMsg ( sizeof(Msg39Request),
|
||||
&r->size_url,
|
||||
&r->size_cookie,
|
||||
&r->ptr_url,
|
||||
r,
|
||||
&requestBufSize ,
|
||||
NULL ,
|
||||
0,//RBUF_SIZE ,
|
||||
false );
|
||||
// g_errno should be set in this case, most likely to ENOMEM
|
||||
if ( ! requestBuf ) return true;
|
||||
|
||||
// . otherwise, send the request to the key host
|
||||
// . returns false and sets g_errno on error
|
||||
// . now wait for 2 minutes before timing out
|
||||
if ( ! g_udpServer.sendRequest ( (char *)r ,
|
||||
requestSize ,
|
||||
if ( ! g_udpServer.sendRequest ( requestBuf, // (char *)r ,
|
||||
requestBufSize ,
|
||||
0x13 , // msgType 0x13
|
||||
h->m_ip ,
|
||||
h->m_port ,
|
||||
@ -309,7 +323,8 @@ void gotForwardedReplyWrapper ( void *state , UdpSlot *slot ) {
|
||||
|
||||
bool Msg13::gotForwardedReply ( UdpSlot *slot ) {
|
||||
// don't let udpserver free the request, it's our m_request[]
|
||||
slot->m_sendBufAlloc = NULL;
|
||||
// no, now let him free it because it was serialized into there
|
||||
//slot->m_sendBufAlloc = NULL;
|
||||
// what did he give us?
|
||||
char *reply = slot->m_readBuf;
|
||||
long replySize = slot->m_readBufSize;
|
||||
@ -343,7 +358,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
||||
|
||||
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
||||
logf(LOG_DEBUG,"spider: FINALIZED %s firstIp=%s",
|
||||
r->m_url,iptoa(r->m_firstIp));
|
||||
r->ptr_url,iptoa(r->m_firstIp));
|
||||
|
||||
|
||||
// . if timed out probably the host is now dead so try another one!
|
||||
@ -351,7 +366,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
||||
if ( g_errno == EUDPTIMEDOUT ) {
|
||||
// try again
|
||||
log("spider: retrying1. had error for %s : %s",
|
||||
r->m_url,mstrerror(g_errno));
|
||||
r->ptr_url,mstrerror(g_errno));
|
||||
// return if that blocked
|
||||
if ( ! forwardRequest ( ) ) return false;
|
||||
// a different g_errno should be set now!
|
||||
@ -362,7 +377,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
||||
// for it here
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: error for %s: %s",
|
||||
r->m_url,mstrerror(g_errno));
|
||||
r->ptr_url,mstrerror(g_errno));
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -435,7 +450,7 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
||||
// log it for now
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("http: got doc %s %li to %li",
|
||||
r->m_url,(long)replySize,(long)uncompressedLen);
|
||||
r->ptr_url,(long)replySize,(long)uncompressedLen);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -458,9 +473,9 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
//if ( niceness == 0 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// make sure we do not download gigablast.com admin pages!
|
||||
if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->m_urlLen >= 7 ) {
|
||||
if ( g_hostdb.isIpInNetwork ( r->m_firstIp ) && r->size_url-1 >= 7 ) {
|
||||
Url url;
|
||||
url.set ( r->m_url );
|
||||
url.set ( r->ptr_url );
|
||||
// . never download /master urls from ips of hosts in cluster
|
||||
// . TODO: FIX! the pages might be in another cluster!
|
||||
if ( ( strncasecmp ( url.getPath() , "/master/" , 8 ) == 0 ||
|
||||
@ -500,7 +515,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
// helpful for debugging. even though you may see a robots.txt
|
||||
// redirect and think we are downloading that each time,
|
||||
// we are not... the redirect is cached here as well.
|
||||
//log("spider: %s was in cache",r->m_url);
|
||||
//log("spider: %s was in cache",r->ptr_url);
|
||||
// . send the cached reply back
|
||||
// . this will free send/read bufs on completion/g_errno
|
||||
g_udpServer.sendReply_ass ( rec , recSize , rec, recSize,slot);
|
||||
@ -510,7 +525,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
// log it so we can see if we are hammering
|
||||
if ( g_conf.m_logDebugRobots || g_conf.m_logDebugDownloads )
|
||||
logf(LOG_DEBUG,"spider: DOWNLOADING %s firstIp=%s",
|
||||
r->m_url,iptoa(r->m_firstIp));
|
||||
r->ptr_url,iptoa(r->m_firstIp));
|
||||
|
||||
// temporary hack
|
||||
if ( r->m_parent ) { char *xx=NULL;*xx=0; }
|
||||
@ -559,7 +574,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
// which will store maybe a -1 if currently downloading...
|
||||
if ( queueIt ) {
|
||||
// debug
|
||||
//log("spider: adding %s to crawldelayqueue",r->m_url);
|
||||
//log("spider: adding %s to crawldelayqueue",r->ptr_url);
|
||||
// save this
|
||||
r->m_udpSlot = slot;
|
||||
r->m_nextLink = NULL;
|
||||
@ -580,7 +595,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
if ( last > 0 && waited < r->m_crawlDelayMS ) {
|
||||
log("spider: hammering firstIp=%s url=%s "
|
||||
"only waited %lli ms of %li ms",
|
||||
iptoa(r->m_firstIp),r->m_url,waited,
|
||||
iptoa(r->m_firstIp),r->ptr_url,waited,
|
||||
r->m_crawlDelayMS);
|
||||
// this guy has too many redirects and it fails us...
|
||||
// BUT do not core if running live, only if for test
|
||||
@ -598,14 +613,14 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
// "firstIp=%s "
|
||||
// "url=%s "
|
||||
// "to msg13::hammerCache",
|
||||
// nowms,iptoa(r->m_firstIp),r->m_url);
|
||||
// nowms,iptoa(r->m_firstIp),r->ptr_url);
|
||||
// clear error from that if any, not important really
|
||||
g_errno = 0;
|
||||
}
|
||||
|
||||
// try to get it from the test cache?
|
||||
TcpSocket ts;
|
||||
if ( r->m_useTestCache && getTestDoc ( r->m_url, &ts , r ) ) {
|
||||
if ( r->m_useTestCache && getTestDoc ( r->ptr_url, &ts , r ) ) {
|
||||
// save this
|
||||
r->m_udpSlot = slot;
|
||||
// store the request so gotHttpReply can reply to it
|
||||
@ -672,7 +687,7 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
|
||||
|
||||
// do not get .google.com/ crap
|
||||
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||
//if ( strstr(r->ptr_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
downloadTheDocForReals ( r );
|
||||
}
|
||||
@ -689,7 +704,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
|
||||
// this means our callback will be called
|
||||
if ( ! firstInLine ) {
|
||||
//log("spider: inlining %s",r->m_url);
|
||||
//log("spider: inlining %s",r->ptr_url);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -715,7 +730,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
"firstIp=%s "
|
||||
"url=%s "
|
||||
"to msg13::hammerCache",
|
||||
-1LL,iptoa(r->m_firstIp),r->m_url);
|
||||
-1LL,iptoa(r->m_firstIp),r->ptr_url);
|
||||
|
||||
|
||||
// flag this
|
||||
@ -723,7 +738,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
// note it here
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: downloading %s (%s) (skiphammercheck=%li)",
|
||||
r->m_url,iptoa(r->m_urlIp) ,
|
||||
r->ptr_url,iptoa(r->m_urlIp) ,
|
||||
(long)r->m_skipHammerCheck);
|
||||
|
||||
// use the default agent unless scraping
|
||||
@ -755,7 +770,7 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
|
||||
|
||||
// download it
|
||||
if ( ! g_httpServer.getDoc ( r->m_url ,
|
||||
if ( ! g_httpServer.getDoc ( r->ptr_url ,
|
||||
r->m_urlIp ,
|
||||
0 , // offset
|
||||
-1 ,
|
||||
@ -767,7 +782,10 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
r->m_httpProxyPort ,
|
||||
r->m_maxTextDocLen ,
|
||||
r->m_maxOtherDocLen ,
|
||||
agent ) )
|
||||
agent ,
|
||||
"HTTP/1.0" , // protocol
|
||||
false , // do POST?
|
||||
r->ptr_cookie ) )
|
||||
// return false if blocked
|
||||
return;
|
||||
// . log this so i know about it
|
||||
@ -818,7 +836,7 @@ void gotHttpReply2 ( void *state ,
|
||||
if ( g_errno && g_conf.m_logDebugSpider )
|
||||
log("spider: http reply (msg13) had error = %s "
|
||||
"for %s at ip %s",
|
||||
mstrerror(g_errno),r->m_url,iptoa(r->m_urlIp));
|
||||
mstrerror(g_errno),r->ptr_url,iptoa(r->m_urlIp));
|
||||
|
||||
// get time now
|
||||
long long nowms = gettimeofdayInMilliseconds();
|
||||
@ -832,7 +850,7 @@ void gotHttpReply2 ( void *state ,
|
||||
"firstIp=%s "
|
||||
"url=%s "
|
||||
"to msg13::hammerCache",
|
||||
nowms,iptoa(r->m_firstIp),r->m_url);
|
||||
nowms,iptoa(r->m_firstIp),r->ptr_url);
|
||||
|
||||
|
||||
// sanity. this was happening from iframe download
|
||||
@ -859,7 +877,7 @@ void gotHttpReply2 ( void *state ,
|
||||
// note it
|
||||
if ( r->m_useTestCache && g_conf.m_logDebugSpider )
|
||||
logf(LOG_DEBUG,"spider: got reply for %s firstIp=%s uh48=%llu",
|
||||
r->m_url,iptoa(r->m_firstIp),r->m_urlHash48);
|
||||
r->ptr_url,iptoa(r->m_firstIp),r->m_urlHash48);
|
||||
|
||||
long niceness = r->m_niceness;
|
||||
|
||||
@ -986,7 +1004,7 @@ void gotHttpReply2 ( void *state ,
|
||||
!r->m_isRobotsTxt &&
|
||||
r->m_compressReply ) {
|
||||
long cs = getCharsetFast ( &mime,
|
||||
r->m_url,
|
||||
r->ptr_url,
|
||||
content,
|
||||
contentLen,
|
||||
niceness);
|
||||
@ -1088,7 +1106,7 @@ void gotHttpReply2 ( void *state ,
|
||||
// ok, did we have an error?
|
||||
if ( g_errno )
|
||||
log("scproxy: xml set for %s had error: %s",
|
||||
r->m_url,mstrerror(g_errno));
|
||||
r->ptr_url,mstrerror(g_errno));
|
||||
// otherwise, i guess we had no iframes worthy of expanding
|
||||
// so pretend we do not have any iframes
|
||||
hasIframe2 = false;
|
||||
@ -1128,12 +1146,12 @@ void gotHttpReply2 ( void *state ,
|
||||
}
|
||||
|
||||
// nuke the content if from flurbit.com website!!
|
||||
if ( r->m_url &&
|
||||
if ( r->ptr_url &&
|
||||
replySize>0 &&
|
||||
goodStatus &&
|
||||
strstr ( r->m_url,"flurbit.com/" ) ) {
|
||||
strstr ( r->ptr_url,"flurbit.com/" ) ) {
|
||||
// note it in log
|
||||
log("msg13: got flurbit url: %s",r->m_url);
|
||||
log("msg13: got flurbit url: %s",r->ptr_url);
|
||||
// record in the stats
|
||||
docsPtr = &g_stats.m_compressUnchangedDocs;
|
||||
bytesInPtr = &g_stats.m_compressUnchangedBytesIn;
|
||||
@ -1366,7 +1384,7 @@ void gotHttpReply2 ( void *state ,
|
||||
log("proxy: msg13: sending back error: %s "
|
||||
"for url %s with ip %s",
|
||||
mstrerror(err),
|
||||
r2->m_url,
|
||||
r2->ptr_url,
|
||||
iptoa(r2->m_urlIp));
|
||||
g_udpServer.sendErrorReply ( slot , err );
|
||||
continue;
|
||||
@ -1412,7 +1430,7 @@ void passOnReply ( void *state , UdpSlot *slot ) {
|
||||
|
||||
if ( g_errno ) {
|
||||
log("spider: error from proxy for %s: %s",
|
||||
r->m_url,mstrerror(g_errno));
|
||||
r->ptr_url,mstrerror(g_errno));
|
||||
g_udpServer.sendErrorReply(r->m_udpSlot, g_errno);
|
||||
return;
|
||||
}
|
||||
@ -2014,8 +2032,8 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
||||
// make a fake spider request so we can do it
|
||||
SpiderRequest sreq;
|
||||
sreq.reset();
|
||||
strcpy(sreq.m_url,r->m_url);
|
||||
long firstIp = hash32n(r->m_url);
|
||||
strcpy(sreq.m_url,r->ptr_url);
|
||||
long firstIp = hash32n(r->ptr_url);
|
||||
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
||||
sreq.setKey( firstIp,0LL, false );
|
||||
sreq.m_isInjecting = 1;
|
||||
@ -2027,7 +2045,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
||||
|
||||
// log it now
|
||||
if ( g_conf.m_logDebugBuild )
|
||||
log("scproxy: expanding iframes for %s",r->m_url);
|
||||
log("scproxy: expanding iframes for %s",r->ptr_url);
|
||||
|
||||
// . use the enormous power of our new XmlDoc class
|
||||
// . this returns false with g_errno set on error
|
||||
@ -2108,7 +2126,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
||||
char **ec = xd->getExpandedUtf8Content();
|
||||
// this means it blocked
|
||||
if ( ec == (void *)-1 ) {
|
||||
//log("scproxy: waiting for %s",r->m_url);
|
||||
//log("scproxy: waiting for %s",r->ptr_url);
|
||||
return false;
|
||||
}
|
||||
// return true with g_errno set
|
||||
@ -2128,7 +2146,7 @@ bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts ) {
|
||||
// so i'd think indicative of something special
|
||||
if ( g_conf.m_logDebugBuild )
|
||||
log("scproxy: got iframe expansion without blocking for url=%s"
|
||||
" err=%s",r->m_url,mstrerror(g_errno));
|
||||
" err=%s",r->ptr_url,mstrerror(g_errno));
|
||||
|
||||
// save g_errno for returning
|
||||
long saved = g_errno;
|
||||
@ -2169,11 +2187,11 @@ void gotIframeExpandedContent ( void *state ) {
|
||||
// this was stored in xd
|
||||
Msg13Request *r = xd->m_r;
|
||||
|
||||
//log("scproxy: done waiting for %s",r->m_url);
|
||||
//log("scproxy: done waiting for %s",r->ptr_url);
|
||||
|
||||
// note it
|
||||
if ( g_conf.m_logDebugBuild )
|
||||
log("scproxy: got iframe expansion for url=%s",r->m_url);
|
||||
log("scproxy: got iframe expansion for url=%s",r->ptr_url);
|
||||
|
||||
// assume we had no expansion or there was an error
|
||||
char *reply = NULL;
|
||||
@ -2212,7 +2230,7 @@ void gotIframeExpandedContent ( void *state ) {
|
||||
// on the main cluster!
|
||||
if ( g_errno )
|
||||
log("scproxy: error getting iframe content for url=%s : %s",
|
||||
r->m_url,mstrerror(g_errno));
|
||||
r->ptr_url,mstrerror(g_errno));
|
||||
// sanity check
|
||||
if ( reply && reply[replySize-1] != '\0') { char *xx=NULL;*xx=0; }
|
||||
// pass back the error we had, if any
|
||||
@ -2270,7 +2288,7 @@ void scanHammerQueue ( int fd , void *state ) {
|
||||
// debug
|
||||
//log("spider: downloading %s from crawldelay queue "
|
||||
// "waited=%llims crawldelay=%lims",
|
||||
// r->m_url,waited,r->m_crawlDelayMS);
|
||||
// r->ptr_url,waited,r->m_crawlDelayMS);
|
||||
|
||||
// good to go
|
||||
downloadTheDocForReals ( r );
|
||||
|
15
Msg13.h
15
Msg13.h
@ -80,16 +80,23 @@ public:
|
||||
long long m_cacheKey;
|
||||
char m_testDir[32];
|
||||
// msg13 sets this too, so you don't have to worry about setting it
|
||||
long m_urlLen;
|
||||
//long m_urlLen;
|
||||
// includes \0 termination
|
||||
char m_url[MAX_URL_LEN+1];
|
||||
//char m_url[MAX_URL_LEN+1];
|
||||
|
||||
char *ptr_url;
|
||||
char *ptr_cookie;
|
||||
|
||||
long size_url;
|
||||
long size_cookie;
|
||||
|
||||
long getSize() {
|
||||
return ((char *)m_url-(char *)this) +m_urlLen +1;};
|
||||
return ((char *)ptr_url-(char *)this) +size_url+size_cookie;};
|
||||
|
||||
// zero it all out
|
||||
void reset() {
|
||||
memset (this,0,(char *)m_url - (char *)this + 1);
|
||||
//memset (this,0,(char *)m_url - (char *)this + 1);
|
||||
memset (this,0,sizeof(Msg13Request));
|
||||
m_maxTextDocLen = -1; // no limit
|
||||
m_maxOtherDocLen = -1; // no limit
|
||||
m_crawlDelayMS = -1; // unknown or none
|
||||
|
16
Url.cpp
16
Url.cpp
@ -195,14 +195,14 @@ void Url::set ( char *t , long tlen , bool addWWW , bool stripSessionId ,
|
||||
s[len]='\0';
|
||||
|
||||
// make http:////www.xyz.com into http://www.xyz.com
|
||||
if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" , 9 ) ) {
|
||||
memcpy (s+7,s+9,len-9+1);
|
||||
len -= 2;
|
||||
}
|
||||
if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////", 10 ) ) {
|
||||
memcpy (s+8,s+10,len-9+1);
|
||||
len -= 2;
|
||||
}
|
||||
// if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" ,9) ){
|
||||
// memcpy (s+7,s+9,len-9+1);
|
||||
// len -= 2;
|
||||
// }
|
||||
// if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////",10)){
|
||||
// memcpy (s+8,s+10,len-9+1);
|
||||
// len -= 2;
|
||||
// }
|
||||
|
||||
// . remove session ids from s
|
||||
// . ';' most likely preceeds a session id
|
||||
|
32
XmlDoc.cpp
32
XmlDoc.cpp
@ -9412,6 +9412,23 @@ Url **XmlDoc::getRedirUrl() {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
|
||||
// get cookie for redirect to fix nyt.com
|
||||
char *cookie = mime.getCookie();
|
||||
// find end of cookie at the semicolon
|
||||
char *s = cookie;
|
||||
for ( ; s && *s && *s != ';' ; s++ );
|
||||
if ( s && *s == ';' ) {
|
||||
// do not include ;
|
||||
long clen = s - cookie;
|
||||
m_redirCookieBuf.reset();
|
||||
m_redirCookieBuf.safeMemcpy ( cookie , clen );
|
||||
m_redirCookieBuf.nullTerm();
|
||||
m_redirCookieBufValid = true;
|
||||
}
|
||||
|
||||
// mdw23
|
||||
//log("http: reply=%s",m_httpReply);
|
||||
|
||||
// a hack for removing session ids already in there. for
|
||||
// brilliantshopper's bs4 collection and gk0 cluster
|
||||
//bool forceRedirect = false;
|
||||
@ -9520,7 +9537,7 @@ Url **XmlDoc::getRedirUrl() {
|
||||
// . if we followed too many then bail
|
||||
// . www.motorolamobility.com www.outlook.com ... failed when we
|
||||
// had >= 4 here
|
||||
if ( ++m_numRedirects >= 7 ) {
|
||||
if ( ++m_numRedirects >= 10 ) {
|
||||
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
||||
return &m_redirUrlPtr;
|
||||
}
|
||||
@ -14711,7 +14728,9 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
// clear it first
|
||||
r->reset();
|
||||
// and set the url
|
||||
strcpy ( r->m_url , cu->getUrl() );
|
||||
//strcpy ( r->m_url , cu->getUrl() );
|
||||
r->ptr_url = cu->getUrl();
|
||||
r->size_url = cu->getUrlLen()+1;
|
||||
// sanity check
|
||||
if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
||||
// max to download in bytes. currently 1MB.
|
||||
@ -14747,6 +14766,15 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
r->m_ifModifiedSince = 0;
|
||||
r->m_skipHammerCheck = 0;
|
||||
|
||||
if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) {
|
||||
r->ptr_cookie = m_redirCookieBuf.getBufStart();
|
||||
r->size_cookie = m_redirCookieBuf.length() + 1;
|
||||
// . only do once per redirect
|
||||
// . do not invalidate because we might have to carry it
|
||||
// through to the next redir... unless we change domain
|
||||
// . this fixes the nyt.com bug some more
|
||||
//m_redirCookieBufValid = false;
|
||||
}
|
||||
|
||||
// . this is -1 if unknown. none found in robots.txt or provided
|
||||
// in the custom crawl parms.
|
||||
|
2
XmlDoc.h
2
XmlDoc.h
@ -962,6 +962,7 @@ class XmlDoc {
|
||||
|
||||
Url m_redirUrl;
|
||||
Url *m_redirUrlPtr;
|
||||
SafeBuf m_redirCookieBuf;
|
||||
Url m_metaRedirUrl;
|
||||
Url *m_metaRedirUrlPtr;
|
||||
Url m_canonicalRedirUrl;
|
||||
@ -1235,6 +1236,7 @@ class XmlDoc {
|
||||
//bool m_tryAgainTimeDeltaValid;
|
||||
//bool m_eliminateMenusValid;
|
||||
bool m_redirUrlValid;
|
||||
bool m_redirCookieBufValid;
|
||||
bool m_metaRedirUrlValid;
|
||||
bool m_canonicalRedirUrlValid;
|
||||
bool m_statusMsgValid;
|
||||
|
Loading…
Reference in New Issue
Block a user