try to fix some redirect issues

This commit is contained in:
mwells 2014-07-31 10:34:03 -07:00
parent 343f783592
commit 146e45db56
2 changed files with 24 additions and 2 deletions

10
Url.cpp
View File

@ -194,6 +194,16 @@ void Url::set ( char *t , long tlen , bool addWWW , bool stripSessionId ,
memcpy ( s , t , tlen );
s[len]='\0';
// make http:////www.xyz.com into http://www.xyz.com
if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" , 9 ) ) {
memcpy (s+7,s+9,len-9+1);
len -= 2;
}
if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////", 10 ) ) {
memcpy (s+8,s+10,len-9+1);
len -= 2;
}
// . remove session ids from s
// . ';' most likely preceeds a session id
// . http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?pp=1

View File

@ -9520,7 +9520,7 @@ Url **XmlDoc::getRedirUrl() {
// . if we followed too many then bail
// . www.motorolamobility.com www.outlook.com ... failed when we
// had >= 4 here
if ( ++m_numRedirects >= 5 ) {
if ( ++m_numRedirects >= 7 ) {
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
return &m_redirUrlPtr;
}
@ -9643,6 +9643,18 @@ Url **XmlDoc::getRedirUrl() {
// login page ... so add this check here
if ( ! f->isRoot() )
simplifiedRedir = true;
bool allowSimplifiedRedirs = m_allowSimplifiedRedirs;
// follow redirects if injecting so we do not return
// EDOCSIMPLIFIEDREDIR
if ( getIsInjecting ( ) )
allowSimplifiedRedirs = true;
// or if disabled then follow the redirect
if ( ! cr->m_useSimplifiedRedirects )
allowSimplifiedRedirs = true;
// . if the redir url is simpler, but has no hostname we
// prepend a "www." to it
// . this should avoids www.russ.ru and russ.ru from being
@ -9663,7 +9675,7 @@ Url **XmlDoc::getRedirUrl() {
// . 301 means moved PERMANENTLY...
// . many people use 301 on their root pages though, so treat
// it like a temporary redirect, like exclusivelyequine.com
if ( simplifiedRedir && ! m_allowSimplifiedRedirs &&
if ( simplifiedRedir && ! allowSimplifiedRedirs &&
// for custom BULK clients don't like this i guess
// AND for custom crawl it was messing up the processing
// url format for a nytimes blog subsite which was redirecting