mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
try to fix some redirect issues
This commit is contained in:
parent
343f783592
commit
146e45db56
10
Url.cpp
10
Url.cpp
@ -194,6 +194,16 @@ void Url::set ( char *t , long tlen , bool addWWW , bool stripSessionId ,
|
||||
memcpy ( s , t , tlen );
|
||||
s[len]='\0';
|
||||
|
||||
// make http:////www.xyz.com into http://www.xyz.com
|
||||
if ( len > 14 && s[7]=='/' && ! strncasecmp ( s , "http:////" , 9 ) ) {
|
||||
memcpy (s+7,s+9,len-9+1);
|
||||
len -= 2;
|
||||
}
|
||||
if ( len > 14 && s[8]=='/' && ! strncasecmp ( s ,"https:////", 10 ) ) {
|
||||
memcpy (s+8,s+10,len-9+1);
|
||||
len -= 2;
|
||||
}
|
||||
|
||||
// . remove session ids from s
|
||||
// . ';' most likely preceeds a session id
|
||||
// . http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?pp=1
|
||||
|
16
XmlDoc.cpp
16
XmlDoc.cpp
@ -9520,7 +9520,7 @@ Url **XmlDoc::getRedirUrl() {
|
||||
// . if we followed too many then bail
|
||||
// . www.motorolamobility.com www.outlook.com ... failed when we
|
||||
// had >= 4 here
|
||||
if ( ++m_numRedirects >= 5 ) {
|
||||
if ( ++m_numRedirects >= 7 ) {
|
||||
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
||||
return &m_redirUrlPtr;
|
||||
}
|
||||
@ -9643,6 +9643,18 @@ Url **XmlDoc::getRedirUrl() {
|
||||
// login page ... so add this check here
|
||||
if ( ! f->isRoot() )
|
||||
simplifiedRedir = true;
|
||||
|
||||
bool allowSimplifiedRedirs = m_allowSimplifiedRedirs;
|
||||
|
||||
// follow redirects if injecting so we do not return
|
||||
// EDOCSIMPLIFIEDREDIR
|
||||
if ( getIsInjecting ( ) )
|
||||
allowSimplifiedRedirs = true;
|
||||
|
||||
// or if disabled then follow the redirect
|
||||
if ( ! cr->m_useSimplifiedRedirects )
|
||||
allowSimplifiedRedirs = true;
|
||||
|
||||
// . if the redir url is simpler, but has no hostname we
|
||||
// prepend a "www." to it
|
||||
// . this should avoids www.russ.ru and russ.ru from being
|
||||
@ -9663,7 +9675,7 @@ Url **XmlDoc::getRedirUrl() {
|
||||
// . 301 means moved PERMANENTLY...
|
||||
// . many people use 301 on their root pages though, so treat
|
||||
// it like a temporary redirect, like exclusivelyequine.com
|
||||
if ( simplifiedRedir && ! m_allowSimplifiedRedirs &&
|
||||
if ( simplifiedRedir && ! allowSimplifiedRedirs &&
|
||||
// for custom BULK clients don't like this i guess
|
||||
// AND for custom crawl it was messing up the processing
|
||||
// url format for a nytimes blog subsite which was redirecting
|
||||
|
Loading…
Reference in New Issue
Block a user