no longer for add www to url domain if it is just

a domain. was messing of tmblr.co where www.tmblr.co
has no IP.
This commit is contained in:
Matt Wells 2014-06-05 17:00:12 -07:00
parent 3c6a8bf87e
commit 7f10fca234

View File

@ -1315,7 +1315,9 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
}
else {
// add www is now REQUIRED for all!
setFirstUrl ( sreq->m_url , true ); // false );
// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
// www.tmblr.co has no IP
setFirstUrl ( sreq->m_url , false );//true ); // false );
// you can't call this from a docid based url until you
// know the uh48
//setSpideredTime();
@ -1934,7 +1936,9 @@ bool XmlDoc::injectDoc ( char *url ,
// normalize url
Url uu;
uu.set(url,gbstrlen(url),true);
// do not add www to fix tmblr.co/ZHw5yo1E5TAaW injection
// which has no www.tmblr.co IP!
uu.set(url,gbstrlen(url),false);//true);
// remove >'s i guess and store in st1->m_url[] buffer
char cleanUrl[MAX_URL_LEN+1];
@ -12300,6 +12304,7 @@ long *XmlDoc::getIp ( ) {
if ( m_sreqValid && m_sreq.m_isInjecting ) delay = 0;
if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
if ( m_sreqValid && m_sreq.m_isScraping ) delay = 0;
if ( m_sreqValid && m_sreq.m_fakeFirstIp ) delay = 0;
// . don't do the delay when downloading extra doc, robots.txt etc.
// . this also reports a status msg of "getting new doc" when it
// really means "delaying spider"
@ -12363,7 +12368,6 @@ long *XmlDoc::gotIp ( bool save ) {
if ( g_errno ) return NULL;
// this is bad too
//if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP;
// note it
//log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl());
setStatus ("got ip");
@ -12371,6 +12375,11 @@ long *XmlDoc::gotIp ( bool save ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
// note it for crawlbot
if ( cr->m_isCustomCrawl && ( m_ip == 0 || m_ip == -1 ) )
log("db: got ip %li for %s",
m_ip,getCurrentUrl()->getUrl());
bool useTestCache = false;
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
// unless its the pagesubmit.cpp event submission tool
@ -23716,8 +23725,10 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
// before it was not!
//if ( flags & LF_OLDLINK ) continue;
// set it. addWWW = true!
Url url; url.set ( s , slen , true );
// set it. addWWW = true! no.. make it false because of issues
// like tmblr.co/ZHw5yo1E5TAaW injection where
// www.tmblr.co has no IP
Url url; url.set ( s , slen , false ); // true );
// if hostname length is <= 2 then SILENTLY reject it
if ( url.getHostLen() <= 2 ) continue;
@ -24879,6 +24890,7 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
Url iu;
// use "pageUrl" as the baseUrl
Url *cu = getCurrentUrl();
// we can addwww to normalize since this is for deduping kinda
iu.set ( cu , src , srcLen , true ); // addWWW? yes...
char *u = iu.getUrl ();
long ulen = iu.getUrlLen();
@ -25785,6 +25797,8 @@ bool XmlDoc::hashLinks ( HashTableX *tt ) {
// it's to cnn.com or www.cnn.com.
// Every now and then we add new session ids to our list in
// Url.cpp, too, so we have to version that.
// Since this is just for hashing, it shouldn't matter that
// www.tmblr.co has no IP whereas only tmblr.co does.
link.set ( m_links.m_linkPtrs[i] ,
m_links.m_linkLens[i] ,
true , // addWWW?
@ -27450,8 +27464,9 @@ Url *XmlDoc::getBaseUrl ( ) {
if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml;
Url *cu = getCurrentUrl();
if ( ! cu || cu == (void *)-1 ) return (Url *)cu;
// set it
m_baseUrl.set ( cu , true ); // addWWW = true
// no longer set addWWW to true since tmblr.co has an IP but
// www.tmblr.co does not
m_baseUrl.set ( cu , false ); // addWWW = true
// look for base url
for ( long i=0 ; i < xml->getNumNodes() ; i++ ) {
// 12 is the <base href> tag id