From 7f10fca2343c712bc9af81d541273b91fb55714c Mon Sep 17 00:00:00 2001 From: Matt Wells Date: Thu, 5 Jun 2014 17:00:12 -0700 Subject: [PATCH] no longer for add www to url domain if it is just a domain. was messing of tmblr.co where www.tmblr.co has no IP. --- XmlDoc.cpp | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/XmlDoc.cpp b/XmlDoc.cpp index 137f3410..7c397921 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -1315,7 +1315,9 @@ bool XmlDoc::set4 ( SpiderRequest *sreq , } else { // add www is now REQUIRED for all! - setFirstUrl ( sreq->m_url , true ); // false ); + // crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because + // www.tmblr.co has no IP + setFirstUrl ( sreq->m_url , false );//true ); // false ); // you can't call this from a docid based url until you // know the uh48 //setSpideredTime(); @@ -1934,7 +1936,9 @@ bool XmlDoc::injectDoc ( char *url , // normalize url Url uu; - uu.set(url,gbstrlen(url),true); + // do not add www to fix tmblr.co/ZHw5yo1E5TAaW injection + // which has no www.tmblr.co IP! + uu.set(url,gbstrlen(url),false);//true); // remove >'s i guess and store in st1->m_url[] buffer char cleanUrl[MAX_URL_LEN+1]; @@ -12300,6 +12304,7 @@ long *XmlDoc::getIp ( ) { if ( m_sreqValid && m_sreq.m_isInjecting ) delay = 0; if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0; if ( m_sreqValid && m_sreq.m_isScraping ) delay = 0; + if ( m_sreqValid && m_sreq.m_fakeFirstIp ) delay = 0; // . don't do the delay when downloading extra doc, robots.txt etc. // . this also reports a status msg of "getting new doc" when it // really means "delaying spider" @@ -12363,7 +12368,6 @@ long *XmlDoc::gotIp ( bool save ) { if ( g_errno ) return NULL; // this is bad too //if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP; - // note it //log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl()); setStatus ("got ip"); @@ -12371,6 +12375,11 @@ long *XmlDoc::gotIp ( bool save ) { CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; + // note it for crawlbot + if ( cr->m_isCustomCrawl && ( m_ip == 0 || m_ip == -1 ) ) + log("db: got ip %li for %s", + m_ip,getCurrentUrl()->getUrl()); + bool useTestCache = false; if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true; // unless its the pagesubmit.cpp event submission tool @@ -23716,8 +23725,10 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) { // before it was not! //if ( flags & LF_OLDLINK ) continue; - // set it. addWWW = true! - Url url; url.set ( s , slen , true ); + // set it. addWWW = true! no.. make it false because of issues + // like tmblr.co/ZHw5yo1E5TAaW injection where + // www.tmblr.co has no IP + Url url; url.set ( s , slen , false ); // true ); // if hostname length is <= 2 then SILENTLY reject it if ( url.getHostLen() <= 2 ) continue; @@ -24879,6 +24890,7 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) { Url iu; // use "pageUrl" as the baseUrl Url *cu = getCurrentUrl(); + // we can addwww to normalize since this is for deduping kinda iu.set ( cu , src , srcLen , true ); // addWWW? yes... char *u = iu.getUrl (); long ulen = iu.getUrlLen(); @@ -25785,6 +25797,8 @@ bool XmlDoc::hashLinks ( HashTableX *tt ) { // it's to cnn.com or www.cnn.com. // Every now and then we add new session ids to our list in // Url.cpp, too, so we have to version that. + // Since this is just for hashing, it shouldn't matter that + // www.tmblr.co has no IP whereas only tmblr.co does. link.set ( m_links.m_linkPtrs[i] , m_links.m_linkLens[i] , true , // addWWW? @@ -27450,8 +27464,9 @@ Url *XmlDoc::getBaseUrl ( ) { if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml; Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1 ) return (Url *)cu; - // set it - m_baseUrl.set ( cu , true ); // addWWW = true + // no longer set addWWW to true since tmblr.co has an IP but + // www.tmblr.co does not + m_baseUrl.set ( cu , false ); // addWWW = true // look for base url for ( long i=0 ; i < xml->getNumNodes() ; i++ ) { // 12 is the tag id