no longer for add www to url domain if it is just

a domain. was messing of tmblr.co where www.tmblr.co has no IP.
2024-10-04 20:27:43 +03:00 · 2014-06-05 17:00:12 -07:00 · 2014-06-05 17:00:12 -07:00 · 7f10fca234
commit 7f10fca234
parent 3c6a8bf87e
1 changed files with 22 additions and 7 deletions
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1315,7 +1315,9 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 	}
 	else {
 		// add www is now REQUIRED for all!
-		setFirstUrl ( sreq->m_url , true ); // false );
+		// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
+		// www.tmblr.co has no IP
+		setFirstUrl ( sreq->m_url , false );//true ); // false );
 		// you can't call this from a docid based url until you
 		// know the uh48
 		//setSpideredTime();
@ -1934,7 +1936,9 @@ bool XmlDoc::injectDoc ( char *url ,

 	// normalize url
 	Url uu;
-	uu.set(url,gbstrlen(url),true);
+	// do not add www to fix tmblr.co/ZHw5yo1E5TAaW injection
+	// which has no www.tmblr.co IP!
+	uu.set(url,gbstrlen(url),false);//true);

 	// remove >'s i guess and store in st1->m_url[] buffer
 	char cleanUrl[MAX_URL_LEN+1];
@ -12300,6 +12304,7 @@ long *XmlDoc::getIp ( ) {
 	if ( m_sreqValid && m_sreq.m_isInjecting  ) delay = 0;
 	if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
 	if ( m_sreqValid && m_sreq.m_isScraping   ) delay = 0;
+	if ( m_sreqValid && m_sreq.m_fakeFirstIp  ) delay = 0;
 	// . don't do the delay when downloading extra doc, robots.txt etc.
 	// . this also reports a status msg of "getting new doc" when it
 	//   really means "delaying spider"
@ -12363,7 +12368,6 @@ long *XmlDoc::gotIp ( bool save ) {
 	if ( g_errno ) return NULL;
 	// this is bad too
 	//if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP;
-	// note it
 	//log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl());

 	setStatus ("got ip");
@ -12371,6 +12375,11 @@ long *XmlDoc::gotIp ( bool save ) {
 	CollectionRec *cr = getCollRec();
 	if ( ! cr ) return NULL;

+	// note it for crawlbot
+	if ( cr->m_isCustomCrawl && ( m_ip == 0 || m_ip == -1 ) )
+		log("db: got ip %li for %s",
+		    m_ip,getCurrentUrl()->getUrl());
+
 	bool useTestCache = false;
 	if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
 	// unless its the pagesubmit.cpp event submission tool
@ -23716,8 +23725,10 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
 		//   before it was not!
 		//if ( flags & LF_OLDLINK ) continue;

-		// set it. addWWW = true!
-		Url url; url.set ( s , slen , true );
+		// set it. addWWW = true! no.. make it false because of issues
+		// like tmblr.co/ZHw5yo1E5TAaW injection where 
+		// www.tmblr.co has no IP
+		Url url; url.set ( s , slen , false ); // true );

 		// if hostname length is <= 2 then SILENTLY reject it
 		if ( url.getHostLen() <= 2 ) continue;
@ -24879,6 +24890,7 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
 		Url iu;
 		// use "pageUrl" as the baseUrl
 		Url *cu = getCurrentUrl();
+		// we can addwww to normalize since this is for deduping kinda
 		iu.set ( cu , src , srcLen , true );  // addWWW? yes...
 		char *u    = iu.getUrl   ();
 		long  ulen = iu.getUrlLen();
@ -25785,6 +25797,8 @@ bool XmlDoc::hashLinks ( HashTableX *tt ) {
 		// it's to cnn.com or www.cnn.com.
 		// Every now and then we add new session ids to our list in
 		// Url.cpp, too, so we have to version that.
+		// Since this is just for hashing, it shouldn't matter that
+		// www.tmblr.co has no IP whereas only tmblr.co does.
 		link.set ( m_links.m_linkPtrs[i] , 
 			   m_links.m_linkLens[i] ,
 			   true          , // addWWW? 
@ -27450,8 +27464,9 @@ Url *XmlDoc::getBaseUrl ( ) {
 	if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml;
 	Url *cu = getCurrentUrl();
 	if ( ! cu || cu == (void *)-1 ) return (Url *)cu;
-	// set it
-	m_baseUrl.set ( cu , true ); // addWWW = true
+	// no longer set addWWW to true since tmblr.co has an IP but
+	// www.tmblr.co does not
+	m_baseUrl.set ( cu , false ); // addWWW = true
 	// look for base url
 	for ( long i=0 ; i < xml->getNumNodes() ; i++ ) {
 		// 12 is the <base href> tag id