mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
no longer for add www to url domain if it is just
a domain. was messing of tmblr.co where www.tmblr.co has no IP.
This commit is contained in:
parent
3c6a8bf87e
commit
7f10fca234
29
XmlDoc.cpp
29
XmlDoc.cpp
@ -1315,7 +1315,9 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
}
|
||||
else {
|
||||
// add www is now REQUIRED for all!
|
||||
setFirstUrl ( sreq->m_url , true ); // false );
|
||||
// crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because
|
||||
// www.tmblr.co has no IP
|
||||
setFirstUrl ( sreq->m_url , false );//true ); // false );
|
||||
// you can't call this from a docid based url until you
|
||||
// know the uh48
|
||||
//setSpideredTime();
|
||||
@ -1934,7 +1936,9 @@ bool XmlDoc::injectDoc ( char *url ,
|
||||
|
||||
// normalize url
|
||||
Url uu;
|
||||
uu.set(url,gbstrlen(url),true);
|
||||
// do not add www to fix tmblr.co/ZHw5yo1E5TAaW injection
|
||||
// which has no www.tmblr.co IP!
|
||||
uu.set(url,gbstrlen(url),false);//true);
|
||||
|
||||
// remove >'s i guess and store in st1->m_url[] buffer
|
||||
char cleanUrl[MAX_URL_LEN+1];
|
||||
@ -12300,6 +12304,7 @@ long *XmlDoc::getIp ( ) {
|
||||
if ( m_sreqValid && m_sreq.m_isInjecting ) delay = 0;
|
||||
if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0;
|
||||
if ( m_sreqValid && m_sreq.m_isScraping ) delay = 0;
|
||||
if ( m_sreqValid && m_sreq.m_fakeFirstIp ) delay = 0;
|
||||
// . don't do the delay when downloading extra doc, robots.txt etc.
|
||||
// . this also reports a status msg of "getting new doc" when it
|
||||
// really means "delaying spider"
|
||||
@ -12363,7 +12368,6 @@ long *XmlDoc::gotIp ( bool save ) {
|
||||
if ( g_errno ) return NULL;
|
||||
// this is bad too
|
||||
//if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP;
|
||||
// note it
|
||||
//log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl());
|
||||
|
||||
setStatus ("got ip");
|
||||
@ -12371,6 +12375,11 @@ long *XmlDoc::gotIp ( bool save ) {
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
// note it for crawlbot
|
||||
if ( cr->m_isCustomCrawl && ( m_ip == 0 || m_ip == -1 ) )
|
||||
log("db: got ip %li for %s",
|
||||
m_ip,getCurrentUrl()->getUrl());
|
||||
|
||||
bool useTestCache = false;
|
||||
if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true;
|
||||
// unless its the pagesubmit.cpp event submission tool
|
||||
@ -23716,8 +23725,10 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
// before it was not!
|
||||
//if ( flags & LF_OLDLINK ) continue;
|
||||
|
||||
// set it. addWWW = true!
|
||||
Url url; url.set ( s , slen , true );
|
||||
// set it. addWWW = true! no.. make it false because of issues
|
||||
// like tmblr.co/ZHw5yo1E5TAaW injection where
|
||||
// www.tmblr.co has no IP
|
||||
Url url; url.set ( s , slen , false ); // true );
|
||||
|
||||
// if hostname length is <= 2 then SILENTLY reject it
|
||||
if ( url.getHostLen() <= 2 ) continue;
|
||||
@ -24879,6 +24890,7 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
Url iu;
|
||||
// use "pageUrl" as the baseUrl
|
||||
Url *cu = getCurrentUrl();
|
||||
// we can addwww to normalize since this is for deduping kinda
|
||||
iu.set ( cu , src , srcLen , true ); // addWWW? yes...
|
||||
char *u = iu.getUrl ();
|
||||
long ulen = iu.getUrlLen();
|
||||
@ -25785,6 +25797,8 @@ bool XmlDoc::hashLinks ( HashTableX *tt ) {
|
||||
// it's to cnn.com or www.cnn.com.
|
||||
// Every now and then we add new session ids to our list in
|
||||
// Url.cpp, too, so we have to version that.
|
||||
// Since this is just for hashing, it shouldn't matter that
|
||||
// www.tmblr.co has no IP whereas only tmblr.co does.
|
||||
link.set ( m_links.m_linkPtrs[i] ,
|
||||
m_links.m_linkLens[i] ,
|
||||
true , // addWWW?
|
||||
@ -27450,8 +27464,9 @@ Url *XmlDoc::getBaseUrl ( ) {
|
||||
if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml;
|
||||
Url *cu = getCurrentUrl();
|
||||
if ( ! cu || cu == (void *)-1 ) return (Url *)cu;
|
||||
// set it
|
||||
m_baseUrl.set ( cu , true ); // addWWW = true
|
||||
// no longer set addWWW to true since tmblr.co has an IP but
|
||||
// www.tmblr.co does not
|
||||
m_baseUrl.set ( cu , false ); // addWWW = true
|
||||
// look for base url
|
||||
for ( long i=0 ; i < xml->getNumNodes() ; i++ ) {
|
||||
// 12 is the <base href> tag id
|
||||
|
Loading…
Reference in New Issue
Block a user