From ad88ea8ba91767a71372ab45575be5088de0ef2b Mon Sep 17 00:00:00 2001 From: Matt Wells Date: Thu, 30 Apr 2015 11:11:27 -0700 Subject: [PATCH] fix gbss related cores. fix bn.com crawling redir bug. --- XmlDoc.cpp | 51 +++++++++++++++++++++++++++++++++++++++------------ XmlDoc.h | 2 ++ 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/XmlDoc.cpp b/XmlDoc.cpp index 008cb94e..8cb06d3e 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -187,8 +187,13 @@ XmlDoc::~XmlDoc() { static int64_t s_lastTimeStart = 0LL; +// for debugging +class XmlDoc *g_xd; + void XmlDoc::reset ( ) { + m_redirUrl.reset(); + m_ipStartTime = 0; m_ipEndTime = 0; m_diffbotReplyRetries = 0; @@ -10029,6 +10034,19 @@ int64_t XmlDoc::getFirstUrlHash64() { return m_firstUrlHash64; } +Url **XmlDoc::getLastRedirUrl() { + + Url **ru = getRedirUrl(); + if ( ! ru || ru == (void *)-1 ) return ru; + + // m_redirUrlPtr will be NULL in all cases, however, the + // last redir url we actually got will be set in + // m_redirUrl.m_url so return that. + m_lastRedirUrlPtr = &m_redirUrl; + return &m_lastRedirUrlPtr; +} + + // . operates on the latest m_httpReply Url **XmlDoc::getRedirUrl() { if ( m_redirUrlValid ) return &m_redirUrlPtr; @@ -25476,7 +25494,9 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) { //isInSeedBuf(cr,m_firstUrl.getUrl(),m_firstUrl.getUrlLen() ) && m_hopCount == 0 && m_redirUrlValid && - m_redirUrlPtr && + ptr_redirUrl && + //m_redirUrlPtr && (this gets reset to NULL as being LAST redir) + // this is the last non-empty redir here: m_redirUrl.getUrlLen() > 0 ) { log("build: seed REDIR: %s",m_redirUrl.getUrl()); redirDomHash32 = m_redirUrl.getDomainHash32(); @@ -27511,11 +27531,12 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) { ":%.01f,\n", m_percentChanged); - jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n", - *priority); + if ( ! m_isDiffbotJSONObject ) + jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n", + *priority); // this could be -1, careful - if ( *ufn >= 0 ) + if ( *ufn >= 0 && ! m_isDiffbotJSONObject ) jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n", cr->m_regExs[*ufn].getBufStart()); @@ -27534,31 +27555,36 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) { // do not show the -1 any more, just leave it out then // to make things look prettier - if ( m_crawlDelayValid && m_crawlDelay >= 0 ) + if ( m_crawlDelayValid && m_crawlDelay >= 0 && + ! m_isDiffbotJSONObject ) // -1 if none? jd.safePrintf("\"gbssCrawlDelayMS\":%"INT32",\n", (int32_t)m_crawlDelay); // was this url ever sent to diffbot either now or at a previous // spider time? - jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n", - (int)m_sentToDiffbot); + if ( ! m_isDiffbotJSONObject ) { + jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n", + (int)m_sentToDiffbot); - // sent to diffbot? - jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n", - (int)m_sentToDiffbotThisTime); + // sent to diffbot? + jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n", + (int)m_sentToDiffbotThisTime); + } // page must have been downloaded for this one if ( cr->m_isCustomCrawl && m_utf8ContentValid && + ! m_isDiffbotJSONObject && m_content && + m_contentValid && cr->m_diffbotPageProcessPattern.getBufStart() && cr->m_diffbotPageProcessPattern.getBufStart()[0] ) { char match = doesPageContentMatchDiffbotProcessPattern(); jd.safePrintf("\"gbssMatchesPageProcessPattern\":%i,\n", (int)match); } - if ( cr->m_isCustomCrawl && m_firstUrlValid ) { + if ( cr->m_isCustomCrawl && m_firstUrlValid && !m_isDiffbotJSONObject){ char *url = getFirstUrl()->getUrl(); @@ -27602,7 +27628,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) { - if ( m_diffbotReplyValid && m_sentToDiffbotThisTime ) { + if ( m_diffbotReplyValid && m_sentToDiffbotThisTime && + ! m_isDiffbotJSONObject ) { jd.safePrintf("\"gbssDiffbotReplyCode\":%"INT32",\n", m_diffbotReplyError); jd.safePrintf("\"gbssDiffbotReplyMsg\":\""); diff --git a/XmlDoc.h b/XmlDoc.h index 04accb76..b055b201 100644 --- a/XmlDoc.h +++ b/XmlDoc.h @@ -600,6 +600,7 @@ class XmlDoc { class Url *getFirstUrl() ; int64_t getFirstUrlHash48(); int64_t getFirstUrlHash64(); + class Url **getLastRedirUrl() ; class Url **getRedirUrl() ; class Url **getMetaRedirUrl() ; class Url **getCanonicalRedirUrl ( ) ; @@ -1009,6 +1010,7 @@ class XmlDoc { Url m_redirUrl; Url *m_redirUrlPtr; + Url *m_lastRedirUrlPtr; SafeBuf m_redirCookieBuf; Url m_metaRedirUrl; Url *m_metaRedirUrlPtr;