fix gbss related cores. fix bn.com crawling redir bug.

This commit is contained in:
Matt Wells 2015-04-30 11:11:27 -07:00
parent 29a6d7a085
commit ad88ea8ba9
2 changed files with 41 additions and 12 deletions

View File

@ -187,8 +187,13 @@ XmlDoc::~XmlDoc() {
static int64_t s_lastTimeStart = 0LL;
// for debugging
class XmlDoc *g_xd;
void XmlDoc::reset ( ) {
m_redirUrl.reset();
m_ipStartTime = 0;
m_ipEndTime = 0;
m_diffbotReplyRetries = 0;
@ -10029,6 +10034,19 @@ int64_t XmlDoc::getFirstUrlHash64() {
return m_firstUrlHash64;
}
Url **XmlDoc::getLastRedirUrl() {
Url **ru = getRedirUrl();
if ( ! ru || ru == (void *)-1 ) return ru;
// m_redirUrlPtr will be NULL in all cases, however, the
// last redir url we actually got will be set in
// m_redirUrl.m_url so return that.
m_lastRedirUrlPtr = &m_redirUrl;
return &m_lastRedirUrlPtr;
}
// . operates on the latest m_httpReply
Url **XmlDoc::getRedirUrl() {
if ( m_redirUrlValid ) return &m_redirUrlPtr;
@ -25476,7 +25494,9 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
//isInSeedBuf(cr,m_firstUrl.getUrl(),m_firstUrl.getUrlLen() ) &&
m_hopCount == 0 &&
m_redirUrlValid &&
m_redirUrlPtr &&
ptr_redirUrl &&
//m_redirUrlPtr && (this gets reset to NULL as being LAST redir)
// this is the last non-empty redir here:
m_redirUrl.getUrlLen() > 0 ) {
log("build: seed REDIR: %s",m_redirUrl.getUrl());
redirDomHash32 = m_redirUrl.getDomainHash32();
@ -27511,11 +27531,12 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
":%.01f,\n",
m_percentChanged);
jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n",
*priority);
if ( ! m_isDiffbotJSONObject )
jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n",
*priority);
// this could be -1, careful
if ( *ufn >= 0 )
if ( *ufn >= 0 && ! m_isDiffbotJSONObject )
jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n",
cr->m_regExs[*ufn].getBufStart());
@ -27534,31 +27555,36 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
// do not show the -1 any more, just leave it out then
// to make things look prettier
if ( m_crawlDelayValid && m_crawlDelay >= 0 )
if ( m_crawlDelayValid && m_crawlDelay >= 0 &&
! m_isDiffbotJSONObject )
// -1 if none?
jd.safePrintf("\"gbssCrawlDelayMS\":%"INT32",\n",
(int32_t)m_crawlDelay);
// was this url ever sent to diffbot either now or at a previous
// spider time?
jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n",
(int)m_sentToDiffbot);
if ( ! m_isDiffbotJSONObject ) {
jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n",
(int)m_sentToDiffbot);
// sent to diffbot?
jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n",
(int)m_sentToDiffbotThisTime);
// sent to diffbot?
jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n",
(int)m_sentToDiffbotThisTime);
}
// page must have been downloaded for this one
if ( cr->m_isCustomCrawl &&
m_utf8ContentValid &&
! m_isDiffbotJSONObject &&
m_content &&
m_contentValid &&
cr->m_diffbotPageProcessPattern.getBufStart() &&
cr->m_diffbotPageProcessPattern.getBufStart()[0] ) {
char match = doesPageContentMatchDiffbotProcessPattern();
jd.safePrintf("\"gbssMatchesPageProcessPattern\":%i,\n",
(int)match);
}
if ( cr->m_isCustomCrawl && m_firstUrlValid ) {
if ( cr->m_isCustomCrawl && m_firstUrlValid && !m_isDiffbotJSONObject){
char *url = getFirstUrl()->getUrl();
@ -27602,7 +27628,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
if ( m_diffbotReplyValid && m_sentToDiffbotThisTime ) {
if ( m_diffbotReplyValid && m_sentToDiffbotThisTime &&
! m_isDiffbotJSONObject ) {
jd.safePrintf("\"gbssDiffbotReplyCode\":%"INT32",\n",
m_diffbotReplyError);
jd.safePrintf("\"gbssDiffbotReplyMsg\":\"");

View File

@ -600,6 +600,7 @@ class XmlDoc {
class Url *getFirstUrl() ;
int64_t getFirstUrlHash48();
int64_t getFirstUrlHash64();
class Url **getLastRedirUrl() ;
class Url **getRedirUrl() ;
class Url **getMetaRedirUrl() ;
class Url **getCanonicalRedirUrl ( ) ;
@ -1009,6 +1010,7 @@ class XmlDoc {
Url m_redirUrl;
Url *m_redirUrlPtr;
Url *m_lastRedirUrlPtr;
SafeBuf m_redirCookieBuf;
Url m_metaRedirUrl;
Url *m_metaRedirUrlPtr;