mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
fix gbss related cores. fix bn.com crawling redir bug.
This commit is contained in:
parent
29a6d7a085
commit
ad88ea8ba9
51
XmlDoc.cpp
51
XmlDoc.cpp
@ -187,8 +187,13 @@ XmlDoc::~XmlDoc() {
|
||||
|
||||
static int64_t s_lastTimeStart = 0LL;
|
||||
|
||||
// for debugging
|
||||
class XmlDoc *g_xd;
|
||||
|
||||
void XmlDoc::reset ( ) {
|
||||
|
||||
m_redirUrl.reset();
|
||||
|
||||
m_ipStartTime = 0;
|
||||
m_ipEndTime = 0;
|
||||
m_diffbotReplyRetries = 0;
|
||||
@ -10029,6 +10034,19 @@ int64_t XmlDoc::getFirstUrlHash64() {
|
||||
return m_firstUrlHash64;
|
||||
}
|
||||
|
||||
Url **XmlDoc::getLastRedirUrl() {
|
||||
|
||||
Url **ru = getRedirUrl();
|
||||
if ( ! ru || ru == (void *)-1 ) return ru;
|
||||
|
||||
// m_redirUrlPtr will be NULL in all cases, however, the
|
||||
// last redir url we actually got will be set in
|
||||
// m_redirUrl.m_url so return that.
|
||||
m_lastRedirUrlPtr = &m_redirUrl;
|
||||
return &m_lastRedirUrlPtr;
|
||||
}
|
||||
|
||||
|
||||
// . operates on the latest m_httpReply
|
||||
Url **XmlDoc::getRedirUrl() {
|
||||
if ( m_redirUrlValid ) return &m_redirUrlPtr;
|
||||
@ -25476,7 +25494,9 @@ char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) {
|
||||
//isInSeedBuf(cr,m_firstUrl.getUrl(),m_firstUrl.getUrlLen() ) &&
|
||||
m_hopCount == 0 &&
|
||||
m_redirUrlValid &&
|
||||
m_redirUrlPtr &&
|
||||
ptr_redirUrl &&
|
||||
//m_redirUrlPtr && (this gets reset to NULL as being LAST redir)
|
||||
// this is the last non-empty redir here:
|
||||
m_redirUrl.getUrlLen() > 0 ) {
|
||||
log("build: seed REDIR: %s",m_redirUrl.getUrl());
|
||||
redirDomHash32 = m_redirUrl.getDomainHash32();
|
||||
@ -27511,11 +27531,12 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
":%.01f,\n",
|
||||
m_percentChanged);
|
||||
|
||||
jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n",
|
||||
*priority);
|
||||
if ( ! m_isDiffbotJSONObject )
|
||||
jd.safePrintf("\"gbssSpiderPriority\":%"INT32",\n",
|
||||
*priority);
|
||||
|
||||
// this could be -1, careful
|
||||
if ( *ufn >= 0 )
|
||||
if ( *ufn >= 0 && ! m_isDiffbotJSONObject )
|
||||
jd.safePrintf("\"gbssMatchingUrlFilter\":\"%s\",\n",
|
||||
cr->m_regExs[*ufn].getBufStart());
|
||||
|
||||
@ -27534,31 +27555,36 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
|
||||
// do not show the -1 any more, just leave it out then
|
||||
// to make things look prettier
|
||||
if ( m_crawlDelayValid && m_crawlDelay >= 0 )
|
||||
if ( m_crawlDelayValid && m_crawlDelay >= 0 &&
|
||||
! m_isDiffbotJSONObject )
|
||||
// -1 if none?
|
||||
jd.safePrintf("\"gbssCrawlDelayMS\":%"INT32",\n",
|
||||
(int32_t)m_crawlDelay);
|
||||
|
||||
// was this url ever sent to diffbot either now or at a previous
|
||||
// spider time?
|
||||
jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n",
|
||||
(int)m_sentToDiffbot);
|
||||
if ( ! m_isDiffbotJSONObject ) {
|
||||
jd.safePrintf("\"gbssSentToDiffbotAtSomeTime\":%i,\n",
|
||||
(int)m_sentToDiffbot);
|
||||
|
||||
// sent to diffbot?
|
||||
jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n",
|
||||
(int)m_sentToDiffbotThisTime);
|
||||
// sent to diffbot?
|
||||
jd.safePrintf("\"gbssSentToDiffbotThisTime\":%i,\n",
|
||||
(int)m_sentToDiffbotThisTime);
|
||||
}
|
||||
|
||||
// page must have been downloaded for this one
|
||||
if ( cr->m_isCustomCrawl &&
|
||||
m_utf8ContentValid &&
|
||||
! m_isDiffbotJSONObject &&
|
||||
m_content &&
|
||||
m_contentValid &&
|
||||
cr->m_diffbotPageProcessPattern.getBufStart() &&
|
||||
cr->m_diffbotPageProcessPattern.getBufStart()[0] ) {
|
||||
char match = doesPageContentMatchDiffbotProcessPattern();
|
||||
jd.safePrintf("\"gbssMatchesPageProcessPattern\":%i,\n",
|
||||
(int)match);
|
||||
}
|
||||
if ( cr->m_isCustomCrawl && m_firstUrlValid ) {
|
||||
if ( cr->m_isCustomCrawl && m_firstUrlValid && !m_isDiffbotJSONObject){
|
||||
|
||||
char *url = getFirstUrl()->getUrl();
|
||||
|
||||
@ -27602,7 +27628,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
|
||||
|
||||
|
||||
if ( m_diffbotReplyValid && m_sentToDiffbotThisTime ) {
|
||||
if ( m_diffbotReplyValid && m_sentToDiffbotThisTime &&
|
||||
! m_isDiffbotJSONObject ) {
|
||||
jd.safePrintf("\"gbssDiffbotReplyCode\":%"INT32",\n",
|
||||
m_diffbotReplyError);
|
||||
jd.safePrintf("\"gbssDiffbotReplyMsg\":\"");
|
||||
|
2
XmlDoc.h
2
XmlDoc.h
@ -600,6 +600,7 @@ class XmlDoc {
|
||||
class Url *getFirstUrl() ;
|
||||
int64_t getFirstUrlHash48();
|
||||
int64_t getFirstUrlHash64();
|
||||
class Url **getLastRedirUrl() ;
|
||||
class Url **getRedirUrl() ;
|
||||
class Url **getMetaRedirUrl() ;
|
||||
class Url **getCanonicalRedirUrl ( ) ;
|
||||
@ -1009,6 +1010,7 @@ class XmlDoc {
|
||||
|
||||
Url m_redirUrl;
|
||||
Url *m_redirUrlPtr;
|
||||
Url *m_lastRedirUrlPtr;
|
||||
SafeBuf m_redirCookieBuf;
|
||||
Url m_metaRedirUrl;
|
||||
Url *m_metaRedirUrlPtr;
|
||||
|
Loading…
Reference in New Issue
Block a user