if old title rec was corrupted we would get a random docid

when re-spidering the url causing some chaos. now things should return to normal and we should overwrite the corrupted titlerec on the next spidering. also, no longer do robots.txt titlerec lookups. silly.
2024-10-04 04:07:13 +03:00 · 2016-03-15 23:26:57 -07:00 · 2016-03-15 23:26:57 -07:00 · 0b5f417349
commit 0b5f417349
parent 58993dbbf9
4 changed files with 69 additions and 9 deletions
--- a/Msg22.cpp
+++ b/Msg22.cpp
@ -77,11 +77,12 @@ bool Msg22::getTitleRec ( Msg22Request  *r              ,
 			  int32_t           timeout        ,
 			  bool           doLoadBalancing ) {

+	m_availDocId = 0;
 	// sanity
 	if ( getAvailDocIdOnly && justCheckTfndb ) { char *xx=NULL;*xx=0; }
 	if ( getAvailDocIdOnly && url            ) { char *xx=NULL;*xx=0; }

-	//if ( m_url ) log(LOG_DEBUG,"build: getting TitleRec for %s",m_url);
+	//if ( url ) log(LOG_DEBUG,"build: getting TitleRec for %s",url);
 	// sanity checks
 	if ( url    && docId!=0LL ) { char *xx=NULL;*xx=0; }
 	if ( url    && !url[0]    ) { char *xx=NULL;*xx=0; }
@ -857,6 +858,7 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
 	// set probable docid
 	int64_t pd = 0LL;
 	if ( r->m_url[0] ) {
+		//log("msg22: url= %s",r->m_url);
 		pd = g_titledb.getProbableDocId(r->m_url);
 		if ( pd != st->m_pd ) { 
 			log("db: crap probable docids do not match! u=%s",
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -315,6 +315,8 @@ void XmlDoc::reset ( ) {

 	m_sentToDiffbot = 0;
 	m_gotDiffbotSuccessfulReply = 0;
+	// we need to reset this to false
+	m_useTimeAxis = false;

 	m_sentToDiffbotThisTime = false;

@ -11405,8 +11407,7 @@ Url **XmlDoc::getRedirUrl() {
 	Url *loc = NULL;

 	// quickly see if we are a robots.txt url originally
-	Url *fu = getFirstUrl();
-	bool isRobotsTxt = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
+	bool isRobotsTxt = isFirstUrlRobotsTxt ( );

 	// 
 	// check for <meta http-equiv="Refresh" content="1; URL=contact.htm">
@ -12737,8 +12738,32 @@ int64_t *XmlDoc::getDocId ( ) {
 	setStatus ("getting docid");
 	// . set our docid
 	// . *od is NULL if no title rec found with that docid in titledb
-	if ( *od ) m_docId = *(*od)->getDocId();
-	else       m_docId = m_msg22a.getAvailDocId();
+	if ( *od ) {
+		m_docId = *(*od)->getDocId();
+		m_docIdValid = true;
+		return &m_docId;
+	}
+
+	m_docId = m_msg22a.getAvailDocId();
+
+	// if titlerec was there but not od it had an error uncompressing
+	// because of the corruption bug in RdbMem.cpp when dumping to disk.
+	if ( m_docId == 0 && m_oldTitleRec && m_oldTitleRecSize > 12 ) {
+		m_docId = g_titledb.getDocIdFromKey ( (key_t *)m_oldTitleRec );
+		log("build: salvaged docid %"INT64" from corrupt title rec "
+		    "for %s",m_docId,m_firstUrl.m_url);
+	}
+
+	// ensure it is within probable range
+	if ( ! getUseTimeAxis () ) {
+		char *u = getFirstUrl()->getUrl();
+		int64_t pd = g_titledb.getProbableDocId(u);
+		int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
+		int64_t d2 = g_titledb.getLastProbableDocId  ( pd );
+		if ( m_docId < d1 || m_docId > d2 ) {
+			char *xx=NULL;*xx=0; }
+	}
+
 	// if docid is zero, none is a vailable!!!
 	//if ( m_docId == 0LL ) m_indexCode = ENODOCID;
 	m_docIdValid = true;
@ -14999,6 +15024,14 @@ int32_t *XmlDoc::getFinalCrawlDelay() {
 	return &m_finalCrawlDelay;
 }

+bool XmlDoc::isFirstUrlRobotsTxt ( ) {
+	if ( m_isRobotsTxtUrlValid )
+		return m_isRobotsTxtUrl;
+	Url *fu = getFirstUrl();
+	m_isRobotsTxtUrl = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
+	m_isRobotsTxtUrlValid = true;
+	return m_isRobotsTxtUrl;
+}

 // . get the Robots.txt and see if we are allowed
 // . returns NULL and sets g_errno on error
@ -15056,9 +15089,7 @@ bool *XmlDoc::getIsAllowed ( ) {

 	// . if WE are robots.txt that is always allowed!!!
 	// . check the *first* url since these often redirect to wierd things
-	Url *fu = getFirstUrl();
-	bool isRobotsTxt = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
-	if ( isRobotsTxt ) {
+	if ( isFirstUrlRobotsTxt() ) {
 		m_isAllowed      = true;
 		m_isAllowedValid = true;
 		m_crawlDelayValid = true;
@ -15080,6 +15111,7 @@ bool *XmlDoc::getIsAllowed ( ) {
 	int32_t *ip = getIp ();
 	// error? or blocked?
 	if ( ! ip || ip == (void *)-1 ) return (bool *)ip;
+	Url *fu = getFirstUrl();
 	// if ip does not exist on the dns, do not try to download robots.txt
 	// it is pointless... this can happen in the dir coll and we basically
 	// have "m_siteInCatdb" set to true
@ -17281,7 +17313,9 @@ char **XmlDoc::getHttpReply2 ( ) {
 	//if ( ! hc || hc == (void *)-1 ) return (char **)hc;

 	XmlDoc *od = NULL;
-	if ( ! m_isSpiderProxy ) {
+	if ( ! m_isSpiderProxy &&
+	     // don't lookup xyz.com/robots.txt in titledb
+	     ! isFirstUrlRobotsTxt() ) {
 		XmlDoc **pod = getOldXmlDoc ( );
 		if ( ! pod || pod == (XmlDoc **)-1 ) return (char **)pod;
 		// get ptr to old xml doc, could be NULL if non exists
@ -22098,6 +22132,16 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
 	if (  m_docIdValid ) 
 		sb->safePrintf("docid=%"UINT64" ",m_docId);

+	char *u = getFirstUrl()->getUrl();
+	int64_t pd = g_titledb.getProbableDocId(u);
+	int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
+	int64_t d2 = g_titledb.getLastProbableDocId  ( pd );
+	sb->safePrintf("probdocid=%"UINT64" ",pd);
+	sb->safePrintf("probdocidmin=%"UINT64" ",d1);
+	sb->safePrintf("probdocidmax=%"UINT64" ",d2);
+	sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);
+
+
 	if ( m_siteNumInlinksValid ) {
 		sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks );
 		// sb->safePrintf("siteipinlinks=%"INT32" ",
@ -30738,6 +30782,12 @@ bool XmlDoc::getUseTimeAxis ( ) {
 	if ( ! cr ) return false;
 	m_useTimeAxis = cr->m_useTimeAxis;
 	m_useTimeAxisValid = true;
+	// sanity check
+	// if ( cr->m_isCustomCrawl && m_useTimeAxis ) {
+	// 	log("build: custom crawls can't use time axis");
+	// 	char *xx=NULL;*xx=0; 
+	// 	m_useTimeAxis = false;
+	// }
 	return m_useTimeAxis;
 }

--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -1204,6 +1204,9 @@ class XmlDoc {
 	HashTableX m_vctab;
 	HashTableX m_vcduptab;

+	bool isFirstUrlRobotsTxt();
+	bool m_isRobotsTxtUrl;
+
 	Images     m_images;
 	HashTableX m_countTable;
 	HttpMime   m_mime;
@ -1260,6 +1263,7 @@ class XmlDoc {
 	char     m_filteredRootTitleBufValid;
 	char     m_titleBufValid;
 	char     m_fragBufValid;
+	char     m_isRobotsTxtUrlValid;
 	char     m_inlineSectionVotingBufValid;
 	char     m_wordSpamBufValid;
 	char     m_finalSummaryBufValid;
--- a/main.cpp
+++ b/main.cpp
@ -6564,6 +6564,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
 					"cs=%04d "
 					"lang=%02d "
 					"sni=%03"INT32" "
+					"usetimeaxis=%i "
 					//"cats=%"INT32" "
 					"lastspidered=%s "
 					"ip=%s "
@ -6589,6 +6590,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
 					xd->m_charset,//tr.getCharset(),
 					xd->m_langId,//tr.getLanguage(),
 					(int32_t)xd->m_siteNumInlinks,//tr.getDo
+					xd->m_useTimeAxis,
 					//nc,
 					ppp, 
 					iptoa(xd->m_ip),//ipbuf , 
@ -6692,6 +6694,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
 			"ctype=%s "
 			"lang=%02d "
 			"sni=%03"INT32" "
+			"usetimeaxis=%i "
 			//"cats=%"INT32" "
 			"lastspidered=%s "
 			"ip=%s "
@ -6718,6 +6721,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
 			g_contentTypeStrings[xd->m_contentType],
 			xd->m_langId,//tr.getLanguage(),
 			(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
+			xd->m_useTimeAxis,
 			//nc,
 			ppp, 
 			iptoa(xd->m_ip),//ipbuf ,