if old title rec was corrupted we would get a random docid

when re-spidering the url causing some chaos. now things
should return to normal and we should overwrite the corrupted
titlerec on the next spidering. also, no longer do robots.txt
titlerec lookups. silly.
This commit is contained in:
Matt Wells 2016-03-15 23:26:57 -07:00
parent 58993dbbf9
commit 0b5f417349
4 changed files with 69 additions and 9 deletions

View File

@ -77,11 +77,12 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
int32_t timeout ,
bool doLoadBalancing ) {
m_availDocId = 0;
// sanity
if ( getAvailDocIdOnly && justCheckTfndb ) { char *xx=NULL;*xx=0; }
if ( getAvailDocIdOnly && url ) { char *xx=NULL;*xx=0; }
//if ( m_url ) log(LOG_DEBUG,"build: getting TitleRec for %s",m_url);
//if ( url ) log(LOG_DEBUG,"build: getting TitleRec for %s",url);
// sanity checks
if ( url && docId!=0LL ) { char *xx=NULL;*xx=0; }
if ( url && !url[0] ) { char *xx=NULL;*xx=0; }
@ -857,6 +858,7 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
// set probable docid
int64_t pd = 0LL;
if ( r->m_url[0] ) {
//log("msg22: url= %s",r->m_url);
pd = g_titledb.getProbableDocId(r->m_url);
if ( pd != st->m_pd ) {
log("db: crap probable docids do not match! u=%s",

View File

@ -315,6 +315,8 @@ void XmlDoc::reset ( ) {
m_sentToDiffbot = 0;
m_gotDiffbotSuccessfulReply = 0;
// we need to reset this to false
m_useTimeAxis = false;
m_sentToDiffbotThisTime = false;
@ -11405,8 +11407,7 @@ Url **XmlDoc::getRedirUrl() {
Url *loc = NULL;
// quickly see if we are a robots.txt url originally
Url *fu = getFirstUrl();
bool isRobotsTxt = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
bool isRobotsTxt = isFirstUrlRobotsTxt ( );
//
// check for <meta http-equiv="Refresh" content="1; URL=contact.htm">
@ -12737,8 +12738,32 @@ int64_t *XmlDoc::getDocId ( ) {
setStatus ("getting docid");
// . set our docid
// . *od is NULL if no title rec found with that docid in titledb
if ( *od ) m_docId = *(*od)->getDocId();
else m_docId = m_msg22a.getAvailDocId();
if ( *od ) {
m_docId = *(*od)->getDocId();
m_docIdValid = true;
return &m_docId;
}
m_docId = m_msg22a.getAvailDocId();
// if titlerec was there but not od it had an error uncompressing
// because of the corruption bug in RdbMem.cpp when dumping to disk.
if ( m_docId == 0 && m_oldTitleRec && m_oldTitleRecSize > 12 ) {
m_docId = g_titledb.getDocIdFromKey ( (key_t *)m_oldTitleRec );
log("build: salvaged docid %"INT64" from corrupt title rec "
"for %s",m_docId,m_firstUrl.m_url);
}
// ensure it is within probable range
if ( ! getUseTimeAxis () ) {
char *u = getFirstUrl()->getUrl();
int64_t pd = g_titledb.getProbableDocId(u);
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
if ( m_docId < d1 || m_docId > d2 ) {
char *xx=NULL;*xx=0; }
}
// if docid is zero, none is a vailable!!!
//if ( m_docId == 0LL ) m_indexCode = ENODOCID;
m_docIdValid = true;
@ -14999,6 +15024,14 @@ int32_t *XmlDoc::getFinalCrawlDelay() {
return &m_finalCrawlDelay;
}
bool XmlDoc::isFirstUrlRobotsTxt ( ) {
if ( m_isRobotsTxtUrlValid )
return m_isRobotsTxtUrl;
Url *fu = getFirstUrl();
m_isRobotsTxtUrl = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
m_isRobotsTxtUrlValid = true;
return m_isRobotsTxtUrl;
}
// . get the Robots.txt and see if we are allowed
// . returns NULL and sets g_errno on error
@ -15056,9 +15089,7 @@ bool *XmlDoc::getIsAllowed ( ) {
// . if WE are robots.txt that is always allowed!!!
// . check the *first* url since these often redirect to wierd things
Url *fu = getFirstUrl();
bool isRobotsTxt = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
if ( isRobotsTxt ) {
if ( isFirstUrlRobotsTxt() ) {
m_isAllowed = true;
m_isAllowedValid = true;
m_crawlDelayValid = true;
@ -15080,6 +15111,7 @@ bool *XmlDoc::getIsAllowed ( ) {
int32_t *ip = getIp ();
// error? or blocked?
if ( ! ip || ip == (void *)-1 ) return (bool *)ip;
Url *fu = getFirstUrl();
// if ip does not exist on the dns, do not try to download robots.txt
// it is pointless... this can happen in the dir coll and we basically
// have "m_siteInCatdb" set to true
@ -17281,7 +17313,9 @@ char **XmlDoc::getHttpReply2 ( ) {
//if ( ! hc || hc == (void *)-1 ) return (char **)hc;
XmlDoc *od = NULL;
if ( ! m_isSpiderProxy ) {
if ( ! m_isSpiderProxy &&
// don't lookup xyz.com/robots.txt in titledb
! isFirstUrlRobotsTxt() ) {
XmlDoc **pod = getOldXmlDoc ( );
if ( ! pod || pod == (XmlDoc **)-1 ) return (char **)pod;
// get ptr to old xml doc, could be NULL if non exists
@ -22098,6 +22132,16 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
if ( m_docIdValid )
sb->safePrintf("docid=%"UINT64" ",m_docId);
char *u = getFirstUrl()->getUrl();
int64_t pd = g_titledb.getProbableDocId(u);
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
sb->safePrintf("probdocid=%"UINT64" ",pd);
sb->safePrintf("probdocidmin=%"UINT64" ",d1);
sb->safePrintf("probdocidmax=%"UINT64" ",d2);
sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);
if ( m_siteNumInlinksValid ) {
sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks );
// sb->safePrintf("siteipinlinks=%"INT32" ",
@ -30738,6 +30782,12 @@ bool XmlDoc::getUseTimeAxis ( ) {
if ( ! cr ) return false;
m_useTimeAxis = cr->m_useTimeAxis;
m_useTimeAxisValid = true;
// sanity check
// if ( cr->m_isCustomCrawl && m_useTimeAxis ) {
// log("build: custom crawls can't use time axis");
// char *xx=NULL;*xx=0;
// m_useTimeAxis = false;
// }
return m_useTimeAxis;
}

View File

@ -1204,6 +1204,9 @@ class XmlDoc {
HashTableX m_vctab;
HashTableX m_vcduptab;
bool isFirstUrlRobotsTxt();
bool m_isRobotsTxtUrl;
Images m_images;
HashTableX m_countTable;
HttpMime m_mime;
@ -1260,6 +1263,7 @@ class XmlDoc {
char m_filteredRootTitleBufValid;
char m_titleBufValid;
char m_fragBufValid;
char m_isRobotsTxtUrlValid;
char m_inlineSectionVotingBufValid;
char m_wordSpamBufValid;
char m_finalSummaryBufValid;

View File

@ -6564,6 +6564,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
"cs=%04d "
"lang=%02d "
"sni=%03"INT32" "
"usetimeaxis=%i "
//"cats=%"INT32" "
"lastspidered=%s "
"ip=%s "
@ -6589,6 +6590,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
xd->m_charset,//tr.getCharset(),
xd->m_langId,//tr.getLanguage(),
(int32_t)xd->m_siteNumInlinks,//tr.getDo
xd->m_useTimeAxis,
//nc,
ppp,
iptoa(xd->m_ip),//ipbuf ,
@ -6692,6 +6694,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
"ctype=%s "
"lang=%02d "
"sni=%03"INT32" "
"usetimeaxis=%i "
//"cats=%"INT32" "
"lastspidered=%s "
"ip=%s "
@ -6718,6 +6721,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
g_contentTypeStrings[xd->m_contentType],
xd->m_langId,//tr.getLanguage(),
(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
xd->m_useTimeAxis,
//nc,
ppp,
iptoa(xd->m_ip),//ipbuf ,