mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
if old title rec was corrupted we would get a random docid
when re-spidering the url causing some chaos. now things should return to normal and we should overwrite the corrupted titlerec on the next spidering. also, no longer do robots.txt titlerec lookups. silly.
This commit is contained in:
parent
58993dbbf9
commit
0b5f417349
@ -77,11 +77,12 @@ bool Msg22::getTitleRec ( Msg22Request *r ,
|
||||
int32_t timeout ,
|
||||
bool doLoadBalancing ) {
|
||||
|
||||
m_availDocId = 0;
|
||||
// sanity
|
||||
if ( getAvailDocIdOnly && justCheckTfndb ) { char *xx=NULL;*xx=0; }
|
||||
if ( getAvailDocIdOnly && url ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
//if ( m_url ) log(LOG_DEBUG,"build: getting TitleRec for %s",m_url);
|
||||
//if ( url ) log(LOG_DEBUG,"build: getting TitleRec for %s",url);
|
||||
// sanity checks
|
||||
if ( url && docId!=0LL ) { char *xx=NULL;*xx=0; }
|
||||
if ( url && !url[0] ) { char *xx=NULL;*xx=0; }
|
||||
@ -857,6 +858,7 @@ void gotTitleList ( void *state , RdbList *list , Msg5 *msg5 ) {
|
||||
// set probable docid
|
||||
int64_t pd = 0LL;
|
||||
if ( r->m_url[0] ) {
|
||||
//log("msg22: url= %s",r->m_url);
|
||||
pd = g_titledb.getProbableDocId(r->m_url);
|
||||
if ( pd != st->m_pd ) {
|
||||
log("db: crap probable docids do not match! u=%s",
|
||||
|
66
XmlDoc.cpp
66
XmlDoc.cpp
@ -315,6 +315,8 @@ void XmlDoc::reset ( ) {
|
||||
|
||||
m_sentToDiffbot = 0;
|
||||
m_gotDiffbotSuccessfulReply = 0;
|
||||
// we need to reset this to false
|
||||
m_useTimeAxis = false;
|
||||
|
||||
m_sentToDiffbotThisTime = false;
|
||||
|
||||
@ -11405,8 +11407,7 @@ Url **XmlDoc::getRedirUrl() {
|
||||
Url *loc = NULL;
|
||||
|
||||
// quickly see if we are a robots.txt url originally
|
||||
Url *fu = getFirstUrl();
|
||||
bool isRobotsTxt = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
|
||||
bool isRobotsTxt = isFirstUrlRobotsTxt ( );
|
||||
|
||||
//
|
||||
// check for <meta http-equiv="Refresh" content="1; URL=contact.htm">
|
||||
@ -12737,8 +12738,32 @@ int64_t *XmlDoc::getDocId ( ) {
|
||||
setStatus ("getting docid");
|
||||
// . set our docid
|
||||
// . *od is NULL if no title rec found with that docid in titledb
|
||||
if ( *od ) m_docId = *(*od)->getDocId();
|
||||
else m_docId = m_msg22a.getAvailDocId();
|
||||
if ( *od ) {
|
||||
m_docId = *(*od)->getDocId();
|
||||
m_docIdValid = true;
|
||||
return &m_docId;
|
||||
}
|
||||
|
||||
m_docId = m_msg22a.getAvailDocId();
|
||||
|
||||
// if titlerec was there but not od it had an error uncompressing
|
||||
// because of the corruption bug in RdbMem.cpp when dumping to disk.
|
||||
if ( m_docId == 0 && m_oldTitleRec && m_oldTitleRecSize > 12 ) {
|
||||
m_docId = g_titledb.getDocIdFromKey ( (key_t *)m_oldTitleRec );
|
||||
log("build: salvaged docid %"INT64" from corrupt title rec "
|
||||
"for %s",m_docId,m_firstUrl.m_url);
|
||||
}
|
||||
|
||||
// ensure it is within probable range
|
||||
if ( ! getUseTimeAxis () ) {
|
||||
char *u = getFirstUrl()->getUrl();
|
||||
int64_t pd = g_titledb.getProbableDocId(u);
|
||||
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
|
||||
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
|
||||
if ( m_docId < d1 || m_docId > d2 ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
// if docid is zero, none is a vailable!!!
|
||||
//if ( m_docId == 0LL ) m_indexCode = ENODOCID;
|
||||
m_docIdValid = true;
|
||||
@ -14999,6 +15024,14 @@ int32_t *XmlDoc::getFinalCrawlDelay() {
|
||||
return &m_finalCrawlDelay;
|
||||
}
|
||||
|
||||
bool XmlDoc::isFirstUrlRobotsTxt ( ) {
|
||||
if ( m_isRobotsTxtUrlValid )
|
||||
return m_isRobotsTxtUrl;
|
||||
Url *fu = getFirstUrl();
|
||||
m_isRobotsTxtUrl = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
|
||||
m_isRobotsTxtUrlValid = true;
|
||||
return m_isRobotsTxtUrl;
|
||||
}
|
||||
|
||||
// . get the Robots.txt and see if we are allowed
|
||||
// . returns NULL and sets g_errno on error
|
||||
@ -15056,9 +15089,7 @@ bool *XmlDoc::getIsAllowed ( ) {
|
||||
|
||||
// . if WE are robots.txt that is always allowed!!!
|
||||
// . check the *first* url since these often redirect to wierd things
|
||||
Url *fu = getFirstUrl();
|
||||
bool isRobotsTxt = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() );
|
||||
if ( isRobotsTxt ) {
|
||||
if ( isFirstUrlRobotsTxt() ) {
|
||||
m_isAllowed = true;
|
||||
m_isAllowedValid = true;
|
||||
m_crawlDelayValid = true;
|
||||
@ -15080,6 +15111,7 @@ bool *XmlDoc::getIsAllowed ( ) {
|
||||
int32_t *ip = getIp ();
|
||||
// error? or blocked?
|
||||
if ( ! ip || ip == (void *)-1 ) return (bool *)ip;
|
||||
Url *fu = getFirstUrl();
|
||||
// if ip does not exist on the dns, do not try to download robots.txt
|
||||
// it is pointless... this can happen in the dir coll and we basically
|
||||
// have "m_siteInCatdb" set to true
|
||||
@ -17281,7 +17313,9 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
//if ( ! hc || hc == (void *)-1 ) return (char **)hc;
|
||||
|
||||
XmlDoc *od = NULL;
|
||||
if ( ! m_isSpiderProxy ) {
|
||||
if ( ! m_isSpiderProxy &&
|
||||
// don't lookup xyz.com/robots.txt in titledb
|
||||
! isFirstUrlRobotsTxt() ) {
|
||||
XmlDoc **pod = getOldXmlDoc ( );
|
||||
if ( ! pod || pod == (XmlDoc **)-1 ) return (char **)pod;
|
||||
// get ptr to old xml doc, could be NULL if non exists
|
||||
@ -22098,6 +22132,16 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
if ( m_docIdValid )
|
||||
sb->safePrintf("docid=%"UINT64" ",m_docId);
|
||||
|
||||
char *u = getFirstUrl()->getUrl();
|
||||
int64_t pd = g_titledb.getProbableDocId(u);
|
||||
int64_t d1 = g_titledb.getFirstProbableDocId ( pd );
|
||||
int64_t d2 = g_titledb.getLastProbableDocId ( pd );
|
||||
sb->safePrintf("probdocid=%"UINT64" ",pd);
|
||||
sb->safePrintf("probdocidmin=%"UINT64" ",d1);
|
||||
sb->safePrintf("probdocidmax=%"UINT64" ",d2);
|
||||
sb->safePrintf("usetimeaxis=%i ",(int)m_useTimeAxis);
|
||||
|
||||
|
||||
if ( m_siteNumInlinksValid ) {
|
||||
sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks );
|
||||
// sb->safePrintf("siteipinlinks=%"INT32" ",
|
||||
@ -30738,6 +30782,12 @@ bool XmlDoc::getUseTimeAxis ( ) {
|
||||
if ( ! cr ) return false;
|
||||
m_useTimeAxis = cr->m_useTimeAxis;
|
||||
m_useTimeAxisValid = true;
|
||||
// sanity check
|
||||
// if ( cr->m_isCustomCrawl && m_useTimeAxis ) {
|
||||
// log("build: custom crawls can't use time axis");
|
||||
// char *xx=NULL;*xx=0;
|
||||
// m_useTimeAxis = false;
|
||||
// }
|
||||
return m_useTimeAxis;
|
||||
}
|
||||
|
||||
|
4
XmlDoc.h
4
XmlDoc.h
@ -1204,6 +1204,9 @@ class XmlDoc {
|
||||
HashTableX m_vctab;
|
||||
HashTableX m_vcduptab;
|
||||
|
||||
bool isFirstUrlRobotsTxt();
|
||||
bool m_isRobotsTxtUrl;
|
||||
|
||||
Images m_images;
|
||||
HashTableX m_countTable;
|
||||
HttpMime m_mime;
|
||||
@ -1260,6 +1263,7 @@ class XmlDoc {
|
||||
char m_filteredRootTitleBufValid;
|
||||
char m_titleBufValid;
|
||||
char m_fragBufValid;
|
||||
char m_isRobotsTxtUrlValid;
|
||||
char m_inlineSectionVotingBufValid;
|
||||
char m_wordSpamBufValid;
|
||||
char m_finalSummaryBufValid;
|
||||
|
4
main.cpp
4
main.cpp
@ -6564,6 +6564,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
|
||||
"cs=%04d "
|
||||
"lang=%02d "
|
||||
"sni=%03"INT32" "
|
||||
"usetimeaxis=%i "
|
||||
//"cats=%"INT32" "
|
||||
"lastspidered=%s "
|
||||
"ip=%s "
|
||||
@ -6589,6 +6590,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
|
||||
xd->m_charset,//tr.getCharset(),
|
||||
xd->m_langId,//tr.getLanguage(),
|
||||
(int32_t)xd->m_siteNumInlinks,//tr.getDo
|
||||
xd->m_useTimeAxis,
|
||||
//nc,
|
||||
ppp,
|
||||
iptoa(xd->m_ip),//ipbuf ,
|
||||
@ -6692,6 +6694,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
|
||||
"ctype=%s "
|
||||
"lang=%02d "
|
||||
"sni=%03"INT32" "
|
||||
"usetimeaxis=%i "
|
||||
//"cats=%"INT32" "
|
||||
"lastspidered=%s "
|
||||
"ip=%s "
|
||||
@ -6718,6 +6721,7 @@ void dumpTitledb (char *coll,int32_t startFileNum,int32_t numFiles,bool includeT
|
||||
g_contentTypeStrings[xd->m_contentType],
|
||||
xd->m_langId,//tr.getLanguage(),
|
||||
(int32_t)xd->m_siteNumInlinks,//tr.getDocQuality(),
|
||||
xd->m_useTimeAxis,
|
||||
//nc,
|
||||
ppp,
|
||||
iptoa(xd->m_ip),//ipbuf ,
|
||||
|
Loading…
Reference in New Issue
Block a user