fix bugs to try to get sharding working

on crawlbot today
2024-10-04 04:07:13 +03:00 · 2014-01-21 13:58:21 -08:00 · 2014-01-21 13:58:21 -08:00 · 45cb5c9a0c
commit 45cb5c9a0c
parent 7065b0ae0c
8 changed files with 72 additions and 40 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -515,7 +515,14 @@ bool Collectiondb::addNewColl ( char *coll ,
 	}


-	return registerCollRec ( cr , true );
+	if ( ! registerCollRec ( cr , true ) )
+		return false;
+
+	// add the rdbbases for this coll, CollectionRec::m_bases[]
+	if ( ! addRdbBasesForCollRec ( cr ) )
+		return false;
+
+	return true;
 }

 // . called only by addNewColl() and by addExistingColl()
@ -533,12 +540,12 @@ bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
 		CollectionRec *cr = m_recs[i];
 		if ( ! cr ) continue;
 		// add rdb base files etc. for it
-		addRdbBaseForCollRec ( cr );
+		addRdbBasesForCollRec ( cr );
 	}
 	return true;
 }

-bool Collectiondb::addRdbBaseForCollRec ( CollectionRec *cr ) {
+bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {

 	char *coll = cr->m_coll;

--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -125,7 +125,7 @@ class Collectiondb  {
 	bool registerCollRec ( CollectionRec *cr ,  bool isNew ) ;

 	bool addRdbBaseToAllRdbsForEachCollRec ( ) ;
-	bool addRdbBaseForCollRec ( CollectionRec *cr ) ;
+	bool addRdbBasesForCollRec ( CollectionRec *cr ) ;

 	bool setRecPtr ( collnum_t collnum , CollectionRec *cr ) ;

--- a/Msg22.cpp
+++ b/Msg22.cpp
@ -334,7 +334,8 @@ void handleRequest22 ( UdpSlot *slot , long netnice ) {
 	// get base, returns NULL and sets g_errno to ENOCOLLREC on error
 	RdbBase *tbase; 
 	if ( ! (tbase=getRdbBase(RDB_TITLEDB,coll) ) ) {
-		log("db: Could not get title rec in collection \"%s\".",
+		log("db: Could not get title rec in collection \"%s\" "
+		    "because rdbbase is null.",
 		    coll);
 		g_errno = EBADENGINEER;
 		us->sendErrorReply ( slot , g_errno ); 
--- a/RdbBase.cpp
+++ b/RdbBase.cpp
@ -1562,10 +1562,11 @@ void RdbBase::gotTokenForMerge ( ) {
 	if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
 	// sanity check
 	if ( m_isMerging || m->isMerging() ) {
-		if ( m_doLog )
-		log(LOG_INFO,
-		    "merge: Someone already merging. Waiting for merge token "
-		    "in order to merge %s.",m_dbname);
+		//if ( m_doLog )
+			//log(LOG_INFO,
+			//"merge: Someone already merging. Waiting for "
+			//"merge token "
+			//"in order to merge %s.",m_dbname);
 		return;
 	}
 	// clear for take-off
@ -2258,8 +2259,9 @@ bool RdbBase::verifyFileSharding ( ) {

 		if ( ++printed > 100 ) continue;

-		log ( "db: Found bad key in list belongs to shard %li",
-		      shardNum);
+		// avoid log spam... comment this out
+		//log ( "db: Found bad key in list belongs to shard %li",
+		//      shardNum);
 	}

 	g_threads.enableThreads();
@ -2275,8 +2277,8 @@ bool RdbBase::verifyFileSharding ( ) {
 	log ("db: Out of first %li records in %s for %s, only %li belong "
 	     "to our group.",count,m_dbname,m_coll,got);
 	// exit if NONE, we probably got the wrong data
-	if ( got == 0 ) log("db: Are you sure you have the "
-			    "right data in the right directory? ");
+	//if ( got == 0 ) log("db: Are you sure you have the "
+	//		    "right data in the right directory? ");

 	//log ( "db: Exiting due to Posdb inconsistency." );
 	g_threads.enableThreads();
--- a/Spider.cpp
+++ b/Spider.cpp
@ -150,7 +150,7 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
 	if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
 	if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO ");

-	if ( m_hasSiteVenue  ) sb->safePrintf("HASSITEVENUE ");
+	//if ( m_hasSiteVenue  ) sb->safePrintf("HASSITEVENUE ");
 	if ( m_isContacty      ) sb->safePrintf("CONTACTY ");
 	if ( m_isWWWSubdomain  ) sb->safePrintf("WWWSUBDOMAIN ");
 	if ( m_avoidSpiderLinks ) sb->safePrintf("AVOIDSPIDERLINKS ");
@ -235,11 +235,11 @@ long SpiderReply::print ( SafeBuf *sbarg ) {
 	if ( m_isPermalink ) sb->safePrintf("ISPERMALINK ");
 	if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER ");
 	//if ( m_deleted ) sb->safePrintf("DELETED ");
-	if ( m_isIndexed ) sb->safePrintf("ISINDEXED ");
+	if ( m_isIndexed && ! m_isIndexedINValid) sb->safePrintf("ISINDEXED ");

 	if ( m_hasAddress    ) sb->safePrintf("HASADDRESS ");
 	if ( m_hasTOD        ) sb->safePrintf("HASTOD ");
-	if ( m_hasSiteVenue  ) sb->safePrintf("HASSITEVENUE ");
+	//if ( m_hasSiteVenue  ) sb->safePrintf("HASSITEVENUE ");
 	if ( m_isContacty    ) sb->safePrintf("CONTACTY ");

 	//sb->safePrintf("url=%s",m_url);
@ -344,7 +344,7 @@ long SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
 	if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
 	if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO ");

-	if ( m_hasSiteVenue  ) sb->safePrintf("HASSITEVENUE ");
+	//if ( m_hasSiteVenue  ) sb->safePrintf("HASSITEVENUE ");
 	if ( m_isContacty      ) sb->safePrintf("CONTACTY ");

 	//if ( m_inOrderTree ) sb->safePrintf("INORDERTREE ");
@ -3439,6 +3439,7 @@ bool SpiderColl::scanListForWinners ( ) {
 			if ( srepUh48 == m_lastRepUh48 ) continue;
 			m_lastRepUh48 = srepUh48;
 			//if ( ! srep ) continue;
+			// TODO: what is srep->m_isIndexedINValid is set????
 			if ( ! srep->m_isIndexed ) continue;
 			// keep count per site and firstip
 			m_localTable.addScore(&sreq->m_firstIp,1);
@ -9415,6 +9416,7 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			goto checkNextRule;
 		}

+		/*
 		if ( *p=='h' && strncmp(p,"hassitevenue",12) == 0 ) {
 			// if we do not have enough info for outlink, all done
 			if ( isOutlink ) return -1;
@ -9435,6 +9437,7 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			p += 2;
 			goto checkNextRule;
 		}
+		*/

 		if ( *p != 'i' ) goto skipi;

@ -9573,6 +9576,11 @@ long getUrlFilterNum2 ( SpiderRequest *sreq       ,
 			//if ( ! srep ) continue;
 			// skip for msg20
 			if ( isForMsg20 ) continue;
+			// skip if reply does not KNOW because of an error
+			// since XmDoc::indexDoc() called
+			// XmlDoc::getNewSpiderReply() and did not have this
+			// info...
+			if ( srep && (bool)srep->m_isIndexedINValid ) continue;
 			// if no match continue
 			if ( srep && (bool)srep->m_isIndexed==val ) continue;
 			// allow "!isindexed" if no SpiderReply at all
@ -10866,7 +10874,7 @@ void dedupSpiderdbList ( RdbList *list , long niceness , bool removeNegRecs ) {
 			sreq->m_inGoogle           = old->m_inGoogle;
 			sreq->m_hasAuthorityInlink = old->m_hasAuthorityInlink;
 			sreq->m_hasContactInfo     = old->m_hasContactInfo;
-			sreq->m_hasSiteVenue       = old->m_hasSiteVenue;
+			//sreq->m_hasSiteVenue       = old->m_hasSiteVenue;
 		}

 		// if we are not the same url as last request, add it
--- a/Spider.h
+++ b/Spider.h
@ -847,7 +847,11 @@ class SpiderReply {
 	long    m_isContacty    :1;
 	long    m_hasAddress    :1;
 	long    m_hasTOD        :1;
-	long    m_hasSiteVenue  :1;
+
+	// make this "INvalid" not valid since it was set to 0 before
+	// and we want to be backwards compatible
+	long    m_isIndexedINValid :1;
+	//long    m_hasSiteVenue  :1;

 	// expires after a certain time or if ownership changed
 	long    m_inGoogleValid           :1;
@ -856,7 +860,8 @@ class SpiderReply {
 	long    m_isContactyValid         :1;
 	long    m_hasAddressValid         :1;
 	long    m_hasTODValid             :1;
-	long    m_hasSiteVenueValid       :1;
+	//long    m_hasSiteVenueValid       :1;
+	long    m_reserved2               :1;
 	long    m_siteNumInlinksValid     :1;
 	// was the request an injection request
 	long    m_fromInjectionRequest    :1; 
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1590,7 +1590,7 @@ bool XmlDoc::set2 ( char    *titleRec ,
 	m_isLinkSpam2         = m_isLinkSpam;
 	m_hasAddress2         = m_hasAddress;
 	m_hasTOD2             = m_hasTOD;
-	m_hasSiteVenue2       = m_hasSiteVenue;
+	//m_hasSiteVenue2       = m_hasSiteVenue;
 	m_hasContactInfo2     = m_hasContactInfo;
 	//m_skipIndexingByte    = m_skipIndexing;
 	m_isSiteRoot2         = m_isSiteRoot;
@ -11410,6 +11410,7 @@ bool *XmlDoc::getHasTOD ( ) {
 	return &m_hasTOD2;
 }

+/*
 bool *XmlDoc::getHasSiteVenue ( ) {
 	if ( m_hasSiteVenueValid ) return &m_hasSiteVenue2;
 	// get the tag rec
@ -11423,7 +11424,7 @@ bool *XmlDoc::getHasSiteVenue ( ) {
 	m_hasSiteVenueValid = true;
 	return &m_hasSiteVenue2;
 }
-
+*/


 // do not include addresses that are always in the header/footer of every page!
@ -21540,7 +21541,7 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
 	m_httpStatus    = od->m_httpStatus;
 	m_hasAddress    = od->m_hasAddress;
 	m_hasTOD        = od->m_hasTOD;
-	m_hasSiteVenue  = od->m_hasSiteVenue;
+	//m_hasSiteVenue  = od->m_hasSiteVenue;
 	m_isRSS         = od->m_isRSS;
 	m_isPermalink   = od->m_isPermalink;
 	m_hasContactInfo= od->m_hasContactInfo;
@ -21550,7 +21551,7 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
 	// do not forget the shadow members of the bit members
 	m_hasAddress2    = m_hasAddress;
 	m_hasTOD2        = m_hasTOD;
-	m_hasSiteVenue2  = m_hasSiteVenue;
+	//m_hasSiteVenue2  = m_hasSiteVenue;
 	m_isRSS2         = m_isRSS;
 	m_isPermalink2   = m_isPermalink;

@ -21561,7 +21562,7 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
 	m_httpStatusValid    = true;
 	m_hasAddressValid    = true;
 	m_hasTODValid        = true;
-	m_hasSiteVenueValid  = true;
+	//m_hasSiteVenueValid  = true;
 	m_isRSSValid         = true;
 	m_isPermalinkValid   = true;
 	m_hasContactInfoValid= true;
@ -21808,9 +21809,11 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 	else
 		m_srep.m_hadDiffbotError = false;

-	// sanity
-	if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
-	if ( ! m_isInIndexValid  ) { char *xx=NULL;*xx=0; }
+	// sanity. if being called directly from indexDoc() because of
+	// an error like out of memory, then we do not know if it is
+	// indexed or not or was indexed...
+	//if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
+	//if ( ! m_isInIndexValid  ) { char *xx=NULL;*xx=0; }

 	// were we already in titledb before we started spidering?
 	m_srep.m_wasIndexed = m_wasInIndex;
@ -21822,11 +21825,17 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 	// this is an EFAKEFIRSTIP error or something similar where we
 	// basically just add this reply and we're done.
 	// NOTE: this also pertains to SpiderReply::m_isIndexed.
-	m_srep.m_wasIndexedValid = true;
+	m_srep.m_wasIndexedValid = m_wasInIndexValid;

 	// assume no change
 	m_srep.m_isIndexed = m_isInIndex;

+	// we need to know if the m_isIndexed bit is valid or not
+	// because sometimes like if we are being called directly from
+	// indexDoc() because of an error situation, we do not know!
+	if ( m_isInIndexValid ) m_srep.m_isIndexedINValid = false;
+	else                    m_srep.m_isIndexedINValid = true;
+
 	// likewise, we need to know if we deleted it so we can decrement the
 	// quota count for this subdomain/host in SpiderColl::m_quotaTable
 	//if ( m_srep.m_wasIndexed ) m_srep.m_isIndexed = true;
@ -21922,12 +21931,12 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 			m_srep.m_isPermalink    = m_oldDoc->m_isPermalink;
 			m_srep.m_hasAddress     = m_oldDoc->m_hasAddress;
 			m_srep.m_hasTOD         = m_oldDoc->m_hasTOD;
-			m_srep.m_hasSiteVenue   = m_oldDoc->m_hasSiteVenue;
+			//m_srep.m_hasSiteVenue   = m_oldDoc->m_hasSiteVenue;
 			m_srep.m_siteNumInlinks = m_oldDoc->m_siteNumInlinks;
 			// they're all valid
 			m_srep.m_hasAddressValid     = true;
 			m_srep.m_hasTODValid         = true;
-			m_srep.m_hasSiteVenueValid   = true;
+			//m_srep.m_hasSiteVenueValid   = true;
 			m_srep.m_siteNumInlinksValid = true;
 		}
 		// do special things if
@ -21967,9 +21976,9 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 	if ( ! hasTOD || hasTOD == (void *)-1 )
 		return (SpiderReply *)hasTOD;
 	// does it have a venue address?
-	bool *hasSiteVenue = getHasSiteVenue();
-	if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
-		return (SpiderReply *)hasSiteVenue;
+	//bool *hasSiteVenue = getHasSiteVenue();
+	//if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
+	//	return (SpiderReply *)hasSiteVenue;
 	// get the content type
 	uint8_t *ct = getContentType();
 	if ( ! ct ) return NULL;
@ -22047,7 +22056,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {

 	if ( ! m_hasAddressValid    ) { char *xx=NULL;*xx=0; }
 	if ( ! m_hasTODValid        ) { char *xx=NULL;*xx=0; }
-	if ( ! m_hasSiteVenueValid  ) { char *xx=NULL;*xx=0; }
+	//if ( ! m_hasSiteVenueValid  ) { char *xx=NULL;*xx=0; }
 	if ( ! m_hasContactInfoValid) { char *xx=NULL;*xx=0; }

 	// . we use this to store "bad" spider recs to keep from respidering
@ -22080,7 +22089,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 					//	 *isRoot ,
 					//	 m_niceness );
 	m_srep.m_hasTOD        = *hasTOD;
-	m_srep.m_hasSiteVenue  = *hasSiteVenue;
+	//m_srep.m_hasSiteVenue  = *hasSiteVenue;

 	// validate all
 	m_srep.m_inGoogleValid           = 1;
@ -22089,7 +22098,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
 	m_srep.m_isContactyValid         = 1;
 	m_srep.m_hasAddressValid         = 1;
 	m_srep.m_hasTODValid             = 1;
-	m_srep.m_hasSiteVenueValid       = 1;
+	//m_srep.m_hasSiteVenueValid       = 1;

 	// validate
 	m_srepValid = true;
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -325,7 +325,7 @@ class XmlDoc {
 	uint16_t  m_isLinkSpam:1;
 	uint16_t  m_hasAddress:1;
 	uint16_t  m_hasTOD:1;
-	uint16_t  m_hasSiteVenue:1;
+	uint16_t  m_reserved_sv:1;//hasSiteVenue:1;
 	uint16_t  m_hasContactInfo:1;
 	uint16_t  m_isSiteRoot:1;

@ -1220,7 +1220,7 @@ class XmlDoc {
 	bool m_isAdultValid;
 	bool m_hasAddressValid;
 	bool m_hasTODValid;
-	bool m_hasSiteVenueValid;
+	//bool m_hasSiteVenueValid;
 	bool m_catRecValid;
 	bool m_urlPubDateValid;
 	bool m_isUrlPermalinkFormatValid;
@ -1342,7 +1342,7 @@ class XmlDoc {
 	char m_isLinkSpam2;
 	bool m_hasAddress2;
 	bool m_hasTOD2;
-	bool m_hasSiteVenue2;
+	//bool m_hasSiteVenue2;
 	char m_hasContactInfo2;
 	char m_isSiteRoot2;