fix bugs to try to get sharding working

on crawlbot today
This commit is contained in:
Matt Wells 2014-01-21 13:58:21 -08:00
parent 7065b0ae0c
commit 45cb5c9a0c
8 changed files with 72 additions and 40 deletions

View File

@ -515,7 +515,14 @@ bool Collectiondb::addNewColl ( char *coll ,
}
return registerCollRec ( cr , true );
if ( ! registerCollRec ( cr , true ) )
return false;
// add the rdbbases for this coll, CollectionRec::m_bases[]
if ( ! addRdbBasesForCollRec ( cr ) )
return false;
return true;
}
// . called only by addNewColl() and by addExistingColl()
@ -533,12 +540,12 @@ bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
CollectionRec *cr = m_recs[i];
if ( ! cr ) continue;
// add rdb base files etc. for it
addRdbBaseForCollRec ( cr );
addRdbBasesForCollRec ( cr );
}
return true;
}
bool Collectiondb::addRdbBaseForCollRec ( CollectionRec *cr ) {
bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
char *coll = cr->m_coll;

View File

@ -125,7 +125,7 @@ class Collectiondb {
bool registerCollRec ( CollectionRec *cr , bool isNew ) ;
bool addRdbBaseToAllRdbsForEachCollRec ( ) ;
bool addRdbBaseForCollRec ( CollectionRec *cr ) ;
bool addRdbBasesForCollRec ( CollectionRec *cr ) ;
bool setRecPtr ( collnum_t collnum , CollectionRec *cr ) ;

View File

@ -334,7 +334,8 @@ void handleRequest22 ( UdpSlot *slot , long netnice ) {
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
RdbBase *tbase;
if ( ! (tbase=getRdbBase(RDB_TITLEDB,coll) ) ) {
log("db: Could not get title rec in collection \"%s\".",
log("db: Could not get title rec in collection \"%s\" "
"because rdbbase is null.",
coll);
g_errno = EBADENGINEER;
us->sendErrorReply ( slot , g_errno );

View File

@ -1562,10 +1562,11 @@ void RdbBase::gotTokenForMerge ( ) {
if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
// sanity check
if ( m_isMerging || m->isMerging() ) {
if ( m_doLog )
log(LOG_INFO,
"merge: Someone already merging. Waiting for merge token "
"in order to merge %s.",m_dbname);
//if ( m_doLog )
//log(LOG_INFO,
//"merge: Someone already merging. Waiting for "
//"merge token "
//"in order to merge %s.",m_dbname);
return;
}
// clear for take-off
@ -2258,8 +2259,9 @@ bool RdbBase::verifyFileSharding ( ) {
if ( ++printed > 100 ) continue;
log ( "db: Found bad key in list belongs to shard %li",
shardNum);
// avoid log spam... comment this out
//log ( "db: Found bad key in list belongs to shard %li",
// shardNum);
}
g_threads.enableThreads();
@ -2275,8 +2277,8 @@ bool RdbBase::verifyFileSharding ( ) {
log ("db: Out of first %li records in %s for %s, only %li belong "
"to our group.",count,m_dbname,m_coll,got);
// exit if NONE, we probably got the wrong data
if ( got == 0 ) log("db: Are you sure you have the "
"right data in the right directory? ");
//if ( got == 0 ) log("db: Are you sure you have the "
// "right data in the right directory? ");
//log ( "db: Exiting due to Posdb inconsistency." );
g_threads.enableThreads();

View File

@ -150,7 +150,7 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO ");
if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
//if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
if ( m_isWWWSubdomain ) sb->safePrintf("WWWSUBDOMAIN ");
if ( m_avoidSpiderLinks ) sb->safePrintf("AVOIDSPIDERLINKS ");
@ -235,11 +235,11 @@ long SpiderReply::print ( SafeBuf *sbarg ) {
if ( m_isPermalink ) sb->safePrintf("ISPERMALINK ");
if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER ");
//if ( m_deleted ) sb->safePrintf("DELETED ");
if ( m_isIndexed ) sb->safePrintf("ISINDEXED ");
if ( m_isIndexed && ! m_isIndexedINValid) sb->safePrintf("ISINDEXED ");
if ( m_hasAddress ) sb->safePrintf("HASADDRESS ");
if ( m_hasTOD ) sb->safePrintf("HASTOD ");
if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
//if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
//sb->safePrintf("url=%s",m_url);
@ -344,7 +344,7 @@ long SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO ");
if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
//if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
//if ( m_inOrderTree ) sb->safePrintf("INORDERTREE ");
@ -3439,6 +3439,7 @@ bool SpiderColl::scanListForWinners ( ) {
if ( srepUh48 == m_lastRepUh48 ) continue;
m_lastRepUh48 = srepUh48;
//if ( ! srep ) continue;
// TODO: what is srep->m_isIndexedINValid is set????
if ( ! srep->m_isIndexed ) continue;
// keep count per site and firstip
m_localTable.addScore(&sreq->m_firstIp,1);
@ -9415,6 +9416,7 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
/*
if ( *p=='h' && strncmp(p,"hassitevenue",12) == 0 ) {
// if we do not have enough info for outlink, all done
if ( isOutlink ) return -1;
@ -9435,6 +9437,7 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
p += 2;
goto checkNextRule;
}
*/
if ( *p != 'i' ) goto skipi;
@ -9573,6 +9576,11 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
//if ( ! srep ) continue;
// skip for msg20
if ( isForMsg20 ) continue;
// skip if reply does not KNOW because of an error
// since XmDoc::indexDoc() called
// XmlDoc::getNewSpiderReply() and did not have this
// info...
if ( srep && (bool)srep->m_isIndexedINValid ) continue;
// if no match continue
if ( srep && (bool)srep->m_isIndexed==val ) continue;
// allow "!isindexed" if no SpiderReply at all
@ -10866,7 +10874,7 @@ void dedupSpiderdbList ( RdbList *list , long niceness , bool removeNegRecs ) {
sreq->m_inGoogle = old->m_inGoogle;
sreq->m_hasAuthorityInlink = old->m_hasAuthorityInlink;
sreq->m_hasContactInfo = old->m_hasContactInfo;
sreq->m_hasSiteVenue = old->m_hasSiteVenue;
//sreq->m_hasSiteVenue = old->m_hasSiteVenue;
}
// if we are not the same url as last request, add it

View File

@ -847,7 +847,11 @@ class SpiderReply {
long m_isContacty :1;
long m_hasAddress :1;
long m_hasTOD :1;
long m_hasSiteVenue :1;
// make this "INvalid" not valid since it was set to 0 before
// and we want to be backwards compatible
long m_isIndexedINValid :1;
//long m_hasSiteVenue :1;
// expires after a certain time or if ownership changed
long m_inGoogleValid :1;
@ -856,7 +860,8 @@ class SpiderReply {
long m_isContactyValid :1;
long m_hasAddressValid :1;
long m_hasTODValid :1;
long m_hasSiteVenueValid :1;
//long m_hasSiteVenueValid :1;
long m_reserved2 :1;
long m_siteNumInlinksValid :1;
// was the request an injection request
long m_fromInjectionRequest :1;

View File

@ -1590,7 +1590,7 @@ bool XmlDoc::set2 ( char *titleRec ,
m_isLinkSpam2 = m_isLinkSpam;
m_hasAddress2 = m_hasAddress;
m_hasTOD2 = m_hasTOD;
m_hasSiteVenue2 = m_hasSiteVenue;
//m_hasSiteVenue2 = m_hasSiteVenue;
m_hasContactInfo2 = m_hasContactInfo;
//m_skipIndexingByte = m_skipIndexing;
m_isSiteRoot2 = m_isSiteRoot;
@ -11410,6 +11410,7 @@ bool *XmlDoc::getHasTOD ( ) {
return &m_hasTOD2;
}
/*
bool *XmlDoc::getHasSiteVenue ( ) {
if ( m_hasSiteVenueValid ) return &m_hasSiteVenue2;
// get the tag rec
@ -11423,7 +11424,7 @@ bool *XmlDoc::getHasSiteVenue ( ) {
m_hasSiteVenueValid = true;
return &m_hasSiteVenue2;
}
*/
// do not include addresses that are always in the header/footer of every page!
@ -21540,7 +21541,7 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
m_httpStatus = od->m_httpStatus;
m_hasAddress = od->m_hasAddress;
m_hasTOD = od->m_hasTOD;
m_hasSiteVenue = od->m_hasSiteVenue;
//m_hasSiteVenue = od->m_hasSiteVenue;
m_isRSS = od->m_isRSS;
m_isPermalink = od->m_isPermalink;
m_hasContactInfo= od->m_hasContactInfo;
@ -21550,7 +21551,7 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
// do not forget the shadow members of the bit members
m_hasAddress2 = m_hasAddress;
m_hasTOD2 = m_hasTOD;
m_hasSiteVenue2 = m_hasSiteVenue;
//m_hasSiteVenue2 = m_hasSiteVenue;
m_isRSS2 = m_isRSS;
m_isPermalink2 = m_isPermalink;
@ -21561,7 +21562,7 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
m_httpStatusValid = true;
m_hasAddressValid = true;
m_hasTODValid = true;
m_hasSiteVenueValid = true;
//m_hasSiteVenueValid = true;
m_isRSSValid = true;
m_isPermalinkValid = true;
m_hasContactInfoValid= true;
@ -21808,9 +21809,11 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
else
m_srep.m_hadDiffbotError = false;
// sanity
if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
if ( ! m_isInIndexValid ) { char *xx=NULL;*xx=0; }
// sanity. if being called directly from indexDoc() because of
// an error like out of memory, then we do not know if it is
// indexed or not or was indexed...
//if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_isInIndexValid ) { char *xx=NULL;*xx=0; }
// were we already in titledb before we started spidering?
m_srep.m_wasIndexed = m_wasInIndex;
@ -21822,11 +21825,17 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
// this is an EFAKEFIRSTIP error or something similar where we
// basically just add this reply and we're done.
// NOTE: this also pertains to SpiderReply::m_isIndexed.
m_srep.m_wasIndexedValid = true;
m_srep.m_wasIndexedValid = m_wasInIndexValid;
// assume no change
m_srep.m_isIndexed = m_isInIndex;
// we need to know if the m_isIndexed bit is valid or not
// because sometimes like if we are being called directly from
// indexDoc() because of an error situation, we do not know!
if ( m_isInIndexValid ) m_srep.m_isIndexedINValid = false;
else m_srep.m_isIndexedINValid = true;
// likewise, we need to know if we deleted it so we can decrement the
// quota count for this subdomain/host in SpiderColl::m_quotaTable
//if ( m_srep.m_wasIndexed ) m_srep.m_isIndexed = true;
@ -21922,12 +21931,12 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
m_srep.m_isPermalink = m_oldDoc->m_isPermalink;
m_srep.m_hasAddress = m_oldDoc->m_hasAddress;
m_srep.m_hasTOD = m_oldDoc->m_hasTOD;
m_srep.m_hasSiteVenue = m_oldDoc->m_hasSiteVenue;
//m_srep.m_hasSiteVenue = m_oldDoc->m_hasSiteVenue;
m_srep.m_siteNumInlinks = m_oldDoc->m_siteNumInlinks;
// they're all valid
m_srep.m_hasAddressValid = true;
m_srep.m_hasTODValid = true;
m_srep.m_hasSiteVenueValid = true;
//m_srep.m_hasSiteVenueValid = true;
m_srep.m_siteNumInlinksValid = true;
}
// do special things if
@ -21967,9 +21976,9 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
if ( ! hasTOD || hasTOD == (void *)-1 )
return (SpiderReply *)hasTOD;
// does it have a venue address?
bool *hasSiteVenue = getHasSiteVenue();
if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
return (SpiderReply *)hasSiteVenue;
//bool *hasSiteVenue = getHasSiteVenue();
//if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
// return (SpiderReply *)hasSiteVenue;
// get the content type
uint8_t *ct = getContentType();
if ( ! ct ) return NULL;
@ -22047,7 +22056,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
if ( ! m_hasAddressValid ) { char *xx=NULL;*xx=0; }
if ( ! m_hasTODValid ) { char *xx=NULL;*xx=0; }
if ( ! m_hasSiteVenueValid ) { char *xx=NULL;*xx=0; }
//if ( ! m_hasSiteVenueValid ) { char *xx=NULL;*xx=0; }
if ( ! m_hasContactInfoValid) { char *xx=NULL;*xx=0; }
// . we use this to store "bad" spider recs to keep from respidering
@ -22080,7 +22089,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
// *isRoot ,
// m_niceness );
m_srep.m_hasTOD = *hasTOD;
m_srep.m_hasSiteVenue = *hasSiteVenue;
//m_srep.m_hasSiteVenue = *hasSiteVenue;
// validate all
m_srep.m_inGoogleValid = 1;
@ -22089,7 +22098,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
m_srep.m_isContactyValid = 1;
m_srep.m_hasAddressValid = 1;
m_srep.m_hasTODValid = 1;
m_srep.m_hasSiteVenueValid = 1;
//m_srep.m_hasSiteVenueValid = 1;
// validate
m_srepValid = true;

View File

@ -325,7 +325,7 @@ class XmlDoc {
uint16_t m_isLinkSpam:1;
uint16_t m_hasAddress:1;
uint16_t m_hasTOD:1;
uint16_t m_hasSiteVenue:1;
uint16_t m_reserved_sv:1;//hasSiteVenue:1;
uint16_t m_hasContactInfo:1;
uint16_t m_isSiteRoot:1;
@ -1220,7 +1220,7 @@ class XmlDoc {
bool m_isAdultValid;
bool m_hasAddressValid;
bool m_hasTODValid;
bool m_hasSiteVenueValid;
//bool m_hasSiteVenueValid;
bool m_catRecValid;
bool m_urlPubDateValid;
bool m_isUrlPermalinkFormatValid;
@ -1342,7 +1342,7 @@ class XmlDoc {
char m_isLinkSpam2;
bool m_hasAddress2;
bool m_hasTOD2;
bool m_hasSiteVenue2;
//bool m_hasSiteVenue2;
char m_hasContactInfo2;
char m_isSiteRoot2;