mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
fix bugs to try to get sharding working
on crawlbot today
This commit is contained in:
parent
7065b0ae0c
commit
45cb5c9a0c
@ -515,7 +515,14 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
}
|
||||
|
||||
|
||||
return registerCollRec ( cr , true );
|
||||
if ( ! registerCollRec ( cr , true ) )
|
||||
return false;
|
||||
|
||||
// add the rdbbases for this coll, CollectionRec::m_bases[]
|
||||
if ( ! addRdbBasesForCollRec ( cr ) )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . called only by addNewColl() and by addExistingColl()
|
||||
@ -533,12 +540,12 @@ bool Collectiondb::addRdbBaseToAllRdbsForEachCollRec ( ) {
|
||||
CollectionRec *cr = m_recs[i];
|
||||
if ( ! cr ) continue;
|
||||
// add rdb base files etc. for it
|
||||
addRdbBaseForCollRec ( cr );
|
||||
addRdbBasesForCollRec ( cr );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Collectiondb::addRdbBaseForCollRec ( CollectionRec *cr ) {
|
||||
bool Collectiondb::addRdbBasesForCollRec ( CollectionRec *cr ) {
|
||||
|
||||
char *coll = cr->m_coll;
|
||||
|
||||
|
@ -125,7 +125,7 @@ class Collectiondb {
|
||||
bool registerCollRec ( CollectionRec *cr , bool isNew ) ;
|
||||
|
||||
bool addRdbBaseToAllRdbsForEachCollRec ( ) ;
|
||||
bool addRdbBaseForCollRec ( CollectionRec *cr ) ;
|
||||
bool addRdbBasesForCollRec ( CollectionRec *cr ) ;
|
||||
|
||||
bool setRecPtr ( collnum_t collnum , CollectionRec *cr ) ;
|
||||
|
||||
|
@ -334,7 +334,8 @@ void handleRequest22 ( UdpSlot *slot , long netnice ) {
|
||||
// get base, returns NULL and sets g_errno to ENOCOLLREC on error
|
||||
RdbBase *tbase;
|
||||
if ( ! (tbase=getRdbBase(RDB_TITLEDB,coll) ) ) {
|
||||
log("db: Could not get title rec in collection \"%s\".",
|
||||
log("db: Could not get title rec in collection \"%s\" "
|
||||
"because rdbbase is null.",
|
||||
coll);
|
||||
g_errno = EBADENGINEER;
|
||||
us->sendErrorReply ( slot , g_errno );
|
||||
|
18
RdbBase.cpp
18
RdbBase.cpp
@ -1562,10 +1562,11 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
|
||||
// sanity check
|
||||
if ( m_isMerging || m->isMerging() ) {
|
||||
if ( m_doLog )
|
||||
log(LOG_INFO,
|
||||
"merge: Someone already merging. Waiting for merge token "
|
||||
"in order to merge %s.",m_dbname);
|
||||
//if ( m_doLog )
|
||||
//log(LOG_INFO,
|
||||
//"merge: Someone already merging. Waiting for "
|
||||
//"merge token "
|
||||
//"in order to merge %s.",m_dbname);
|
||||
return;
|
||||
}
|
||||
// clear for take-off
|
||||
@ -2258,8 +2259,9 @@ bool RdbBase::verifyFileSharding ( ) {
|
||||
|
||||
if ( ++printed > 100 ) continue;
|
||||
|
||||
log ( "db: Found bad key in list belongs to shard %li",
|
||||
shardNum);
|
||||
// avoid log spam... comment this out
|
||||
//log ( "db: Found bad key in list belongs to shard %li",
|
||||
// shardNum);
|
||||
}
|
||||
|
||||
g_threads.enableThreads();
|
||||
@ -2275,8 +2277,8 @@ bool RdbBase::verifyFileSharding ( ) {
|
||||
log ("db: Out of first %li records in %s for %s, only %li belong "
|
||||
"to our group.",count,m_dbname,m_coll,got);
|
||||
// exit if NONE, we probably got the wrong data
|
||||
if ( got == 0 ) log("db: Are you sure you have the "
|
||||
"right data in the right directory? ");
|
||||
//if ( got == 0 ) log("db: Are you sure you have the "
|
||||
// "right data in the right directory? ");
|
||||
|
||||
//log ( "db: Exiting due to Posdb inconsistency." );
|
||||
g_threads.enableThreads();
|
||||
|
18
Spider.cpp
18
Spider.cpp
@ -150,7 +150,7 @@ long SpiderRequest::print ( SafeBuf *sbarg ) {
|
||||
if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
|
||||
if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO ");
|
||||
|
||||
if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
|
||||
//if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
|
||||
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
|
||||
if ( m_isWWWSubdomain ) sb->safePrintf("WWWSUBDOMAIN ");
|
||||
if ( m_avoidSpiderLinks ) sb->safePrintf("AVOIDSPIDERLINKS ");
|
||||
@ -235,11 +235,11 @@ long SpiderReply::print ( SafeBuf *sbarg ) {
|
||||
if ( m_isPermalink ) sb->safePrintf("ISPERMALINK ");
|
||||
if ( m_isPingServer ) sb->safePrintf("ISPINGSERVER ");
|
||||
//if ( m_deleted ) sb->safePrintf("DELETED ");
|
||||
if ( m_isIndexed ) sb->safePrintf("ISINDEXED ");
|
||||
if ( m_isIndexed && ! m_isIndexedINValid) sb->safePrintf("ISINDEXED ");
|
||||
|
||||
if ( m_hasAddress ) sb->safePrintf("HASADDRESS ");
|
||||
if ( m_hasTOD ) sb->safePrintf("HASTOD ");
|
||||
if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
|
||||
//if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
|
||||
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
|
||||
|
||||
//sb->safePrintf("url=%s",m_url);
|
||||
@ -344,7 +344,7 @@ long SpiderRequest::printToTable ( SafeBuf *sb , char *status ,
|
||||
if ( m_hasAuthorityInlink ) sb->safePrintf("HASAUTHORITYINLINK ");
|
||||
if ( m_hasContactInfo ) sb->safePrintf("HASCONTACTINFO ");
|
||||
|
||||
if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
|
||||
//if ( m_hasSiteVenue ) sb->safePrintf("HASSITEVENUE ");
|
||||
if ( m_isContacty ) sb->safePrintf("CONTACTY ");
|
||||
|
||||
//if ( m_inOrderTree ) sb->safePrintf("INORDERTREE ");
|
||||
@ -3439,6 +3439,7 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
if ( srepUh48 == m_lastRepUh48 ) continue;
|
||||
m_lastRepUh48 = srepUh48;
|
||||
//if ( ! srep ) continue;
|
||||
// TODO: what is srep->m_isIndexedINValid is set????
|
||||
if ( ! srep->m_isIndexed ) continue;
|
||||
// keep count per site and firstip
|
||||
m_localTable.addScore(&sreq->m_firstIp,1);
|
||||
@ -9415,6 +9416,7 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
/*
|
||||
if ( *p=='h' && strncmp(p,"hassitevenue",12) == 0 ) {
|
||||
// if we do not have enough info for outlink, all done
|
||||
if ( isOutlink ) return -1;
|
||||
@ -9435,6 +9437,7 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
*/
|
||||
|
||||
if ( *p != 'i' ) goto skipi;
|
||||
|
||||
@ -9573,6 +9576,11 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
//if ( ! srep ) continue;
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// skip if reply does not KNOW because of an error
|
||||
// since XmDoc::indexDoc() called
|
||||
// XmlDoc::getNewSpiderReply() and did not have this
|
||||
// info...
|
||||
if ( srep && (bool)srep->m_isIndexedINValid ) continue;
|
||||
// if no match continue
|
||||
if ( srep && (bool)srep->m_isIndexed==val ) continue;
|
||||
// allow "!isindexed" if no SpiderReply at all
|
||||
@ -10866,7 +10874,7 @@ void dedupSpiderdbList ( RdbList *list , long niceness , bool removeNegRecs ) {
|
||||
sreq->m_inGoogle = old->m_inGoogle;
|
||||
sreq->m_hasAuthorityInlink = old->m_hasAuthorityInlink;
|
||||
sreq->m_hasContactInfo = old->m_hasContactInfo;
|
||||
sreq->m_hasSiteVenue = old->m_hasSiteVenue;
|
||||
//sreq->m_hasSiteVenue = old->m_hasSiteVenue;
|
||||
}
|
||||
|
||||
// if we are not the same url as last request, add it
|
||||
|
9
Spider.h
9
Spider.h
@ -847,7 +847,11 @@ class SpiderReply {
|
||||
long m_isContacty :1;
|
||||
long m_hasAddress :1;
|
||||
long m_hasTOD :1;
|
||||
long m_hasSiteVenue :1;
|
||||
|
||||
// make this "INvalid" not valid since it was set to 0 before
|
||||
// and we want to be backwards compatible
|
||||
long m_isIndexedINValid :1;
|
||||
//long m_hasSiteVenue :1;
|
||||
|
||||
// expires after a certain time or if ownership changed
|
||||
long m_inGoogleValid :1;
|
||||
@ -856,7 +860,8 @@ class SpiderReply {
|
||||
long m_isContactyValid :1;
|
||||
long m_hasAddressValid :1;
|
||||
long m_hasTODValid :1;
|
||||
long m_hasSiteVenueValid :1;
|
||||
//long m_hasSiteVenueValid :1;
|
||||
long m_reserved2 :1;
|
||||
long m_siteNumInlinksValid :1;
|
||||
// was the request an injection request
|
||||
long m_fromInjectionRequest :1;
|
||||
|
43
XmlDoc.cpp
43
XmlDoc.cpp
@ -1590,7 +1590,7 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
m_isLinkSpam2 = m_isLinkSpam;
|
||||
m_hasAddress2 = m_hasAddress;
|
||||
m_hasTOD2 = m_hasTOD;
|
||||
m_hasSiteVenue2 = m_hasSiteVenue;
|
||||
//m_hasSiteVenue2 = m_hasSiteVenue;
|
||||
m_hasContactInfo2 = m_hasContactInfo;
|
||||
//m_skipIndexingByte = m_skipIndexing;
|
||||
m_isSiteRoot2 = m_isSiteRoot;
|
||||
@ -11410,6 +11410,7 @@ bool *XmlDoc::getHasTOD ( ) {
|
||||
return &m_hasTOD2;
|
||||
}
|
||||
|
||||
/*
|
||||
bool *XmlDoc::getHasSiteVenue ( ) {
|
||||
if ( m_hasSiteVenueValid ) return &m_hasSiteVenue2;
|
||||
// get the tag rec
|
||||
@ -11423,7 +11424,7 @@ bool *XmlDoc::getHasSiteVenue ( ) {
|
||||
m_hasSiteVenueValid = true;
|
||||
return &m_hasSiteVenue2;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
|
||||
// do not include addresses that are always in the header/footer of every page!
|
||||
@ -21540,7 +21541,7 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
|
||||
m_httpStatus = od->m_httpStatus;
|
||||
m_hasAddress = od->m_hasAddress;
|
||||
m_hasTOD = od->m_hasTOD;
|
||||
m_hasSiteVenue = od->m_hasSiteVenue;
|
||||
//m_hasSiteVenue = od->m_hasSiteVenue;
|
||||
m_isRSS = od->m_isRSS;
|
||||
m_isPermalink = od->m_isPermalink;
|
||||
m_hasContactInfo= od->m_hasContactInfo;
|
||||
@ -21550,7 +21551,7 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
|
||||
// do not forget the shadow members of the bit members
|
||||
m_hasAddress2 = m_hasAddress;
|
||||
m_hasTOD2 = m_hasTOD;
|
||||
m_hasSiteVenue2 = m_hasSiteVenue;
|
||||
//m_hasSiteVenue2 = m_hasSiteVenue;
|
||||
m_isRSS2 = m_isRSS;
|
||||
m_isPermalink2 = m_isPermalink;
|
||||
|
||||
@ -21561,7 +21562,7 @@ void XmlDoc::copyFromOldDoc ( XmlDoc *od ) {
|
||||
m_httpStatusValid = true;
|
||||
m_hasAddressValid = true;
|
||||
m_hasTODValid = true;
|
||||
m_hasSiteVenueValid = true;
|
||||
//m_hasSiteVenueValid = true;
|
||||
m_isRSSValid = true;
|
||||
m_isPermalinkValid = true;
|
||||
m_hasContactInfoValid= true;
|
||||
@ -21808,9 +21809,11 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
else
|
||||
m_srep.m_hadDiffbotError = false;
|
||||
|
||||
// sanity
|
||||
if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! m_isInIndexValid ) { char *xx=NULL;*xx=0; }
|
||||
// sanity. if being called directly from indexDoc() because of
|
||||
// an error like out of memory, then we do not know if it is
|
||||
// indexed or not or was indexed...
|
||||
//if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; }
|
||||
//if ( ! m_isInIndexValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// were we already in titledb before we started spidering?
|
||||
m_srep.m_wasIndexed = m_wasInIndex;
|
||||
@ -21822,11 +21825,17 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
// this is an EFAKEFIRSTIP error or something similar where we
|
||||
// basically just add this reply and we're done.
|
||||
// NOTE: this also pertains to SpiderReply::m_isIndexed.
|
||||
m_srep.m_wasIndexedValid = true;
|
||||
m_srep.m_wasIndexedValid = m_wasInIndexValid;
|
||||
|
||||
// assume no change
|
||||
m_srep.m_isIndexed = m_isInIndex;
|
||||
|
||||
// we need to know if the m_isIndexed bit is valid or not
|
||||
// because sometimes like if we are being called directly from
|
||||
// indexDoc() because of an error situation, we do not know!
|
||||
if ( m_isInIndexValid ) m_srep.m_isIndexedINValid = false;
|
||||
else m_srep.m_isIndexedINValid = true;
|
||||
|
||||
// likewise, we need to know if we deleted it so we can decrement the
|
||||
// quota count for this subdomain/host in SpiderColl::m_quotaTable
|
||||
//if ( m_srep.m_wasIndexed ) m_srep.m_isIndexed = true;
|
||||
@ -21922,12 +21931,12 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
m_srep.m_isPermalink = m_oldDoc->m_isPermalink;
|
||||
m_srep.m_hasAddress = m_oldDoc->m_hasAddress;
|
||||
m_srep.m_hasTOD = m_oldDoc->m_hasTOD;
|
||||
m_srep.m_hasSiteVenue = m_oldDoc->m_hasSiteVenue;
|
||||
//m_srep.m_hasSiteVenue = m_oldDoc->m_hasSiteVenue;
|
||||
m_srep.m_siteNumInlinks = m_oldDoc->m_siteNumInlinks;
|
||||
// they're all valid
|
||||
m_srep.m_hasAddressValid = true;
|
||||
m_srep.m_hasTODValid = true;
|
||||
m_srep.m_hasSiteVenueValid = true;
|
||||
//m_srep.m_hasSiteVenueValid = true;
|
||||
m_srep.m_siteNumInlinksValid = true;
|
||||
}
|
||||
// do special things if
|
||||
@ -21967,9 +21976,9 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
if ( ! hasTOD || hasTOD == (void *)-1 )
|
||||
return (SpiderReply *)hasTOD;
|
||||
// does it have a venue address?
|
||||
bool *hasSiteVenue = getHasSiteVenue();
|
||||
if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
|
||||
return (SpiderReply *)hasSiteVenue;
|
||||
//bool *hasSiteVenue = getHasSiteVenue();
|
||||
//if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 )
|
||||
// return (SpiderReply *)hasSiteVenue;
|
||||
// get the content type
|
||||
uint8_t *ct = getContentType();
|
||||
if ( ! ct ) return NULL;
|
||||
@ -22047,7 +22056,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
|
||||
if ( ! m_hasAddressValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! m_hasTODValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! m_hasSiteVenueValid ) { char *xx=NULL;*xx=0; }
|
||||
//if ( ! m_hasSiteVenueValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( ! m_hasContactInfoValid) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// . we use this to store "bad" spider recs to keep from respidering
|
||||
@ -22080,7 +22089,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
// *isRoot ,
|
||||
// m_niceness );
|
||||
m_srep.m_hasTOD = *hasTOD;
|
||||
m_srep.m_hasSiteVenue = *hasSiteVenue;
|
||||
//m_srep.m_hasSiteVenue = *hasSiteVenue;
|
||||
|
||||
// validate all
|
||||
m_srep.m_inGoogleValid = 1;
|
||||
@ -22089,7 +22098,7 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
m_srep.m_isContactyValid = 1;
|
||||
m_srep.m_hasAddressValid = 1;
|
||||
m_srep.m_hasTODValid = 1;
|
||||
m_srep.m_hasSiteVenueValid = 1;
|
||||
//m_srep.m_hasSiteVenueValid = 1;
|
||||
|
||||
// validate
|
||||
m_srepValid = true;
|
||||
|
6
XmlDoc.h
6
XmlDoc.h
@ -325,7 +325,7 @@ class XmlDoc {
|
||||
uint16_t m_isLinkSpam:1;
|
||||
uint16_t m_hasAddress:1;
|
||||
uint16_t m_hasTOD:1;
|
||||
uint16_t m_hasSiteVenue:1;
|
||||
uint16_t m_reserved_sv:1;//hasSiteVenue:1;
|
||||
uint16_t m_hasContactInfo:1;
|
||||
uint16_t m_isSiteRoot:1;
|
||||
|
||||
@ -1220,7 +1220,7 @@ class XmlDoc {
|
||||
bool m_isAdultValid;
|
||||
bool m_hasAddressValid;
|
||||
bool m_hasTODValid;
|
||||
bool m_hasSiteVenueValid;
|
||||
//bool m_hasSiteVenueValid;
|
||||
bool m_catRecValid;
|
||||
bool m_urlPubDateValid;
|
||||
bool m_isUrlPermalinkFormatValid;
|
||||
@ -1342,7 +1342,7 @@ class XmlDoc {
|
||||
char m_isLinkSpam2;
|
||||
bool m_hasAddress2;
|
||||
bool m_hasTOD2;
|
||||
bool m_hasSiteVenue2;
|
||||
//bool m_hasSiteVenue2;
|
||||
char m_hasContactInfo2;
|
||||
char m_isSiteRoot2;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user