open-source-search-engine/Repair.cpp
Matt Wells eb49094343 try to start indexing spider replies
as regular search results in the index so
you can query on those. get histograms of
spider status msgs, etc. ability to turn
that and images on/off.
2014-05-09 11:18:24 -07:00

2702 lines
77 KiB
C++

// Copyright 2007, Gigablast Inc.
#undef _XOPEN_SOURCE
#define _XOPEN_SOURCE 500
#include "gb-include.h"
#include "Repair.h"
#include "Rdb.h"
#include "Spider.h"
#include "Msg1.h"
//#include "Datedb.h"
#include "Pages.h"
#include "PingServer.h"
#include "Spider.h"
#include "Process.h"
#include "Tagdb.h"
//#include "Placedb.h"
#include "Sections.h"
//#include "Revdb.h"
//#include "Tfndb.h"
static void repairWrapper ( int fd , void *state ) ;
static void loopWrapper ( void *state , RdbList *list , Msg5 *msg5 ) ;
//static void loopWrapper2 ( void *state );
//static void loopWrapper3 ( void *state );
//static void injectCompleteWrapper ( void *state );
static bool saveAllRdbs ( void *state , void (* callback)(void *state) ) ;
static bool anyRdbNeedsSave ( ) ;
static void doneSavingRdb ( void *state );
char g_repairMode = 0;
// the global class
Repair g_repair;
Rdb **getSecondaryRdbs ( long *nsr ) {
static Rdb *s_rdbs[50];
static long s_nsr = 0;
static bool s_init = false;
if ( ! s_init ) {
s_init = true;
s_nsr = 0;
//s_rdbs[s_nsr++] = g_tfndb2.getRdb ();
s_rdbs[s_nsr++] = g_titledb2.getRdb ();
//s_rdbs[s_nsr++] = g_indexdb2.getRdb ();
s_rdbs[s_nsr++] = g_posdb2.getRdb ();
//s_rdbs[s_nsr++] = g_datedb2.getRdb ();
s_rdbs[s_nsr++] = g_spiderdb2.getRdb ();
s_rdbs[s_nsr++] = g_clusterdb2.getRdb ();
s_rdbs[s_nsr++] = g_linkdb2.getRdb ();
s_rdbs[s_nsr++] = g_tagdb2.getRdb ();
//s_rdbs[s_nsr++] = g_placedb2.getRdb ();
//s_rdbs[s_nsr++] = g_sectiondb2.getRdb ();
//s_rdbs[s_nsr++] = g_revdb2.getRdb ();
}
*nsr = s_nsr;
return s_rdbs;
}
Rdb **getAllRdbs ( long *nsr ) {
static Rdb *s_rdbs[50];
static long s_nsr = 0;
static bool s_init = false;
if ( ! s_init ) {
s_init = true;
s_nsr = 0;
//s_rdbs[s_nsr++] = g_tfndb.getRdb ();
s_rdbs[s_nsr++] = g_titledb.getRdb ();
//s_rdbs[s_nsr++] = g_indexdb.getRdb ();
s_rdbs[s_nsr++] = g_posdb.getRdb ();
//s_rdbs[s_nsr++] = g_datedb.getRdb ();
s_rdbs[s_nsr++] = g_spiderdb.getRdb ();
s_rdbs[s_nsr++] = g_clusterdb.getRdb ();
s_rdbs[s_nsr++] = g_linkdb.getRdb ();
s_rdbs[s_nsr++] = g_tagdb.getRdb ();
//s_rdbs[s_nsr++] = g_placedb.getRdb ();
//s_rdbs[s_nsr++] = g_sectiondb.getRdb ();
//s_rdbs[s_nsr++] = g_revdb.getRdb ();
//s_rdbs[s_nsr++] = g_tfndb2.getRdb ();
s_rdbs[s_nsr++] = g_titledb2.getRdb ();
//s_rdbs[s_nsr++] = g_indexdb2.getRdb ();
s_rdbs[s_nsr++] = g_posdb2.getRdb ();
//s_rdbs[s_nsr++] = g_datedb2.getRdb ();
s_rdbs[s_nsr++] = g_spiderdb2.getRdb ();
s_rdbs[s_nsr++] = g_clusterdb2.getRdb ();
s_rdbs[s_nsr++] = g_linkdb2.getRdb ();
s_rdbs[s_nsr++] = g_tagdb2.getRdb ();
//s_rdbs[s_nsr++] = g_placedb2.getRdb ();
//s_rdbs[s_nsr++] = g_sectiondb2.getRdb ();
//s_rdbs[s_nsr++] = g_revdb2.getRdb ();
}
*nsr = s_nsr;
return s_rdbs;
}
Repair::Repair() {
}
// main.cpp calls g_repair.init()
bool Repair::init ( ) {
//logf(LOG_DEBUG,"repair: TODO: alloc s_docs[] on demand to save mem");
m_msg5InUse = false;
m_isSuspended = false;
m_saveRepairState = false;
m_isRetrying = false;
m_needsCallback = false;
m_completed = false;
if( ! g_loop.registerSleepCallback( 1 , NULL , repairWrapper ) )
return log("repair: Failed register callback.");
return true;
}
bool Repair::isRepairActive() {
return g_repairMode >= 4;
}
// . call this once every second
// . this is responsible for advancing from one g_repairMode to the next
void repairWrapper ( int fd , void *state ) {
g_errno = 0;
// . all hosts should have their g_conf.m_repairMode parm set
// . it is global now, not collection based, since we need to
// lock down titledb for the scan and there could be recs from
// the collection we are repairing in titledb's rdbtree, which,
// when dumped, would mess up our scan.
if ( ! g_conf.m_repairingEnabled ) return;
// if the power went off
if ( ! g_process.m_powerIsOn ) return;
// if it got turned back on after being suspended, start where
// we left off, this is how we re-enter Repair::loop()
if ( g_repair.m_isSuspended && g_repairMode == 4 ) {
// unsuspend it
g_repair.m_isSuspended = false;
// note it
log("repair: Resuming repair scan after suspension.");
// try to read another title rec, or whatever
g_repair.loop();
return;
}
// if we are in retry mode
if ( g_repair.m_isRetrying && g_repairMode == 4 ) {
// reset it
g_repair.m_isRetrying = false;
// try to read another title rec, or whatever
g_repair.loop();
return;
}
//
// ok, repairing is enabled at this point
//
// are we just starting?
if ( g_repairMode == 0 ) {
// turn spiders off since repairing is enabled
g_conf.m_spideringEnabled = false;
//g_conf.m_injectionEnabled = false;
// wait for a previous repair to finish?
//if ( g_pingServer.getMinRepairMode() != 0 ) return;
// if some are not done yet with the previous repair, wait...
// no because we are trying to load up repair.dat
//if ( g_pingServer.getMaxRepairMode() == 8 ) return;
g_repair.m_startTime = gettimeofdayInMilliseconds();
// enter repair mode level 1
g_repairMode = 1;
// note it
log("repair: Waiting for all writing operations to stop.");
}
// we can only enter repairMode 2 once all "writing" has stopped
if ( g_repairMode == 1 ) {
// wait for all merging to stop just to be on the safe side
if ( g_merge.isMerging () ) return;
if ( g_merge2.isMerging() ) return;
// this is >= 0 is correct, -1 means no outstanding spiders
if ( g_spiderLoop.m_maxUsed >= 0 ) return;
// wait for ny outstanding unlinks or renames to finish
if ( g_unlinkRenameThreads > 0 ) return;
// . make sure all Msg4s are done and have completely added all
// recs they were supposed to
// . PROBLEM: if resuming a repair after re-starting, we can
// not turn on repairing
// . SOLVED: saveState() for msg4 uses different filename
if ( hasAddsInQueue() ) return;
// . ok, go to level 2
// . we can only get to level *3* once PingServer.cpp sees
// that all hosts in the cluster are in level 2. that way we
// guarantee not to add or delete any recs from any rdb,
// because that could damage the repair. PingServer will
// call g_repair.allHostsRead() when they all report they
// have a repair mode of 2.
g_repairMode = 2;
// note it
log("repair: All oustanding writing operations stopped. ");
log("repair: Waiting for all other hosts to stop, too.");
}
// we can only enter mode 3 once all hosts are in 2 or higher
if ( g_repairMode == 2 ) {
// we are still waiting on some guy if this is <= 1
if ( g_pingServer.getMinRepairMode() <= 1 ) return;
// wait for others to sync clocks, lest xmldoc cores when
// it calls getTimeGlobal() like in getNewTagBuf()
if ( ! isClockInSync() ) return;
// . this will return true if everything is saved to disk that
// needs to be, otherwise false if waiting on an rdb to finish
// saving
// . do this after all hosts are done writing, otherwise
// they might add data to our rdbs!
if ( ! saveAllRdbs ( NULL , NULL ) ) return;
// note it
//log("repair: Initializing the new Rdbs and scan parameters.");
// reset scan info BEFORE calling Repair::load()
g_repair.resetForNewCollection();
// before calling loop for the first time, init the scan,
// this will block and only return when it is done
g_repair.initScan();
// on error this sets g_repairingEnabled to false
if ( ! g_conf.m_repairingEnabled ) return;
// save "addsinprogress" file now so that the file will be
// saved as essentially an empty file at this point.
saveAddsInProgress ( NULL );
// sanity check
//char *xx = NULL; *xx = 0;
// hey, everyone is done "writing"
g_repairMode = 3;
// not eit
log("repair: All data saved and clock synced.");
log("repair: Waiting for all hosts to save and sync clocks.");
}
if ( g_repairMode == 3 ) {
// wait for others to save everything
if ( g_pingServer.getMinRepairMode() <= 2 ) return;
// start the loop
log("repair: All hosts saved.");
log("repair: Loading repair-addsinprogress.dat");
// . tell Msg4 to load state using the new filename now
// . load "repair-addsinprogress" file
loadAddsInProgress ( "repair-" );
//log("repair: Scanning titledb file #%li.", g_repair.m_fn );
log("repair: Starting repair scan.");
// advance
g_repairMode = 4;
// now start calling the loop. returns false if blocks
if ( ! g_repair.loop() ) return;
}
// we can only enter mode 4 once we have completed the repairs
// and have dumped all the in-memory data to disk
if ( g_repairMode == 4 ) {
// special case
if ( g_repair.m_needsCallback ) {
// only do once
g_repair.m_needsCallback = false;
// note it in log
log("repair: calling needed callback for msg4");
// and call the loop then. returns false if blocks..
if ( ! g_repair.loop() ) return;
}
// wait for scan loops to complete
if ( ! g_repair.m_completedFirstScan ) return;
if ( ! g_repair.m_completedSpiderdbScan ) return;
// note it
log("repair: Scan completed.");
log("repair: Waiting for other hosts to complete scan.");
// ok, we are ready to update the data files
g_repairMode = 5;
}
// we can only enter mode 5 once all hosts are in 4 or higher
if ( g_repairMode == 5 ) {
// if add queues still adding, wait, otherwise they will not
// be able to add to our rebuild collection
if ( hasAddsInQueue() ) return;
// note it
log("repair: All adds have been flushed.");
log("repair: Waiting for all other hosts to flush out their "
"add operations.");
// update repair mode
g_repairMode = 6;
}
if ( g_repairMode == 6 ) {
// wait for everyone to get to mode 6 before we dump, otherwise
// data might arrive in the middle of the dumping and it stays
// in the in-memory RdbTree!
if ( g_pingServer.getMinRepairMode() < 6 ) return;
// do not dump if we are doing a full rebuild or a
// no split list rebuild -- why?
//if(! g_repair.m_fullRebuild && ! g_repair.m_rebuildNoSplits){
//if ( ! g_repair.m_rebuildNoSplits ) {
// we might have to dump again
g_repair.dumpLoop();
// are we done dumping?
if ( ! g_repair.dumpsCompleted() ) return;
//}
// wait for all merging to stop just to be on the safe side
if ( g_merge.isMerging () ) return;
if ( g_merge2.isMerging() ) return;
// wait for ny outstanding unlinks or renames to finish
if ( g_unlinkRenameThreads > 0 ) return;
// note it
log("repair: Final dump completed.");
log("repair: Updating rdbs to use newly repaired data.");
// everyone is ready
g_repairMode = 7;
}
// we can only enter mode 6 once we are done updating the original
// rdbs with the rebuilt/repaired data. we move the old rdb data files
// into the trash and replace it with the new data.
if ( g_repairMode == 7 ) {
// wait for autosave...
if ( g_process.m_mode ) return; // = SAVE_MODE;
// save to disk so it zeroes out indexdbRebuild-saved.dat
// which should have 0 records in it cuz we dumped it above
// in g_repair.dumpLoop()
if ( ! saveAllRdbs ( NULL , NULL ) ) return;
// . this blocks and gets the job done
// . this will move the old *.dat and *-saved.dat files into
// a subdir in the trash subdir
// . it will rename the rebuilt files to remove the "Rebuild"
// from their filenames
// . it will then restart the primary rdbs using those newly
// rebuilt and renamed files
// . this will not allow itself to be called more than once
// per scan/repair process
g_repair.updateRdbs();
// note this
log("repair: resetting secondary rdbs.");
// . only do this after indexdbRebuild-saved.dat has had a
// chance to save to "zero-out" its file on disk
// . all done with these guys, free their mem
g_repair.resetSecondaryRdbs();
// save "repair-addsinprogress" now so that the file will
// be saved as essentially an empty file at this
// point.
saveAddsInProgress ( "repair-" );
// reset it again in case it gets saved again later
g_repair.resetForNewCollection();
// unlink the repair.dat file, in case we core and are unable
// to save the freshly-reset repair.dat file
log("repair: unlinking repair.dat");
char tmp[1024];
sprintf ( tmp, "%s/repair.dat", g_hostdb.m_dir );
::unlink ( tmp );
// do not save it again! we just unlinked it!!
g_repair.m_saveRepairState = false;
// note it
log("repair: Waiting for other hosts to complete update.");
// ready to reset
g_repairMode = 8;
// mark it
g_repair.m_completed = true;
}
// go back to 0 once all hosts do not equal 5
if ( g_repairMode == 8 ) {
// nobody can be in 7 (they might be 0!)
if ( g_pingServer.getMinRepairModeBesides0() != 8 ) return;
// note it
log("repair: Exiting repair mode. took %lli ms",
gettimeofdayInMilliseconds() - g_repair.m_startTime);
// turn it off to prevent going back to mode 1 again
g_conf.m_repairingEnabled = false;
// ok reset
g_repairMode = 0;
}
}
void Repair::resetForNewCollection ( ) {
m_stage = 0;
m_lastDocId = 0;
m_prevDocId = 0;
m_completedFirstScan = false;
m_completedSpiderdbScan = false;
//m_completedIndexdbScan = false;
}
// . PingServer.cpp will call this g_repair.allHostsReady() when all hosts
// have completely stopped spidering and merging
// . returns false if blocked, true otherwise
//void Repair::allHostsReady () {
void Repair::initScan ( ) {
// reset some stuff for the titledb scan
//m_nextRevdbKey.setMin ();
m_nextTitledbKey.setMin();
m_nextSpiderdbKey.setMin();
m_lastSpiderdbKey.setMin();
//m_nextIndexdbKey.setMin ();
m_nextPosdbKey.setMin ();
//m_nextDatedbKey.setMin ();
m_nextLinkdbKey.setMin ();
//m_nextPlacedbKey.setMin ();
m_endKey.setMax();
m_titleRecList.reset();
//m_fn = 0;
m_count = 0;
// all Repair::updateRdbs() to be called
m_updated = false;
// titledb scan stats
m_recsScanned = 0;
m_recsNegativeKeys = 0;
m_recsOutOfOrder = 0;
m_recsetErrors = 0;
m_recsCorruptErrors = 0;
m_recsXmlErrors = 0;
m_recsDupDocIds = 0;
m_recsOverwritten = 0;
m_recsUnassigned = 0;
m_recsWrongGroupId = 0;
m_noTitleRecs = 0;
m_spiderRecsScanned = 0;
m_spiderRecSetErrors = 0;
m_spiderRecNotAssigned = 0;
m_spiderRecBadTLD = 0;
m_rebuildTitledb = g_conf.m_rebuildTitledb;
//m_rebuildIndexdb = g_conf.m_rebuildIndexdb;
m_rebuildPosdb = g_conf.m_rebuildPosdb;
//m_rebuildNoSplits = g_conf.m_rebuildNoSplits;
//m_rebuildDatedb = g_conf.m_rebuildDatedb;
//m_rebuildTfndb = g_conf.m_rebuildTfndb;
//m_rebuildChecksumdb = g_conf.m_rebuildChecksumdb;
m_rebuildClusterdb = g_conf.m_rebuildClusterdb;
m_rebuildSpiderdb = g_conf.m_rebuildSpiderdb;
m_rebuildLinkdb = g_conf.m_rebuildLinkdb;
//m_rebuildTagdb = g_conf.m_rebuildTagdb;
//m_rebuildPlacedb = g_conf.m_rebuildPlacedb;
//m_rebuildSectiondb = g_conf.m_rebuildSectiondb;
//m_rebuildRevdb = g_conf.m_rebuildRevdb;
m_fullRebuild = g_conf.m_fullRebuild;
//m_removeBadPages = g_conf.m_removeBadPages;
m_rebuildRoots = g_conf.m_rebuildRoots;
m_rebuildNonRoots = g_conf.m_rebuildNonRoots;
m_numOutstandingInjects = 0;
// we call Msg14::injectUrl() directly and that will add to ALL the
// necessary secondary rdbs automatically
if ( m_fullRebuild ) {
// why rebuild titledb? its the base. no we need to
// rebuild it for new event displays.
m_rebuildTitledb = true;//false;
//m_rebuildTfndb = true;//false;
m_rebuildSpiderdb = false;
//m_removeBadPages = false;
//m_rebuildIndexdb = true;
m_rebuildPosdb = true;
//m_rebuildNoSplits = true;
//m_rebuildDatedb = true;
m_rebuildClusterdb = true;
m_rebuildLinkdb = true;
//m_rebuildTagdb = true;
//m_rebuildPlacedb = true;
//m_rebuildSectiondb = true;
//m_rebuildRevdb = true;
}
// no supported right now
//m_rebuildTfndb = false;
// . what does it mean to rebuild titledb?
// . we need to rebuild titledb so the eventdisplays are updated!
// they have the title descriptions etc.!!
//m_rebuildTitledb = false;
// never rebuild this for now, we'll lose our firstips...
//m_rebuildTagdb = false;
// don't rebuild placedb because i think we add to it from titlerecs
// that we do not store into titledb... yeah we only store the
// title rec in XmlDoc.cpp if we got valid events...
//m_rebuildPlacedb = false;
// and sectiondb votes are added for root urls even if they don't
// have a valid event, and a title rec... so we can't really build
// it just from title recs either... maybe from revdb recs?
//m_rebuildSectiondb = false;
// . if rebuilding tfndb, only do that...
// . because rebuilding titledb requires that we do a
// lookup on tfndb to see if the title rec we got is
// the real deal, i.e. from the correct tfn, which
// is stored in the tfndb rec. otherwise, it is an
// older version of the same url probably.
/*
if ( m_rebuildTfndb ) {
m_rebuildTitledb = false;
m_rebuildIndexdb = false;
//m_rebuildNoSplits = false;
m_rebuildDatedb = false;
//m_rebuildChecksumdb = false;
m_rebuildClusterdb = false;
m_rebuildSpiderdb = false;
m_rebuildLinkdb = false;
m_rebuildTagdb = false;
m_rebuildPlacedb = false;
m_rebuildSectiondb = false;
m_rebuildRevdb = false;
m_fullRebuild = false;
//m_removeBadPages = false;
}
*/
/*
// only this can be on by itself
if ( m_rebuildNoSplits ) {
m_rebuildTitledb = false;
m_rebuildIndexdb = false;
m_rebuildDatedb = false;
m_rebuildTfndb = false;
//m_rebuildChecksumdb = false;
m_rebuildClusterdb = false;
m_rebuildSpiderdb = false;
m_rebuildLinkdb = false;
m_rebuildTagdb = false;
m_rebuildPlacedb = false;
m_rebuildSectiondb = false;
m_rebuildRevdb = false;
m_fullRebuild = false;
m_removeBadPages = false;
}
*/
/*
// i forgot what this was for!
if ( m_removeBadPages ) {
m_rebuildTitledb = false;
m_rebuildIndexdb = false;
//m_rebuildNoSplits = false;
m_rebuildDatedb = false;
m_rebuildTfndb = false;
//m_rebuildChecksumdb = false;
m_rebuildClusterdb = false;
m_rebuildSpiderdb = false;
//m_rebuildSitedb = false;
m_rebuildLinkdb = false;
m_rebuildTagdb = false;
m_rebuildPlacedb = false;
m_rebuildSectiondb = false;
m_rebuildRevdb = false;
m_fullRebuild = false;
}
*/
// force reverse indexdb rebuild if you are changing
// of these dbs
/*
if ( m_rebuildIndexdb ||
m_rebuildDatedb ||
m_rebuildClusterdb ||
m_rebuildLinkdb ||
m_rebuildPlacedb ||
m_rebuildSectiondb )
m_rebuildRevdb = true;
*/
// rebuilding spiderdb means we must rebuild tfndb, too
if ( m_rebuildSpiderdb ) {
logf(LOG_DEBUG,"repair: Not rebuilding tfndb like "
"we should because it is broken!");
// TODO: put this back when it is fixed!
// see the comment in addToTfndb2() below
// YOU HAVE TO REBUILD spiderdb first then rebuild
// tfndb when that is done...
//m_rebuildTfndb = true;
}
// rebuilding titledb means we must rebuild tfndb, which means
// we must rebuild spiderdb, too!
//if ( m_rebuildTitledb ) {
// //m_rebuildTfndb = true;
// m_rebuildSpiderdb = true;
//}
// . set the list of ptrs to the collections we have to repair
// . should be comma or space separated in g_conf.m_collsToRepair
// . none listed means to repair all collections
char *s = g_conf.m_collsToRepair;
char *cbuf = g_conf.m_collsToRepair;
// reset the list of ptrs to colls to repair
m_numColls = 0;
// scan through the collections in the string, if there are any
collLoop:
// skip non alnum chars
while ( *s && !is_alnum_a(*s) ) s++;
// if not at the end of the string, grab the collection
if ( *s ) {
m_collOffs[m_numColls] = s - cbuf;
// hold it
char *begin = s;
// find the length
while ( *s && *s != ',' && !is_wspace_a(*s) ) s++;
// store that, too
m_collLens[m_numColls] = s - begin;
// advance the number of collections
m_numColls++;
// get the next collection if under 100 collections still
if ( m_numColls < 100 ) goto collLoop;
}
// split the mem we have available among the rdbs
m_totalMem = g_conf.m_repairMem;
// 30MB min
if ( m_totalMem < 30000000 ) m_totalMem = 30000000;
//
// try to get some more mem.
//
// weight factors
float weight = 0;
if ( m_rebuildTitledb ) weight += 100.0;
//if ( m_rebuildTfndb ) weight += 1.0;
//if ( m_rebuildIndexdb ) weight += 100.0;
if ( m_rebuildPosdb ) weight += 100.0;
//if ( m_rebuildDatedb ) weight += 80.0;
if ( m_rebuildClusterdb ) weight += 1.0;
//if ( m_rebuildChecksumdb ) weight += 1.0;
if ( m_rebuildSpiderdb ) weight += 5.0;
if ( m_rebuildLinkdb ) weight += 20.0;
if ( m_rebuildTagdb ) weight += 5.0;
//if ( m_rebuildPlacedb ) weight += 20.0;
//if ( m_rebuildSectiondb ) weight += 5.0;
//if ( m_rebuildRevdb ) weight += 80.0;
// assign memory based on weight
long titledbMem = 0;
//long tfndbMem = 0;
//long indexdbMem = 0;
long posdbMem = 0;
//long datedbMem = 0;
long clusterdbMem = 0;
//long checksumdbMem = 0;
long spiderdbMem = 0;
long linkdbMem = 0;
//long tagdbMem = 0;
//long placedbMem = 0;
//long sectiondbMem = 0;
//long revdbMem = 0;
float tt = (float)m_totalMem;
if ( m_rebuildTitledb ) titledbMem = (long)((100.0 * tt)/weight);
//if(m_rebuildTfndb ) tfndbMem = (long)(( 1.0 * tt)/weight);
// HACK FIX CORE:
//if ( m_rebuildTfndb ) tfndbMem = 100*1024*1024;
//if(m_rebuildIndexdb ) indexdbMem = (long)((100.0 * tt)/weight);
if ( m_rebuildPosdb ) posdbMem = (long)((100.0 * tt)/weight);
//if(m_rebuildDatedb ) datedbMem = (long)(( 80.0 * tt)/weight);
if ( m_rebuildClusterdb ) clusterdbMem = (long)(( 1.0 * tt)/weight);
//if(m_rebuildChecksumdb ) checksumdbMem = (long)(( 1.0 * tt)/weight);
if ( m_rebuildSpiderdb ) spiderdbMem = (long)(( 5.0 * tt)/weight);
if ( m_rebuildLinkdb ) linkdbMem = (long)(( 20.0 * tt)/weight);
//if ( m_rebuildTagdb ) tagdbMem = (long)(( 5.0 * tt)/weight);
//if(m_rebuildPlacedb ) placedbMem = (long)(( 20.0 * tt)/weight);
//if(m_rebuildSectiondb ) sectiondbMem = (long)(( 5.0 * tt)/weight);
//if(m_rebuildRevdb ) revdbMem = (long)(( 80.0 * tt)/weight);
// debug hack
//posdbMem = 10000000;
// init secondary rdbs
if ( m_rebuildTitledb )
if ( ! g_titledb2.init2 ( titledbMem ) ) goto hadError;
//if ( m_rebuildTfndb )
// if ( ! g_tfndb2.init2 ( tfndbMem ) ) goto hadError;
//if ( m_rebuildIndexdb )
// if ( ! g_indexdb2.init2 ( indexdbMem ) ) goto hadError;
if ( m_rebuildPosdb )
if ( ! g_posdb2.init2 ( posdbMem ) ) goto hadError;
//if ( m_rebuildDatedb )
// if ( ! g_datedb2.init2 ( datedbMem ) ) goto hadError;
if ( m_rebuildClusterdb )
if ( ! g_clusterdb2.init2 ( clusterdbMem ) ) goto hadError;
//if ( m_rebuildChecksumdb )
// if ( ! g_checksumdb2.init2 ( checksumdbMem ) ) goto hadError;
if ( m_rebuildSpiderdb )
if ( ! g_spiderdb2.init2 ( spiderdbMem ) ) goto hadError;
//if ( m_rebuildSitedb )
// if ( ! g_tagdb2.init2 ( spiderdbMem ) ) goto hadError;
if ( m_rebuildLinkdb )
if ( ! g_linkdb2.init2 ( linkdbMem ) ) goto hadError;
//if ( m_rebuildTagdb )
// if ( ! g_tagdb2.init2 ( tagdbMem ) ) goto hadError;
//if ( m_rebuildPlacedb )
// if ( ! g_placedb2.init2 ( placedbMem ) ) goto hadError;
//if ( m_rebuildSectiondb )
// if ( ! g_sectiondb2.init2 ( sectiondbMem ) ) goto hadError;
//if ( m_rebuildRevdb )
// if ( ! g_revdb2.init2 ( revdbMem ) ) goto hadError;
g_errno = 0;
// reset current coll we are repairing
m_coll = NULL;
m_colli = -1;
m_completedFirstScan = false;
// . tell it to advance to the next collection
// . this will call addColl() on the appropriate Rdbs
// . it will call addColl() on the primary rdbs for m_fullRebuild
getNextCollToRepair();
// if could not get any, bail
if ( ! m_coll ) goto hadError;
g_errno = 0;
// load the old repair state if on disk, this will block
load();
// now we can save if we need to
m_saveRepairState = true;
// if error loading, ignore it
g_errno = 0;
//if ( ! loop() ) return;
// was there an error
//if ( g_errno ) log("repair: loop: %s.",mstrerror(g_errno));
return;
// on any init2() error, reset all and return true
hadError:
long saved = g_errno;
// all done with these guys
resetSecondaryRdbs();
// pull back g_errno
g_errno = saved;
// note it
log("repair: Had error in repair init. %s. Exiting.",
mstrerror(g_errno));
// back to step 0
g_repairMode = 0;
// a mode of 5 means we are done repairing and waiting to go
// back to mode 0, but only PingServer.cpp will only set our
// mode to 0 once it has verified all other hosts are in
// mode 5 or 0.
//g_repairMode = 5;
// reset current coll we are repairing
m_coll = NULL;
m_colli = -1;
g_conf.m_repairingEnabled = false;
return;
}
// . sets m_coll/m_collLen to the next collection to repair
// . sets m_coll to NULL when none are left (we are done)
void Repair::getNextCollToRepair ( ) {
// . advance index into collections
// . can be index into m_colls or into g_collectiondb
m_colli++;
// ptr to first coll
if ( m_numColls ) {
if ( m_colli >= m_numColls ) {
m_coll = NULL;
m_collLen = 0;
return;
}
m_coll = g_conf.m_collsToRepair + m_collOffs [m_colli];
m_collLen = m_collLens[m_colli];
m_cr = g_collectiondb.getRec (m_coll, m_collLen);
// if DNE, set m_coll to NULL to stop repairing
if ( ! m_cr ) { m_coll = NULL; g_errno = ENOCOLLREC; return; }
}
// otherwise, we are repairing every collection by default
else {
m_cr = NULL;
// loop m_colli over all the possible collnums
while ( ! m_cr && m_colli < g_collectiondb.m_numRecs )
m_cr = g_collectiondb.m_recs [ ++m_colli ];
if ( ! m_cr ) {
m_coll = NULL;
m_collLen = 0;
g_errno = ENOCOLLREC;
return;
}
m_coll = m_cr->m_coll;
m_collLen = m_cr->m_collLen;
}
// collection cannot be deleted while we are in repair mode...
m_collnum = m_cr->m_collnum;
/*
if ( m_fullRebuild ) {
// set the new collection name...
m_newCollLen = sprintf(m_newColl,"%sRebuild",m_coll);
// . add the new collection
// . copy parms from m_coll
if ( ! g_collectiondb.addRec ( m_newColl ,
m_coll ,
m_collLen ,
true , // isNew?
-1 , // collnum
false , // isdump?
true ) && // save it?
// if already there, just keep going
g_errno != EEXIST )// is dump?
goto hadError;
// assign m_newCollnum as well
m_newCollnum = g_collectiondb.getCollnum ( m_newColl );
CollectionRec *newRec = g_collectiondb.m_recs[m_newCollnum];
// turn off link spidering on the new coll so we do
// not add more than what is there
newRec->m_spiderLinks = false;
// increase min files to merge
newRec->m_indexdbMinFilesToMerge = 100;
newRec->m_datedbMinFilesToMerge = 100;
newRec->m_spiderdbMinFilesToMerge = 100;
//newRec->m_checksumdbMinFilesToMerge = 100;
newRec->m_clusterdbMinFilesToMerge = 100;
// do we use this one?
newRec->m_linkdbMinFilesToMerge = 10;
// and USUALLY we don't want to re-dedup...
// it wastes a ton of disk seeks especially with indexdb
// with so many files!
newRec->m_dedupingEnabled = false;
newRec->m_dupCheckWWW = false;
return;
}
*/
// add collection to secondary rdbs
if ( m_rebuildTitledb ) {
if ( //! g_titledb2.addColl ( m_coll ) &&
! g_titledb2.getRdb()->addRdbBase1(m_coll) &&
g_errno != EEXIST ) goto hadError;
}
//if ( m_rebuildTfndb ) {
// if ( ! g_tfndb2.addColl ( m_coll ) &&
// g_errno != EEXIST ) goto hadError;
//}
//if ( m_rebuildIndexdb ) {
// if ( ! g_indexdb2.addColl ( m_coll ) &&
// g_errno != EEXIST ) goto hadError;
//}
if ( m_rebuildPosdb ) {
if ( ! g_posdb2.getRdb()->addRdbBase1 ( m_coll ) &&
g_errno != EEXIST ) goto hadError;
}
//if ( m_rebuildDatedb ) {
// if ( ! g_datedb2.addColl ( m_coll ) &&
// g_errno != EEXIST ) goto hadError;
//}
if ( m_rebuildClusterdb ) {
if ( ! g_clusterdb2.getRdb()->addRdbBase1 ( m_coll ) &&
g_errno != EEXIST ) goto hadError;
}
//if ( m_rebuildChecksumdb ) {
// if ( ! g_checksumdb2.addColl ( m_coll ) &&
// g_errno != EEXIST ) goto hadError;
//}
if ( m_rebuildSpiderdb ) {
if ( ! g_spiderdb2.getRdb()->addRdbBase1 ( m_coll ) &&
g_errno != EEXIST ) goto hadError;
}
//if ( m_rebuildSitedb ) {
// if ( ! g_tagdb2.addColl ( m_coll ) &&
// g_errno != EEXIST ) goto hadError;
//}
if ( m_rebuildLinkdb ) {
if ( ! g_linkdb2.getRdb()->addRdbBase1 ( m_coll ) &&
g_errno != EEXIST ) goto hadError;
}
//if ( m_rebuildTagdb ) {
// if ( ! g_tagdb2.addColl ( m_coll ) &&
// g_errno != EEXIST ) goto hadError;
//}
//if ( m_rebuildPlacedb ) {
// if ( ! g_placedb2.addColl ( m_coll ) &&
// g_errno != EEXIST ) goto hadError;
//}
//if ( m_rebuildSectiondb ) {
// if ( ! g_sectiondb2.addColl ( m_coll ) &&
// g_errno != EEXIST ) goto hadError;
//}
//if ( m_rebuildRevdb ) {
// if ( ! g_revdb2.addColl ( m_coll ) &&
// g_errno != EEXIST ) goto hadError;
//}
return;
hadError:
// note it
log("repair: Had error getting next coll to repair: %s. Exiting.",
mstrerror(g_errno));
// a mode of 5 means we are done repairing and waiting to go back to
// mode 0, but only PingServer.cpp will only set our mode to 0 once
// it has verified all other hosts are in mode 5 or 0.
//g_repairMode = 5;
return;
}
void loopWrapper ( void *state , RdbList *list , Msg5 *msg5 ) {
Repair *THIS = (Repair *)state;
THIS->m_msg5InUse = false;
THIS->loop(NULL);
}
void loopWrapper2 ( void *state ) {
g_repair.loop(NULL);
}
//void loopWrapper3 ( void *state ) {
// //Repair *THIS = (Repair *)state;
// // this hold "tr" in one case
// g_repair.loop(state);
//}
enum {
STAGE_TITLEDB_0 = 0 ,
STAGE_TITLEDB_1 ,
STAGE_TITLEDB_2 ,
STAGE_TITLEDB_3 ,
STAGE_TITLEDB_4 ,
/*
STAGE_TITLEDB_5 ,
STAGE_TITLEDB_6 ,
*/
STAGE_SPIDERDB_0
/*
STAGE_SPIDERDB_1 ,
STAGE_SPIDERDB_2A ,
STAGE_SPIDERDB_2B ,
STAGE_SPIDERDB_3 ,
STAGE_SPIDERDB_4 ,
STAGE_INDEXDB_0 ,
STAGE_INDEXDB_1 ,
STAGE_INDEXDB_2 ,
STAGE_DATEDB_0 ,
STAGE_DATEDB_1 ,
STAGE_DATEDB_2
*/
};
bool Repair::save ( ) {
// do not do a blocking save for auto save if
// we never entere repair mode
if ( ! m_saveRepairState ) return true;
// log it
log("repair: saving repair.dat");
char tmp[1024];
sprintf ( tmp , "%s/repair.dat", g_hostdb.m_dir );
File ff;
ff.set ( tmp );
if ( ! ff.open ( O_RDWR | O_CREAT | O_TRUNC ) )
return log("repair: Could not open %s : %s",
ff.getFilename(),mstrerror(g_errno));
// first 8 bytes are the size of the DATA file we're mapping
g_errno = 0;
long size = &m_SAVE_END - &m_SAVE_START;
long long offset = 0LL;
ff.write ( &m_SAVE_START , size , offset ) ;
ff.close();
return true;
}
bool Repair::load ( ) {
char tmp[1024];
sprintf ( tmp , "%s/repair.dat", g_hostdb.m_dir );
File ff;
ff.set ( tmp );
logf(LOG_INIT,"repair: Loading %s to resume repair.",tmp);
if ( ! ff.open ( O_RDONLY ) )
return log("repair: Could not open %s : %s",
ff.getFilename(),mstrerror(g_errno));
// first 8 bytes are the size of the DATA file we're mapping
g_errno = 0;
long size = &m_SAVE_END - &m_SAVE_START;
long long offset = 0LL;
ff.read ( &m_SAVE_START, size , offset ) ;
ff.close();
// resume titledb scan?
//m_nextRevdbKey = m_lastRevdbKey;
m_nextTitledbKey = m_lastTitledbKey;
// resume spiderdb scan?
m_nextSpiderdbKey = m_lastSpiderdbKey;
// reinstate the valuable vars
m_cr = g_collectiondb.m_recs [ m_collnum ];
m_coll = m_cr->m_coll;
m_stage = STAGE_TITLEDB_0;
if ( m_completedFirstScan ) m_stage = STAGE_SPIDERDB_0;
//if ( m_completedSpiderdbScan ) m_stage = STAGE_INDEXDB_0;
//m_isSuspended = true;
// HACK FORCE FOR BUZZ
// point to offset of collection we are rebuilding
// . "main" collection
// . offset into g_conf.m_collsToRepair
//m_collOffs[0] = 0;
//m_collLens[0] = 4;
//m_numColls = 1;
return true;
}
// . this is the main repair loop
// . this is repsonsible for calling all the repair functions
// . all repair callbacks given come back into this loop
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool Repair::loop ( void *state ) {
m_allowInjectToLoop = false;
// if the power went off
if ( ! g_process.m_powerIsOn ) {
// sleep 1 second and retry
m_isRetrying = true;
return true;
}
// was repairing turned off all of a sudden?
if ( ! g_conf.m_repairingEnabled ) {
//log("repair: suspending repair.");
// when it gets turned back on, the sleep callback above
// will notice it was suspended and call loop() again to
// resume where we left off...
m_isSuspended = true;
return true;
}
// if we re-entered this loop from doneWithIndexDocWrapper
// do not launch another msg5 if it is currently out!
if ( m_msg5InUse ) return false;
// set this to on
g_process.m_repairNeedsSave = true;
// . titledb scan
// . build g_checksumdb2, g_spiderdb2, g_clusterdb2, g_tfndb2
loop1:
if ( m_stage == STAGE_TITLEDB_0 ) {
m_stage++;
if ( ! scanRecs() ) return false;
}
if ( m_stage == STAGE_TITLEDB_1 ) {
m_stage++;
if ( ! gotScanRecList() ) return false;
}
if ( m_stage == STAGE_TITLEDB_2 ) {
m_stage++;
// skip this for now
//if ( ! gotTfndbList() ) return false;
// get title rec for revdb. if none, then we'll just
// re-add this revdb rec into the new revdb RDB2_REVDB2
//if ( ! getTitleRec() ) return false;
}
// get the site rec to see if it is banned first, before injecting it
if ( m_stage == STAGE_TITLEDB_3 ) {
// if we have maxed out our injects, wait for one to come back
if ( m_numOutstandingInjects >= g_conf.m_maxRepairSpiders ) {
m_allowInjectToLoop = true;
return false;
}
m_stage++;
// BEGIN NEW STUFF
bool status = injectTitleRec();
//return false; // (state)
// try to launch another
if ( m_numOutstandingInjects<g_conf.m_maxRepairSpiders ) {
m_stage = STAGE_TITLEDB_0;
goto loop1;
}
// if we are full and it blocked... wait now
if ( ! status ) return false;
}
if ( m_stage == STAGE_TITLEDB_4 ) {
m_stage++;
//if ( ! addToTfndb2() ) return false;
}
// if we are not done with the titledb scan loop back up
if ( ! m_completedFirstScan ) {
m_stage = STAGE_TITLEDB_0;
goto loop1;
}
// if we are waiting for injects to come back, return
if ( m_numOutstandingInjects > 0 ) {
// tell injection complete wrapper to call us back, otherwise
// we never end up moving on to the spider phase
g_repair.m_allowInjectToLoop = true;
return false;
}
// reset list
//m_list.reset();
// . spiderdb scan
// . put new spider recs into g_spiderdb2
/*
loop2:
if ( m_stage == STAGE_SPIDERDB_0 ) {
m_stage++;
if ( ! scanSpiderdb() ) return false;
}
if ( m_stage == STAGE_SPIDERDB_1 ) {
m_stage++;
if ( ! getTfndbListPart2() ) return false;
}
if ( m_stage == STAGE_SPIDERDB_2A ) {
m_stage++;
if ( ! getTagRecPart2() ) return false;
}
if ( m_stage == STAGE_SPIDERDB_2B ) {
m_stage++;
if ( ! getRootQualityPart2() ) return false;
}
if ( m_stage == STAGE_SPIDERDB_3 ) {
m_stage++;
if ( ! addToSpiderdb2Part2() ) return false;
}
if ( m_stage == STAGE_SPIDERDB_4 ) {
m_stage++;
if ( ! addToTfndb2Part2() ) return false;
}
// if we are not done with the titledb scan loop back up
if ( ! m_completedSpiderdbScan ) {
m_stage = STAGE_SPIDERDB_0;
goto loop2;
}
*/
// reset list
m_titleRecList.reset();
// . indexdb scan
// . delete indexdb recs whose docid is not in tfndb
// . delete duplicate docid in same termlist docids
// . turn this off for now to get buzz ready faster
/*
loop3:
if ( m_stage == STAGE_INDEXDB_0 ) {
m_stage++;
if ( ! scanIndexdb() ) return false;
}
if ( m_stage == STAGE_INDEXDB_1 ) {
m_stage++;
if ( ! gotIndexRecList() ) return false;
}
if ( m_stage == STAGE_INDEXDB_2 ) {
m_stage++;
if ( ! addToIndexdb2() ) return false;
}
// if we are not done with the titledb scan loop back up
if ( ! m_completedIndexdbScan ) {
m_stage = STAGE_INDEXDB_0;
goto loop3;
}
*/
// in order for dump to work we must be in mode 4 because
// Rdb::dumpTree() checks that
g_repairMode = 4;
// force dump to disk of the newly rebuilt rdbs, because we need to
// make sure their trees are empty when the primary rdbs assume
// the data and map files of the secondary rdbs. i don't want to
// have to mess with tree data as well.
// if we do not complete the dump here it will be monitored above
// in the sleep wrapper, repairWrapper(), and that will call
// Repair::loop() (this function) again when the dump is done
// and we will be able to advance passed this m_stage
// . dump the trees of all secondary rdbs that need it
//dumpLoop();
// are we done dumping?
//if ( ! dumpsCompleted() ) return false;
// we are all done with the repair loop
return true;
}
// this blocks
void Repair::updateRdbs ( ) {
if ( m_updated ) return;
// do not double call
m_updated = true;
// . we can only perform the update once every host is in mode 5
// . a host can only go to mode 5 once every host has gone to mode 4
//if ( g_hostdb.m_maxRepairMode < 5 ) return false;
// . replace old rdbs with the new ones
// . these calls must all block otherwise things will get out of sync
Rdb *rdb1;
Rdb *rdb2;
if ( m_rebuildTitledb ) {
rdb1 = g_titledb.getRdb ();
rdb2 = g_titledb2.getRdb();
rdb1->updateToRebuildFiles ( rdb2 , m_coll );
}
//if ( m_rebuildTfndb ) {
// rdb1 = g_tfndb.getRdb();
// rdb2 = g_tfndb2.getRdb();
// rdb1->updateToRebuildFiles ( rdb2 , m_coll );
//}
//if ( m_rebuildIndexdb ) {
// rdb1 = g_indexdb.getRdb();
// rdb2 = g_indexdb2.getRdb();
// rdb1->updateToRebuildFiles ( rdb2 , m_coll );
//}
if ( m_rebuildPosdb ) {
rdb1 = g_posdb.getRdb();
rdb2 = g_posdb2.getRdb();
rdb1->updateToRebuildFiles ( rdb2 , m_coll );
}
//if ( m_rebuildDatedb ) {
// rdb1 = g_datedb.getRdb();
// rdb2 = g_datedb2.getRdb();
// rdb1->updateToRebuildFiles ( rdb2 , m_coll );
//}
if ( m_rebuildClusterdb ) {
rdb1 = g_clusterdb.getRdb();
rdb2 = g_clusterdb2.getRdb();
rdb1->updateToRebuildFiles ( rdb2 , m_coll );
}
//if ( m_rebuildChecksumdb ) {
// rdb1 = g_checksumdb.getRdb();
// rdb2 = g_checksumdb2.getRdb();
// rdb1->updateToRebuildFiles ( rdb2 , m_coll );
//}
if ( m_rebuildSpiderdb ) {
rdb1 = g_spiderdb.getRdb();
rdb2 = g_spiderdb2.getRdb();
rdb1->updateToRebuildFiles ( rdb2 , m_coll );
}
//if ( m_rebuildSitedb ) {
// rdb1 = g_tagdb.getRdb();
// rdb2 = g_tagdb2.getRdb();
// rdb1->updateToRebuildFiles ( rdb2 , m_coll );
//}
if ( m_rebuildLinkdb ) {
rdb1 = g_linkdb.getRdb();
rdb2 = g_linkdb2.getRdb();
rdb1->updateToRebuildFiles ( rdb2 , m_coll );
}
//if ( m_rebuildTagdb ) {
// rdb1 = g_tagdb.getRdb();
// rdb2 = g_tagdb2.getRdb();
// rdb1->updateToRebuildFiles ( rdb2 , m_coll );
//}
//if ( m_rebuildPlacedb ) {
// rdb1 = g_placedb.getRdb();
// rdb2 = g_placedb2.getRdb();
// rdb1->updateToRebuildFiles ( rdb2 , m_coll );
//}
//if ( m_rebuildSectiondb ) {
// rdb1 = g_sectiondb.getRdb();
// rdb2 = g_sectiondb2.getRdb();
// rdb1->updateToRebuildFiles ( rdb2 , m_coll );
//}
//if ( m_rebuildRevdb ) {
// rdb1 = g_revdb.getRdb();
// rdb2 = g_revdb2.getRdb();
// rdb1->updateToRebuildFiles ( rdb2 , m_coll );
//}
// reset scan info
//resetForNewCollection();
// all done with these guys, free their mem
//resetSecondaryRdbs();
/*
// now go to the next collection
//getNextCollToRepair();
m_coll = NULL;
m_colli = -1;
g_conf.m_repairingEnabled = false;
// if we got another collection, repair/rebuild it now
if ( m_coll ) {
// back to mode 3
g_repairMode = 3;
// and scan titledb for this coll
goto loop1;
}
// all done with these guys, free their mem
//resetSecondaryRdbs();
// note it
log("repair: Repairs completed. Exiting repair mode.");
// a mode of 5 means we are done repairing and waiting to go back to
// mode 0, but only PingServer.cpp will only set our mode to 0 once
// it has verified all other hosts are in mode 5 or 0.
g_repairMode = 5;
// . all done for good
// . return true because we did not block this caller
return true;
*/
}
void Repair::resetSecondaryRdbs ( ) {
long nsr;
Rdb **rdbs = getSecondaryRdbs ( &nsr );
for ( long i = 0 ; i < nsr ; i++ ) {
Rdb *rdb = rdbs[i];
// use niceness of 1
rdb->reset();
}
}
bool Repair::dumpLoop ( ) {
long nsr;
Rdb **rdbs = getSecondaryRdbs ( &nsr );
for ( long i = 0 ; i < nsr ; i++ ) {
Rdb *rdb = rdbs[i];
// don't dump tfndb...
if ( rdb->m_rdbId == RDB2_TFNDB2 ) continue;
// use niceness of 1
rdb->dumpTree ( 1 );
}
g_errno = 0;
// . register sleep wrapper to check when dumping is done
// . it will call Repair::loop() when done
return false;
}
bool Repair::dumpsCompleted ( ) {
long nsr;
Rdb **rdbs = getSecondaryRdbs ( &nsr );
for ( long i = 0 ; i < nsr ; i++ ) {
Rdb *rdb = rdbs[i];
// we don't dump tfndb...
if ( rdb->m_rdbId == RDB2_TFNDB2 ) continue;
// anything in tree/buckets?
if ( rdb->getNumUsedNodes() ) return false;
// still dumping?
if ( rdb->isDumping () ) return false;
}
// no more dump activity
return true;
}
// . this is only called from repairLoop()
// . returns false if blocked, true otherwise
// . grab the next scan record
bool Repair::scanRecs ( ) {
// just the tree?
//long nf = 1;
//bool includeTree = false;
RdbBase *base = g_titledb.getRdb()->getBase ( m_collnum );
//if ( m_fn == base->getNumFiles() ) { nf = 0; includeTree = true; }
// always clear last bit of g_nextKey
m_nextTitledbKey.n0 &= 0xfffffffffffffffeLL;
// for saving
m_lastTitledbKey = m_nextTitledbKey;
log(LOG_DEBUG,"repair: nextKey=%s endKey=%s"
"coll=%s collnum=%li "
"bnf=%li",//fn=%li nf=%li",
KEYSTR(&m_nextTitledbKey,sizeof(key_t)),
KEYSTR(&m_endKey,sizeof(key_t)),
m_coll,
(long)m_collnum,
(long)base->getNumFiles());//,m_fn,nf);
// sanity check
if ( m_msg5InUse ) {
char *xx = NULL; *xx = 0; }
// when building anything but tfndb we can get the rec
// from the twin in case of data corruption on disk
bool fixErrors = true;
//if ( m_rebuildTfndb ) fixErrors = false;
// get the list of recs
g_errno = 0;
if ( m_msg5.getList ( RDB_TITLEDB ,
m_collnum ,
&m_titleRecList ,
m_nextTitledbKey ,
m_endKey , // should be maxed!
1024 , // min rec sizes
true , // include tree?
false , // includeCache
false , // addToCache
0 , // startFileNum
-1 , // m_numFiles
this , // state
loopWrapper , // callback
MAX_NICENESS , // niceness
fixErrors , // do error correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
&m_msg5b ))
return true;
m_msg5InUse = true;
return false;
}
// . this is only called from repairLoop()
// . returns false if blocked, true otherwise
bool Repair::gotScanRecList ( ) {
QUICKPOLL(MAX_NICENESS);
// get the base
//RdbBase *base = g_titledb.getRdb()->getBase ( m_collnum );
if ( g_errno == ECORRUPTDATA ) {
log("repair: Encountered corruption1 in titledb. "
"NextKey=%s",
KEYSTR(&m_nextTitledbKey,sizeof(key_t)));
/*
// get map for this file
RdbMap *map = base->getMap(m_fn);
// what page has this key?
long page = map->getPage ( (char *)&m_nextTitledbKey );
// advance the page number
advancePage:
page++;
// if no more pages, we are done!
if ( page >= map->getNumPages() ) {
log("repair: No more pages in rdb map, done with "
"titledb file.");
g_errno = 0; m_recsCorruptErrors++;
goto fileDone;
}
// get key from that page
key_t next = *(key_t *)map->getKeyPtr ( page );
// keep advancing if its the same key!
if ( next == m_nextTitledbKey ) goto advancePage;
// ok, we got a new key, use it
m_nextTitledbKey = next;
*/
// get the docid
//long long dd = g_titledb.getDocIdFromKey(&m_nextTitledbKey);
// inc it
//dd++;
// re-make key
//m_nextTitledbKey = g_titledb.makeFirstTitleRecKey ( dd );
// advance one if positive, must always start on a neg
if ( (m_nextTitledbKey.n0 & 0x01) == 0x01 )
m_nextTitledbKey += (unsigned long)1;
// count as error
m_recsCorruptErrors++;
}
// was there an error? list will probably be empty
if ( g_errno ) {
log("repair: Got error reading title rec: %s.",
mstrerror(g_errno));
// keep retrying, might be OOM
m_stage = STAGE_TITLEDB_0 ;
// sleep 1 second and retry
m_isRetrying = true;
// exit the loop code, Repair::loop() will be re-called
return false;
}
/*
// a hack
if ( m_count > 100 ) { // && m_fn == 0 ) {
logf(LOG_INFO,"repair: hacking titledb complete.");
//m_completedFirstScan = true;
//m_stage = STAGE_SPIDERDB_0;
m_list.reset();
//return true;
}
*/
// all done with this bigfile if this list is empty
if ( m_titleRecList.isEmpty() ) { //||m_recsScanned > 10 ) {
// note it
//logf(LOG_INFO,"repair: Scanning ledb file #%li.", m_fn );
m_completedFirstScan = true;
logf(LOG_INFO,"repair: Completed titledb scan of "
"%lli records.",m_recsScanned);
//logf(LOG_INFO,"repair: Starting spiderdb scan.");
m_stage = STAGE_SPIDERDB_0;
// force spider scan completed now too!
m_completedSpiderdbScan = true;
g_repair.m_allowInjectToLoop = true;
return true;
}
// nextRec2:
key_t tkey = m_titleRecList.getCurrentKey();
long long docId = g_titledb.getDocId ( &tkey );
// save it
//m_currentTitleRecKey = tkey;
// save it
m_docId = docId;
// is it a delete?
m_isDelete = false;
// we need this to compute the tfndb key to add/delete
//m_ext = -1;
m_uh48 = 0LL;
// count the title recs we scan
m_recsScanned++;
// skip if bad... CORRUPTION
if ( tkey < m_nextTitledbKey ) {
log("repair: Encountered corruption2 in titledb. "
"key=%s < NextKey=%s"
"FirstDocId=%llu.",
//p1-1,
KEYSTR(&tkey,sizeof(key_t)),
KEYSTR(&m_nextTitledbKey,sizeof(key_t)),
docId);
m_nextTitledbKey += (unsigned long)1;
// advance one if positive, must always start on a negative key
if ( (m_nextTitledbKey.n0 & 0x01) == 0x01 )
m_nextTitledbKey += (unsigned long)1;
m_stage = STAGE_TITLEDB_0;
return true;
}
else {
// advance m_nextTitledbKey to get next titleRec
m_nextTitledbKey = m_titleRecList.getCurrentKey();
m_nextTitledbKey += (unsigned long)1;
// advance one if positive, must always start on a negative key
if ( (m_nextTitledbKey.n0 & 0x01) == 0x01 )
m_nextTitledbKey += (unsigned long)1;
}
// are we the host this url is meant for?
//uint32_t gid = getGroupId ( RDB_TITLEDB , &tkey );
unsigned long shardNum = getShardNum (RDB_TITLEDB , &tkey );
if ( shardNum != getMyShardNum() ) {
m_recsWrongGroupId++;
m_stage = STAGE_TITLEDB_0;
return true;
}
// . if one of our twins is responsible for it...
// . is it assigned to us? taken from assigendToUs() in SpiderCache.cpp
// . get our group from our hostId
long numHosts;
//Host *hosts = g_hostdb.getGroup ( g_hostdb.m_groupId, &numHosts);
Host *hosts = g_hostdb.getShard ( shardNum , &numHosts );
long ii = docId % numHosts ;
// . are we the host this url is meant for?
// . however, if you are rebuilding tfndb, each twin must scan all
// title recs and make individual entries for those title recs
if ( hosts[ii].m_hostId != g_hostdb.m_hostId ){//&&!m_rebuildTfndb ) {
m_recsUnassigned++;
m_stage = STAGE_TITLEDB_0;
return true;
}
/*
// is the list from the tree in memory?
long id2;
if ( m_fn == base->getNumFiles() ) id2 = 255;
else id2 = base->m_fileIds2[m_fn];
// that is the tfn...
m_tfn = id2;
*/
// is it a negative titledb key?
if ( (tkey.n0 & 0x01) == 0x00 ) {
// count it
m_recsNegativeKeys++;
// otherwise, we need to delete this
// docid from tfndb...
m_isDelete = true;
}
// if not rebuilding tfndb, skip this
//if ( ! m_rebuildTfndb && m_isDelete ) {
if ( m_isDelete ) {
m_stage = STAGE_TITLEDB_0;
return true;
}
return true;
}
/*
// if rebuilding tfndb only, always add this to tfndb
if ( m_rebuildTfndb && ! m_isDelete ) {
// get raw rec from list
char *rec = m_list.getCurrentRec();
// use this first
m_doc.reset();
//long recSize = m_list.getCurrentRecSize();
//TitleRec *tr = m_doc.getTitleRec();
if ( ! m_doc.set2 ( rec, -1, m_coll, NULL, MAX_NICENESS ) ) {
m_recsetErrors++;
m_stage = STAGE_TITLEDB_0; // 0
return true;
}
// remember this
m_prevDocId = m_docId;
// set the titleRec we got
//if ( ! tr->set ( rec , recSize , false ) ) {
// m_recsetErrors++;
// m_stage = STAGE_TITLEDB_0; // 0
// return true;
//}
Url *fu = m_doc.getFirstUrl();
// set the extended hash, m_ext
//m_ext = g_tfndb.makeExt ( fu ); // tr->getUrl() );
m_uh48 = hash64b(fu->getUrl()) & 0x0000ffffffffffffLL;
// addToTfndb2()
//m_stage = STAGE_TITLEDB_6;
m_stage = STAGE_TITLEDB_4;
return true;
}
// if previous titledb key was positive and had the
// same docid as us, then this negative key probably has
// different "content hash" bits and is meant to delete a
// previous version of this titlerec.
if ( m_rebuildTfndb && m_prevDocId == m_docId ) {
// just ignore it
m_stage = STAGE_TITLEDB_0;
return true;
}
// assume normal tfndb lookup
char rdbId = RDB_TFNDB;
// if a negative titledb key, then we need to lookup in the
// REBUILT tfndb to see/ what the ext hash bits are so we
// can delete that key from the new rebuilt tfndb! these
// hash bits are not in the title rec key unfortunately
if ( m_rebuildTfndb && m_isDelete )
rdbId = RDB2_TFNDB2;
//
// look up this docid in tfndb
//
// . make the keys for getting recs from tfndb
// . url recs map docid to the title file # that contains the titleRec
key_t uk1 ;
key_t uk2 ;
// . if docId was explicitly specified...
// . we may get multiple tfndb recs
// . for this we know the docid, so get it exactly
uk1 = g_tfndb.makeMinKey ( docId );
uk2 = g_tfndb.makeMaxKey ( docId );
// sanity check
if ( m_msg5InUse ) {
char *xx = NULL; *xx = 0; }
// . get the list of url recs for this docid range
// . this should not block, tfndb SHOULD all be in memory all the time
// . use 500 million for min recsizes to get all in range
// . no, using 500MB causes problems for RdbTree::getList, so use
// 100k. how many recs can there be?
if ( m_msg5.getList ( rdbId , // RDB_TFNDB
m_coll ,
&m_ulist ,
uk1 , // startKey
uk2 , // endKey
// use 0x7fffffff preceisely because it
// will determine eactly how long the
// tree list needs to allocate in Msg5.cpp
0x7fffffff , // minRecSizes
true , // includeTree?
false , // addToCache?
0 , // max cache age
0 , // startFileNum
-1 , // numFiles (-1 =all)
this ,
loopWrapper ,
MAX_NICENESS ,
true ))// error correction?
return true;
m_msg5InUse = true;
return false;
}
// . if no recs in the list have a matching tfn, skip the title rec
// . if one matches and one does not, skip the title rec
// . if has multiple and all match that is ok
bool Repair::gotTfndbList ( ) {
// was there an error? list will probably be empty
if ( g_errno )
log("repair: Got error reading tfndb list: %s.",
mstrerror(g_errno));
// sanity check
if ( m_rebuildTfndb && ! m_isDelete ) { char *xx=NULL;*xx=0;}
// just in case
m_ulist.resetListPtr();
// did we have a matchf or our docid?
bool matched = false;
// check for our docid
// we may have multiple tfndb recs but we should NEVER have to read
// multiple titledb files...
for ( ; ! m_ulist.isExhausted() ; m_ulist.skipCurrentRecord() ) {
// yield
QUICKPOLL(MAX_NICENESS);
// get first rec
key_t k = m_ulist.getCurrentKey();
// some titledbs have incorrect extension hashes in the
// buzzlogic collection, so ignore that for now
//if ( st->m_url[0] ) {
// if ( g_tfndb.getExt ( k ) != e ) continue;
//}
// docid must match! cuz these include probable docids
// in the range of [uk1,uk2]
if ( g_tfndb.getDocId(&k) != m_docId ) continue;
// . skip it if it is a delete and we are not touching tfndb
// . remember this is the newly rebuilt tfndb we are accessing
// here since we set rdbId to RDB2_TFNDB2 just for rebuilding
// tfndb exclusively
if ( m_rebuildTfndb && m_isDelete ) {
// addToTfndb2()
//m_ext = g_tfndb.getExt(k);
m_uh48 = g_tfndb.getUrlHash48(&k);
//m_stage = STAGE_TITLEDB_6;
m_stage = STAGE_TITLEDB_4;
return true;
}
// . get file num this rec is stored
// . this is updated right after the file num is merged by
// scanning all records in tfndb. this is very quick if all
// of tfndb is in memory, otherwise, it might take a few
// seconds. update call done in RdbMerge::incorporateMerge().
// . 255 means just in spiderdb OR titleRec is in tree
long tfn = g_tfndb.getTfn ( &k );
// set "matched" to true if this titlerec is the latest
if ( tfn == m_tfn ) matched = true;
// break now that we've matched the title rec's docid
break;
}
// check if in tree
RdbTree *tree = &g_titledb.m_rdb.m_tree;
long node = tree->getNode ( m_collnum,(char *)&m_currentTitleRecKey );
// if there, that's a match
if ( node >= 0 ) matched = true;
// if not matched in tfndb and not in tree it must have been deleted!
if ( ! matched ) {
m_stage = STAGE_TITLEDB_0;
m_recsOverwritten++;
return true;
}
return true;
}
*/
/*
bool Repair::getTitleRec ( ) {
key_t key = m_scanList.getCurrentKey();
// that's a revdb record, get the docid
m_docId = g_revdb.getDocId (&key);
// make it
key_t tk1 = g_titledb.makeFirstKey ( m_docId );
key_t tk2 = g_titledb.makeLastKey ( m_docId );
// use msg22
return m_msg5.getList ( RDB_TITLEDB ,
m_coll,
&m_titleRecList ,
&tk1 ,
&tk2 ,
32 ,
true , // include tree
false , // add to cache
0 , // max cache age
0 , // start file #
-1 , // numfiles
this ,
loopWrapper ,
MAX_NICENESS , // niceness
true , // do error correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
&m_msg5b );
}
*/
// TODO: allocate these on demand!!!!!!
//#define MAX_OUT_REPAIR 10
//static char s_inUse [ MAX_OUT_REPAIR ];
//static XmlDoc s_docs [ MAX_OUT_REPAIR ];
void doneWithIndexDoc ( XmlDoc *xd ) {
// preserve
long saved = g_errno;
// nuke it
mdelete ( xd , sizeof(XmlDoc) , "xdprnuke");
delete ( xd );
// reduce the count
g_repair.m_numOutstandingInjects--;
// error?
if ( saved ) {
g_repair.m_recsetErrors++;
g_repair.m_stage = STAGE_TITLEDB_0; // 0
return;
}
QUICKPOLL(MAX_NICENESS);
/*
// find the i
long i ; for ( i = 0 ; i < MAX_OUT_REPAIR ; i++ ) {
if ( ! s_inUse[i] ) continue;
if ( xd == &s_docs[i] ) break;
}
if ( i >= MAX_OUT_REPAIR ) { char *xx=NULL;*xx=0; }
// reset it i guess
xd->reset();
// give back the tr
s_inUse[i] = 0;
*/
}
void doneWithIndexDocWrapper ( void *state ) {
// clean up
doneWithIndexDoc ( (XmlDoc *)state );
// and re-enter the loop to get next title rec
g_repair.loop ( NULL );
}
//bool Repair::getTagRec ( void **state ) {
bool Repair::injectTitleRec ( ) {
// no, now we specify in call to indexDoc() which
// dbs we want to update
//if ( ! m_fullRebuild && ! m_removeBadPages ) return true;
QUICKPOLL(MAX_NICENESS);
// scan for our docid in the title rec list
char *titleRec = NULL;
long titleRecSize = 0;
// convenience var
RdbList *tlist = &m_titleRecList;
// scan the titleRecs in the list
for ( ; ! tlist->isExhausted() ; tlist->skipCurrentRecord ( ) ) {
// breathe
QUICKPOLL ( MAX_NICENESS );
// get the rec
char *rec = tlist->getCurrentRec();
long recSize = tlist->getCurrentRecSize();
// get that key
key_t *k = (key_t *)rec;
// skip negative recs, first one should not be negative however
if ( ( k->n0 & 0x01 ) == 0x00 ) continue;
// get docid of that guy
long long dd = g_titledb.getDocId(k);
// compare that
if ( m_docId != dd ) continue;
// we got it!
titleRec = rec;
titleRecSize = recSize;
break;
}
/*
// title rec for this doc was not found...
if ( ! titleRec ) {
// don't bother with revdb?
if ( ! m_rebuildRevdb ) return true;
// so just add the revdb rec into the new revdb. it was
// probably an eventless url and we just added sectiondb or
// placedb entries for it...
char *rec = m_scanList.getCurrentRec();
long recSize = m_scanList.getCurrentRecSize();
if ( recSize <= 0 ) { char *xx=NULL;*xx=0; }
if ( ! m_msg4.addMetaList ( rec ,
recSize ,
m_coll ,
this ,
loopWrapper2 ,
MAX_NICENESS ,
RDB2_REVDB2 ) ) {
// note it for debugging
log("repair: msg4 returned false");
// it will call our callback!
return false;
}
// crap, gotta retry adding this if it returned false
//g_repair.m_stage = STAGE_TITLEDB_3;
// ask repair wrapper to call us back
//g_repair.m_needsCallback = true;
// sleep away!
//return false;
//}
// no title recs
m_noTitleRecs++;
// we're all done and did not block, per se
return true;
}
*/
// make sure this is on
//g_conf.m_injectionEnabled = true;
// get raw rec from list
//char *rec = m_titleRecList.getCurrentRec();
//long recSize = m_titleRecList.getCurrentRecSize();
/*
// claim a title rec
bool static s_init = false;
if ( ! s_init ) { memset (s_inUse,0,MAX_OUT_REPAIR); s_init = true; }
//TitleRec *tr = NULL;
XmlDoc *xd = NULL;
long i ;
for ( i = 0 ; i < MAX_OUT_REPAIR ; i++ ) {
if ( s_inUse[i] ) continue;
//tr = &s_trs[i];
xd = &s_docs[i];
break;
}
*/
XmlDoc *xd = NULL;
try { xd = new ( XmlDoc ); }
catch ( ... ) {
g_errno = ENOMEM;
m_recsetErrors++;
m_stage = STAGE_TITLEDB_0; // 0
return true;
}
mnew ( xd , sizeof(XmlDoc),"xmldocpr");
// clear out first since set2 no longer does
//xd->reset();
if ( ! xd->set2 ( titleRec , -1 , m_coll , NULL , MAX_NICENESS ) ) {
m_recsetErrors++;
m_stage = STAGE_TITLEDB_0; // 0
return true;
}
// set callback
xd->setCallback ( xd , doneWithIndexDocWrapper );
// clear any error involved with cache, it doesn't matter so much
g_errno = 0;
// set the titleRec we got
//if ( ! tr->set ( rec , recSize , false /*own data?*/ ) ) {
// m_recsetErrors++;
// m_stage = STAGE_TITLEDB_0;
// return true;
//}
//Url *fu = xd->getFirstUrl();
// . determine which host in our group should spider this
// . just use the host that should dole it
// . if we are not responsible for this url, skip it
// . usually this uses m_firstIp of SpiderRequest but just use
// a hash of the url as the ip! HACK!
// . no.. no.. we already have a docid based assignment filter
// in gotScanRecList(), it mods the docid with the # of hosts
// in our group
//long hh = hash32n(fu->getUrl());
//long hostId = getHostIdToDole ( hh );
//if ( ! isAssignedToUs ( hh ) ) {
// m_stage = STAGE_TITLEDB_0;
// return true;
//}
// skip if root and not doing roots
//if ( ! m_rebuildRoots && tr->getUrl()->isRoot() ) {
// m_recsRoot++;
// m_stage = STAGE_TITLEDB_0;
// return true;
//}
// skip if non-root and not doing non roots
//if ( ! m_rebuildNonRoots && ! tr->getUrl()->isRoot() ) {
// m_recsNonRoot++;
// m_stage = STAGE_TITLEDB_0;
// return true;
//}
// invalidate certain things to recompute!
// we are now setting from docid
xd->m_tagRecValid = false;
// rebuild the title rec! otherwise we re-add the old one!!!!!!!
xd->m_titleRecBufValid = false;
// free it since set2() should have uncompressed it!
//mfree ( titleRec , titleRecSize, "repair" );
// and so xd doesn't free it
xd->m_titleRecBuf.purge();// = NULL;
// use the ptr_utf8Content that we have
xd->m_recycleContent = true;
// rebuild the content hash since we change that function sometimes
xd->m_contentHash32Valid = false;
// hmmm... take these out to see if fixes the core
//xd->m_linkInfo1Valid = false;
//xd->m_linkInfo2Valid = false;
// claim it, so "tr" is not overwritten
m_numOutstandingInjects++;
//s_inUse[i] = 1;
bool addToSecondaryRdbs = true;
//if ( m_fullRebuild ) addToSecondaryRdbs = false;
//if ( m_removeBadPages ) addToSecondaryRdbs = false;
xd->m_usePosdb = m_rebuildPosdb;
//xd->m_useDatedb = m_rebuildDatedb;
xd->m_useClusterdb = m_rebuildClusterdb;
xd->m_useLinkdb = m_rebuildLinkdb;
xd->m_useSpiderdb = m_rebuildSpiderdb;
xd->m_useTitledb = m_rebuildTitledb;
//xd->m_usePlacedb = m_rebuildPlacedb;
//xd->m_useSectiondb = m_rebuildSectiondb;
//xd->m_useRevdb = m_rebuildRevdb;
xd->m_useSecondaryRdbs = addToSecondaryRdbs;
// always use tagdb because if we update the sitenuminlinks
// or whatever, we want to add that to tagdb
xd->m_useTagdb = true;
// not if rebuilding link info though! we assume the old link info is
// bad...
if ( m_rebuildLinkdb )
xd->m_useTagdb = false;
if ( m_rebuildLinkdb ) {
// also need to preserve the "lost link" flag somehow
// from the old linkdb...
//log("repair: would lose linkdb lost flag.");
// core until we find a way to preserve the old discovery
// date from the old linkdb!
//log("repair: fix linkdb rebuild. coring.");
//char *xx=NULL;*xx=0;
}
if ( ! g_conf.m_rebuildRecycleLinkInfo ) {
// then recompute link info as well!
xd->m_linkInfo1Valid = false;
// make null to be safe
xd->ptr_linkInfo1 = NULL;
xd->size_linkInfo1 = 0;
}
// . also lookup site rank again!
// . this will use the value in tagdb if less than 48 hours otherwise
// it will recompute it
// . CRAP! this makes the data undeletable if siterank changes!
// so we have to be able to re-save our title rec with the new
// site rank info...
if ( xd->m_useTitledb ) {
// save for logging
xd->m_logLangId = xd->m_langId;
xd->m_logSiteNumInlinks = xd->m_siteNumInlinks;
// recompute site, no more domain sites allowed
xd->m_siteValid = false;
xd->ptr_site = NULL;
xd->size_site = 0;
// recalculate the sitenuminlinks
xd->m_siteNumInlinksValid = false;
// recalculate the langid
xd->m_langIdValid = false;
// recalcualte and store the link info
xd->m_linkInfo1Valid = false;
// make null to be safe
xd->ptr_linkInfo1 = NULL;
xd->size_linkInfo1 = 0;
//xd->m_linkInfo2Valid = false;
// re-get the tag rec from tagdb
xd->m_tagRecValid = false;
xd->m_tagRecDataValid = false;
}
xd->m_priority = -1;
xd->m_priorityValid = true;
// this makes sense now that we set from docid using set3()?
//xd->m_recycleContent = true;
xd->m_contentValid = true;
xd->m_content = xd->ptr_utf8Content;
xd->m_contentLen = xd->size_utf8Content - 1;
// . get the meta list to add
// . sets m_usePosdb, m_useTitledb, etc.
bool status = xd->indexDoc ( );
// blocked?
if ( ! status ) return false;
// give it back
doneWithIndexDoc ( xd );
return true;
}
/*
bool Repair::addToTfndb2 ( ) {
// only do this for adding recs to tfndb
if ( ! m_rebuildTfndb ) return true;
// if doing a full rebuild, skip this, already done
//if ( m_fullRebuild ) return true;
// . this is broken!!! figure out why the rebuild doesn't work...
// . seems like the tfns are off...
//char *xx = NULL; *xx = 0;
QUICKPOLL(MAX_NICENESS);
// sanity check, must have a valid m_ext
//if ( m_ext == -1 ) { char *xx = NULL; *xx = 0; }
if ( ! m_uh48 ) { char *xx = NULL; *xx = 0; }
// m_docId should already have been set!
m_tfndbKey = g_tfndb.makeKey ( m_docId , // tr->getDocId()
m_uh48 ,
m_tfn ,
m_isDelete );// isDelete?
// set the list from the buffer
m_addlist.set ( (char *)&m_tfndbKey ,
sizeof(key_t) ,
(char *)&m_tfndbKey ,
sizeof(key_t) ,
(char *)&m_tfndbKey , // start key
(char *)&m_tfndbKey , // end key
0 , // fixedDataSize
false , // ownData?
false , // use half keys? not when adding.
12 );// tfndb key size
// this returns false if it blocks
g_errno = 0;
// . keep these local, because the tfn in the tfndb rec
// make not be the same between twins!!
// . returns true on success, so go on to next stage
if ( g_tfndb2.getRdb()->addList(m_collnum,&m_addlist,
MAX_NICENESS) )
return true;
// keep retrying, might be OOM, auto-saving, etc.
m_stage = STAGE_TITLEDB_4 ;
// sleep 1 second and retry
m_isRetrying = true;
// must need to dump, so wait for that!
return log("repair: addToTfndb2: %s",mstrerror(g_errno));
}
*/
// . returns false if fails cuz buffer cannot be grown (oom)
// . this is called by Parms.cpp
bool Repair::printRepairStatus ( SafeBuf *sb , long fromIp ) {
// default is a repairMode of 0, "not running"
char *status = "not running";
if ( g_repairMode == 0 && g_conf.m_repairingEnabled )
status = "waiting for previous repair to complete";
if ( g_repairMode == 1 )
status = "waiting for spiders or merge to stop";
if ( g_repairMode == 2 )
status = "waiting for all hosts in network to stop "
"spidering and merging";
if ( g_repairMode == 3 )
status = "waiting for all hosts to save";
if ( g_repairMode == 4 ) {
if ( m_completedFirstScan )
status = "scanning old spiderdb";
else
status = "scanning old records";
}
if ( g_repairMode == 5 )
status = "waiting for final dump to complete";
if ( g_repairMode == 6 )
status = "waiting for others to finish scan and dump";
if ( g_repairMode == 7 )
status = "updating rdbs with new data";
if ( g_repairMode == 8 )
status = "waiting for all hosts to complete update";
if ( ! g_process.m_powerIsOn && g_conf.m_repairingEnabled )
status = "waiting for power to return";
// the titledb scan stats (phase 1)
long long ns = m_recsScanned ;
long long nr = g_titledb.getRdb()->getNumTotalRecs() ;
float ratio = ((float)ns * 100.0) / (float)nr;
long long errors =
m_recsOutOfOrder +
m_recsetErrors +
m_recsCorruptErrors +
m_recsXmlErrors +
m_recsDupDocIds ;
// the spiderdb scan stats (phase 2)
long long ns2 = m_spiderRecsScanned ;
long long nr2 = g_spiderdb.getRdb()->getNumTotalRecs() ;
float ratio2 = ((float)ns2 * 100.0) / (float)nr2;
long long errors2 =
m_spiderRecSetErrors;
char *newColl = " &nbsp; ";
//if ( m_fullRebuild ) newColl = m_newColl;
char *oldColl = " &nbsp; ";
if ( m_coll ) oldColl = m_coll;
Host *mh = g_pingServer.m_minRepairModeHost;
long minHostId = -1;
char minIpBuf[64];
minIpBuf[0] = '\0';
short minPort = 80;
if ( mh ) {
minHostId = mh->m_hostId;
long minHostIp = g_hostdb.getBestIp ( mh , fromIp );
strcpy(minIpBuf,iptoa(minHostIp));
minPort = mh->m_httpPort;
}
// now show the rebuild status
sb->safePrintf (
"<table%s"
" id=\"repairstatustable\">"
"<tr class=hdrow><td colspan=2><b><center>"
"Repair Status</center></b></td></tr>\n"
"<tr bgcolor=#%s><td colspan=2>"
"<font size=-2>"
"Use this to rebuild a database or to reindex "
"all pages to pick up new link text."
"</font>"
"</td></tr>"
// status (see list of above statuses)
"<tr bgcolor=#%s><td width=50%%><b>status</b></td>"
"<td>%s</td></tr>\n"
"<tr bgcolor=#%s><td width=50%%><b>repair mode</b>"
"</td>"
"<td>%li</td></tr>\n"
"<tr bgcolor=#%s>"
"<td width=50%%><b>min repair mode</b></td>"
"<td>%li</td></tr>\n"
"<tr bgcolor=#%s>"
"<td width=50%%><b>host ID with min repair mode"
"</b></td>"
"<td><a href=\"http://%s:%hu/admin/repair\">"
"%li</a></td></tr>\n"
"<tr bgcolor=#%s><td><b>old collection</b></td>"
"<td>%s</td></tr>"
"<tr bgcolor=#%s><td><b>new collection</b></td>"
"<td>%s</td></tr>"
,
TABLE_STYLE ,
LIGHT_BLUE ,
LIGHT_BLUE ,
status ,
LIGHT_BLUE ,
(long)g_repairMode,
LIGHT_BLUE ,
(long)g_pingServer.m_minRepairMode,
LIGHT_BLUE ,
minIpBuf, // ip string
minPort, // port
(long)minHostId,
LIGHT_BLUE ,
oldColl ,
LIGHT_BLUE ,
newColl
);
sb->safePrintf (
// docs done, includes overwritten title recs
"<tr bgcolor=#%s><td><b>titledb recs scanned</b></td>"
"<td>%lli of %lli</td></tr>\n"
// percent complete
"<tr bgcolor=#%s><td><b>titledb recs scanned "
"progress</b></td>"
"<td>%.2f%%</td></tr>\n"
// title recs set errors, parsing errors, etc.
//"<tr bgcolor=#%s><td><b>title recs injected</b></td>"
//"<td>%lli</td></tr>\n"
// title recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>titledb rec error count</b></td>"
"<td>%lli</td></tr>\n"
// sub errors
"<tr bgcolor=#%s><td> &nbsp; key out of order</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; set errors</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; corrupt errors</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; xml errors</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; dup docid errors</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; negative keys</b></td>"
"<td>%lli</td></tr>\n"
//"<tr bgcolor=#%s><td> &nbsp; overwritten recs</b></td>"
//"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; twin's "
"respsponsibility</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; wrong shard</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; root urls</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; non-root urls</b></td>"
"<td>%lli</td></tr>\n"
"<tr bgcolor=#%s><td> &nbsp; no title rec</b></td>"
"<td>%lli</td></tr>\n"
//"<tr><td><b> &nbsp; Other errors</b></td>"
//"<td>%lli</td></tr>\n"
// time left in hours
//"<tr><td><b>Time Left in Phase %li</b></td>"
//"<td>%.2f hrs</td></tr>\n"
,
DARK_BLUE,
ns ,
nr ,
DARK_BLUE,
ratio ,
//DARK_BLUE,
//m_recsInjected ,
DARK_BLUE,
errors ,
DARK_BLUE,
m_recsOutOfOrder ,
DARK_BLUE,
m_recsetErrors ,
DARK_BLUE,
m_recsCorruptErrors ,
DARK_BLUE,
m_recsXmlErrors ,
DARK_BLUE,
m_recsDupDocIds ,
DARK_BLUE,
m_recsNegativeKeys ,
//DARK_BLUE,
//m_recsOverwritten ,
DARK_BLUE,
m_recsUnassigned ,
DARK_BLUE,
m_recsWrongGroupId ,
DARK_BLUE,
m_recsRoot ,
DARK_BLUE,
m_recsNonRoot ,
DARK_BLUE,
m_noTitleRecs
);
sb->safePrintf(
// spider recs done
"<tr bgcolor=#%s><td><b>spider recs scanned</b></td>"
"<td>%lli of %lli</td></tr>\n"
// percent complete
"<tr bgcolor=#%s><td><b>spider recs scanned "
"progress</b></td>"
"<td>%.2f%%</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>spider rec not "
"assigned to us</b></td>"
"<td>%li</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>spider rec errors</b></td>"
"<td>%lli</td></tr>\n"
// spider recs set errors, parsing errors, etc.
"<tr bgcolor=#%s><td><b>spider rec bad tld</b></td>"
"<td>%li</td></tr>\n"
// time left in hours
//"<tr bgcolor=#%s><td><b>"
//"Time Left in Phase %li</b></td>"
//"<td>%.2f hrs</td></tr>\n"
,
LIGHT_BLUE ,
ns2 ,
nr2 ,
LIGHT_BLUE ,
ratio2 ,
LIGHT_BLUE ,
m_spiderRecNotAssigned ,
LIGHT_BLUE ,
errors2,
LIGHT_BLUE ,
m_spiderRecBadTLD
);
long nsr;
Rdb **rdbs = getSecondaryRdbs ( &nsr );
// . count the recs in each secondary rdb
// . those are the rdbs we are adding the recs to
for ( long i = 0 ; i < nsr ; i++ ) {
char *bg = DARK_BLUE;
Rdb *rdb = rdbs[i];
long long tr = rdb->getNumTotalRecs();
// skip if init2() as not called on it b/c the
// m_dbname will be 0
if ( tr == 0 ) continue;
sb->safePrintf(
"<tr bgcolor=#%s><td><b>%s2 recs</b></td>"
"<td>%lli</td></tr>\n" ,
bg,
rdb->m_dbname,
rdb->getNumTotalRecs());
}
// close up that table
sb->safePrintf("</table>\n<br>");
// print a table
char *rr[23];
if ( m_fullRebuild ) rr[0] = "Y";
else rr[0] = "N";
if ( m_rebuildTitledb ) rr[1] = "Y";
else rr[1] = "N";
//if ( m_rebuildTfndb ) rr[2] = "Y";
//else rr[2] = "N";
//if ( m_rebuildIndexdb ) rr[3] = "Y";
//else rr[3] = "N";
if ( m_rebuildPosdb ) rr[3] = "Y";
else rr[3] = "N";
//if ( m_rebuildDatedb ) rr[4] = "Y";
//else rr[4] = "N";
if ( m_rebuildClusterdb ) rr[5] = "Y";
else rr[5] = "N";
//if ( m_rebuildChecksumdb ) rr[6] = "Y";
//else rr[6] = "N";
if ( m_rebuildSpiderdb ) rr[7] = "Y";
else rr[7] = "N";
//if ( m_rebuildSitedb ) rr[8] = "Y";
//else rr[8] = "N";
if ( m_rebuildLinkdb ) rr[9] = "Y";
else rr[9] = "N";
//if ( g_conf.m_rebuildRecycleLinkInfo ) rr[10] = "Y";
//else rr[10] = "N";
if ( m_rebuildRoots ) rr[11] = "Y";
else rr[11] = "N";
if ( m_rebuildNonRoots ) rr[12] = "Y";
else rr[12] = "N";
//if ( m_rebuildTagdb ) rr[13] = "Y";
//else rr[13] = "N";
//if ( m_rebuildPlacedb ) rr[14] = "Y";
//else rr[14] = "N";
//if ( m_rebuildSectiondb ) rr[16] = "Y";
//else rr[16] = "N";
//if ( m_rebuildRevdb ) rr[17] = "Y";
//else rr[17] = "N";
sb->safePrintf (
"<table %s "
"id=\"repairstatustable2\">"
// current collection being repaired
"<tr class=hdrow><td colspan=2><b><center>"
"Repair Settings In Use</center></b></td></tr>"
// . print parms for this repair
// . they may differ than current controls because
// the current controls were changed after the
// repair started
"<tr bgcolor=#%s>"
"<td width=50%%><b>full rebuild</b></td>"
"<td>%s</td></tr>\n"
//"<tr bgcolor=#%s><td><b>recycle link info</b></td>"
//"<td>%s</td></tr>\n"
"<tr bgcolor=#%s><td><b>rebuild titledb</b></td>"
"<td>%s</td></tr>\n"
//"<tr bgcolor=#%s><td><b>rebuild tfndb</b></td>"
//"<td>%s</td></tr>\n"
//"<tr bgcolor=#%s><td><b>rebuild indexdb</b></td>"
//"<td>%s</td></tr>\n"
"<tr bgcolor=#%s><td><b>rebuild posdb</b></td>"
"<td>%s</td></tr>\n"
//"<tr bgcolor=#%s><td><b>rebuild datedb</b></td>"
//"<td>%s</td></tr>\n"
"<tr bgcolor=#%s><td><b>rebuild clusterdb</b></td>"
"<td>%s</td></tr>\n"
//"<tr bgcolor=#%s><td><b>rebuild checksumdb</b></td>"
//"<td>%s</td></tr>\n"
"<tr bgcolor=#%s><td><b>rebuild spiderdb</b></td>"
"<td>%s</td></tr>\n"
"<tr bgcolor=#%s><td><b>rebuild linkdb</b></td>"
"<td>%s</td></tr>\n"
//"<tr bgcolor=#%s><td><b>rebuild tagdb</b></td>"
//"<td>%s</td></tr>\n"
//"<tr bgcolor=#%s><td><b>rebuild placedb</b></td>"
//"<td>%s</td></tr>\n"
//"<tr bgcolor=#%s><td><b>rebuild sectiondb</b></td>"
//"<td>%s</td></tr>\n"
//"<tr bgcolor=#%s><td><b>rebuild revdb</b></td>"
//"<td>%s</td></tr>\n"
"<tr bgcolor=#%s><td><b>rebuild root urls</b></td>"
"<td>%s</td></tr>\n"
"<tr bgcolor=#%s>"
"<td><b>rebuild non-root urls</b></td>"
"<td>%s</td></tr>\n"
"</table>\n"
"<br>\n"
,
TABLE_STYLE,
LIGHT_BLUE,
rr[0],
//rr[10],
LIGHT_BLUE,
rr[1],
//rr[2],
LIGHT_BLUE,
rr[3],
//rr[4],
LIGHT_BLUE,
rr[5],
//rr[6],
LIGHT_BLUE,
rr[7],
//rr[8],
LIGHT_BLUE,
rr[9],
//rr[13],
//rr[14],
//rr[15],
//rr[16],
//rr[17],
LIGHT_BLUE,
rr[11],
LIGHT_BLUE,
rr[12]
);
return true;
}
static bool s_savingAll = false;
static void (*s_saveCallback)(void *state) ;
static void *s_saveState;
// . return false if blocked, true otherwise
// . will call the callback when all have been saved
// . used by Repair.cpp to save all rdbs before doing repair work
bool saveAllRdbs ( void *state , void (* callback)(void *state) ) {
// only call once
if ( s_savingAll ) {
//log("db: Already saving all.");
// let them know their callback will not be called even
// though we returned false
if ( callback ) { char *xx = NULL; *xx = 0; }
return false;
}
// set it
s_savingAll = true;
// TODO: why is this called like 100x per second when a merge is
// going on? why don't we sleep longer in between?
//bool close ( void *state ,
// void (* callback)(void *state ) ,
// bool urgent ,
// bool exitAfterClosing );
long nsr;
Rdb **rdbs = getAllRdbs ( &nsr );
for ( long i = 0 ; i < nsr ; i++ ) {
Rdb *rdb = rdbs[i];
// skip if not initialized
if ( ! rdb->isInitialized() ) continue;
// save/close it
rdb->close(NULL,doneSavingRdb,false,false);
}
// return if still waiting on one to close
if ( anyRdbNeedsSave() ) return false;
// all done
return true;
}
// return false if one or more is still not closed yet
bool anyRdbNeedsSave ( ) {
long count = 0;
long nsr;
Rdb **rdbs = getAllRdbs ( &nsr );
for ( long i = 0 ; i < nsr ; i++ ) {
Rdb *rdb = rdbs[i];
count += rdb->needsSave();
}
if ( count ) return true;
s_savingAll = false;
return false;
}
// returns false if waiting on some to save
void doneSavingRdb ( void *state ) {
if ( ! anyRdbNeedsSave() ) return;
// all done
s_savingAll = false;
// call callback
if ( s_saveCallback ) s_saveCallback ( s_saveState );
}