keep track of how many times the host exited/cored as an exponent

to the 'x' in the hosts table. this way we can detect hosts that
have restarted many times and fix them.
This commit is contained in:
Matt 2015-04-01 16:28:58 -06:00
parent e583850e40
commit 2ce107e4be
4 changed files with 32 additions and 6 deletions

View File

@ -115,7 +115,7 @@ class PingInfo {
char m_gbVersionStr[21];
char m_repairMode;
char m_kernelErrors;
uint8_t m_recoveryLevel;
};
class Host {

View File

@ -521,9 +521,18 @@ skipReplaceHost:
}
// recovery mode? reocvered from coring?
if ((flags & PFLAG_RECOVERYMODE)&& format == FORMAT_HTML )
if ((flags & PFLAG_RECOVERYMODE)&& format == FORMAT_HTML ) {
fb.safePrintf("<b title=\"Recovered from core"
"\">x</b>");
// this is only 8-bits at the moment so it's capped
// at 255. this level is 1 the first time we core
// and are restarted.
if ( h->m_pingInfo.m_recoveryLevel > 1 )
fb.safePrintf("<sup>%"INT32"</sup>",
(int32_t)
h->m_pingInfo.m_recoveryLevel);
}
if ((flags & PFLAG_RECOVERYMODE)&& format != FORMAT_HTML )
fb.safePrintf("Recovered from core");
@ -1463,7 +1472,8 @@ skipReplaceHost:
"<td>x (status flag)</td>"
"<td>Indicates host has abruptly exited due to a fatal "
"error (cored) and "
"restarted itself."
"restarted itself. The exponent is how many times it has "
"done this. If no exponent, it only did it once."
"</td>"
"</tr>\n"

View File

@ -28,6 +28,7 @@ int32_t klogctl( int, char *,int ) { return 0; }
// from main.cpp. when keepalive script restarts us this is true
extern bool g_recoveryMode;
extern int32_t g_recoveryLevel;
// a global class extern'd in .h file
PingServer g_pingServer;
@ -491,6 +492,10 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
flags |= PFLAG_MERGEMODE0OR6;
if ( ! isClockInSync() ) flags |= PFLAG_OUTOFSYNC;
uint8_t rv8 = (uint8_t)g_recoveryLevel;
if ( g_recoveryLevel > 255 ) rv8 = 255;
pi->m_recoveryLevel = rv8;
//*(int32_t *)p = flags; p += 4; // 4 bytes
pi->m_flags = flags;

View File

@ -195,6 +195,8 @@ void dumpLinkdb ( char *coll,int32_t sfn,int32_t numFiles,bool includeT
void exitWrapper ( void *state ) { exit(0); };
bool g_recoveryMode = false;
int32_t g_recoveryLevel = 0;
bool isRecoveryFutile ( ) ;
@ -1116,8 +1118,15 @@ int main2 ( int argc , char *argv[] ) {
//send an email on startup for -r, like if we are recovering from an
//unclean shutdown.
g_recoveryMode = false;
if ( strcmp ( cmd , "-r" ) == 0 ) g_recoveryMode = true;
if ( strcmp ( cmd2 , "-r" ) == 0 ) g_recoveryMode = true;
char *cc = NULL;
if ( strncmp ( cmd , "-r" ,2 ) == 0 ) cc = cmd;
if ( strncmp ( cmd2 , "-r",2 ) == 0 ) cc = cmd2;
if ( cc ) {
g_recoveryMode = true;
g_recoveryLevel = 1;
if ( cc[2] ) g_recoveryLevel = atoi(cc+2);
if ( g_recoveryLevel < 0 ) g_recoveryLevel = 0;
}
// run as daemon? then we have to fork
if ( strcmp ( cmd , "-d" ) == 0 ) g_conf.m_runAsDaemon = true;
@ -5170,6 +5179,7 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
"export MALLOC_CHECK_=0;"
"cp -f gb gb.oldsave ; "
"ADDARGS='' "
"INC=1 "
"EXITSTATUS=1 ; "
"while [ \\$EXITSTATUS != 0 ]; do "
"{ "
@ -5191,7 +5201,8 @@ int install ( install_flag_konst_t installFlag , int32_t hostId , char *dir ,
" ;"
"EXITSTATUS=\\$? ; "
"ADDARGS='-r' ; "
"ADDARGS='-r'\\$INC ; "
"INC=\\$((INC+1));"
"} "
"done >& /dev/null & \" %s",
//"\" %s",