added host disk usage redbox and stats.

This commit is contained in:
Matt Wells 2014-02-12 09:47:44 -07:00
parent eb044c765c
commit ca4aafa8a6
6 changed files with 117 additions and 10 deletions

View File

@ -53,7 +53,7 @@ enum {
#define PFLAG_RECOVERYMODE 0x80
// added slow disk reads to it, 4 bytes (was 52)
#define MAX_PING_SIZE (44+4)
#define MAX_PING_SIZE (44+4+4)
#define HT_GRUNT 0x01
#define HT_SPARE 0x02
@ -144,6 +144,8 @@ class Host {
// cpu usage
float m_cpuUsage;
float m_diskUsage;
long m_slowDiskReads;
// doc count

View File

@ -23,6 +23,7 @@ static int dgramsFromSort ( const void *i1, const void *i2 );
//static int loadAvgSort ( const void *i1, const void *i2 );
static int memUsedSort ( const void *i1, const void *i2 );
static int cpuUsageSort ( const void *i1, const void *i2 );
static int diskUsageSort ( const void *i1, const void *i2 );
long generatePingMsg( Host *h, long long nowms, char *buffer );
@ -222,6 +223,9 @@ skipReplaceHost:
"<td><a href=\"/master/hosts?c=%s&sort=10\">"
"<b>cpu</a></td>"
"<td><a href=\"/master/hosts?c=%s&sort=17\">"
"<b>disk</a></td>"
"<td><a href=\"/master/hosts?c=%s&sort=14\">"
"<b>max ping1</a></td>"
@ -257,6 +261,7 @@ skipReplaceHost:
coll,
coll,
coll,
coll,
shotcol );
// loop through each host we know and print it's stats
@ -295,6 +300,7 @@ skipReplaceHost:
case 14:gbsort ( hostSort, nh, sizeof(long), pingMaxSort ); break;
case 15:gbsort ( hostSort, nh, sizeof(long), slowDiskSort ); break;
case 16:gbsort ( hostSort, nh, sizeof(long), defaultSort ); break;
case 17:gbsort ( hostSort, nh, sizeof(long), diskUsageSort ); break;
}
// we are the only one that uses these flags, so set them now
@ -379,6 +385,12 @@ skipReplaceHost:
if ( cpu > 100.0 ) cpu = 100.0;
if ( cpu < 0.0 ) cpu = -1.0;
char diskUsageMsg[64];
sprintf(diskUsageMsg,"%.1f%%",h->m_diskUsage);
if ( h->m_diskUsage < 0.0 )
sprintf(diskUsageMsg,"???");
// split time, don't divide by zero!
long splitTime = 0;
if ( h->m_splitsDone )
@ -494,6 +506,8 @@ skipReplaceHost:
"<td>%s%.1f%%%s</td>"
// cpu usage
"<td>%.1f%%</td>"
// disk usage
"<td>%s</td>"
// ping max
"<td>%s</td>"
@ -547,6 +561,7 @@ skipReplaceHost:
h->m_percentMemUsed, // float
fontTagBack,
cpu, // float
diskUsageMsg,
// ping max
pms,
@ -1156,3 +1171,11 @@ int cpuUsageSort ( const void *i1, const void *i2 ) {
if ( h1->m_cpuUsage < h2->m_cpuUsage ) return 1;
return 0;
}
int diskUsageSort ( const void *i1, const void *i2 ) {
Host *h1 = g_hostdb.getHost ( *(long*)i1 );
Host *h2 = g_hostdb.getHost ( *(long*)i2 );
if ( h1->m_diskUsage > h2->m_diskUsage ) return -1;
if ( h1->m_diskUsage < h2->m_diskUsage ) return 1;
return 0;
}

View File

@ -2586,7 +2586,7 @@ bool printRedBox ( SafeBuf *mb ) {
if ( g_conf.m_numConnectIps == 0 && g_conf.m_numMasterPwds == 0 ) {
if ( adds ) mb->safePrintf("<br>");
if ( adds ) mb->safePrintf("<br><br>");
adds++;
mb->safePrintf("URGENT. Please specify a password "
"or IP address in the "
@ -2595,9 +2595,26 @@ bool printRedBox ( SafeBuf *mb ) {
"to access the Gigablast admin controls.");
}
// out of disk space?
long out = 0;
for ( long i = 0 ; i < g_hostdb.m_numHosts ; i++ ) {
Host *h = &g_hostdb.m_hosts[i];
if ( h->m_diskUsage < 98.0 ) continue;
out++;
}
if ( out > 0 ) {
if ( adds ) mb->safePrintf("<br><br>");
adds++;
char *s = "s are";
if ( out == 1 ) s = " is";
mb->safePrintf("%li host%s over 98%% disk usage. "
"See the <a href=/admin/hosts>"
"hosts</a> table.",out,s);
}
if ( g_pingServer.m_hostsConfInDisagreement ) {
if ( adds ) mb->safePrintf("<br>");
if ( adds ) mb->safePrintf("<br><br>");
adds++;
mb->safePrintf("The hosts.conf or localhosts.conf file "
"is not the same over all hosts.");

View File

@ -416,7 +416,7 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
//*(long*)p = (long)g_test.m_urlsIndexed;
//p += sizeof(long);
// our num recs, eventsIndexed
//*(long*)p = g_timedb.getNumTotalEvents();//g_coldb.m_numEventsAllColls;
//*(long*)p = g_timedb.getNumTotalEvents();//g_coldb.m_numEventsAllColl
//*(long *)p = 0;
//p += sizeof(long);
// slow disk reads
@ -428,6 +428,8 @@ void PingServer::pingHost ( Host *h , uint32_t ip , uint16_t port ) {
// ensure crc is legit
if ( g_hostdb.getCRC() == 0 ) { char *xx=NULL;*xx=0; }
// disk usage (df -ka)
*(float *)p = g_process.m_diskUsage; p += 4;
// flags indicating our state
long flags = 0;
@ -895,6 +897,10 @@ void handleRequest11 ( UdpSlot *slot , long niceness ) {
// sanity
if ( h->m_hostsConfCRC == 0 ) { char *xx=NULL;*xx=0; }
// disk usage
h->m_diskUsage = *(float *)p;
p += sizeof(float);
// put the state flags
h->m_flags = *(long *)(p);
p += sizeof(long);

View File

@ -386,7 +386,7 @@ static void powerMonitorWrapper ( int fd , void *state ) ;
static void fanSwitchCheckWrapper ( int fd , void *state ) ;
static void gotPowerWrapper ( void *state , TcpSocket *s ) ;
static void doneCmdWrapper ( void *state ) ;
//static void hdtempWrapper ( int fd , void *state ) ;
static void hdtempWrapper ( int fd , void *state ) ;
static void hdtempDoneWrapper ( void *state , ThreadEntry *t ) ;
static void *hdtempStartWrapper_r ( void *state , ThreadEntry *t ) ;
static void heartbeatWrapper ( int fd , void *state ) ;
@ -400,6 +400,8 @@ Process::Process ( ) {
}
bool Process::init ( ) {
// -1 means unknown
m_diskUsage = -1.0;
// we do not know if the fans are turned off or on
m_currentFanState = -1;
m_threadOut = false;
@ -492,8 +494,9 @@ bool Process::init ( ) {
// . hard drive temperature
// . now that we use intel ssds that do not support smart, ignore this
//if ( ! g_loop.registerSleepCallback(10000,NULL,hdtempWrapper,0))
// return false;
// . well use it for disk usage i guess
if ( ! g_loop.registerSleepCallback(10000,NULL,hdtempWrapper,0))
return false;
// power monitor, every 30 seconds
if ( ! g_loop.registerSleepCallback(30000,NULL,powerMonitorWrapper,0))
@ -871,10 +874,65 @@ void hdtempDoneWrapper ( void *state , ThreadEntry *t ) {
s_lasttime = now;
}
// set Process::m_diskUsage
float getDiskUsage ( ) {
// first get disk usage now
char cmd[10048];
char *out = "/tmp/diskusage";
snprintf(cmd,10000,"df -ka %s | tail -1 | awk '{print $5}' > %s",
g_hostdb.m_dir,
out);
int err = system ( cmd );
if ( err == 127 ) {
log("build: /bin/sh does not exist. can not get disk usage.");
return -1.0; // unknown
}
// this will happen if you don't upgrade glibc to 2.2.4-32 or above
if ( err != 0 ) {
log("build: Call to system(\"%s\") had error.",cmd);
return -1.0; // unknown
}
// read in temperatures from file
int fd = open ( "/tmp/diskusage" , O_RDONLY );
if ( fd < 0 ) {
//m_errno = errno;
log("build: Could not open %s for reading: %s.",
out,mstrerror(errno));
return -1.0; // unknown
}
char buf[2000];
long r = read ( fd , buf , 2000 );
// did we get an error
if ( r <= 0 ) {
//m_errno = errno;
log("build: Error reading %s: %s.",out,mstrerror(errno));
close ( fd );
return -1.0; // unknown
}
// clean up shop
close ( fd );
float usage;
sscanf(buf,"%f",&usage);
return usage;
}
// . sets m_errno on error
// . taken from Msg16.cpp
void *hdtempStartWrapper_r ( void *state , ThreadEntry *t ) {
// run the df -ka cmd
g_process.m_diskUsage = getDiskUsage();
// ignore temps now. ssds don't have it
return NULL;
static char *s_parm = "ata";
// make a system call to /usr/sbin/hddtemp /dev/sda,b,c,d
//char *cmd =
@ -884,9 +942,9 @@ void *hdtempStartWrapper_r ( void *state , ThreadEntry *t ) {
// "/usr/sbin/hddtemp /dev/sdd >> /tmp/hdtemp ";
retry:
// linux 2.4 does not seem to like hddtemp
char cmd[10048];
char *path = g_hostdb.m_dir;
//char *path = "/usr/sbin/";
char cmd[10048];
sprintf ( cmd ,
"%ssmartctl -Ad %s /dev/sda | grep Temp | awk '{print $10}' > /tmp/hdtemp2;"
"%ssmartctl -Ad %s /dev/sdb | grep Temp | awk '{print $10}' >> /tmp/hdtemp2;"
@ -912,8 +970,8 @@ void *hdtempStartWrapper_r ( void *state , ThreadEntry *t ) {
//m_errno = EBADENGINEER;
log("build: Call to system(\"%s\") had error.",cmd);
//s_flag = 1;
// wait an hour
s_nextTime = getTime() + 3600;
// wait 5 minutes
s_nextTime = getTime() + 300; // 3600;
return NULL;
}
// read in temperatures from file

View File

@ -89,6 +89,7 @@ class Process {
float m_roofTemp;
long m_currentFanState;
long m_desiredFanState;
float m_diskUsage;
};
extern Process g_process;