2013-08-03 00:12:24 +04:00
// Matt Wells, copyright Feb 2003
#include "gb-include.h"
#include <errno.h>
#include <time.h>
#include <sys/time.h>
#include <netdb.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include "PingServer.h"
#include "HttpServer.h"
#include "Dns.h"
#include <sys/time.h>
#include <sys/resource.h>
// Loop.h defines these to core, so fix that!
#undef sleep(a)
#undef usleep(a)
extern int h_errno;
bool mainShutdown ( bool urgent ) { return true; }
bool closeAll ( void *state , void (* callback)(void *state) ) { return true; }
bool allExit ( ) { return true; }
void checkPage ( char *host , unsigned short port , char *path ) ;
bool getPage ( char *host , unsigned short port , char *path ) ;
int connectSock ( char *host , unsigned short port ) ;
bool sendEmail ( char *errmsg ) ;
//bool sendEmailSSL ( char *errmsg ) ;
void sleepWrapper ( int fd , void *state ) ;
// a static buf
static char s_errbuf [ 50024 ];
// sample every X seconds (was 120)
#define WAIT 30
// for debugging
//#define WAIT 2
static long s_wait = WAIT;
// count # of consecutive errors
static long s_count = 0;
// last time we sent an email
static time_t s_lastTime = 0;
//static bool s_buzz = false;
char *g_host = NULL;
long g_port = 80;
bool g_montest = false;
bool g_isFlurbit = false;
bool g_isProCog = false;
char *g_fqhn = "";
int main ( int argc , char *argv[] ) {
// debug test
//sendEmail("hey you!");
//return 0;
//if(argc > 1 && strcmp(argv[1], "buzz") == 0) {
// s_buzz = true;
bool badArgs = false;
for ( long i = 2 ; i < argc ; i++ ) {
if ( argv[i][0]=='-' &&
argv[i][1]=='t' ) {
g_montest = true;
s_wait = 1;
if ( i+1 < argc &&
argv[i][0]=='-' &&
argv[i][1]=='h' ) {
g_fqhn = argv[i+1];
badArgs = true;
if ( argc < 2 || badArgs ) {
fprintf(stderr,"Usage: monitor [-h FQHN] [-t]\n");
fprintf(stderr,"FQHN defaults to, but if you are"
" not monitoring on gigablast's network then you "
"need to set this to like "
"or whatever your hostname is so verizon accepts our "
g_host = argv[1];
// scan for port
char *portStr = strstr(g_host,":");
g_port = 80;
if ( portStr ) {
g_port = atoi(portStr+1);
*portStr = 0;
g_isFlurbit = (bool)strstr(g_host,"");
if ( ! g_isFlurbit )
g_isFlurbit = (bool)strstr(g_host,"");
if ( ! g_isFlurbit )
g_isFlurbit = (bool)strstr(g_host,"");
if ( ! g_isFlurbit )
g_isFlurbit = (bool)strstr(g_host,"");
// let's ensure our core file can dump
struct rlimit lim;
lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
if ( setrlimit(RLIMIT_CORE,&lim) )
log("monitor: setrlimit: %s.", mstrerror(errno) );
g_isProCog = (bool)strstr(g_host,"");
g_conf.m_sendEmailAlerts = true;
g_conf.m_sendEmailAlertsToSysadmin = true;
//g_conf.m_sendEmailAlertsToMattAlltell = false;
//g_conf.m_sendEmailAlertsToJavier = false;
//g_conf.m_sendEmailAlertsToPartap = false;
//g_conf.m_sendEmailAlertsToZak = false;
g_conf.m_delayNonCriticalEmailAlerts = false;
// avoid any local dns, just go directly to the roots
// no, try asking local bind9 !! seems safer... the root dns
// seem to go down forever after a while...
g_conf.m_askRootNameservers = false;//true;
// hack # of dns servers
g_conf.m_numDns = 2;//3;
g_conf.m_dnsPorts[0] = 53;
g_conf.m_dnsPorts[1] = 53;
g_conf.m_dnsPorts[2] = 53;
// local bind9 server
//g_conf.m_dnsIps [2] = atoip ( "",9);
// google open dns server
g_conf.m_dnsIps [1] = atoip ( "",7);
//g_conf.m_dnsIps [0] = atoip ( "",7);
g_conf.m_dnsIps [0] = atoip ( "",9);
g_conf.m_dnsMaxCacheMem = 1024*10;
g_conf.m_logDebugDns = false;//true;
// matt wells
// call alltel mail server directly to send to matt in case
// is down
// hey, it already goes directly in m_sendEmailAlertsToMattAlltell
// so set this to false.
g_conf.m_sendEmailAlertsToEmail1 = true;
// verizon bought us out...
// was
strcpy ( g_conf.m_email1Addr , "");
strcpy ( g_conf.m_email1From , "");
// use our email server
// nah, our mailserver is probably down...
// result of "dig MX", make this canonical
// in case they change their IP one of these days!
//strcpy ( g_conf.m_email1MX , "" );
//strcpy ( g_conf.m_email1MX , "");
// . result of "dig MX" =
// . they changed this on 4/19/2013!!
// . prepend gbmxrec- to hostname to do an mxrec lookup
// . fix it :
// 1366500952283 000 DEBUG dns Got CNAME alias for
// 1366500952283 000 DEBUG dns Got CNAME alias for
// got ip for ''
//strcpy ( g_conf.m_email1MX , "");
strcpy ( g_conf.m_email1MX , "");
// send to zak directly
g_conf.m_sendEmailAlertsToEmail2 = false;//true;
//strcpy ( g_conf.m_email2Addr , "" );
//strcpy ( g_conf.m_email2Addr , "" );
//strcpy ( g_conf.m_email2From , "");
// got this ip from the cmd:
// dig mx
//strcpy ( g_conf.m_email2MX , "" );
//strcpy ( g_conf.m_email2MX , "" );
//strcpy ( g_conf.m_email2MX , "");
// to sabino right to his email server
//g_conf.m_sendEmailAlertsToEmail3 = true;
//strcpy ( g_conf.m_email3Addr , "");
//strcpy ( g_conf.m_email3From , "" );
// ""
//strcpy ( g_conf.m_email3MX , "");
g_conf.m_sendEmailAlertsToEmail3 = false;//true;
//strcpy ( g_conf.m_email3Addr , "");
strcpy ( g_conf.m_email3Addr , "" );
strcpy ( g_conf.m_email3From , "" );
//strcpy ( g_conf.m_email3MX , "");
strcpy ( g_conf.m_email3MX , "");
g_conf.m_httpMaxSockets = 100;
// use -1 or 0 for port to avoid setting up the server, just client
g_httpServer.init ( -1 , -1 );
g_loop.registerSleepCallback ( 10 , // WAIT * 1000 ,
sleepWrapper );
if ( ! g_dns.init( 9532 ) ) return 1;
if ( ! g_loop.runLoop() ) return 1;
// have a list of queries we cycle through
long g_qn = 0;
char *g_queries[] = {
#define TYPE_GB 0
#define TYPE_BUZZ 1
char g_type[] = {
//1, // buzzlogic
//0, // test
0 , // broncos
0, // ibm
0, // yahoo
//1, // buzzlogic
// . need to contain these substrings in the result page
// . added quote because they need to be in an href="xxx" tag
char *g_substrings[][MAX_SUBSTRINGS] = {
// just for buzz
// broncos
// test
// "","","",
// "" , NULL } ,
// ibm
"", NULL},
// yahoo
// just for buzz
// google
// amazon
// ebay
// ... add more here...
static long s_goodMXIp = 0;
void gotMXIpWrapper ( void *state , long ip ) {
g_conf.m_logDebugDns = false;
log("monitor: gotmxipwrapper ip of %s for %s",
if ( ip != 0 && ip != -1 ) {
// make sure ping server just uses the last legit ip
// if it's lookup fails
//g_conf.m_email1MXIPBackup = ip;
s_goodMXIp = ip;
log("monitor: saving good mxip %s",iptoa(ip));
// try to load the page every tick
void sleepWrapper ( int fd , void *state ) {
// register ourselves
g_loop.unregisterSleepCallback ( state , sleepWrapper );
// re-register
g_loop.registerSleepCallback ( s_wait * 1000 ,
sleepWrapper );
// the main monitoring loop
// loop:
// check port 80
// make the request
char query[1024];
// "code=gbmonitor",g_queries[g_qn]);
//char *host = "";
// we need to use port 8002 if running from titan, but on voyager2
// we can still use port 80
short port = 80;
// hack for testing on titan
//port = 8002;
// for debugging
//port = 5699;
if( g_type[g_qn]== TYPE_BUZZ) {
//host = "";
port = 8000;
if ( g_isFlurbit )
sprintf(query,"/NM/Albuquerque" );
if ( g_isProCog )
// launch dns lookup every 30 minutes
long now = getTimeLocal();
static long s_lastDnsTime = 0;
static long s_mxip1;
//static DnsState s_ds;
if ( now - s_lastDnsTime > 30*60 ) {
s_lastDnsTime = now;
// note it
log("dns: calling g_dns.getIp()");
g_conf.m_logDebugDns = true;
// .
// . this will store it in the cache to keep it on hand
if ( ! g_dns.getIp ( g_conf.m_email1MX ,
&s_mxip1 ,
gotMXIpWrapper ,
80, // 30 second timeout
false, // dnslookup?
// if we lose our internet connection
// temporarily we do not want to cache a
// bad ip address for the cellphone's
// mx server!!!
false // CACHE NOT FOUNDS? NO!!!!!!!!
) )
// return if it blocked
// we got it without blocking
log("dns: got ip without blocking");
// save this
long old = g_qn;
long long startTime = gettimeofdayInMilliseconds();
bool status;
if ( g_montest ) {
status = false;
strcpy(s_errbuf,"monitor: test error");
// need raw=9 so robot checking doesn't cut us off
status = getPage( g_host,
// php front-end now redirects raw=x
// to
query );
// check all hosts
//for ( unsigned short port = 8000 ; port < 8032 ; port++ )
time_t t = time(NULL);
char *s = ctime(&t);
s[strlen(s)-1] = '\0';
char buf [ 30024 ];
long long took = gettimeofdayInMilliseconds() - startTime;
// if ok, loop back
if ( status ) {
if ( ! g_isFlurbit )
fprintf(stderr,"monitor: %s got page ok in %lli ms "
fprintf(stderr,"monitor: %s got page ok in %lli ms "
s_count = 0;
//sleep ( WAIT );
//goto loop;
if ( strlen(s_errbuf) > 20000 ) s_errbuf[20000] = '\0';
// make a pretty error msg
sprintf ( buf , "monitor %s:%li: %s %s\n" , g_host,g_port,s,s_errbuf );
// log to console
//fprintf ( stderr , buf );
// there might %'s in the s_errbuf so do this!!
fprintf ( stderr , "monitor %s:%li: %s %s\n" , g_host,g_port,s,s_errbuf );
// count the error
// have we already sent an email in the last 10 mins?
if ( t - s_lastTime < 10*60 ) { s_count = 0; return; }
//sleep ( WAIT ); goto loop; }
// we need 3 errors in a row to send an email
if ( s_count < 3 ) return; // { sleep ( WAIT ); goto loop; }
// log to cell phone
//Host h;
//h.m_emailCode = 0;
//h.m_hostId = 1000;
log("sendEmail: sending email alert. mxip=%s",iptoa(s_goodMXIp));
//if ( ! g_pingServer.sendEmail ( NULL , buf ) ) // &h , buf ) )
g_pingServer.sendEmail ( NULL ,
buf ,
true , // sendtoAdmin? default
false, // oom? default
false, // kernelerrors? default
false, // parmchanged? default
false, // forceit? default
s_goodMXIp); // mx ip address
//g_fqhn ) ;
//fprintf ( stderr , "sendEmail: %s\n" , s_errbuf);
// so we don't send more than 1 email every 10 minutes
s_lastTime = t;
// loop back
s_count = 0;
//sleep ( WAIT );
//goto loop;
if ( g_montest ) {
g_montest = false;
s_wait = WAIT;
// . returns false and fills in s_errbuf on error
// . returns true if no error
bool getPage ( char *host , unsigned short port , char *path ) {
// get the socket fd
int sd = connectSock ( host , port );
if ( sd < 0 ) return false;
SafeBuf sbuf;
// a tmp buf
char tbuf [ 1024*100+10 ];
tbuf[0] = '\0';
// send the request
sprintf ( tbuf ,
"GET %s HTTP/1.0\r\n"
"Connection: close\r\n"
"Host:\r\n\r\n" , path );
if ( send ( sd , tbuf , strlen(tbuf) , 0 ) == -1 ) {
sprintf ( s_errbuf ,"send: %s" , strerror(errno) );
close ( sd );
return false;
// read the reply
long nb;
//long sum = 0;
// make it non blocking in case we don't get reply
int flags = fcntl ( sd , F_GETFL ) ;
if ( flags < 0 ) {
sprintf ( s_errbuf,"fcntl: (F_GETFL): %s.",strerror(errno));
close ( sd );
return false;
if ( fcntl ( sd, F_SETFL, flags|O_NONBLOCK|O_ASYNC ) < 0 ) {
sprintf ( s_errbuf ,"fcntl: %s" , strerror(errno) );
close ( sd );
return false;
// try to fix core in Mem.cpp from trying to freee in thread
g_mem.setPid(); // s_pid = getpid();
// crazy?
if ( g_mem.getPid() == -1 ) {
log("monitor: bad s_pid"); char *xx=NULL;*xx=0; }
// start time
long now = time(NULL);
long end = now + 25; // 25 seconds to read
if ( (nb = read ( sd , tbuf , 1024*100 ) ) == -1 &&
errno != EAGAIN ) {
sprintf ( s_errbuf ,"read: %s" , strerror(errno) );
close ( sd );
return false;
// 0 means blocked
if ( nb == -1 ) {
now = time(NULL);
if ( now >= end ) {
errno = ETIMEDOUT;
sprintf ( s_errbuf ,"read: timedout after 25 seconds "
"of trying to read reply" );
close ( sd );
return false;
// wait for data to be there! wait 5 ms
goto loop;
// copy into safebuf
sbuf.safeMemcpy ( tbuf , nb );
// keep going if we read something
if ( nb > 0 ) goto loop; // { sum += nb; goto loop; }
// add this just in case
//if ( sum > 1024*100 ) sum = 1024*100;
//if ( sum >= 0 ) buf[sum] = '\0';
// . must have read something, at least this for the 'test' query!!!
// . no results page is only
if ( sbuf.length() < 3*1024 ) {
sprintf ( s_errbuf ,"read: only read %li bytes for %s. "
"readbuf=%s" , sbuf.length()-1,
close ( sd );
return false;
// if flurbit. look for "Next " link
if ( g_isFlurbit && ! strstr(sbuf.getBufStart(),"Next 25") ) {
sprintf ( s_errbuf ,"monitor: did not read Next 25 link: "
"readbuf=%s", sbuf.getBufStart());
close ( sd );
return false;
// search for,,,...
// in the search results, at least one should be there!
char *p = NULL;
// it must contain at least one substring
for ( long i = 0 ; i < MAX_SUBSTRINGS && ! g_isFlurbit ; i++ ) {
// end of list...
if ( g_substrings[g_qn][i] == NULL ) break;
p = strstr(sbuf.getBufStart(), g_substrings[g_qn][i]);
if ( p ) break;
//if ( ! p ) p = strstr(buf,"");
//if ( ! p ) p = strstr(buf,"");
//if ( ! p ) p = strstr(buf,"");
//if ( ! p ) p = strstr(buf,"");
//if ( ! p ) p = strstr(buf,"");
//if ( ! p ) p = strstr(buf,"");
// debug it!
//p = NULL;
if ( ! p && ! g_isFlurbit ) { // && !s_buzz) {
long slen = sbuf.length();
char *pbuf = sbuf.getBufStart();
//if ( slen > 30000 ) pbuf[30000] = '\0';
if ( slen > 1000 ) pbuf[1000] = '\0';
"read: bad search results (len=%li) for %s "
close ( sd );
// do NOT inc g_qn cuz this query needs to work! the
// problem might be with just it and we need to
// strike out 3 times to send an email alert
return false;
// try next
// wrap around
if ( g_queries[g_qn] == NULL ) g_qn = 0;
close ( sd );
// success
return true;
// . returns -1 and fill s_errbuf on error
// . returns fd on success
int connectSock ( char *host , unsigned short port ) {
// use the same socket connection
sethostent( 0 ) ; // stayopen? 0--> use udp
// say what we're monitoring
hostent *e = gethostbyname( host );
// close the connection, but not for buzz because they go by ip
//if(!s_buzz) endhostent ();
endhostent ();
// return false on error
if ( ! e ) {
sprintf ( s_errbuf ,"connectSock: gethostbyname (%s): %s",
host, hstrerror(h_errno) );
return -1;
// get first ip address
//long n = e->h_length;
unsigned long ip = *(long *)(e->h_addr_list[0]);
pid_t pid = getpid();
char cmd[256];
sprintf(cmd,"/usr/bin/dig +short %s | tail -1 > /tmp/ip.%lu",
system ( cmd );
char filename[256];
FILE *fd = fopen( filename,"r");
char ipstring[256];
long ip = atoip(ipstring,strlen(ipstring));
// print that
// now make a new socket descriptor
int sd = socket ( AF_INET , SOCK_STREAM , 0 ) ;
if ( sd < 0 ) {
sprintf ( s_errbuf ,"connectSock: socket: %s" ,
strerror(errno) );
return -1;
//fd_set set;
fcntl ( sd, F_SETFL, O_NONBLOCK );
// set up for connection
struct sockaddr_in to;
to.sin_family = AF_INET;
// our ip's are always in network order, but ports are in host order
to.sin_addr.s_addr = ip;
to.sin_port = htons (port);
bzero ( &(to.sin_zero) , 8 );
long long start = gettimeofdayInMillisecondsLocal();
// connect to the socket, it returns 0 on success
int ret = connect ( sd, (sockaddr *)&to, sizeof(to) ) ;
if ( ret != 0 && errno != EINPROGRESS ) {
sprintf ( s_errbuf ,"connectSock: connect: %s" ,
strerror(errno) );
close ( sd );
return -1;
// if it has been 10 seconds, forget it!
long long now = gettimeofdayInMillisecondsLocal();
long long elapsed = now - start;
if ( elapsed >= 10000 ) {
sprintf ( s_errbuf ,"connectSock: connect: TIMEDOUT!",
strerror(errno) );
close ( sd );
return -1;
// sleep for 10ms and try again
goto connectLoop;
// fake return for compiler
return 1;