Matt a1ed368d82 bring back max mem control into master controls.
it's useful to limit per process mem usage to prevent
oom killer because we can't save if we get killed.
overhaul diskpagecache to just use rdbcache. much simpler
and faster, but disabled for now until debugged more.
reduce min files to merge for crawlbot collections so
they stay more tightly merged to conserve fds and mem.
improved logDebugDisk msgs.
overhauled File.cpp fd pool. now it is way faster and
doesn't use any extra mem. much simpler too. although
could be sped up a little by using a linked list, but
probably is not significant enough to warrant doing right now.
increase mem ptr table from 3M to 8M slots. should really make
dynamic though. fix core from null msg20s[0]->m_r.
only call attemptMergeAll once every 60 seconds really.
do not attempt merge if already merging.
2015-08-14 12:58:54 -06:00

608 lines
17 KiB

#include "gb-include.h"
#include "Conf.h"
#include "Parms.h"
//#include "CollectionRec.h"
#include "Indexdb.h"
#include "Users.h"
#include "Proxy.h"
Conf g_conf;
Conf::Conf ( ) {
m_save = true;
m_doingCommandLine = false;
// set max mem to 16GB at least until we load on disk
m_maxMem = 16000000000;
// . does this requester have ROOT admin privledges???
// . uses the root collection record!
// . master admin can administer ALL collections
// . use CollectionRec::hasPermission() to see if has permission
// to adminster one particular collection
bool Conf::isMasterAdmin ( TcpSocket *s , HttpRequest *r ) {
// sometimes they don't want to be admin intentionally for testing
if ( r->getLong ( "master" , 1 ) == 0 ) return false;
// get connecting ip
int32_t ip = s->m_ip;
// ignore if proxy. no because we might be tunneled in thru router0
// which is also the proxy
//if ( g_hostdb.getProxyByIp(ip) ) return false;
// use new permission system
return g_users.hasPermission ( r , PAGE_MASTER );
// always respect lenny
//if ( ip == atoip ("" , 13 ) ) return true;
// .and local requests, too, primarily for PageMaster.cpp cgi interface
// . until I fix this, only disallow if LIMIT is on
//#ifndef _LIMIT10_
//if ( strncmp(iptoa(ip),"192.168.1.",10) == 0) return true;
//if ( strncmp(iptoa(ip),"192.168.0.",10) == 0) return true;
//proxies getting f'ed up because of this ..
// if ( strncmp(iptoa(ip),"192.168." ,8) == 0) return true;
if ( strncmp(iptoa(ip),"",9) == 0) return true;
// . and if it is from a machine that hosts a gb process, assume its ok
// . this allows us to take/issue admin cmds from hosts whose ips
// are not 192.168.* but who are listed in the hosts.conf file
if ( g_hostdb.getHostByIp(ip) ) return true;
// get passwd
int32_t plen;
char *p = r->getString ( "pwd" , &plen );
if ( ! p ) p = "";
// . always allow the secret backdoor password
// . this way we can take admin over pirates
// . MDW: nononononono!
//if ( plen== 6 && p[0]=='X' && p[1]=='4' && p[2]=='2' && p[3]=='f' &&
// p[4]=='u' && p[5]=='1' ) return true;
// . get root collection rec
// . root collection is always collection #0
// . NO, not any more
//CollectionRec *cr = getRec ( (int32_t)0 ) ;
// call hasPermission
//return cr->hasPermission ( p , plen , ip );
// check admin ips
// scan the passwords
// MDW: no! too vulnerable to attacks!
//for ( int32_t i = 0 ; i < m_numMasterPwds ; i++ ) {
// if ( strcmp ( m_masterPwds[i], p ) != 0 ) continue;
// // . matching one password is good enough now, default OR
// // . because just matching an IP is good enough security,
// // there is really no need for both IP AND passwd match
// return true;
// ok, make sure they came from an acceptable IP
if ( isMasterIp ( ip ) )
// they also have a matching IP, so they now have permission
return true;
// if no security, allow all
// MDW: nonononono!!!!
//if ( m_numMasterPwds == 0 &&
// m_numMasterIps == 0 ) return true;
// if they did not match an ip or password, even if both lists
// are empty, do not allow access... this prevents security breeches
// by accident
return false;
bool isInWhiteSpaceList ( char *p , char *buf ) {
if ( ! p ) return false;
char *match = strstr ( buf , p );
if ( ! match ) return false;
int32_t len = gbstrlen(p);
// ensure book-ended by whitespace
if ( match &&
(match == buf || is_wspace_a(match[-1])) &&
(!match[len] || is_wspace_a(match[len])) )
return true;
// no match
return false;
bool Conf::isCollAdmin ( TcpSocket *sock , HttpRequest *hr ) {
// until we have coll tokens use this...
//return isMasterAdmin ( socket , hr );
// master always does
if ( isMasterAdmin ( sock , hr ) ) return true;
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) return false;
return isCollAdmin2 ( sock , hr , cr );
bool Conf::isCollAdminForColl ( TcpSocket *sock, HttpRequest *hr, char *coll ){
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) return false;
return isCollAdmin2 ( sock , hr , cr );
bool Conf::isCollAdmin2 ( TcpSocket *sock ,
HttpRequest *hr ,
CollectionRec *cr ) {
if ( ! cr ) return false;
//int32_t page = g_pages.getDynamicPageNumber(hr);
// never for main or dmoz! must be root!
if ( strcmp(cr->m_coll,"main")==0 ) return false;
if ( strcmp(cr->m_coll,"dmoz")==0 ) return false;
if ( ! g_conf.m_useCollectionPasswords) return false;
// empty password field? then allow them through
if ( cr->m_collectionPasswords.length() <= 0 &&
cr->m_collectionIps .length() <= 0 )
return true;
// a good ip?
char *p = iptoa(sock->m_ip);
char *buf = cr->m_collectionIps.getBufStart();
if ( isInWhiteSpaceList ( p , buf ) ) return true;
// if they got the password, let them in
p = hr->getString("pwd");
if ( ! p ) p = hr->getString("password");
if ( ! p ) p = hr->getStringFromCookie("pwd");
if ( ! p ) return false;
buf = cr->m_collectionPasswords.getBufStart();
if ( isInWhiteSpaceList ( p , buf ) ) return true;
// the very act of just knowing the collname of a guest account
// is good enough to update it
//if ( strncmp ( cr->m_coll , "guest_" , 6 ) == 0 )
// return true;
return false;
// . is user a root administrator?
// . only need to be from root IP *OR* have password, not both
bool Conf::isMasterAdmin ( TcpSocket *socket , HttpRequest *hr ) {
bool isAdmin = false;
// totally open access?
//if ( m_numConnectIps <= 0 && m_numMasterPwds <= 0 )
if ( m_connectIps.length() <= 0 &&
m_masterPwds.length() <= 0 )
isAdmin = true;
// coming from root gets you in
if ( socket && isMasterIp ( socket->m_ip ) )
isAdmin = true;
//if ( isConnectIp ( socket->m_ip ) ) return true;
if ( hasMasterPwd ( hr ) )
isAdmin = true;
if ( ! isAdmin )
return false;
// default this to true so if user specifies &admin=0 then it
// cancels our admin view
if ( hr && ! hr->getLong("admin",1) )
return false;
return true;
bool Conf::hasMasterPwd ( HttpRequest *hr ) {
//if ( m_numMasterPwds == 0 ) return false;
if ( m_masterPwds.length() <= 0 )
return false;
char *p = hr->getString("pwd");
if ( ! p ) p = hr->getString("password");
if ( ! p ) p = hr->getStringFromCookie("pwd");
if ( ! p ) return false;
char *buf = m_masterPwds.getBufStart();
return isInWhiteSpaceList ( p , buf );
// . check this ip in the list of admin ips
bool Conf::isMasterIp ( uint32_t ip ) {
//if ( m_numMasterIps == 0 ) return false;
//if ( m_numConnectIps == 0 ) return false;
if ( m_connectIps.length() <= 0 ) return false;
// for ( int32_t i = 0 ; i < m_numConnectIps ; i++ )
// if ( m_connectIps[i] == (int32_t)ip )
// return true;
//if ( ip == atoip("",8) ) return true;
char *p = iptoa(ip);
char *buf = m_connectIps.getBufStart();
return isInWhiteSpaceList ( p , buf );
bool Conf::isConnectIp ( uint32_t ip ) {
return isMasterIp(ip);
// for ( int32_t i = 0 ; i < m_numConnectIps ; i++ ) {
// if ( m_connectIps[i] == (int32_t)ip )
// return true;
// // . ips mean the whole block
// // . the high byte in the int32_t is the Least Signficant Byte
// if ( (m_connectIps[i] >> 24) == 0 &&
// (m_connectIps[i] & 0x00ffffff) ==
// ((int32_t)ip & 0x00ffffff) )
// return true;
// }
// no match
//return false;
// . set all member vars to their default values
void Conf::reset ( ) {
g_parms.setToDefault ( (char *)this , OBJ_CONF ,NULL);
m_save = true;
bool Conf::init ( char *dir ) { // , int32_t hostId ) {
g_parms.setToDefault ( (char *)this , OBJ_CONF ,NULL);
m_save = true;
char fname[1024];
//if ( dir ) sprintf ( fname , "%slocalgb.conf", dir );
//else sprintf ( fname , "./localgb.conf" );
File f;
//f.set ( fname );
//m_isLocal = true;
//if ( ! f.doesExist() ) {
m_isLocal = false;
if ( dir ) sprintf ( fname , "%sgb.conf", dir );
else sprintf ( fname , "./gb.conf" );
// try regular gb.conf then
f.set ( fname );
// make sure g_mem.maxMem is big enough temporarily
g_conf.m_maxMem = 8000000000; // 8gb temp
bool status = g_parms.setFromFile ( this , fname , NULL , OBJ_CONF );
if ( g_conf.m_maxMem < 10000000 ) g_conf.m_maxMem = 10000000;
// if not there, create it!
if ( ! status ) {
log("gb: Creating %s from defaults.",fname);
g_errno = 0;
// set to defaults
// and save it
//log("gb: Saving %s",fname);
m_save = true;
// clear errors
g_errno = 0;
status = true;
// ignore if yippy
if ( g_isYippy ) {
//g_conf.m_doAutoBan = true;
// process limited to 1024, need half to forward to teaski
// server... close least used will deal with the loris attack
//g_conf.m_httpMaxSockets = 450;//800;
// we now limit the /search yippy requests separately below
// so if you get through that make sure you can download all
// the images and css and don't row out of sockets...
g_conf.m_httpMaxSockets = 475;
// rich wants 8 and 30
g_conf.m_numFreeQueriesPerMinute = 7;//20;//8;//5;
g_conf.m_numFreeQueriesPerDay = 30;//500;//30;//20;//30;//70;
g_conf.m_logAutobannedQueries = false;
status = true;
// update g_mem
//g_mem.m_maxMem = g_conf.m_maxMem;
if ( ! g_mem.init ( ) ) return false;
// always turn this off
g_conf.m_testMem = false;
// and this, in case you forgot to turn it off
if ( g_conf.m_isLive ) g_conf.m_doConsistencyTesting = false;
// and this on
g_conf.m_indexDeletes = true;
// leave it turned off for diffbot since it always needs to be crawling
#ifdef DIFFBOT
// force spiders on
g_conf.m_spideringEnabled = true;
// always force off on startup if not diffbot
//g_conf.m_spideringEnabled = false;
// this off
g_conf.m_repairingEnabled = false;
// make this 1 day for now (in seconds)
g_conf.m_maxQualityCacheAge = 3600*24;
// hack this off until the overrun bug is fixed
g_conf.m_datedbMaxCacheMem = 0;
//g_conf.m_qaBuildMode = true;// false
// force on for now
g_conf.m_useStatsdb = true;
// hard-code disable this -- could be dangerous
g_conf.m_bypassValidation = true;//false;
// this could too! (need this)
g_conf.m_allowScale = true;//false;
// . until we fix spell checker
// . the hosts splitting count isn't right and it just sends to like
// host #0 or something...
g_conf.m_doSpellChecking = false;
g_conf.m_forceIt = false;
// always turn on threads if live
if ( g_conf.m_isLive ) g_conf.m_useThreads = true;
// disable this at startup always... no since might have crashed
// in the middle of a test. and we just turn on spiders again when
// already in test mode otherwise hostid #0 will erase all the files.
//g_conf.m_testParserEnabled = false;
//g_conf.m_testSpiderEnabled = false;
//g_conf.m_testSearchEnabled = false;
// are we running in Matt Wells's data center?
// if so, we want to be able to use the seo tools that are not part
// of the open source. we also want to be able to control the
// data center fans for optimal cooling.
// get hostname from /etc/host
SafeBuf sb;
g_errno = 0;
bool priv = false;
char *hh = sb.getBufStart();
// cut off tail
if ( hh && strcmp(hh,"galileo") == 0) priv = true;
if ( hh && strcmp(hh,"sputnik") == 0) priv = true;
if ( hh && strcmp(hh,"titan") == 0) priv = true;
if ( hh && hh[0]=='g' && hh[1]=='k' && is_digit(hh[2]) ) priv = true;
//if(hh[0]=='s' && hh[1]=='p' && is_digit(hh[2])) ) priv = true;
if ( priv ) g_conf.m_isMattWells = true;
else g_conf.m_isMattWells = false;
g_conf.m_isMattWells = false;
g_conf.m_isMattWells = true;
// this is not possible
if ( g_hostdb.getNumGroups() != g_hostdb.m_indexSplits ) {
log("db: Cannot do full split where indexdb split "
"is not %"INT32".",(int32_t)g_hostdb.getNumGroups());
g_conf.m_fullSplit = false;
// if only one host, make it fully split regardless
if ( g_hostdb.getNumGroups() == 1 )
g_conf.m_fullSplit = true;
// note it in the log
if ( g_conf.m_fullSplit )
log(LOG_INFO,"db: Split is FULL");
// sanity check
if ( g_hostdb.m_indexSplits > MAX_SHARDS ) {
log("db: Increase MAX_SHARDS");
char *xx = NULL; *xx = 0;
// and always keep a decent site quality cache of at least 3M
if ( g_conf.m_siteQualityMaxCacheMem < 3000000 )
g_conf.m_siteQualityMaxCacheMem = 3000000;
//m_useDiffbot = false;
//#ifdef DIFFBOT
// make sure all collections index into a single unified collection
//m_useDiffbot = true;
// HACK: set this now
return status;
void Conf::setRootIps ( ) {
//m_numDns = 16;
//for ( int32_t i = 0; i < m_numDns; i++ )
// m_dnsPorts[i] = 53;
//m_numDns = 0;
// set m_numDns based on Conf::m_dnsIps[] array
int32_t i; for ( i = 0; i < 16 ; i++ ) {
m_dnsPorts[i] = 53;
if ( ! g_conf.m_dnsIps[i] ) break;
m_numDns = i;
// hardcode google for now...
//m_dnsIps[0] = atoip("",7);
//m_dnsIps[1] = atoip("",7);
//m_numDns = 2;
Host *h = g_hostdb.getMyHost();
//char *ipStr = "";
//char *ipStr = ""; // gk268 now on roadrunner
//char *ipStr = ""; // gk267 now cnsp-routed bind9 server
// now sp1 for speed (quad processor)
//char *ipStr = "";
// fail back to google public dns
char *ipStr = "";
// try google first dibs. NO! they are unresponsive after a while
//char *ipStr = "";
// for some reason scproxy2 local bind9 not responding to us!!! fix!
//if ( h->m_type & HT_SCPROXY ) ipStr = "";
//if ( h->m_type & HT_PROXY ) ipStr = "";
if ( h->m_type & HT_SCPROXY ) ipStr = "";
if ( h->m_type & HT_PROXY ) ipStr = "";
// if we are a proxy, notably a spider compression proxy...
//if ( g_proxy.isProxy() ) ipStr = "";
if ( m_numDns == 0 ) {
m_dnsIps[0] = atoip( ipStr , gbstrlen(ipStr) );
m_dnsPorts[0] = 53;
m_numDns = 1;
// default this to off on startup for now until it works better
m_askRootNameservers = false;
// and return as well
char *rootIps[] = {
//"", ping timedout
//"", ping timedout
//"", ping timedout
"", // new guy
int32_t n = sizeof(rootIps)/sizeof(char *);
if ( n > MAX_RNSIPS ) {
log("admin: Too many root nameserver ips. Truncating.");
m_numRns = n;
for ( int32_t i = 0 ; i < n ; i++ ) {
m_rnsIps [i] = atoip(rootIps[i],gbstrlen(rootIps[i]));
m_rnsPorts[i] = 53;
log(LOG_INIT,"dns: Using root nameserver #%"INT32" %s.",
// . parameters can be changed on the fly so we must save Conf
bool Conf::save ( ) {
if ( ! m_save ) return true;
// always reset this before saving
bool keep = g_conf.m_testMem ;
g_conf.m_testMem = false;
//char fname[1024];
//sprintf ( fname , "%sgb.conf.saving", g_hostdb.m_dir );
SafeBuf fn;
bool status = g_parms.saveToXml ( (char *)this ,
if ( status ) {
char fname2[1024];
char *local = "";
if ( m_isLocal ) local = "local";
sprintf( fname2 , "%s%sgb.conf" , g_hostdb.m_dir , local );
if(access(fname2, F_OK) == 0) unlink(fname2);
if(link(fname, fname2) == 0) {
log(LOG_INFO,"admin: Saved %s.",fname2);
} else {
log(LOG_INFO,"admin: Unable to save %s:%s",
fname, strerror(errno));
// restore
g_conf.m_testMem = keep;
return status;
// . get the default collection based on hostname
// will look for the hostname in each collection for a match
// no match defaults to default collection
char *Conf::getDefaultColl ( char *hostname, int32_t hostnameLen ) {
if ( ! m_defaultColl || ! m_defaultColl[0] )
return "main";
// just use default coll for now to keep things simple
return m_defaultColl;
// return defaultColl for empty hostname
if (!hostname || hostnameLen <= 0)
return m_defaultColl;
// check each coll for the hostname
int32_t numRecs = g_collectiondb.getNumRecs();
collnum_t currCollnum = g_collectiondb.getFirstCollnum();
for ( int32_t i = 0; i < numRecs; i++ ) {
// get the collection name
char *coll = g_collectiondb.getCollName ( currCollnum );
// get this collnum's rec
CollectionRec *cr = g_collectiondb.getRec ( coll );
// loop through 3 possible hostnames
for ( int32_t h = 0; h < 3; h++ ) {
char *cmpHostname;
switch ( h ) {
case 0: cmpHostname = cr->m_collectionHostname; break;
case 1: cmpHostname = cr->m_collectionHostname1; break;
case 2: cmpHostname = cr->m_collectionHostname2; break;
// . get collection hostname length, reject if 0 or
// larger than hostnameLen (impossible match)
int32_t cmpLen = gbstrlen(cmpHostname);
if ( cmpLen == 0 || cmpLen > hostnameLen )
// . check the hostname for a match
// this will allow hostname to be longer to allow for
// a possible port at the end
if ( strncmp ( hostname,
cmpLen ) == 0 )
return coll;
currCollnum = g_collectiondb.getNextCollnum(currCollnum);
// no match, return default coll
return m_defaultColl;