Merge branch 'testing' into diffbot-matt

Conflicts:
	Collectiondb.cpp
	HttpRequest.h
	PageBasic.cpp
	coll.main.0/coll.conf
This commit is contained in:
mwells 2014-04-09 11:18:39 -07:00
commit 72dc660598
25 changed files with 859 additions and 517 deletions

View File

@ -138,6 +138,19 @@ bool Collectiondb::loadAllCollRecs ( ) {
if ( ! addExistingColl ( coll , collnum ) )
return false;
}
// if no existing recs added... add coll.main.0 always at startup
if ( m_numRecs == 0 ) {
log("admin: adding main collection.");
addNewColl ( "main",
0 , // customCrawl ,
NULL,
0 ,
true , // bool saveIt ,
// Parms.cpp reserves this so it can be sure
// to add the same collnum to every shard
0 );
}
// note it
//log(LOG_INFO,"db: Loaded data for %li collections. Ranging from "
// "collection #0 to #%li.",m_numRecsUsed,m_numRecs-1);
@ -234,10 +247,10 @@ bool Collectiondb::addExistingColl ( char *coll, collnum_t collnum ) {
// load if not new
if ( ! cr->load ( coll , i ) ) {
mdelete ( cr, sizeof(CollectionRec), "CollectionRec" );
log("admin: Failed to load coll.%s.%li/coll.conf",coll,i);
delete ( cr );
m_recs[i] = NULL;
return log("admin: Failed to load conf for collection "
"\"%s\".",coll);
if ( m_recs ) m_recs[i] = NULL;
return false;
}
if ( ! registerCollRec ( cr , false ) ) return false;
@ -972,6 +985,39 @@ bool Collectiondb::setRecPtr ( collnum_t collnum , CollectionRec *cr ) {
return true;
}
// moves a file by first trying rename, then copying since cross device renaming doesn't work
// returns 0 on success
int mv(char* src, char* dest) {
int status = rename( src , dest );
if (status == 0)
return 0;
FILE *fsrc, *fdest;
fsrc = fopen(src, "r");
if (fsrc == NULL)
return -1;
fdest = fopen(dest, "w");
if (fdest == NULL) {
fclose(fsrc);
return -1;
}
const int BUF_SIZE = 1024;
char buf[BUF_SIZE];
while (!ferror(fdest) && !ferror(fsrc) && !feof(fsrc)) {
int read = fread(buf, 1, BUF_SIZE, fsrc);
fwrite(buf, 1, read, fdest);
}
fclose(fsrc);
fclose(fdest);
if (ferror(fdest) || ferror(fsrc))
return -1;
remove(src);
return 0;
}
// . returns false if we need a re-call, true if we completed
// . returns true with g_errno set on error
bool Collectiondb::resetColl2( collnum_t oldCollnum,
@ -1028,7 +1074,7 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%li.bulkurls.txt",cr->m_coll,(long)oldCollnum);
if (cr->m_isCustomCrawl == 2)
rename( oldbulkurlsname , tmpbulkurlsname );
mv( oldbulkurlsname , tmpbulkurlsname );
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
@ -1141,7 +1187,7 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
// be sure to copy back the bulk urls for bulk jobs
if (cr->m_isCustomCrawl == 2)
rename( tmpbulkurlsname, newbulkurlsname );
mv( tmpbulkurlsname, newbulkurlsname );
// and clear the robots.txt cache in case we recently spidered a
// robots.txt, we don't want to use it, we want to use the one we
@ -2280,7 +2326,7 @@ bool CollectionRec::hasSearchPermission ( TcpSocket *s , long encapIp ) {
}
bool expandRegExShortcuts ( SafeBuf *sb ) ;
bool updateSiteList ( collnum_t collnum , bool addSeeds );
bool updateSiteListTables ( collnum_t collnum,bool addSeeds,char *siteListArg);
void nukeDoledb ( collnum_t collnum );
// . anytime the url filters are updated, this function is called
@ -2343,11 +2389,14 @@ bool CollectionRec::rebuildUrlFilters ( ) {
// maybe this is good enough
//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
CollectionRec *cr = sc->m_cr;
// . rebuild sitetable? in PageBasic.cpp.
// . re-adds seed spdierrequests using msg4
// . true = addSeeds
// . rebuilds url filters there too i think
updateSiteList ( m_collnum , true );
updateSiteListTables ( m_collnum ,
true ,
cr->m_siteListBuf.getBufStart() );
}

4
Conf.h
View File

@ -46,7 +46,7 @@
class Conf {
public:
Conf();
bool isCollAdmin ( TcpSocket *socket , HttpRequest *hr ) ;
@ -98,6 +98,8 @@ class Conf {
// a core dump saving them
char m_save;
bool m_runAsDaemon;
bool m_isLocal;
//director info (optional) (used iff m_isTrustedNet is false)

View File

@ -61,6 +61,7 @@ Hostdb::Hostdb ( ) {
m_initialized = false;
m_crcValid = false;
m_crc = 0;
m_created = false;
}
Hostdb::~Hostdb () {
@ -97,8 +98,8 @@ char *Hostdb::getNetName ( ) {
// . gets filename that contains the hosts from the Conf file
// . return false on errro
// . g_errno may NOT be set
bool Hostdb::init ( char *filename , long hostId , char *netName ,
bool proxyHost , char useTmpCluster ) {
bool Hostdb::init ( long hostId , char *netName ,
bool proxyHost , char useTmpCluster , char *cwd ) {
// reset my ip and port
m_myIp = 0;
m_myIpShotgun = 0;
@ -110,9 +111,13 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
m_useTmpCluster = useTmpCluster;
m_initialized = true;
char *dir = "./";
if ( cwd ) dir = cwd;
// try localhosts.conf first
if ( strcmp ( filename , "./hosts.conf" ) == 0 )
filename = "./localhosts.conf";
char *filename = "hosts.conf";
//if ( strcmp ( filename , "hosts.conf" ) == 0 )
// filename = "localhosts.conf";
retry:
@ -139,10 +144,10 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
// set clock in sync in fctypes.cpp
//if ( m_hostId == 0 ) g_clockInSync = true;
// log it
if (this == &g_hostdb) logf(LOG_INIT,"conf: HostId is %li.",m_hostId);
//if(this == &g_hostdb) logf(LOG_INIT,"conf: HostId is %li.",m_hostId);
// . File::open() open old if it exists, otherwise,
File f;
f.set ( filename );
f.set ( dir , filename );
// . returns -1 on error and sets g_errno
// . returns false if does not exist, true otherwise
long status = f.doesExist();
@ -168,13 +173,20 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
// hosts2.conf is not necessary
if ( this == &g_hostdb2 ) return true;
g_errno = ENOHOSTSFILE;
log(
"conf: Filename %s does not exist." ,filename);
// if doing localhosts.conf now try hosts.conf
if ( strcmp(filename,"./localhosts.conf") == 0 ) {
filename = "./hosts.conf";
if ( strcmp(filename,"localhosts.conf") == 0 ) {
filename = "hosts.conf";
g_errno = 0;
goto retry;
}
// now we generate one if that is not there
if ( ! m_created ) {
m_created = true;
g_errno = 0;
createHostsConf( cwd );
goto retry;
}
log("conf: Filename %s does not exist." ,filename);
return false;
}
// get file size
@ -1131,6 +1143,10 @@ bool Hostdb::init ( char *filename , long hostId , char *netName ,
sprintf ( m_httpRootDir , "%shtml/" , m_dir );
sprintf ( m_logFilename , "%slog%03li", m_dir , hostId );
if ( ! g_conf.m_runAsDaemon )
sprintf(m_logFilename,"/dev/stderr");
long gcount = 0;
for ( long i = 0 ; i < MAX_KSLOTS && m_numHosts ; i++ ) {
// set its group id from groupNum, which is "gcount"
@ -2472,4 +2488,131 @@ long Hostdb::getCRC ( ) {
}
bool Hostdb::createHostsConf( char *cwd ) {
SafeBuf sb;
sb.safePrintf("# The Gigablast host configuration file.\n");
sb.safePrintf("# Tells us what hosts are participating in the distributed search engine.\n");
sb.safePrintf("\n");
sb.safePrintf("\n");
sb.safePrintf("# How many mirrors do you want? If this is 0 then your data\n");
sb.safePrintf("# will NOT be replicated. If it is 1 then each host listed\n");
sb.safePrintf("# below will have one host that mirrors it, thereby decreasing\n");
sb.safePrintf("# total index capacity, but increasing redundancy. If this is\n");
sb.safePrintf("# 1 then the first half of hosts will be replicated by the\n");
sb.safePrintf("# second half of the hosts listed below.\n");
sb.safePrintf("\n");
sb.safePrintf("num-mirrors: 0\n");
sb.safePrintf("\n");
sb.safePrintf("\n");
sb.safePrintf("\n");
sb.safePrintf("# List of hosts. Limited to 512 from MAX_HOSTS in Hostdb.h. Increase that\n");
sb.safePrintf("# if you want more.\n");
sb.safePrintf("#\n");
sb.safePrintf("# Format:\n");
sb.safePrintf("#\n");
sb.safePrintf("# first column: hostID (starts at 0 and increments from there)\n");
sb.safePrintf("# second column: the port used by the client DNS algorithms\n");
sb.safePrintf("# third column: port that HTTPS listens on\n");
sb.safePrintf("# fourth column: port that HTTP listens on\n");
sb.safePrintf("# fifth column: port that udp server listens on\n");
sb.safePrintf("# sixth column: IP address or hostname that has an IP address in /etc/hosts\n");
sb.safePrintf("# seventh column: like sixth column but for secondary ethernet port. (optional)\n");
sb.safePrintf("\n");
sb.safePrintf("\n");
sb.safePrintf("# This file consists of a list of lines like this:\n");
sb.safePrintf("#\n");
sb.safePrintf("# <ClientDnsPort> <HttpsPort> <HttpPort> <UdpPort> <IP1> <IP2> <Path>\n");
sb.safePrintf("#\n");
sb.safePrintf("# By default just use the local host as the single host as listed below.\n");
sb.safePrintf("#\n");
sb.safePrintf("# The client DNS uses port 5998, https listens on 7000, http listens on port\n");
sb.safePrintf("# 8000 and the udp server listens on port 9000. We used to use port 6000 for\n");
sb.safePrintf("# DNS listening but it seemed to have some issues. If your DNS keeps timing\n");
sb.safePrintf("# out try a different port from 5998.\n");
sb.safePrintf("#\n");
sb.safePrintf("# If your server only has one IP then just repeat it as IP1 and IP2. You\n");
sb.safePrintf("# can also use an alphanumeric name from /etc/hosts in place of a direct\n");
sb.safePrintf("# IP address. (see example below)\n");
sb.safePrintf("#\n");
sb.safePrintf("# Use './gb N' to run the gb process as host #N where N is 0 to run as\n");
sb.safePrintf("# the first host in the list below. \n");
sb.safePrintf("#\n");
sb.safePrintf("# Use './gb start N' to use passwordless ssh to ssh to that Nth machine\n");
sb.safePrintf("# listed below and start the process. Use must have private/public keys\n");
sb.safePrintf("# for the required passwordless ssh.\n");
sb.safePrintf("#\n");
sb.safePrintf("# Use './gb kstart N' to run the Nth host in a bash keep-alive loop. So if it\n");
sb.safePrintf("# cores it will restart. It will send out an email alert if it restarts.\n");
sb.safePrintf("#\n");
sb.safePrintf("# The working directory is the last string on each line. That is where the\n");
sb.safePrintf("# 'gb' binary resides.\n");
sb.safePrintf("#\n");
// put our cwd here
sb.safePrintf("0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 %s\n",cwd);
sb.safePrintf("\n");
sb.safePrintf("\n");
sb.safePrintf("#\n");
sb.safePrintf("# Example of a four-node distributed search index running on a single\n");
sb.safePrintf("# server with four cores. The working directories are /home/mwells/hostN/.\n");
sb.safePrintf("# The 'gb' binary resides in the working directories. We have to use\n");
sb.safePrintf("# different ports for each gb instance since they are all on the same\n");
sb.safePrintf("# server.\n");
sb.safePrintf("#\n");
sb.safePrintf("# Use './gb 2' to run as the host on IP 1.2.3.8 for example.\n");
sb.safePrintf("#\n");
sb.safePrintf("#0 5998 7000 8000 9000 1.2.3.4 1.2.3.5 /home/mwells/host0/\n");
sb.safePrintf("#1 5997 7001 8001 9001 1.2.3.4 1.2.3.5 /home/mwells/host1/\n");
sb.safePrintf("#2 5996 7002 8002 9002 1.2.3.4 1.2.3.5 /home/mwells/host2/\n");
sb.safePrintf("#3 5995 7003 8003 9003 1.2.3.4 1.2.3.5 /home/mwells/host3/\n");
sb.safePrintf("\n");
sb.safePrintf("# A four-node cluster on four different servers:\n");
sb.safePrintf("#0 5998 7000 8000 9000 1.2.3.4 1.2.3.5 /home/mwells/gigablast/\n");
sb.safePrintf("#1 5998 7000 8000 9000 1.2.3.6 1.2.3.7 /home/mwells/gigablast/\n");
sb.safePrintf("#2 5998 7000 8000 9000 1.2.3.8 1.2.3.9 /home/mwells/gigablast/\n");
sb.safePrintf("#3 5998 7000 8000 9000 1.2.3.10 1.2.3.11 /home/mwells/gigablast/\n");
sb.safePrintf("\n");
sb.safePrintf("\n");
sb.safePrintf("#\n");
sb.safePrintf("# Example of an eight-node cluster.\n");
sb.safePrintf("# Each line represents a single gb process with dual ethernet ports\n");
sb.safePrintf("# whose IP addresses are in /etc/hosts under se0, se0b, se1, se1b, ...\n");
sb.safePrintf("#\n");
sb.safePrintf("#0 5998 7000 8000 9000 se0 se0b /home/mwells/gigablast/\n");
sb.safePrintf("#1 5998 7000 8000 9000 se1 se1b /home/mwells/gigablast/\n");
sb.safePrintf("#2 5998 7000 8000 9000 se2 se2b /home/mwells/gigablast/\n");
sb.safePrintf("#3 5998 7000 8000 9000 se3 se3b /home/mwells/gigablast/\n");
sb.safePrintf("#4 5998 7000 8000 9000 se4 se4b /home/mwells/gigablast/\n");
sb.safePrintf("#5 5998 7000 8000 9000 se5 se5b /home/mwells/gigablast/\n");
sb.safePrintf("#6 5998 7000 8000 9000 se6 se6b /home/mwells/gigablast/\n");
sb.safePrintf("#7 5998 7000 8000 9000 se7 se7b /home/mwells/gigablast/\n");
sb.safePrintf("\n");
sb.safePrintf("\n");
sb.safePrintf("# Proxies\n");
sb.safePrintf("# Proxies handle the incoming search request and load balance it to \n");
sb.safePrintf("# one of the hosts listed above. If you only have one host in your search\n");
sb.safePrintf("# engine then you probably do not really need the proxy. You need to make\n");
sb.safePrintf("# sure all shard hosts and all proxies have the same hosts.conf because\n");
sb.safePrintf("# they ping each other to ensure they are up.\n");
sb.safePrintf("#\n");
sb.safePrintf("# To start a proxy you can run './gb proxy load 0' to start the first\n");
sb.safePrintf("# proxy in your list. Use './gb proxy load 1' to start the second proxy, etc.\n");
sb.safePrintf("#\n");
sb.safePrintf("# Use './gb proxy start N' to start the Nth proxy, where N starts at 0,\n");
sb.safePrintf("# mentioned in the proxy list below. You need to enable passwordless ssh\n");
sb.safePrintf("# using private/public keys for that to work. \n");
sb.safePrintf("#\n");
sb.safePrintf("# Use './gb proxy kstart N' to start the Nth proxy in a keep-alive loop using\n");
sb.safePrintf("# the bash shell. So if it cores it will restart and send you an email alert.\n");
sb.safePrintf("#\n");
sb.safePrintf("# Format:\n");
sb.safePrintf("# First column is \"proxy\" and followed by the standard columns described above\n");
sb.safePrintf("#\n");
sb.safePrintf("# Example:\n");
sb.safePrintf("# A proxy will be running on 10.5.66.18:\n");
sb.safePrintf("#proxy 6001 7001 8001 9001 10.5.66.18\n");
log("%shosts.conf does not exist, creating.",cwd);
sb.save ( cwd , "hosts.conf" );
return true;
}

View File

@ -313,8 +313,9 @@ class Hostdb {
// . sets itself from g_conf (our configuration class)
// . returns false on fatal error
// . gets filename from Conf.h class
bool init ( char *filename , long hostId , char *netname = NULL,
bool proxyHost = false , char useTempCluster = 0 );
bool init ( long hostId , char *netname = NULL,
bool proxyHost = false , char useTempCluster = 0 ,
char *cwd = NULL );
// for dealing with pings
bool registerHandler ( );
@ -627,6 +628,9 @@ class Hostdb {
bool m_initialized;
bool createHostsConf( char *cwd );
bool m_created;
long m_crc;
long m_crcValid;

View File

@ -28,11 +28,13 @@
#include "TcpSocket.h"
// values for HttpRequest::m_replyFormat
#define FORMAT_HTML 0
#define FORMAT_XML 1
#define FORMAT_JSON 2
#define FORMAT_CSV 3
#define FORMAT_WIDGET 4
#define FORMAT_HTML 1
#define FORMAT_XML 2
#define FORMAT_JSON 3
#define FORMAT_CSV 4
#define FORMAT_TXT 5
#define FORMAT_PROCOG 6
#define FORMAT_WIDGET 7
class HttpRequest {

81
Log.cpp
View File

@ -38,6 +38,7 @@ Log::Log () {
m_port = 777;
m_needsPrinting = false;
m_disabled = false;
m_logTimestamps = false;
}
Log::~Log () { reset(); }
@ -73,7 +74,34 @@ bool Log::init ( char *filename ) {
if ( ! m_filename ) return true;
// skip this for now
return true;
//return true;
//
// RENAME log000 to log000-2013_11_04-18:19:32
//
File f;
char tmp[16];
sprintf(tmp,"log%03li",g_hostdb.m_hostId);
f.set ( g_hostdb.m_dir , tmp );
// make new filename like log000-2013_11_04-18:19:32
time_t now = getTimeLocal();
tm *tm1 = gmtime((const time_t *)&now);
char tmp2[64];
strftime(tmp2,64,"%Y_%m_%d-%T",tm1);
SafeBuf newName;
if ( ! newName.safePrintf ( "%slog%03li-%s",
g_hostdb.m_dir,
g_hostdb.m_hostId,
tmp2 ) ) {
fprintf(stderr,"log rename failed\n");
return false;
}
// rename log000 to log000-2013_11_04-18:19:32
if ( f.doesExist() ) {
//fprintf(stdout,"renaming file\n");
f.rename ( newName.getBufStart() );
}
// open it for appending.
// create with -rw-rw-r-- permissions if it's not there.
@ -82,10 +110,11 @@ bool Log::init ( char *filename ) {
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH );
if ( m_fd >= 0 ) return true;
// bitch to stderr and return false on error
fprintf(stderr,"could not open log file %s for appending\n",m_filename);
fprintf(stderr,"could not open log file %s for appending\n",
m_filename);
return false;
}
/*
static const char *getTypeString ( long type ) ;
const char *getTypeString ( long type ) {
@ -101,7 +130,7 @@ const char *getTypeString ( long type ) {
default: return " ";
}
}
*/
#define MAX_LINE_LEN 20048
bool Log::shouldLog ( long type , char *msg ) {
@ -210,6 +239,7 @@ bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
char tt [ MAX_LINE_LEN ];
char *p = tt;
char *pend = tt + MAX_LINE_LEN;
/*
// print timestamp, hostid, type
if ( g_hostdb.m_numHosts <= 999 )
sprintf ( p , "%llu %03li %s ",
@ -220,14 +250,33 @@ bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
else if ( g_hostdb.m_numHosts <= 99999 )
sprintf ( p , "%llu %05li %s ",
now , g_hostdb.m_hostId , getTypeString(type) );
p += gbstrlen ( p );
*/
// print timestamp, hostid, type
if ( m_logTimestamps ) {
if ( g_hostdb.m_numHosts <= 999 )
sprintf ( p , "%llu %03li ",
now , g_hostdb.m_hostId );
else if ( g_hostdb.m_numHosts <= 9999 )
sprintf ( p , "%llu %04li ",
now , g_hostdb.m_hostId );
else if ( g_hostdb.m_numHosts <= 99999 )
sprintf ( p , "%llu %05li ",
now , g_hostdb.m_hostId );
p += gbstrlen ( p );
}
// msg resource
char *x = msg;
long cc = 7;
// the first 7 bytes or up to the : must be ascii
while ( p < pend && *x && is_alnum_a(*x) ) { *p++ = *x++; cc--; }
//while ( p < pend && *x && is_alnum_a(*x) ) { *p++ = *x++; cc--; }
// space pad
while ( cc-- > 0 ) *p++ = ' ';
//while ( cc-- > 0 ) *p++ = ' ';
// ignore the label for now...
while ( p < pend && *x && is_alnum_a(*x) ) { x++; cc--; }
// thread id if in "thread"
if ( pid != s_pid && s_pid != -1 ) {
//sprintf ( p , "[%li] " , (long)getpid() );
@ -238,8 +287,16 @@ bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
long avail = (MAX_LINE_LEN) - (p - tt) - 1;
if ( msgLen > avail ) msgLen = avail;
if ( *x == ':' ) x++;
if ( *x == ' ' ) x++;
strncpy ( p , x , avail );
// capitalize for consistency
if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
p += gbstrlen(p);
// back up over spaces
while ( p[-1] == ' ' ) p--;
// end in period or ? or !
if ( p[-1] != '?' && p[-1] != '.' && p[-1] != '!' )
*p++ = '.';
*p ='\0';
// the total length, not including the \0
long tlen = p - tt;
@ -276,8 +333,14 @@ bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
if ( *ttp == '\t' ) *ttp = ' ';
}
// print it out for now
fprintf ( stderr, "%s\n", tt );
if ( m_fd >= 0 ) {
write ( m_fd , tt , tlen );
write ( m_fd , "\n", 1 );
}
else {
// print it out for now
fprintf ( stderr, "%s\n", tt );
}
// set the stuff in the array
m_errorMsg [m_numErrors] = msg;

2
Log.h
View File

@ -139,6 +139,8 @@ class Log {
bool m_disabled;
bool m_logTimestamps;
private:
bool dumpLog ( ); // make room for the new ones

View File

@ -429,6 +429,7 @@ pid_t s_pid = (pid_t) -1;
void Mem::setPid() {
s_pid = getpid();
//log("mem: pid is %li",(long)s_pid);
if(s_pid == -1 ) { log("monitor: bad s_pid"); char *xx=NULL;*xx=0; }
}
@ -452,7 +453,7 @@ bool Mem::init ( long long maxMem ) {
//lim.rlim_max = maxMem;
//setrlimit ( RLIMIT_AS , &lim ); // ulimit -v
// note
log(LOG_INIT,"mem: Max memory usage set to %lli bytes.", maxMem);
//log(LOG_INIT,"mem: Max memory usage set to %lli bytes.", maxMem);
// warning msg
if ( g_conf.m_detectMemLeaks )
log(LOG_INIT,"mem: Memory leak checking is enabled.");

View File

@ -73,7 +73,9 @@ public:
// . uses msg4 to add seeds to spiderdb if necessary
// . only adds seeds for the shard we are on iff we are responsible for
// the fake firstip!!!
bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
bool updateSiteListTables ( collnum_t collnum ,
bool addSeeds ,
char *siteListArg ) {
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) return true;
@ -113,6 +115,8 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
}
// get the old sitelist Domain Hash to PatternData mapping table
// which tells us what domains, subdomains or paths we can or
// can not spider...
HashTableX *dt = &sc->m_siteListDomTable;
// reset it
@ -142,10 +146,10 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
// use this so it will be free automatically when msg4 completes!
SafeBuf *spiderReqBuf = &sc->m_msg4x.m_tmpBuf;
char *siteList = cr->m_siteListBuf.getBufStart();
//char *siteList = cr->m_siteListBuf.getBufStart();
// scan the list
char *pn = siteList;
char *pn = siteListArg;
// completely empty?
if ( ! pn ) return true;
@ -290,7 +294,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
if ( ! isFilter ) continue;
// make the data node
// make the data node used for filtering urls during spidering
PatternData pd;
// hash of the subdomain or domain for this line in sitelist
pd.m_thingHash32 = u.getHostHash32();
@ -391,10 +395,15 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
// check domain specific tables
HashTableX *dt = &sc->m_siteListDomTable;
// get this
CollectionRec *cr = sc->m_cr;
// need to build dom table for pattern matching?
if ( dt->getNumSlotsUsed() == 0 ) {
if ( dt->getNumSlotsUsed() == 0 && cr ) {
// do not add seeds, just make siteListDomTable, etc.
updateSiteList ( sc->m_collnum , false );
updateSiteListTables ( sc->m_collnum ,
false , // add seeds?
cr->m_siteListBuf.getBufStart() );
}
if ( dt->getNumSlotsUsed() == 0 ) {
@ -731,6 +740,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
char buf [ 128000 ];
SafeBuf sb(buf,128000);
sb.reset();
char *fs = hr->getString("format",NULL,NULL);
char fmt = FORMAT_HTML;
@ -764,7 +774,7 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
//
// show stats
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
char *seedStr = cr->m_diffbotSeeds.getBufStart();
if ( ! seedStr ) seedStr = "";
@ -776,45 +786,23 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
if ( sentAlert ) sentAlert = 1;
sb.safePrintf(
//sb.safePrintf(
// "<form method=get action=/crawlbot>"
// "%s"
// , sb.getBufStart() // hidden input token/name/..
// );
char *hurts = "No";
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
hurts = "Yes";
"<form method=get action=/crawlbot>"
"%s"
, sb.getBufStart() // hidden input token/name/..
);
sb.safePrintf("<TABLE border=0>"
"<TR><TD valign=top>"
"<table border=0 cellpadding=5>"
//
"<tr>"
"<td><b>Crawl Name:</td>"
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Crawl Type:</td>"
"<td>%li</td>"
"</tr>"
//"<tr>"
//"<td><b>Collection Alias:</td>"
//"<td>%s%s</td>"
//"</tr>"
"<tr>"
"<td><b>Token:</td>"
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Seeds:</td>"
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Crawl Status:</td>"
"<td><b>Crawl Status Code:</td>"
"<td>%li</td>"
"</tr>"
@ -823,14 +811,14 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
"<td>%s</td>"
"</tr>"
"<tr>"
"<td><b>Rounds Completed:</td>"
"<td>%li</td>"
"</tr>"
//"<tr>"
//"<td><b>Rounds Completed:</td>"
//"<td>%li</td>"
//"</tr>"
"<tr>"
"<td><b>Has Urls Ready to Spider:</td>"
"<td>%li</td>"
"<td>%s</td>"
"</tr>"
@ -841,12 +829,8 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
//"</tr>"
"<tr>"
"<td><b>Objects Found</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>URLs Harvested</b> (inc. dups)</td>"
"<td><b>URLs Harvested</b> "
"(may include dups)</td>"
"<td>%lli</td>"
"</tr>"
@ -865,60 +849,24 @@ bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
"<td><b>Page Crawl Successes</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Crawl Successes This Round</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Process Attempts</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Process Successes</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Process Successes This Round</b></td>"
"<td>%lli</td>"
"</tr>"
, cr->m_diffbotCrawlName.getBufStart()
, (long)cr->m_isCustomCrawl
, cr->m_diffbotToken.getBufStart()
, seedStr
, crawlStatus
, tmp.getBufStart()
, cr->m_spiderRoundNum
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
//, cr->m_spiderRoundNum
//, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
, hurts
, cr->m_globalCrawlInfo.m_objectsAdded -
cr->m_globalCrawlInfo.m_objectsDeleted
, cr->m_globalCrawlInfo.m_urlsHarvested
//, cr->m_globalCrawlInfo.m_urlsConsidered
, cr->m_globalCrawlInfo.m_pageDownloadAttempts
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
, cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
, cr->m_globalCrawlInfo.m_pageProcessAttempts
, cr->m_globalCrawlInfo.m_pageProcessSuccesses
, cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
);
}
if ( fmt != FORMAT_JSON )
// wrap up the form, print a submit button
g_pages.printAdminBottom ( &sb );
//if ( fmt != FORMAT_JSON )
// // wrap up the form, print a submit button
// g_pages.printAdminBottom ( &sb );
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),

View File

@ -25,11 +25,11 @@
#include "Parms.h"
// so user can specify the format of the reply/output
#define FMT_HTML 1
#define FMT_XML 2
#define FMT_JSON 3
#define FMT_CSV 4
#define FMT_TXT 5
//#define FMT_HTML 1
//#define FMT_XML 2
//#define FMT_JSON 3
//#define FMT_CSV 4
//#define FMT_TXT 5
void doneSendingWrapper ( void *state , TcpSocket *sock ) ;
bool sendBackDump ( TcpSocket *s,HttpRequest *hr );
@ -158,25 +158,25 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
if ( ( xx = strstr ( path , "_data.json" ) ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_JSON;
fmt = FORMAT_JSON;
downloadJSON = true;
}
else if ( ( xx = strstr ( path , "_data.csv" ) ) ) {
rdbId = RDB_TITLEDB;
downloadJSON = true;
fmt = FMT_CSV;
fmt = FORMAT_CSV;
}
else if ( ( xx = strstr ( path , "_urls.csv" ) ) ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_CSV;
fmt = FORMAT_CSV;
}
else if ( ( xx = strstr ( path , "_urls.txt" ) ) ) {
rdbId = RDB_SPIDERDB;
fmt = FMT_TXT;
fmt = FORMAT_TXT;
}
else if ( ( xx = strstr ( path , "_pages.txt" ) ) ) {
rdbId = RDB_TITLEDB;
fmt = FMT_TXT;
fmt = FORMAT_TXT;
}
// sanity, must be one of 3 download calls
@ -213,7 +213,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// . if doing download of csv, make it search results now!
// . make an httprequest on stack and call it
if ( fmt == FMT_CSV && rdbId == RDB_TITLEDB ) {
if ( fmt == FORMAT_CSV && rdbId == RDB_TITLEDB ) {
char tmp2[5000];
SafeBuf sb2(tmp2,5000);
long dr = 1;
@ -247,7 +247,7 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
// . if doing download of json, make it search results now!
// . make an httprequest on stack and call it
if ( fmt == FMT_JSON && rdbId == RDB_TITLEDB ) {
if ( fmt == FORMAT_JSON && rdbId == RDB_TITLEDB ) {
char tmp2[5000];
SafeBuf sb2(tmp2,5000);
long dr = 1;
@ -514,13 +514,13 @@ bool StateCD::sendList ( ) {
//sb.setLabel("dbotdmp");
char *ct = "text/csv";
if ( m_fmt == FMT_JSON )
if ( m_fmt == FORMAT_JSON )
ct = "application/json";
if ( m_fmt == FMT_XML )
if ( m_fmt == FORMAT_XML )
ct = "text/xml";
if ( m_fmt == FMT_TXT )
if ( m_fmt == FORMAT_TXT )
ct = "text/plain";
if ( m_fmt == FMT_CSV )
if ( m_fmt == FORMAT_CSV )
ct = "text/csv";
// . if we haven't yet sent an http mime back to the user
@ -545,13 +545,13 @@ bool StateCD::sendList ( ) {
//CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! m_printedFirstBracket && m_fmt == FMT_JSON ) {
if ( ! m_printedFirstBracket && m_fmt == FORMAT_JSON ) {
sb.safePrintf("[\n");
m_printedFirstBracket = true;
}
// these are csv files not xls
//if ( ! m_printedFirstBracket && m_fmt == FMT_CSV ) {
//if ( ! m_printedFirstBracket && m_fmt == FORMAT_CSV ) {
// sb.safePrintf("sep=,\n");
// m_printedFirstBracket = true;
//}
@ -638,7 +638,7 @@ bool StateCD::sendList ( ) {
// use this for printing out urls.csv as well...
m_printedEndingBracket = true;
// end array of json objects. might be empty!
if ( m_rdbId == RDB_TITLEDB && m_fmt == FMT_JSON )
if ( m_rdbId == RDB_TITLEDB && m_fmt == FORMAT_JSON )
sb.safePrintf("\n]\n");
//log("adding ]. len=%li",sb.length());
// i'd like to exit streaming mode here. i fixed tcpserver.cpp
@ -853,7 +853,7 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
}
// "csv" is default if json not specified
if ( m_fmt == FMT_JSON )
if ( m_fmt == FORMAT_JSON )
sb->safePrintf("[{"
"{\"url\":"
"\"%s\"},"
@ -997,7 +997,7 @@ void StateCD::printTitledbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// if not json, just print the json item out in csv
// moved into PageResults.cpp...
//if ( m_fmt == FMT_CSV ) {
//if ( m_fmt == FORMAT_CSV ) {
// printJsonItemInCsv ( json , sb );
// continue;
//}
@ -1337,7 +1337,7 @@ bool sendReply2 (TcpSocket *socket , long fmt , char *msg ) {
// send this back to browser
SafeBuf sb;
if ( fmt == FMT_JSON ) {
if ( fmt == FORMAT_JSON ) {
sb.safePrintf("{\n\"response\":\"success\",\n"
"\"message\":\"%s\"\n}\n"
, msg );
@ -1368,7 +1368,7 @@ bool sendErrorReply2 ( TcpSocket *socket , long fmt , char *msg ) {
// send this back to browser
SafeBuf sb;
if ( fmt == FMT_JSON ) {
if ( fmt == FORMAT_JSON ) {
sb.safePrintf("{\"error\":\"%s\"}\n"
, msg );
ct = "application/json";
@ -1476,7 +1476,7 @@ void injectedUrlWrapper ( void *state ) {
// send back the html or json response?
SafeBuf *response = &sb;
if ( st->m_fmt == FMT_JSON ) response = &js;
if ( st->m_fmt == FORMAT_JSON ) response = &js;
// . this will call g_httpServer.sendReply()
// . pass it in the injection response, "sb"
@ -1673,7 +1673,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// . now show stats for the current crawl
// . put in xml or json if format=xml or format=json or
// xml=1 or json=1 ...
char fmt = FMT_JSON;
char fmt = FORMAT_JSON;
// token is always required. get from json or html form input
//char *token = getInputString ( "token" );
@ -1693,21 +1693,21 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
name++;
}
// change default formatting to html
fmt = FMT_HTML;
fmt = FORMAT_HTML;
}
char *fs = hr->getString("format",NULL,NULL);
// give john a json api
if ( fs && strcmp(fs,"html") == 0 ) fmt = FMT_HTML;
if ( fs && strcmp(fs,"json") == 0 ) fmt = FMT_JSON;
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FMT_XML;
if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
// if we got json as input, give it as output
//if ( JS.getFirstItem() ) fmt = FMT_JSON;
//if ( JS.getFirstItem() ) fmt = FORMAT_JSON;
if ( ! token && fmt == FMT_JSON ) { // (cast==0|| fmt == FMT_JSON ) ) {
if ( ! token && fmt == FORMAT_JSON ) { // (cast==0|| fmt == FORMAT_JSON ) ) {
char *msg = "invalid token";
return sendErrorReply2 (socket,fmt,msg);
}
@ -1772,7 +1772,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
//}
// just send back a list of all the collections after the delete
//if ( delColl && cast && fmt == FMT_JSON ) {
//if ( delColl && cast && fmt == FORMAT_JSON ) {
// char *msg = "Collection deleted.";
// return sendReply2 (socket,fmt,msg);
//}
@ -2263,7 +2263,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
/*
bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
sb.safePrintf("\"urlFilters\":[");
// skip first filters that are:
@ -2303,7 +2303,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
// urls higher spider priority, so skip it
if ( strncmp(expression,"ismanualadd && ",15) == 0 )
continue;
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("<tr>"
"<td>Expression "
"<input type=text "
@ -2328,7 +2328,7 @@ bool printUrlFilters ( SafeBuf &sb , CollectionRec *cr , long fmt ) {
sb.pushChar('\n');
}
if ( fmt == FMT_JSON ) {
if ( fmt == FORMAT_JSON ) {
// remove trailing comma
sb.removeLastChar('\n');
sb.removeLastChar(',');
@ -2519,7 +2519,7 @@ bool printCrawlDetailsInJson ( SafeBuf *sb , CollectionRec *cx ) {
true // isJSON?
);
*/
//printUrlFilters ( sb , cx , FMT_JSON );
//printUrlFilters ( sb , cx , FORMAT_JSON );
// end that collection rec
sb->safePrintf("}\n");
@ -2537,7 +2537,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// store output into here
SafeBuf sb;
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf(
"<html>"
"<title>Crawlbot - "
@ -2573,7 +2573,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
lb.urlEncode(name);
lb.safePrintf ("&token=");
lb.urlEncode(token);
if ( fmt == FMT_HTML ) lb.safePrintf("&format=html");
if ( fmt == FORMAT_HTML ) lb.safePrintf("&format=html");
lb.nullTerm();
@ -2590,7 +2590,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//}
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("<table border=0>"
"<tr><td>"
"<b><font size=+2>"
@ -2645,7 +2645,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// print list of collections controlled by this token
//
for ( long i = 0 ; fmt == FMT_HTML && i<g_collectiondb.m_numRecs;i++ ){
for ( long i = 0 ; fmt == FORMAT_HTML && i<g_collectiondb.m_numRecs;i++ ){
CollectionRec *cx = g_collectiondb.m_recs[i];
if ( ! cx ) continue;
// get its token if any
@ -2677,19 +2677,19 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
sb.safePrintf("</font></b>");
}
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf ( "</center><br/>" );
// the ROOT JSON [
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
sb.safePrintf("{\n");
// injection is currently not in use, so this is an artifact:
if ( fmt == FMT_JSON && injectionResponse )
if ( fmt == FORMAT_JSON && injectionResponse )
sb.safePrintf("\"response\":\"%s\",\n\n"
, injectionResponse->getBufStart() );
if ( fmt == FMT_JSON && urlUploadResponse )
if ( fmt == FORMAT_JSON && urlUploadResponse )
sb.safePrintf("\"response\":\"%s\",\n\n"
, urlUploadResponse->getBufStart() );
@ -2702,14 +2702,14 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// the items in the array now have type:bulk or type:crawl
// so call them 'jobs'
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
sb.safePrintf("\"jobs\":[");//\"collections\":");
long summary = hr->getLong("summary",0);
// enter summary mode for json
if ( fmt != FMT_HTML ) summary = 1;
if ( fmt != FORMAT_HTML ) summary = 1;
// start the table
if ( summary && fmt == FMT_HTML ) {
if ( summary && fmt == FORMAT_HTML ) {
sb.safePrintf("<table border=1 cellpadding=5>"
"<tr>"
"<td><b>Collection</b></td>"
@ -2740,11 +2740,11 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// just print out single crawl info for json
if ( fmt != FMT_HTML && cx != cr && name3 )
if ( fmt != FORMAT_HTML && cx != cr && name3 )
continue;
// if json, print each collectionrec
if ( fmt == FMT_JSON ) {
if ( fmt == FORMAT_JSON ) {
if ( ! firstOne )
sb.safePrintf(",\n\t");
firstOne = false;
@ -2786,7 +2786,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, cx->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
);
}
if ( summary && fmt == FMT_HTML ) {
if ( summary && fmt == FORMAT_HTML ) {
sb.safePrintf("</table></html>" );
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
@ -2794,7 +2794,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
0); // cachetime
}
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
// end the array of collection objects
sb.safePrintf("\n]\n");
@ -2808,7 +2808,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show urls being crawled (ajax) (from Spider.cpp)
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf ( "<table width=100%% cellpadding=5 "
"style=border-width:1px;border-style:solid;"
"border-color:black;>"
@ -2879,7 +2879,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
rand64 |= r2;
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("<br>"
"<table border=0 cellpadding=5>"
@ -2952,12 +2952,12 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
);
}
if ( injectionResponse && fmt == FMT_HTML )
if ( injectionResponse && fmt == FORMAT_HTML )
sb.safePrintf("<br><font size=-1>%s</font>\n"
,injectionResponse->getBufStart()
);
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf(//"<input type=hidden name=c value=\"%s\">"
//"<input type=hidden name=crawlbotapi value=1>"
"</td>"
@ -2996,7 +2996,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show stats
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
char *seedStr = cr->m_diffbotSeeds.getBufStart();
if ( ! seedStr ) seedStr = "";
@ -3654,7 +3654,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// xml or json does not show the input boxes
//if ( format != FMT_HTML )
//if ( format != FORMAT_HTML )
// return g_httpServer.sendDynamicPage ( s,
// sb.getBufStart(),
// sb.length(),
@ -3677,7 +3677,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
s2 = "";
}
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf(
"<a onclick="
@ -3721,7 +3721,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// print url filters. HACKy...
//
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
g_parms.sendPageGeneric ( socket ,
hr ,
PAGE_FILTERS ,
@ -3732,7 +3732,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// end HACKy hack
//
if ( fmt == FMT_HTML )
if ( fmt == FORMAT_HTML )
sb.safePrintf(
"</form>"
"</div>"
@ -3760,7 +3760,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show simpler url filters table
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
/*
sb.safePrintf ( "<table>"
"<tr><td colspan=2>"
@ -3796,7 +3796,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
//
// show reset and delete crawl buttons
//
if ( fmt == FMT_HTML ) {
if ( fmt == FORMAT_HTML ) {
sb.safePrintf(
"<table cellpadding=5>"
"<tr>"
@ -3859,13 +3859,13 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// the ROOT JSON }
if ( fmt == FMT_JSON )
if ( fmt == FORMAT_JSON )
sb.safePrintf("}\n");
char *ct = "text/html";
if ( fmt == FMT_JSON ) ct = "application/json";
if ( fmt == FMT_XML ) ct = "text/xml";
if ( fmt == FMT_CSV ) ct = "text/csv";
if ( fmt == FORMAT_JSON ) ct = "application/json";
if ( fmt == FORMAT_XML ) ct = "text/xml";
if ( fmt == FORMAT_CSV ) ct = "text/csv";
// this could be in html json or xml
return g_httpServer.sendDynamicPage ( socket,
@ -4142,7 +4142,7 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
char *json = hr->getString("json");
if ( ! json )
return sendReply2 ( socket,
FMT_JSON,
FORMAT_JSON,
"No &json= provided in request.");
@ -4151,12 +4151,12 @@ bool setSpiderParmsFromJSONPost ( TcpSocket *socket ,
// wtf?
if ( ! status )
return sendReply2 ( socket, FMT_JSON,
return sendReply2 ( socket, FORMAT_JSON,
"Error with JSON parser.");
// error adding it?
if ( ! cr )
return sendReply2 ( socket,FMT_JSON,
return sendReply2 ( socket,FORMAT_JSON,
"Failed to create new collection.");
ji = JP.getFirstItem();

View File

@ -169,6 +169,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("</form>\n");
sb.safePrintf("<br>\n");
sb.safePrintf("\n");
// print any red boxes we might need to
if ( printRedBox2 ( &sb , true ) )
sb.safePrintf("<br>\n");
sb.safePrintf("<table cellpadding=3>\n");
sb.safePrintf("\n");

152
Pages.cpp
View File

@ -50,6 +50,9 @@ static WebPage s_pages[] = {
"dummy page - if set in the users row then user will have master=0 and "
" collection links will be highlighted in red",
NULL, 0 },
//{ PAGE_QUALITY , "quality", 0, "quality", 0, 0,
// "dummy page - if set in the users row then \"Quality Control\""
// " will be printed besides the logo for certain pages",
@ -102,12 +105,66 @@ static WebPage s_pages[] = {
// "Basic diffbot page.", sendPageBasicDiffbot , 0 } ,
{ PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 ,
"Basic security page.", sendPageGeneric , 0 } ,
{ PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 ,
"Basic search page.", sendPageRoot , 0 } ,
{ PAGE_MASTER , "admin/master" , 0 , "master controls" , 1 , 0 ,
//USER_MASTER | USER_PROXY ,
"master controls page",
sendPageGeneric , 0 } ,
{ PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1,
//USER_ADMIN | USER_MASTER ,
"search controls page",
sendPageGeneric , 0 } ,
{ PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0,
//USER_ADMIN | USER_MASTER | USER_PROXY ,
"spider controls page",
sendPageGeneric , 0 } ,
{ PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 ,
//USER_MASTER | USER_PROXY,
"log page",
sendPageGeneric , 0 } ,
{ PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 ,
//USER_MASTER | USER_PROXY ,
"advanced security page",
sendPageGeneric , 0 } ,
{ PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 ,
//USER_MASTER ,
"add a new collection using this page",
sendPageAddColl , 0 } ,
{ PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0,
//USER_MASTER ,
"delete a collection using this page",
sendPageDelColl , 0 } ,
{ PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 ,
//USER_MASTER ,
"repair page",
sendPageGeneric , 0 },
{ PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
"what sites can be spidered",
sendPageGeneric , 0 } , // sendPageBasicSettings
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
//USER_ADMIN | USER_MASTER ,
"prioritize urls for spidering",
sendPageGeneric , 0 } ,
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
//USER_ADMIN | USER_MASTER ,
"inject url in the index here",
sendPageInject , 2 } ,
// this is the addurl page the the admin!
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
"add url page for admin",
sendPageAddUrl2 , 0 } ,
{ PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 ,
//USER_ADMIN | USER_MASTER,
"reindex url page",
sendPageReindex , 0 } ,
{ PAGE_HOSTS , "admin/hosts" , 0 , "hosts" , 0 , 0 ,
//USER_MASTER | USER_PROXY,
@ -134,10 +191,7 @@ static WebPage s_pages[] = {
//USER_MASTER | USER_PROXY,
"sockets page",
sendPageSockets , 0 } ,
{ PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 ,
//USER_MASTER | USER_PROXY,
"log page",
sendPageGeneric , 0 } ,
{ PAGE_LOGVIEW , "admin/logview" , 0 , "log view" , 0 , 0 ,
//USER_MASTER ,
"logview page",
@ -147,18 +201,6 @@ static WebPage s_pages[] = {
// "sync page",
// sendPageGeneric , 0 } ,
{ PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 ,
//USER_MASTER | USER_PROXY ,
"advanced security page",
sendPageGeneric , 0 } ,
{ PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 ,
//USER_MASTER ,
"add a new collection using this page",
sendPageAddColl , 0 } ,
{ PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0,
//USER_MASTER ,
"delete a collection using this page",
sendPageDelColl , 0 } ,
{ PAGE_AUTOBAN ,"admin/autoban" , 0 , "autoban" , 1 , 1 ,
//USER_MASTER | USER_PROXY ,
"autobanned ips",
@ -175,10 +217,6 @@ static WebPage s_pages[] = {
//USER_MASTER ,
"threads page",
sendPageThreads , 0 },
{ PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 ,
//USER_MASTER ,
"repair page",
sendPageGeneric , 0 },
//{ PAGE_THESAURUS, "admin/thesaurus", 0 , "thesaurus", 0 , 0 ,
// //USER_MASTER ,
// "thesaurus page",
@ -207,14 +245,6 @@ static WebPage s_pages[] = {
"titledb page",
sendPageTitledb , 2 } ,
// 1 = usePost
{ PAGE_SEARCH , "admin" , 0 , "search controls" , 1 , 1,
//USER_ADMIN | USER_MASTER ,
"search controls page",
sendPageGeneric , 0 } ,
{ PAGE_SPIDER , "admin/spider" , 0 , "spider controls" , 1 , 0,
//USER_ADMIN | USER_MASTER | USER_PROXY ,
"spider controls page",
sendPageGeneric , 0 } ,
{ PAGE_CRAWLBOT , "crawlbot" , 0 , "crawlbot" , 1 , 0,
"simplified spider controls page",
@ -229,30 +259,6 @@ static WebPage s_pages[] = {
// "spider priorities page",
// sendPageGeneric , 0 } ,
{ PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
"what sites can be spidered",
sendPageGeneric , 0 } , // sendPageBasicSettings
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
//USER_ADMIN | USER_MASTER ,
"prioritize urls for spidering",
sendPageGeneric , 0 } ,
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
//USER_ADMIN | USER_MASTER ,
"inject url in the index here",
sendPageInject , 2 } ,
// this is the addurl page the the admin!
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
"add url page for admin",
sendPageAddUrl2 , 0 } ,
{ PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 ,
//USER_ADMIN | USER_MASTER,
"reindex url page",
sendPageReindex , 0 } ,
//{ PAGE_KEYWORDS, "admin/queries",0,"queries" , 0 , 1 ,
// "get queries a url matches",
// sendPageMatchingQueries , 2 } ,
@ -893,8 +899,6 @@ bool Pages::getNiceness ( long page ) {
return s_pages[page].m_niceness;
}
bool printRedBox ( SafeBuf *mb ) ;
///////////////////////////////////////////////////////////
//
// Convenient html printing routines
@ -1056,6 +1060,7 @@ bool Pages::printAdminTop (SafeBuf *sb ,
//if ( page == PAGE_BASIC_DIFFBOT ) isBasic = true;
//if ( page == PAGE_BASIC_SEARCH ) isBasic = true;
if ( page == PAGE_BASIC_SECURITY ) isBasic = true;
if ( page == PAGE_BASIC_SEARCH ) isBasic = true;
//
// print breadcrumb. main > Basic > Settings
@ -1791,7 +1796,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
// is this page basic?
bool pageBasic = false;
if ( i >= PAGE_BASIC_SETTINGS &&
i <= PAGE_BASIC_SECURITY )
i <= PAGE_BASIC_SEARCH )
pageBasic = true;
// print basic pages under the basic menu, advanced pages
@ -2627,9 +2632,18 @@ bool sendPageLogin ( TcpSocket *socket , HttpRequest *hr ) {
NULL);// cookie
}
bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage ) {
SafeBuf mb;
// return false if no red box
if ( ! printRedBox ( &mb , isRootWebPage ) ) return false;
// otherwise, print it
sb->safeStrcpy ( mb.getBufStart() );
// return true since we printed one
return true;
}
// emergency message box
bool printRedBox ( SafeBuf *mb ) {
bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) {
PingServer *ps = &g_pingServer;
@ -2649,11 +2663,33 @@ bool printRedBox ( SafeBuf *mb ) {
char *boxEnd =
"</td></tr></table>";
bool adds = false;
long adds = 0;
mb->safePrintf("<div style=max-width:500px;>");
// are we just starting off? give them a little help.
CollectionRec *cr = g_collectiondb.getRec("main");
if ( g_collectiondb.m_numRecs == 1 &&
cr &&
isRootWebPage &&
cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) {
if ( adds ) mb->safePrintf("<br>");
adds++;
mb->safePrintf("%s",box);
mb->safePrintf("Welcome to Gigablast. The most powerful "
"search engine you can legally download. "
"Please add the websites you want to spider "
"<a href=/admin/settings?c=main>here</a>."
);
mb->safePrintf("%s",boxEnd);
}
if ( isRootWebPage ) {
mb->safePrintf("</div>");
return (bool)adds;
}
if ( g_conf.m_numConnectIps == 0 && g_conf.m_numMasterPwds == 0 ) {
if ( adds ) mb->safePrintf("<br>");
adds++;
@ -2738,5 +2774,5 @@ bool printRedBox ( SafeBuf *mb ) {
mb->safePrintf("</div>");
return adds;
return (bool)adds;
}

31
Pages.h
View File

@ -5,6 +5,9 @@
#ifndef _PAGES_H_
#define _PAGES_H_
bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage = false ) ;
bool printRedBox ( SafeBuf *mb , bool isRootWebPage = false ) ;
// for PageEvents.cpp and Accessdb.cpp
//#define RESULTSWIDTHSTR "550px"
@ -306,25 +309,36 @@ enum {
//PAGE_BASIC_SEARCH , // TODO
//PAGE_BASIC_DIFFBOT , // TODO
PAGE_BASIC_SECURITY ,
PAGE_BASIC_SEARCH ,
// master admin pages
PAGE_MASTER ,
PAGE_SEARCH ,
PAGE_SPIDER ,
PAGE_LOG ,
PAGE_SECURITY ,
PAGE_ADDCOLL ,
PAGE_DELCOLL ,
PAGE_REPAIR ,
PAGE_SITES , // site filters
PAGE_FILTERS ,
PAGE_INJECT ,
PAGE_ADDURL2 ,
PAGE_REINDEX ,
PAGE_HOSTS ,
PAGE_STATS , // 10
PAGE_STATSDB ,
PAGE_PERF ,
PAGE_SOCKETS ,
PAGE_LOG ,
PAGE_LOGVIEW ,
// PAGE_SYNC ,
PAGE_SECURITY ,
PAGE_ADDCOLL ,
PAGE_DELCOLL ,
PAGE_AUTOBAN , // 20
//PAGE_SPIDERLOCKS ,
PAGE_PROFILER ,
PAGE_THREADS ,
PAGE_REPAIR ,
// PAGE_THESAURUS ,
// . non master-admin pages (collection controls)
@ -337,16 +351,9 @@ enum {
PAGE_TITLEDB ,
//PAGE_STATSDB ,
PAGE_SEARCH ,
PAGE_SPIDER ,
PAGE_CRAWLBOT , // 35
PAGE_SPIDERDB ,
//PAGE_PRIORITIES , // priority queue controls
PAGE_SITES , // site filters
PAGE_FILTERS ,
PAGE_INJECT ,
PAGE_ADDURL2 ,
PAGE_REINDEX ,
//PAGE_KEYWORDS ,
PAGE_SEO ,
PAGE_ACCESS , //40

View File

@ -122,6 +122,40 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) ;
//
////////
// from PageBasic.cpp:
bool updateSiteListTables(collnum_t collnum,bool addSeeds,char *siteListArg);
bool CommandUpdateSiteList ( char *rec ) {
// caller must specify collnum
collnum_t collnum = getCollnumFromParmRec ( rec );
if ( collnum < 0 ) {
log("parms: bad collnum for update site list");
g_errno = ENOCOLLREC;
return true;
}
// sanity
long dataSize = getDataSizeFromParmRec ( rec );
if ( dataSize < 0 ) {
log("parms: bad site list size = %li bad!",dataSize);
g_errno = EBADENGINEER;
return true;
}
// need this
CollectionRec *cr = g_collectiondb.getRec ( collnum );
// get the sitelist
char *data = getDataFromParmRec ( rec );
// update it
updateSiteListTables ( collnum ,
true , // add NEW seeds?
data // entire sitelist
);
// now that we deduped the old site list with the new one for
// purposes of adding NEW seeds, we can do the final copy
cr->m_siteListBuf.set ( data );
return true;
}
// . require user manually execute this to prevent us fucking up the data
// at first initially because of a bad hosts.conf file!!!
// . maybe put a red 'A' in the hosts table on the web page to indicate
@ -1888,7 +1922,7 @@ bool Parms::printParm ( SafeBuf* sb,
"value=\"%f\" "
// 3 was ok on firefox but need 6
// on chrome
"size=6>",cgi,*(float *)s);
"size=7>",cgi,*(float *)s);
}
else if ( t == TYPE_IP ) {
if ( m->m_max > 0 && j == jend )
@ -1896,7 +1930,7 @@ bool Parms::printParm ( SafeBuf* sb,
"size=12>",cgi);
else
sb->safePrintf ("<input type=text name=%s value=\"%s\" "
"size=6>",cgi,iptoa(*(long *)s));
"size=12>",cgi,iptoa(*(long *)s));
}
else if ( t == TYPE_LONG ) {
// just show the parm name and value if printing in json
@ -2942,7 +2976,7 @@ bool Parms::setFromFile ( void *THIS ,
// make sure we're init'd
init();
// let em know
if ( THIS == &g_conf ) log (LOG_INIT,"conf: Reading %s." , filename );
//if ( THIS == &g_conf) log (LOG_INIT,"conf: Reading %s." , filename );
// . let the log know what we are doing
// . filename is NULL if a call from CollectionRec::setToDefaults()
Xml xml;
@ -3121,10 +3155,12 @@ bool Parms::setFromFile ( void *THIS ,
// log("conf: %s does not have <%s> tag. "
// "Ommitting.",filename,m->m_xml);
//else
/*
if ( ! m->m_def ) //m->m_def[0] )
log("conf: %s does not have <%s> tag. Using "
"default value of \"%s\".", filename,
m->m_xml,m->m_def);
*/
continue;
}
// . next node is the value of this tag
@ -7534,6 +7570,7 @@ void Parms::init ( ) {
m->m_flags = PF_TEXTAREA;
m++;
/*
// the new upload post submit button
m->m_title = "upload urls";
m->m_desc = "Upload your file of urls.";
@ -7542,6 +7579,7 @@ void Parms::init ( ) {
m->m_obj = OBJ_NONE;
m->m_type = TYPE_FILEUPLOADBUTTON;
m++;
*/
m->m_title = "strip sessionids";
m->m_desc = "Strip added urls of their session ids.";
@ -7591,6 +7629,7 @@ void Parms::init ( ) {
m->m_title = "site list";
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
"See <a href=#examples>example site list</a> below. "
"Gigablast uses the "
"<a href=/admin/filters#insitelist>insitelist</a> "
"directive on "
@ -7599,8 +7638,7 @@ void Parms::init ( ) {
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
"tools. "
"See <a href=#examples>example site list</a> below. "
"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the <a href=/admin/addurl>add "
"urls</a> interface.";
m->m_cgi = "sitelist";
@ -7608,6 +7646,7 @@ void Parms::init ( ) {
m->m_page = PAGE_BASIC_SETTINGS;
m->m_obj = OBJ_COLL;
m->m_type = TYPE_SAFEBUF;
m->m_func = CommandUpdateSiteList;
m->m_def = "";
// rebuild urlfilters now will nuke doledb and call updateSiteList()
m->m_flags = PF_TEXTAREA | PF_DUP | PF_REBUILDURLFILTERS;
@ -7629,6 +7668,7 @@ void Parms::init ( ) {
m++;
*/
/*
// the new upload post submit button
m->m_title = "upload site list";
m->m_desc = "Upload your file of site patterns. Completely replaces "
@ -7640,12 +7680,13 @@ void Parms::init ( ) {
m->m_type = TYPE_FILEUPLOADBUTTON;
m->m_flags = PF_NOSAVE | PF_DUP;
m++;
*/
m->m_title = "restart collection";
m->m_desc = "Remove all documents from this collection and starts "
"spidering over again. If you do this accidentally there "
"is a <a href=/admin.html#recover>recovery procedure</a> to "
"get back the trashed data.";
m->m_desc = "Remove all documents from this collection and restart "
"spidering.";// If you do this accidentally there "
//"is a <a href=/admin.html#recover>recovery procedure</a> to "
// "get back the trashed data.";
m->m_cgi = "restart";
m->m_page = PAGE_BASIC_SETTINGS;
m->m_obj = OBJ_COLL;
@ -7659,6 +7700,7 @@ void Parms::init ( ) {
m->m_title = "site list";
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
"See <a href=#examples>example site list</a> below. "
"Gigablast uses the "
"<a href=/admin/filters#insitelist>insitelist</a> "
"directive on "
@ -7667,8 +7709,7 @@ void Parms::init ( ) {
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
"tools. "
"See <a href=#examples>example site list</a> below. "
"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
"Limit list to 300MB. If you have a lot of INDIVIDUAL urls "
"to add then consider using the <a href=/admin/addurl>addurl"
"</a> interface.";
m->m_cgi = "sitelist";
@ -7676,6 +7717,7 @@ void Parms::init ( ) {
m->m_page = PAGE_SITES;
m->m_obj = OBJ_COLL;
m->m_type = TYPE_SAFEBUF;
m->m_func = CommandUpdateSiteList;
m->m_def = "";
// rebuild urlfilters now will nuke doledb and call updateSiteList()
m->m_flags = PF_TEXTAREA | PF_REBUILDURLFILTERS;
@ -8762,11 +8804,11 @@ void Parms::init ( ) {
m++;
m->m_title = "max robots.txt cache age";
m->m_desc = "How many second to cache a robots.txt file for. "
m->m_desc = "How many seconds to cache a robots.txt file for. "
"86400 is 1 day. 0 means Gigablast will not read from the "
"cache at all and will download the robots.txt before every "
"page if robots.txt use is enabled above. However, if this is "
"0 then Gigablast will still store robots.txt files into the "
"0 then Gigablast will still store robots.txt files in the "
"cache.";
m->m_cgi = "mrca";
m->m_off = (char *)&cr.m_maxRobotsCacheAge - x;
@ -10639,8 +10681,9 @@ void Parms::init ( ) {
m++;
m->m_title = "do query expansion";
m->m_desc = "Query expansion will include word stems and synonyms in "
"its search results.";
m->m_desc = "If enabled, query expansion will expand your query "
"to include word stems and "
"synonyms of the query terms.";
m->m_def = "1";
m->m_off = (char *)&cr.m_queryExpansion - x;
m->m_soff = (char *)&si.m_queryExpansion - y;
@ -10653,7 +10696,7 @@ void Parms::init ( ) {
// more general parameters
m->m_title = "max search results";
m->m_desc = "What is the limit to the total number "
m->m_desc = "What is the maximum total number "
"of returned search results.";
m->m_cgi = "msr";
m->m_off = (char *)&cr.m_maxSearchResults - x;
@ -12457,7 +12500,7 @@ void Parms::init ( ) {
m++;
m->m_title = "max summary line width";
m->m_desc = "<br> tags are inserted to keep the number "
m->m_desc = "&lt;br&gt; tags are inserted to keep the number "
"of chars in the summary per line at or below this width. "
"Strings without spaces that exceed this "
"width are not split.";
@ -18068,7 +18111,11 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
}
// cmd to execute?
if ( parm->m_type == TYPE_CMD ) {
if ( parm->m_type == TYPE_CMD ||
// sitelist is a safebuf but it requires special deduping
// logic to update it so it uses CommandUpdateSiteList() to
// do the updating
parm->m_func ) {
// all parm rec data for TYPE_CMD should be ascii/utf8 chars
// and should be \0 terminated
char *data = getDataFromParmRec ( rec );

View File

@ -1392,7 +1392,7 @@ Profiler::getStackFrame(int sig) {
void
Profiler::startRealTimeProfiler() {
log(LOG_INIT, "admin: MLT starting real time profiler");
log(LOG_INIT, "admin: starting real time profiler");
if(!m_frameTraces) {
m_frameTraces = (FrameTrace *)mmalloc(
sizeof(FrameTrace) * MAX_FRAME_TRACES, "FrameTraces");
@ -1414,7 +1414,7 @@ Profiler::startRealTimeProfiler() {
void
Profiler::stopRealTimeProfiler(const bool keepData) {
log(LOG_INIT, "admin: MLT stopping real time profiler");
log(LOG_INIT, "admin: stopping real time profiler");
struct itimerval value;
int which = ITIMER_REAL;
getitimer( which, &value );

View File

@ -15163,7 +15163,7 @@ bool Sections::printVotingInfoInJSON ( SafeBuf *sb ) {
// breathe
QUICKPOLL ( m_niceness );
// print this section
printSectionDiv ( sk , FMT_JSON ); // forProCog );
printSectionDiv ( sk , FORMAT_JSON ); // forProCog );
// advance
long b = sk->m_b;
// stop if last
@ -15190,7 +15190,8 @@ bool Sections::print2 ( SafeBuf *sbuf ,
HashTableX *st2 ,
HashTableX *tt ,
Addresses *aa ,
char format ) { // bool forProCog ){//FMT_PROCOG FMT_JSON HTML
char format ) { // bool forProCog ){
//FORMAT_PROCOG FORMAT_JSON HTML
//sbuf->safePrintf("<b>Sections in Document</b>\n");
@ -15244,7 +15245,7 @@ bool Sections::print2 ( SafeBuf *sbuf ,
sk = m_sectionPtrs[b];
}
if ( format != FMT_HTML ) return true; // forProCog
if ( format != FORMAT_HTML ) return true; // forProCog
// print header
char *hdr =
@ -15553,7 +15554,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
// m_sbuf->safePrintf("A=%li ",sk->m_a);
if ( format == FMT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
if ( format == FORMAT_PROCOG && sk->m_stats.m_numUniqueSites >= 2 ) {
// do not count our own site!
m_sbuf->safePrintf("<i>"
"<font size=-1>"
@ -15573,7 +15574,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
m_sbuf->safePrintf("<i>");
if ( format == FMT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
if ( format == FORMAT_PROCOG && (sk->m_flags & SEC_SENTENCE) ) {
sec_t f = sk->m_flags;
//if ( f & SEC_SENTENCE )
// m_sbuf->safePrintf("sentence " );
@ -15598,7 +15599,7 @@ bool Sections::printSectionDiv ( Section *sk , char format ) { // bool forProCog
// m_sbuf->safePrintf("notdupvotes=%li ",
// sk->m_votesForNotDup);
if ( format != FMT_PROCOG ) {
if ( format != FORMAT_PROCOG ) {
// print the flags
m_sbuf->safePrintf("A=%li ",sk->m_a);

View File

@ -12414,7 +12414,7 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
if ( cx->m_isCustomCrawl )
return msg->safePrintf("Job is in progress.");
else
return true;
return msg->safePrintf("Spider is in progress.");
}
// pattern is a ||-separted list of substrings

View File

@ -324,8 +324,11 @@ retry19:
g_errno = errno;
//if ( g_errno == EINVAL ) { port++; goto again; }
close ( m_sock );
return log("tcp: Failed to bind socket: %s.",
mstrerror(g_errno));
fprintf(stderr,"Failed to bind socket on port %li: %s."
"\n"
"Are you already running gb?\n",
(long)port,mstrerror(g_errno));
return false;
}
close ( m_sock );
return true;

View File

@ -173,7 +173,16 @@ void Threads::reset ( ) {
m_threadQueues[i].reset();
}
Threads::Threads ( ) {
m_numQueues = 0;
m_initialized = false;
}
bool Threads::init ( ) {
if ( m_initialized ) return true;
m_initialized = true;
m_needsCleanup = false;
//m_needBottom = false;
@ -183,18 +192,18 @@ bool Threads::init ( ) {
// set s_pid to the main process id
#ifdef PTHREADS
s_pid = pthread_self();
log(LOG_INFO,
"threads: main process THREAD id = %lu",(long unsigned)s_pid);
//log(LOG_INFO,
// "threads: main process THREAD id = %lu",(long unsigned)s_pid);
pthread_t tid = pthread_self();
sched_param param;
int policy;
// scheduling parameters of target thread
pthread_getschedparam ( tid, &policy, &param);
log(LOG_INFO,
"threads: min/max thread priority settings = %li/%li (policy=%li)",
(long)sched_get_priority_min(policy),
(long)sched_get_priority_max(policy),
(long)policy);
//log(LOG_INFO,
// "threads: min/max thread priority settings = %li/%li (policy=%li)",
// (long)sched_get_priority_min(policy),
// (long)sched_get_priority_max(policy),
// (long)policy);
#else
s_pid = getpid();
#endif
@ -422,6 +431,10 @@ bool Threads::call ( char type ,
// don't spawn any if disabled
if ( m_disabled ) return false;
if ( ! g_conf.m_useThreads ) return false;
if ( ! m_initialized && ! init() )
return log("db: Threads init failed." );
// . sanity check
// . a thread can NOT call this
//if ( getpid() != s_pid ) {
@ -447,7 +460,7 @@ bool Threads::call ( char type ,
if ( i == m_numQueues ) {
g_errno = EBADENGINEER;
return log(LOG_LOGIC,"thread: addtoqueue: Unregistered "
"thread type");
"thread type %li",(long)type);
}
// debug msg
//log("thread: call: adding entry for thread");

View File

@ -165,7 +165,7 @@ class Threads {
public:
Threads() { m_numQueues = 0; };
Threads();
// returns false and sets errno on error, true otherwise
bool init();
@ -257,6 +257,7 @@ class Threads {
bool m_needsCleanup;
//bool m_needBottom;
bool m_initialized;
// private:

View File

@ -1,11 +1,11 @@
# List of sites to spider, one per line. Gigablast uses the <a
# List of sites to spider, one per line. See <a href=#examples>example site
# list</a> below. Gigablast uses the <a
# href=/admin/filters#insitelist>insitelist</a> directive on the <a
# href=/admin/filters>url filters</a> page to make sure that the spider only
# indexes urls that match the site patterns you specify here, other than urls
# you add individually via the add urls or inject url tools. See <a
# href=#examples>example site list</a> below. Limit list to 300MB. If you have
# a lot of INDIVIDUAL URLS to add then consider using the <a
# href=/admin/addurl>addurl</a> interface.
# you add individually via the add urls or inject url tools. Limit list to
# 300MB. If you have a lot of INDIVIDUAL urls to add then consider using the
# <a href=/admin/addurl>addurl</a> interface.
<siteList><![CDATA[]]></>
# All <, >, " and # characters that are values for a field contained herein
@ -25,10 +25,10 @@
# If this is true Gigablast will respect the robots.txt convention.
<useRobotstxt>1</>
# How many second to cache a robots.txt file for. 86400 is 1 day. 0 means
# How many seconds to cache a robots.txt file for. 86400 is 1 day. 0 means
# Gigablast will not read from the cache at all and will download the
# robots.txt before every page if robots.txt use is enabled above. However, if
# this is 0 then Gigablast will still store robots.txt files into the cache.
# this is 0 then Gigablast will still store robots.txt files in the cache.
<maxRobotstxtCacheAge>86400</>
# Do a tight merge on posdb and titledb at this time every day. This is
@ -146,10 +146,11 @@
# because it is not included by default.
<getDocidScoringInfo>1</>
# Query expansion will include word stems and synonyms in its search results.
# If enabled, query expansion will expand your query to include word stems and
# synonyms of the query terms.
<doQueryExpansion>1</>
# What is the limit to the total number of returned search results.
# What is the maximum total number of returned search results.
<maxSearchResults>1000</>
# What is the limit to the total number of returned search results per query?
@ -205,9 +206,9 @@
# What is the default number of summary excerpts displayed per search result?
<defaultNumberOfSummaryExcerpts>3</>
# <br> tags are inserted to keep the number of chars in the summary per line
# at or below this width. Strings without spaces that exceed this width are
# not split.
# &lt;br&gt; tags are inserted to keep the number of chars in the summary per
# line at or below this width. Strings without spaces that exceed this width
# are not split.
<maxSummaryLineWidth>80</>
# Truncating this will miss out on good summaries, but performance will

View File

@ -1,119 +0,0 @@
# The Gigablast host configuration file.
# Tells us what hosts are participating in the distributed search engine.
# How many mirrors do you want? If this is 0 then your data
# will NOT be replicated. If it is 1 then each host listed
# below will have one host that mirrors it, thereby decreasing
# total index capacity, but increasing redundancy. If this is
# 1 then the first half of hosts will be replicated by the
# second half of the hosts listed below.
num-mirrors: 0
# List of hosts. Limited to 512 from MAX_HOSTS in Hostdb.h. Increase that
# if you want more.
#
# Format:
#
# first column: hostID (starts at 0 and increments from there)
# second column: the port used by the client DNS algorithms
# third column: port that HTTPS listens on
# fourth column: port that HTTP listens on
# fifth column: port that udp server listens on
# sixth column: IP address or hostname that has an IP address in /etc/hosts
# seventh column: like sixth column but for secondary ethernet port. (optional)
# This file consists of a list of lines like this:
#
# <ClientDnsPort> <HttpsPort> <HttpPort> <UdpPort> <IP1> <IP2> <Path>
#
# By default just use the local host as the single host as listed below.
#
# The client DNS uses port 5998, https listens on 7000, http listens on port
# 8000 and the udp server listens on port 9000. We used to use port 6000 for
# DNS listening but it seemed to have some issues. If your DNS keeps timing
# out try a different port from 5998.
#
# If your server only has one IP then just repeat it as IP1 and IP2. You
# can also use an alphanumeric name from /etc/hosts in place of a direct
# IP address. (see example below)
#
# Use './gb N' to run the gb process as host #N where N is 0 to run as
# the first host in the list below.
#
# Use './gb start N' to use passwordless ssh to ssh to that Nth machine
# listed below and start the process. Use must have private/public keys
# for the required passwordless ssh.
#
# Use './gb kstart N' to run the Nth host in a bash keep-alive loop. So if it
# cores it will restart. It will send out an email alert if it restarts.
#
# The working directory is the last string on each line. That is where the
# 'gb' binary resides.
#
0 5998 7000 8000 9000 127.0.0.1 127.0.0.1 /home/mwells/github/
#
# Example of a four-node distributed search index running on a single
# server with four cores. The working directories are /home/mwells/hostN/.
# The 'gb' binary resides in the working directories. We have to use
# different ports for each gb instance since they are all on the same
# server.
#
# Use './gb 2' to run as the host on IP 1.2.3.8 for example.
#
#0 5998 7000 8000 9000 1.2.3.4 1.2.3.5 /home/mwells/host0/
#1 5997 7001 8001 9001 1.2.3.4 1.2.3.5 /home/mwells/host1/
#2 5996 7002 8002 9002 1.2.3.4 1.2.3.5 /home/mwells/host2/
#3 5995 7003 8003 9003 1.2.3.4 1.2.3.5 /home/mwells/host3/
# A four-node cluster on four different servers:
#0 5998 7000 8000 9000 1.2.3.4 1.2.3.5 /home/mwells/gigablast/
#1 5998 7000 8000 9000 1.2.3.6 1.2.3.7 /home/mwells/gigablast/
#2 5998 7000 8000 9000 1.2.3.8 1.2.3.9 /home/mwells/gigablast/
#3 5998 7000 8000 9000 1.2.3.10 1.2.3.11 /home/mwells/gigablast/
#
# Example of an eight-node cluster.
# Each line represents a single gb process with dual ethernet ports
# whose IP addresses are in /etc/hosts under se0, se0b, se1, se1b, ...
#
#0 5998 7000 8000 9000 se0 se0b /home/mwells/gigablast/
#1 5998 7000 8000 9000 se1 se1b /home/mwells/gigablast/
#2 5998 7000 8000 9000 se2 se2b /home/mwells/gigablast/
#3 5998 7000 8000 9000 se3 se3b /home/mwells/gigablast/
#4 5998 7000 8000 9000 se4 se4b /home/mwells/gigablast/
#5 5998 7000 8000 9000 se5 se5b /home/mwells/gigablast/
#6 5998 7000 8000 9000 se6 se6b /home/mwells/gigablast/
#7 5998 7000 8000 9000 se7 se7b /home/mwells/gigablast/
# Proxies
# Proxies handle the incoming search request and load balance it to
# one of the hosts listed above. If you only have one host in your search
# engine then you probably do not really need the proxy. You need to make
# sure all shard hosts and all proxies have the same hosts.conf because
# they ping each other to ensure they are up.
#
# To start a proxy you can run './gb proxy load 0' to start the first
# proxy in your list. Use './gb proxy load 1' to start the second proxy, etc.
#
# Use './gb proxy start N' to start the Nth proxy, where N starts at 0,
# mentioned in the proxy list below. You need to enable passwordless ssh
# using private/public keys for that to work.
#
# Use './gb proxy kstart N' to start the Nth proxy in a keep-alive loop using
# the bash shell. So if it cores it will restart and send you an email alert.
#
# Format:
# First column is "proxy" and followed by the standard columns described above
#
# Example:
# A proxy will be running on 10.5.66.18:
#proxy 6001 7001 8001 9001 10.5.66.18

View File

@ -56,7 +56,22 @@ A work-in-progress <a href=/compare.html>comparison to SOLR</a>.
<br><br><a name=quickstart></a>
<h1>Quick Start</h1>
Requirements: You will need an Intel or AMD system running Linux.<br><br>
Requirements: You will need an Intel or AMD system running Linux and at least 4GB of RAM.<br><br>
Install the <a href=http://www.gigablast.com/gigablast-1.0-1.deb>Gigablast package for Ubuntu or Debian</a> or install the <a href=http://www.gigablast.com/gigaablast-1.0-1.rpm>Gigablast package for RedHat</a>.
<br><br>
If you run into an bugs let me know so i can fix them right away: mattdwells@hotmail.com.
<br>
<br>
<h1>Build From Source</h1>
Requirements: You will need an Intel or AMD system running Linux and at least 4GB of RAM.<br><br>
If you run into an bugs let me know so i can fix them right away: mattdwells@hotmail.com.
<br><br>
You will need the following packages installed<br>
<ul>
@ -84,13 +99,8 @@ rather your current working directory, where the 'gb' binary resides.
<br><br>
5. Re-run it after it builds those binaries.
<br><br>
6. Check out the <a href=http://127.0.0.1:8000/master>Master Controls</a>. You need to connect to port 8000 from a local IP address or from an IP address on the same C-Class as part of Gigablast's security. Consider using an ssh tunnel if your browser's IP is not on the same C-Class as the server's. i.e. From your browser machine, ssh to the machine running the gb server: <i>ssh someservername.com -L 8000:127.0.0.1:8000</i> . Then on your browser go to the <a href=http://127.0.0.1:8000/master>Master Controls</a>.
6. Check out the <a href=http://127.0.0.1:8000/>home page</a>.
<br><br>
7. Click on the <a href=http://127.0.0.1:8000/admin/inject?c=main>inject</a> menu and inject a URL into the index. It might be slow because it uses Google's public DNSes as specified in the Master Controls as 8.8.8.8 and 8.8.4.4. You should change those to your own local bind9 server for speed.
<br><br>
8. When the injection completes, try a <a href=http://127.0.0.1:8000/>search</a> for the document you injected.
<br><br>
9. <a href=http://127.0.0.1:8000/master?se=1>Turn on spiders</a> on the <a href=http://127.0.0.1:8000/master>Master Controls</a> page so that it will begin spidering the outlinks of the page you injected.
<br>
@ -137,14 +147,15 @@ rather your current working directory, where the 'gb' binary resides.
<li> Plug-ins. For indexing any file format by calling Plug-ins to convert that format to HTML. Provided binary plug-ins: pdftohtml (PDF), ppthtml (PowerPoint), antiword (MS Word), pstotext (PostScript).
<li> Indexes JSON and XML natively. Provides ability to search individual structured fields.
<li> Sorting. Sort the search results by meta tags or JSON fields that contain numbers, simply by adding something like gbsortby:price or gbrevsortby:price as a query term, assuming you have meta price tags.
<li>Easy Scaling. Add new servers to the hosts.conf file then click 'rebalance shards' to automatically rebalance the sharded data.
<li>Using &stream=1 can stream back millions of search results for a query without running out of memory.
<li> Easy Scaling. Add new servers to the hosts.conf file then click 'rebalance shards' to automatically rebalance the sharded data.
<li> Using &stream=1 can stream back millions of search results for a query without running out of memory.
<li> Nested boolean queries using AND, OR, NOT operators.
<li> Built-in support for <a href=http://www.diffbot.com/>diffbot.com's api</a>, which extracts various entities from web sites, like products, articles, etc.
</ul>
<br>
<h2>Features available but currently disabled because of code overhaul. Will be re-enabled soon.</h2>
<ul>
<li> Boolean operator support in query
<li> Spellchecker
</ul>

View File

@ -127,11 +127,14 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
<td style="padding-bottom:12px">&nbsp;</td>
<td style="padding-bottom:12px">&nbsp;</td>
</tr>
<!--
<tr bgcolor="#006699">
<th><a name="boolean" id="boolean"></a><font color="#FFFFFF">Boolean Search</font></th>
<th><font color="#FFFFFF">Description</font></th>
<tr bgcolor="#0340fd">
<th><font color=33dcff>Boolean Search</font></th>
<th><font color=33dcff>Description</font></th>
</tr>
<tr>
<td colspan="2" bgcolor="#FFFFCC"><center>
Note: boolean operators must be in UPPER CASE.
@ -214,16 +217,17 @@ a{cursor:hand;cursor:pointer;text-decoration:none;color:blue;}
expressions and can be optionally enclosed in parentheses. A NOT
operator can optionally preceed the left or the right operand.</td>
</tr>
-->
</table>
</td></tr>
</table>
<br>
<center>
Copyright &copy; 2013. All rights reserved.
Copyright &copy; 2014. All rights reserved.
</center>
</body>
</html>

246
main.cpp
View File

@ -202,6 +202,7 @@ void handleRequest8e(UdpSlot *, long netnice ) {return; }
void handleRequest4f(UdpSlot *, long netnice ) {return; }
void handleRequest95(UdpSlot *, long netnice ) {return; }
char *getcwd2 ( char *arg ) ;
// for cleaning up indexdb
void dumpMissing ( char *coll );
@ -361,8 +362,21 @@ void stack_test(){
}
#endif
int main2 ( int argc , char *argv[] ) ;
int main ( int argc , char *argv[] ) {
//fprintf(stderr,"Starting gb.\n");
int ret = main2 ( argc , argv );
if ( ret ) fprintf(stderr,"Failed to start gb. Exiting.\n");
}
int main2 ( int argc , char *argv[] ) {
g_conf.m_runAsDaemon = true;
// appears that linux 2.4.17 kernel would crash with this?
// let's try again on gk127 to make sure
// YES! gk0 cluster has run for months with this just fine!!
@ -392,7 +406,7 @@ int main ( int argc , char *argv[] ) {
printHelp:
SafeBuf sb;
sb.safePrintf(
"Usage: gb [-c hostsConf] <CMD>\n");
"Usage: gb [-d workingDir] <CMD>\n");
sb.safePrintf(
"\tItems in []'s are optional, and items "
"in <>'s are "
@ -411,6 +425,8 @@ int main ( int argc , char *argv[] ) {
"-h\tprint this help.\n\n"
"-v\tprint version and exit.\n\n"
"-d\tdebug mode. do not run as daemon. "
"log to stderr.\n\n"
//"-o\tprint the overview documentation in HTML. "
//"Contains the format of hosts.conf.\n\n"
"-r\tindicates recovery mode, "
@ -422,7 +438,13 @@ int main ( int argc , char *argv[] ) {
"start [hostId]\n"
"\tstart the gb process on all hosts or just on "
"[hostId] if specified.\n\n"
"[hostId] if specified using an ssh command.\n\n"
"kstart [hostId]\n"
"\tstart the gb process on all hosts or just on "
"[hostId] if specified using an ssh command and "
"if the gb process cores then restart it. k stands "
"for keepalive.\n\n"
"stop [hostId]\n"
"\tsaves and exits for all gb hosts or "
@ -915,11 +937,13 @@ int main ( int argc , char *argv[] ) {
}
// get hosts.conf file
char *hostsConf = "./hosts.conf";
//char *hostsConf = "./hosts.conf";
long hostId = 0;
long cmdarg = 1;
if ( argc >= 3 && argv[1][0]=='-'&&argv[1][1]=='c'&&argv[1][2]=='\0') {
hostsConf = argv[2];
char *workingDir = NULL;
if ( argc >= 3 && argv[1][0]=='-'&&argv[1][1]=='d'&&argv[1][2]=='\0') {
//hostsConf = argv[2];
workingDir = argv[2];
cmdarg = 3;
}
@ -979,7 +1003,11 @@ int main ( int argc , char *argv[] ) {
//send an email on startup for -r, like if we are recovering from an
//unclean shutdown.
g_recoveryMode = false;
if ( strcmp ( cmd , "-r" ) == 0 ) g_recoveryMode = true;
if ( strcmp ( cmd , "-r" ) == 0 ) g_recoveryMode = true;
// debug on gdb? then do not fork
if ( strcmp ( cmd , "-d" ) == 0 ) g_conf.m_runAsDaemon = false;
bool testMandrill = false;
if ( strcmp ( cmd , "emailmandrill" ) == 0 ) {
testMandrill = true;
@ -1072,7 +1100,7 @@ int main ( int argc , char *argv[] ) {
if ( strcmp ( cmd , "parsetest" ) == 0 ) {
if ( cmdarg+1 >= argc ) goto printHelp;
// load up hosts.conf
if ( ! g_hostdb.init(hostsConf, hostId) ) {
if ( ! g_hostdb.init(hostId) ) {
log("db: hostdb init failed." ); return 1; }
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
@ -1118,7 +1146,7 @@ int main ( int argc , char *argv[] ) {
*/
if ( strcmp ( cmd , "booltest" ) == 0 ){
if ( ! g_hostdb.init(hostsConf, hostId) ) {
if ( ! g_hostdb.init(hostId) ) {
log("db: hostdb init failed." ); return 1; }
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
@ -1248,14 +1276,14 @@ int main ( int argc , char *argv[] ) {
// this is just like starting up a gb process, but we add one to
// each port, we are a dummy machine in the dummy cluster.
// gb -c hosts.conf tmpstart [hostId]
// gb -d <workingdir> tmpstart [hostId]
char useTmpCluster = 0;
if ( strcmp ( cmd , "tmpstart" ) == 0 )
useTmpCluster = 1;
// gb -c hosts.conf tmpstop [hostId]
// gb -d <workingdir> tmpstop [hostId]
if ( strcmp ( cmd , "tmpstop" ) == 0 )
useTmpCluster = 1;
// gb -c hosts.conf tmpstarthost <hostId>
// gb -d <workingdir> tmpstarthost <hostId>
if ( strcmp ( cmd , "tmpstarthost" ) == 0 ) {
useTmpCluster = 1;
// we need to parse out the hostid too!
@ -1282,8 +1310,18 @@ int main ( int argc , char *argv[] ) {
return 0;
}
//
// get current working dir that the gb binary is in. all the data
// files should in there too!!
//
if ( ! workingDir ) workingDir = getcwd2 ( argv[0] );
// load up hosts.conf
if ( ! g_hostdb.init(hostsConf, hostId, NULL, isProxy,useTmpCluster)){
if ( ! g_hostdb.init(hostId,
NULL,
isProxy,
useTmpCluster,
workingDir)){
log("db: hostdb init failed." ); return 1; }
// set clock file name so gettimeofdayInMmiilisecondsGlobal()
@ -1315,9 +1353,6 @@ int main ( int argc , char *argv[] ) {
g_conf.m_save = false;
// start up log file
if ( ! g_log.init( g_hostdb.m_logFilename ) ) {
fprintf (stderr,"db: Log file init failed.\n" ); return 1; }
// log the version
//log(LOG_INIT,"conf: Gigablast Server %s",GBVersion);
@ -1388,8 +1423,8 @@ int main ( int argc , char *argv[] ) {
if ( ! g_loop.init() ) {
log("db: Loop init failed." ); return 1; }
if ( ! g_threads.init() ) {
log("db: Threads init failed." ); return 1; }
//if ( ! g_threads.init() ) {
// log("db: Threads init failed." ); return 1; }
g_process.init();
@ -2273,7 +2308,8 @@ int main ( int argc , char *argv[] ) {
g_conf.m_sendEmailAlerts = false;
// log how much mem we can use
log(LOG_INIT,"conf: Max mem allowed to use is %lli\n",g_conf.m_maxMem);
//log(LOG_INIT,"conf: Max mem allowed to use is %lli\n",
//g_conf.m_maxMem);
// load the language specific pages
g_languagePages.reloadPages();
@ -2294,8 +2330,11 @@ int main ( int argc , char *argv[] ) {
// set up the threads, might need g_conf
if ( ! g_threads.init() ) {
log("db: Threads init failed." ); return 1; }
// avoid logging threads msgs to stderr if not actually starting up
// a gb daemon...
//if(cmd && cmd[0] && ! is_digit(cmd[0]) && ! g_threads.init() ) {
//if ( ! g_threads.init() ) {
// log("db: Threads init failed." ); return 1; }
// gb gendict
if ( strcmp ( cmd , "gendict" ) == 0 ) {
@ -2666,6 +2705,12 @@ int main ( int argc , char *argv[] ) {
// fixTfndb ( coll ); // coll
//}
// make sure port is available, no use loading everything up then
// failing because another process is already running using this port
//if ( ! g_udpServer.testBind ( g_hostdb.getMyPort() ) )
if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->m_httpPort))
return 1;
//if ( strcmp ( cmd , "gendbs" ) == 0 ) goto jump;
//if ( strcmp ( cmd , "gentfndb" ) == 0 ) goto jump;
if ( strcmp ( cmd , "gencatdb" ) == 0 ) goto jump;
@ -2673,6 +2718,8 @@ int main ( int argc , char *argv[] ) {
// if ( cmd && ! is_digit(cmd[0]) ) goto printHelp;
log("db: Logging to file %s.",g_hostdb.m_logFilename );
/*
// tmp stuff to generate new query log
if ( ! ucInit(g_hostdb.m_dir, true)) return 1;
@ -2685,14 +2732,36 @@ int main ( int argc , char *argv[] ) {
return 0;
*/
// make sure port is available, no use loading everything up then
// failing because another process is already running using this port
//if ( ! g_udpServer.testBind ( g_hostdb.getMyPort() ) )
if ( ! g_httpServer.m_tcp.testBind(g_hostdb.getMyHost()->m_httpPort))
return 1;
// start up log file
if ( ! g_log.init( g_hostdb.m_logFilename ) ) {
fprintf (stderr,"db: Log file init failed.\n" ); return 1; }
g_errno = 0;
//
// run as daemon now
//
//fprintf(stderr,"running as daemon\n");
if ( g_conf.m_runAsDaemon ) {
pid_t pid, sid;
pid = fork();
if ( pid < 0 ) exit(EXIT_FAILURE);
if ( pid > 0 ) exit(EXIT_SUCCESS);
// change file mode mask
umask(0);
sid = setsid();
if ( sid < 0 ) exit(EXIT_FAILURE);
//fprintf(stderr,"done\n");
// set our new pid
g_mem.setPid();
}
// initialize threads down here now so it logs to the logfile and
// not stderr
//if ( ( ! cmd || !cmd[0]) && ! g_threads.init() ) {
// log("db: Threads init failed." ); return 1; }
g_log.m_logTimestamps = true;
if (!ucInit(g_hostdb.m_dir, true)) {
log("Unicode initialization failed!");
@ -3675,7 +3744,8 @@ void doCmdAll ( int fd, void *state ) {
// copy a collection from one network to another (defined by 2 hosts.conf's)
int collcopy ( char *newHostsConf , char *coll , long collnum ) {
Hostdb hdb;
if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
//if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
if ( ! hdb.init( 0/*assume we're zero*/) ) {
log("clusterCopy failed. Could not init hostdb with %s",
newHostsConf);
return -1;
@ -3723,7 +3793,8 @@ int scale ( char *newHostsConf , bool useShotgunIp) {
g_hostdb.resetPortTables();
Hostdb hdb;
if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
//if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
if ( ! hdb.init( 0/*assume we're zero*/) ) {
log("Scale failed. Could not init hostdb with %s",
newHostsConf);
return -1;
@ -4148,13 +4219,13 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
if ( hostId >= 0 && h2->m_hostId != hostId ) continue;
// . save old log now, too
char tmp2[1024];
tmp2[0]='\0';
//char tmp2[1024];
//tmp2[0]='\0';
// let's do this for everyone now
//if ( h2->m_hostId == 0 )
sprintf(tmp2,
"mv ./proxylog ./proxylog-`date '+"
"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " );
//sprintf(tmp2,
// "mv ./proxylog ./proxylog-`date '+"
// "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " );
// . assume conf file name gbHID.conf
// . assume working dir ends in a '/'
//to test add: ulimit -t 10; to the ssh cmd
@ -4167,8 +4238,8 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
"EXITSTATUS=1 ; "
"while [ \\$EXITSTATUS != 0 ]; do "
"{ "
"mv ./proxylog ./proxylog-\\`date '+"
"%%Y_%%m_%%d-%%H:%%M:%%S'\\` ; "
//"mv ./proxylog ./proxylog-\\`date '+"
//"%%Y_%%m_%%d-%%H:%%M:%%S'\\` ; "
"./gb proxy load %li " // mdw
"\\$ADDARGS "
" >& ./proxylog ;"
@ -4550,31 +4621,33 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
}
else if ( installFlag == ifk_start ) {
// . save old log now, too
char tmp2[1024];
tmp2[0]='\0';
//char tmp2[1024];
//tmp2[0]='\0';
// let's do this for everyone now
//if ( h2->m_hostId == 0 )
sprintf(tmp2,
"mv ./log%03li ./log%03li-`date '+"
"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
h2->m_hostId ,
h2->m_hostId );
//sprintf(tmp2,
// "mv ./log%03li ./log%03li-`date '+"
// "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
// h2->m_hostId ,
// h2->m_hostId );
// . assume conf file name gbHID.conf
// . assume working dir ends in a '/'
sprintf(tmp,
"ssh %s \"cd %s ; ulimit -c unlimited; "
"cp -f gb gb.oldsave ; "
"mv -f gb.installed gb ; %s"
"./gb %li >& ./log%03li &\" %s",
"mv -f gb.installed gb ; " // %s"
//"./gb %li >& ./log%03li &\" %s",
"./gb %li &\" %s",
iptoa(h2->m_ip),
h2->m_dir ,
tmp2 ,
//tmp2 ,
//h2->m_dir ,
h2->m_hostId ,
h2->m_hostId ,
//h2->m_hostId ,
amp);
// log it
log(LOG_INIT,"admin: %s", tmp);
//log(LOG_INIT,"admin: %s", tmp);
fprintf(stdout,"admin: %s\n", tmp);
// execute it
system ( tmp );
}
@ -4621,7 +4694,7 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
"ssh %s \"cd %s ; "
"cp -f tmpgb tmpgb.oldsave ; "
"mv -f tmpgb.installed tmpgb ; "
"./tmpgb -c %shosts.conf tmpstarthost "
"./tmpgb -d %s tmpstarthost "
"%li >& ./tmplog%03li &\" &",
iptoa(h2->m_ip),
h2->m_dir ,
@ -4636,15 +4709,15 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
else if ( installFlag == ifk_kstart ) {
//keepalive
// . save old log now, too
char tmp2[1024];
tmp2[0]='\0';
//char tmp2[1024];
//tmp2[0]='\0';
// let's do this for everyone now
//if ( h2->m_hostId == 0 )
sprintf(tmp2,
"mv ./log%03li ./log%03li-`date '+"
"%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
h2->m_hostId ,
h2->m_hostId );
//sprintf(tmp2,
// "mv ./log%03li ./log%03li-`date '+"
// "%%Y_%%m_%%d-%%H:%%M:%%S'` ; " ,
// h2->m_hostId ,
// h2->m_hostId );
// . assume conf file name gbHID.conf
// . assume working dir ends in a '/'
//to test add: ulimit -t 10; to the ssh cmd
@ -4657,26 +4730,28 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
"EXITSTATUS=1 ; "
"while [ \\$EXITSTATUS != 0 ]; do "
"{ "
"mv ./log%03li ./log%03li-\\`date '+"
"%%Y_%%m_%%d-%%H:%%M:%%S'\\` ; "
//"mv ./log%03li ./log%03li-\\`date '+"
//"%%Y_%%m_%%d-%%H:%%M:%%S'\\` ; "
"./gb %li "
"\\$ADDARGS "
" >& ./log%03li ;"
" ;"
//" >& ./log%03li ;"
"EXITSTATUS=\\$? ; "
"ADDARGS='-r' ; "
"} "
"done >& /dev/null & \" %s",
iptoa(h2->m_ip),
h2->m_dir ,
h2->m_hostId ,
h2->m_hostId ,
//h2->m_hostId ,
//h2->m_hostId ,
//h2->m_dir ,
h2->m_hostId ,
//h2->m_hostId ,
h2->m_hostId ,
amp );
// log it
log(LOG_INIT,"admin: %s", tmp);
//log(LOG_INIT,"admin: %s", tmp);
fprintf(stdout,"admin: %s\n", tmp);
// execute it
system ( tmp );
}
@ -13721,7 +13796,7 @@ int injectFile ( char *filename , char *ips ,
long ip = 0;
// is ip field a hosts.conf instead?
if ( strstr(ips,".conf") ) {
if ( ! s_hosts2.init ( ips , 0 ) ) {
if ( ! s_hosts2.init ( 0 ) ) { // ips , 0 ) ) {
fprintf(stderr,"failed to load %s",ips);
exit(0);
}
@ -16660,7 +16735,8 @@ int collinject ( char *newHostsConf ) {
g_hostdb.resetPortTables();
Hostdb hdb;
if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
//if ( ! hdb.init(newHostsConf, 0/*assume we're zero*/) ) {
if ( ! hdb.init( 0/*assume we're zero*/) ) {
log("collinject failed. Could not init hostdb with %s",
newHostsConf);
return -1;
@ -16688,7 +16764,7 @@ int collinject ( char *newHostsConf ) {
Host *h1 = hdb1->getShard ( shardNum );
Host *h2 = hdb2->getShard ( shardNum );
printf("ssh %s 'nohup /w/gbi -c /w/hosts.conf inject titledb "
printf("ssh %s 'nohup /w/gbi -d /w/ inject titledb "
"%s:%li >& /w/ilog' &\n"
, h1->m_hostname
, iptoa(h2->m_ip)
@ -16773,3 +16849,45 @@ bool isRecoveryFutile ( ) {
// otherwise, give up!
return true;
}
char *getcwd2 ( char *arg ) {
// skip initial . and /
if ( arg[0] == '.' && arg[1] == '/' ) arg += 1;
char *a = arg;
// store path part before "/gb" or "/gigablast"
long alen = 0;
for ( ; *a ; a++ ) {
if ( *a != '/' ) continue;
alen = a - arg + 1;
}
if ( alen > 512 ) {
log("db: path is too long");
g_errno = EBADENGINEER;
return NULL;
}
// store the relative path of gb in there now
static char s_cwdBuf[1025];
getcwd ( s_cwdBuf , 1024 );
char *end = s_cwdBuf + gbstrlen(s_cwdBuf);
memcpy ( end , arg , alen );
end += alen;
*end = '\0';
// size of the whole thing
//long clen = gbstrlen(s_cwdBuf);
// store terminating /
//if ( clen < 1024 ) {
// s_cwdBuf[clen] = '/';
// s_cwdBuf[clen+1] = '\0';
//}
//log("hey: hey %s",s_cwdBuf);
return s_cwdBuf;
}