open-source-search-engine/CollectionRec.cpp
2013-08-02 13:12:24 -07:00

396 lines
12 KiB
C++

#include "gb-include.h"
#include "CollectionRec.h"
#include "Collectiondb.h"
#include "HttpServer.h" // printColors2()
#include "Msg5.h"
#include "Threads.h"
#include "Datedb.h"
#include "Timedb.h"
static CollectionRec g_default;
CollectionRec::CollectionRec() {
m_numSearchPwds = 0;
m_numBanIps = 0;
m_numSearchIps = 0;
m_numSpamIps = 0;
m_numAdminPwds = 0;
m_numAdminIps = 0;
// for Url::getSite()
m_updateSiteRulesTable = 1;
m_lastUpdateTime = 0LL;
m_clickNScrollEnabled = false;
// inits for sortbydatetable
m_inProgress = false;
m_msg5 = NULL;
// JAB - track which regex parsers have been initialized
//log(LOG_DEBUG,"regex: %p initalizing empty parsers", m_pRegExParser);
// clear these out so Parms::calcChecksum can work:
memset( m_spiderFreqs, 0, MAX_FILTERS*sizeof(*m_spiderFreqs) );
//for ( int i = 0; i < MAX_FILTERS ; i++ )
// m_spiderQuotas[i] = -1;
memset( m_spiderPriorities, 0,
MAX_FILTERS*sizeof(*m_spiderPriorities) );
//memset( m_rulesets, 0, MAX_FILTERS*sizeof(*m_rulesets) );
for ( int i = 0; i < MAX_SEARCH_PASSWORDS; i++ ) {
*(m_searchPwds[i]) = '\0';
}
for ( int i = 0; i < MAX_ADMIN_PASSWORDS; i++ ) {
*(m_adminPwds[i]) = '\0';
}
memset( m_banIps, 0, MAX_BANNED_IPS*sizeof(*m_banIps) );
memset( m_searchIps, 0, MAX_SEARCH_IPS*sizeof(*m_searchIps) );
memset( m_spamIps, 0, MAX_SPAM_IPS*sizeof(*m_spamIps) );
memset( m_adminIps, 0, MAX_ADMIN_IPS*sizeof(*m_adminIps) );
for ( int i = 0; i < MAX_FILTERS; i++ ) {
//m_pRegExParser[i] = NULL;
*(m_regExs[i]) = '\0';
}
m_numRegExs = 0;
// add default reg ex if we do not have one
fixRec();
}
CollectionRec::~CollectionRec() {
//invalidateRegEx ();
}
// new collection recs get this called on them
void CollectionRec::setToDefaults ( ) {
g_parms.setFromFile ( this , NULL , NULL );
// add default reg ex
fixRec ();
}
// . load this data from a conf file
// . values we do not explicitly have will be taken from "default",
// collection config file. if it does not have them then we use
// the value we received from call to setToDefaults()
// . returns false and sets g_errno on load error
bool CollectionRec::load ( char *coll , long i ) {
// before we load, set to defaults in case some are not in xml file
g_parms.setToDefault ( (char *)this );
// get the filename with that id
File f;
char tmp2[1024];
sprintf ( tmp2 , "%scoll.%s.%li/coll.conf", g_hostdb.m_dir , coll,i);
f.set ( tmp2 );
if ( ! f.doesExist () ) return log("admin: %s does not exist.",tmp2);
// set our collection number
m_collnum = i;
// set our collection name
m_collLen = gbstrlen ( coll );
strcpy ( m_coll , coll );
// collection name HACK for backwards compatibility
//if ( strcmp ( coll , "main" ) == 0 ) {
// m_coll[0] = '\0';
// m_collLen = 0;
//}
// the default conf file
char tmp1[1024];
sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
// . set our parms from the file.
// . accepts OBJ_COLLECTIONREC or OBJ_CONF
g_parms.setFromFile ( this , tmp2 , tmp1 );
// add default reg ex
fixRec ();
// always turn on distributed spider locking because otherwise
// we end up calling Msg50 which calls Msg25 for the same root url
// at the same time, thereby wasting massive resources. it is also
// dangerous to run without this because webmaster get pissed when
// we slam their servers.
// This is now deprecated...
//m_useSpiderLocks = false;
// and all pages downloaded from a particular ip should be done
// by the same host in our cluster to prevent webmaster rage
//m_distributeSpiderGet = true;
//initSortByDateTable(m_coll);
return true;
}
/*
bool CollectionRec::countEvents ( ) {
// set our m_numEventsOnHost value
log("coll: loading event count termlist gbeventcount");
// temporarily turn off threads
bool enabled = g_threads.areThreadsEnabled();
g_threads.disableThreads();
// count them
m_numEventsOnHost = 0;
// 1MB at a time
long minRecSizes = 1000000;
// look up this termlist, gbeventcount which we index in XmlDoc.cpp
long long termId = hash64n("gbeventcount") & TERMID_MASK;
// make datedb key from it
key128_t startKey = g_datedb.makeStartKey ( termId , 0xffffffff );
key128_t endKey = g_datedb.makeEndKey ( termId , 0 );
Msg5 msg5;
RdbList list;
// . init m_numEventsOnHost by getting the exact length of that
// termlist on this host
// . send in the ping request packet so all hosts can total up
// . Rdb.cpp should be added to incrementally so we should have no
// double positives.
// . Rdb.cpp should inspect each datedb rec for this termid in
// a fast an efficient manner
loop:
// use msg5 to get the list, should ALWAYS block since no threads
if ( ! msg5.getList ( RDB_DATEDB ,
m_coll ,
&list ,
(char *)&startKey ,
(char *)&endKey ,
minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL , // cache key ptr
0 , // retry num
-1 , // maxRetries
true , // compensate for merge
-1LL , // sync point
NULL )){// msg5b
// not allowed to block!
char *xx=NULL;*xx=0; }
// scan the list, score is how many valid events from that docid
unsigned long total = 0;
for ( ; ! list.isExhausted() ; list.skipCurrentRec() ) {
unsigned char *rec = (unsigned char *)list.getCurrentRec();
// in datedb score is byte #5
total += (255-rec[5]);
}
// declare
char *lastKeyPtr;
key128_t newStartKey;
// add to count. datedb uses half keys so subtract 6 bytes
// since the termids will be the same...
//m_numEventsOnHost += list.getListSize() / (sizeof(key128_t)-6);
m_numEventsOnHost += total;
// bail if under limit
if ( list.getListSize() < minRecSizes ) goto done;
// update key
lastKeyPtr = list.m_listEnd - 10;
// we make a new start key
list.getKey ( lastKeyPtr , (char *)&newStartKey );
// maxxed out?
if ( newStartKey.n0==0xffffffffffffffffLL &&
newStartKey.n1==0xffffffffffffffffLL )
goto done;
// sanity check
if ( newStartKey < startKey ) { char *xx=NULL;*xx=0; }
if ( newStartKey > endKey ) { char *xx=NULL;*xx=0; }
// inc it
newStartKey.n0++;
// in the top if the bottom wrapped
if ( newStartKey.n0 == 0LL ) newStartKey.n1++;
// assign
startKey = newStartKey;
// and loop back up for more now
goto loop;
done:
// update all colls count
g_collectiondb.m_numEventsAllColls += m_numEventsOnHost;
if ( enabled ) g_threads.enableThreads();
log("coll: got %li local events in termlist",m_numEventsOnHost);
// set "m_hasDocQualityFiler"
//updateFilters();
return true;
}
*/
void CollectionRec::fixRec ( ) {
bool addDefault = false;
if ( m_numRegExs == 0 )
addDefault = true;
//if ( m_numRegExs > 0 && strcmp(m_regExs[m_numRegExs-1],"default") )
// addDefault = true;
if ( ! addDefault ) return;
long n = 0;
strcpy(m_regExs [n],"default");
m_numRegExs++;
m_spiderFreqs [n] = 30; // 30 days default
m_numRegExs2++;
m_spiderPriorities[n] = 0;
m_numRegExs3++;
m_maxSpidersPerRule[n] = 99;
m_numRegExs10++;
m_spiderIpWaits[n] = 1000;
m_numRegExs5++;
m_spiderIpMaxSpiders[n] = 1;
m_numRegExs6++;
//m_spidersEnabled[n] = 0;
//m_numRegExs7++;
}
// returns false on failure and sets g_errno, true otherwise
bool CollectionRec::save ( ) {
if ( g_conf.m_readOnlyMode ) return true;
//File f;
char tmp[1024];
//sprintf ( tmp , "%scollections/%li.%s/c.conf",
// g_hostdb.m_dir,m_id,m_coll);
// collection name HACK for backwards compatibility
//if ( m_collLen == 0 )
// sprintf ( tmp , "%scoll.main/coll.conf", g_hostdb.m_dir);
//else
sprintf ( tmp , "%scoll.%s.%li/coll.conf",
g_hostdb.m_dir , m_coll , (long)m_collnum );
if ( ! g_parms.saveToXml ( (char *)this , tmp ) ) return false;
// log msg
log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
// do not need a save now
m_needsSave = false;
return true;
}
// calls hasPermissin() below
bool CollectionRec::hasPermission ( HttpRequest *r , TcpSocket *s ) {
long plen;
char *p = r->getString ( "pwd" , &plen );
long ip = s->m_ip;
return hasPermission ( p , plen , ip );
}
// . does this password work for this collection?
bool CollectionRec::isAssassin ( long ip ) {
// ok, make sure they came from an acceptable IP
for ( long i = 0 ; i < m_numSpamIps ; i++ )
// they also have a matching IP, so they now have permission
if ( m_spamIps[i] == ip ) return true;
return false;
}
// . does this password work for this collection?
bool CollectionRec::hasPermission ( char *p, long plen , long ip ) {
// just return true
// collection permission is checked from Users::verifyColl
// in User::getUserType for every request
return true;
// scan the passwords
// MDW: no longer, this is too vulnerable!!!
/*
for ( long i = 0 ; i < m_numAdminPwds ; i++ ) {
long len = gbstrlen ( m_adminPwds[i] );
if ( len != plen ) continue;
if ( strncmp ( m_adminPwds[i] , p , plen ) != 0 ) continue;
// otherwise it's a match!
//goto checkIp;
// . matching one password is good enough now, default OR
// . because just matching an IP is good enough security,
// there is really no need for both IP AND passwd match
return true;
}
*/
// . if had passwords but the provided one didn't match, return false
// . matching one password is good enough now, default OR
//if ( m_numPasswords > 0 ) return false;
// checkIp:
// ok, make sure they came from an acceptable IP
for ( long i = 0 ; i < m_numAdminIps ; i++ )
// they also have a matching IP, so they now have permission
if ( m_adminIps[i] == ip ) return true;
// if no security, allow all NONONONONONONONONO!!!!!!!!!!!!!!
//if ( m_numAdminPwds == 0 && m_numAdminIps == 0 ) return true;
// if they did not match an ip or password, even if both lists
// are empty, do not allow access... this prevents security breeches
// by accident
return false;
// if there were IPs then they failed to get in
//if ( m_numAdminIps > 0 ) return false;
// otherwise, they made it
//return true;
}
// can this ip perform a search or add url on this collection?
bool CollectionRec::hasSearchPermission ( TcpSocket *s , long encapIp ) {
// get the ip
long ip = 0; if ( s ) ip = s->m_ip;
// and the ip domain
long ipd = 0; if ( s ) ipd = ipdom ( s->m_ip );
// and top 2 bytes for the israel isp that has this huge block
long ipt = 0; if ( s ) ipt = iptop ( s->m_ip );
// is it in the ban list?
for ( long i = 0 ; i < m_numBanIps ; i++ ) {
if ( isIpTop ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipt ) return false;
continue;
}
// check for ip domain match if this banned ip is an ip domain
if ( isIpDom ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipd ) return false;
continue;
}
// otherwise it's just a single banned ip
if ( m_banIps[i] == ip ) return false;
}
// check the encapsulate ip if any
// 1091771468731 0 Aug 05 23:51:08 63.236.25.77 GET
// /search?code=mammaXbG&uip=65.87.190.39&n=15&raw=8&q=farm+insurance
// +nj+state HTTP/1.0
if ( encapIp ) {
ipd = ipdom ( encapIp );
ip = encapIp;
for ( long i = 0 ; i < m_numBanIps ; i++ ) {
if ( isIpDom ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipd ) return false;
continue;
}
if ( isIpTop ( m_banIps[i] ) ) {
if ( m_banIps[i] == ipt ) return false;
continue;
}
if ( m_banIps[i] == ip ) return false;
}
}
// do we have an "only" list?
if ( m_numSearchIps == 0 ) return true;
// it must be in that list if we do
for ( long i = 0 ; i < m_numSearchIps ; i++ ) {
// check for ip domain match if this banned ip is an ip domain
if ( isIpDom ( m_searchIps[i] ) ) {
if ( m_searchIps[i] == ipd ) return true;
continue;
}
// otherwise it's just a single ip
if ( m_searchIps[i] == ip ) return true;
}
// otherwise no permission
return false;
}