mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
fix regex logic
This commit is contained in:
parent
aad12f9fe3
commit
7f70e4e887
@ -1,764 +0,0 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "CollectionRec.h"
|
||||
#include "Collectiondb.h"
|
||||
#include "HttpServer.h" // printColors2()
|
||||
#include "Msg5.h"
|
||||
#include "Threads.h"
|
||||
#include "Datedb.h"
|
||||
#include "Timedb.h"
|
||||
#include "Spider.h"
|
||||
#include "Process.h"
|
||||
|
||||
static CollectionRec g_default;
|
||||
|
||||
|
||||
CollectionRec::CollectionRec() {
|
||||
//m_numSearchPwds = 0;
|
||||
//m_numBanIps = 0;
|
||||
//m_numSearchIps = 0;
|
||||
//m_numSpamIps = 0;
|
||||
//m_numAdminPwds = 0;
|
||||
//m_numAdminIps = 0;
|
||||
memset ( m_bases , 0 , 4*RDB_END );
|
||||
// how many keys in the tree of each rdb? we now store this stuff
|
||||
// here and not in RdbTree.cpp because we no longer have a maximum
|
||||
// # of collection recs... MAX_COLLS. each is a 32-bit "long" so
|
||||
// it is 4 * RDB_END...
|
||||
memset ( m_numNegKeysInTree , 0 , 4*RDB_END );
|
||||
memset ( m_numPosKeysInTree , 0 , 4*RDB_END );
|
||||
m_spiderColl = NULL;
|
||||
m_overflow = 0x12345678;
|
||||
m_overflow2 = 0x12345678;
|
||||
// the spiders are currently uninhibited i guess
|
||||
m_spiderStatus = SP_INITIALIZING; // this is 0
|
||||
//m_spiderStatusMsg = NULL;
|
||||
// for Url::getSite()
|
||||
m_updateSiteRulesTable = 1;
|
||||
m_lastUpdateTime = 0LL;
|
||||
m_clickNScrollEnabled = false;
|
||||
// inits for sortbydatetable
|
||||
m_inProgress = false;
|
||||
m_msg5 = NULL;
|
||||
// JAB - track which regex parsers have been initialized
|
||||
//log(LOG_DEBUG,"regex: %p initalizing empty parsers", m_pRegExParser);
|
||||
|
||||
// clear these out so Parms::calcChecksum can work:
|
||||
memset( m_spiderFreqs, 0, MAX_FILTERS*sizeof(*m_spiderFreqs) );
|
||||
//for ( int i = 0; i < MAX_FILTERS ; i++ )
|
||||
// m_spiderQuotas[i] = -1;
|
||||
memset( m_spiderPriorities, 0,
|
||||
MAX_FILTERS*sizeof(*m_spiderPriorities) );
|
||||
//memset( m_rulesets, 0, MAX_FILTERS*sizeof(*m_rulesets) );
|
||||
//for ( int i = 0; i < MAX_SEARCH_PASSWORDS; i++ ) {
|
||||
// *(m_searchPwds[i]) = '\0';
|
||||
//}
|
||||
//for ( int i = 0; i < MAX_ADMIN_PASSWORDS; i++ ) {
|
||||
// *(m_adminPwds[i]) = '\0';
|
||||
//}
|
||||
//memset( m_banIps, 0, MAX_BANNED_IPS*sizeof(*m_banIps) );
|
||||
//memset( m_searchIps, 0, MAX_SEARCH_IPS*sizeof(*m_searchIps) );
|
||||
//memset( m_spamIps, 0, MAX_SPAM_IPS*sizeof(*m_spamIps) );
|
||||
//memset( m_adminIps, 0, MAX_ADMIN_IPS*sizeof(*m_adminIps) );
|
||||
|
||||
//for ( int i = 0; i < MAX_FILTERS; i++ ) {
|
||||
// //m_pRegExParser[i] = NULL;
|
||||
// *(m_regExs[i]) = '\0';
|
||||
//}
|
||||
m_numRegExs = 0;
|
||||
|
||||
//m_requests = 0;
|
||||
//m_replies = 0;
|
||||
//m_doingCallbacks = false;
|
||||
|
||||
m_lastResetCount = 0;
|
||||
|
||||
// regex_t types
|
||||
m_hasucr = false;
|
||||
m_hasupr = false;
|
||||
|
||||
// for diffbot caching the global spider stats
|
||||
reset();
|
||||
|
||||
// add default reg ex if we do not have one
|
||||
setUrlFiltersToDefaults();
|
||||
}
|
||||
|
||||
CollectionRec::~CollectionRec() {
|
||||
//invalidateRegEx ();
|
||||
reset();
|
||||
}
|
||||
|
||||
// new collection recs get this called on them
|
||||
void CollectionRec::setToDefaults ( ) {
|
||||
g_parms.setFromFile ( this , NULL , NULL );
|
||||
// add default reg ex
|
||||
setUrlFiltersToDefaults();
|
||||
}
|
||||
|
||||
void CollectionRec::reset() {
|
||||
|
||||
// regex_t types
|
||||
if ( m_hasucr ) regfree ( &m_ucr );
|
||||
if ( m_hasupr ) regfree ( &m_upr );
|
||||
|
||||
// make sure we do not leave spiders "hanging" waiting for their
|
||||
// callback to be called... and it never gets called
|
||||
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
|
||||
//if ( m_doingCallbacks ) { char *xx=NULL;*xx=0; }
|
||||
//if ( m_replies != m_requests ) { char *xx=NULL;*xx=0; }
|
||||
m_localCrawlInfo.reset();
|
||||
m_globalCrawlInfo.reset();
|
||||
//m_requests = 0;
|
||||
//m_replies = 0;
|
||||
// free all RdbBases in each rdb
|
||||
for ( long i = 0 ; i < g_process.m_numRdbs ; i++ ) {
|
||||
Rdb *rdb = g_process.m_rdbs[i];
|
||||
rdb->resetBase ( m_collnum );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CollectionRec *g_cr = NULL;
|
||||
|
||||
// . load this data from a conf file
|
||||
// . values we do not explicitly have will be taken from "default",
|
||||
// collection config file. if it does not have them then we use
|
||||
// the value we received from call to setToDefaults()
|
||||
// . returns false and sets g_errno on load error
|
||||
bool CollectionRec::load ( char *coll , long i ) {
|
||||
// also reset some counts not included in parms list
|
||||
reset();
|
||||
// before we load, set to defaults in case some are not in xml file
|
||||
g_parms.setToDefault ( (char *)this );
|
||||
// get the filename with that id
|
||||
File f;
|
||||
char tmp2[1024];
|
||||
sprintf ( tmp2 , "%scoll.%s.%li/coll.conf", g_hostdb.m_dir , coll,i);
|
||||
f.set ( tmp2 );
|
||||
if ( ! f.doesExist () ) return log("admin: %s does not exist.",tmp2);
|
||||
// set our collection number
|
||||
m_collnum = i;
|
||||
// set our collection name
|
||||
m_collLen = gbstrlen ( coll );
|
||||
strcpy ( m_coll , coll );
|
||||
|
||||
// collection name HACK for backwards compatibility
|
||||
//if ( strcmp ( coll , "main" ) == 0 ) {
|
||||
// m_coll[0] = '\0';
|
||||
// m_collLen = 0;
|
||||
//}
|
||||
|
||||
// the default conf file
|
||||
char tmp1[1024];
|
||||
sprintf ( tmp1 , "%sdefault.conf" , g_hostdb.m_dir );
|
||||
|
||||
// . set our parms from the file.
|
||||
// . accepts OBJ_COLLECTIONREC or OBJ_CONF
|
||||
g_parms.setFromFile ( this , tmp2 , tmp1 );
|
||||
|
||||
// add default reg ex IFF there are no url filters there now
|
||||
if ( m_numRegExs == 0 ) setUrlFiltersToDefaults();
|
||||
|
||||
// compile regexs here
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasucr = true;
|
||||
if ( rx && regcomp ( &m_ucr , rx ,
|
||||
REG_EXTENDED|REG_ICASE|
|
||||
REG_NEWLINE|REG_NOSUB) ) {
|
||||
// error!
|
||||
return log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
}
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx && regcomp ( &m_upr , rx ,
|
||||
REG_EXTENDED|REG_ICASE|
|
||||
REG_NEWLINE|REG_NOSUB) ) {
|
||||
// error!
|
||||
return log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// LOAD the crawlinfo class in the collectionrec for diffbot
|
||||
//
|
||||
// LOAD LOCAL
|
||||
sprintf ( tmp1 , "%scoll.%s.%li/localcrawlinfo.dat",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
log("coll: loading %s",tmp1);
|
||||
m_localCrawlInfo.reset();
|
||||
SafeBuf sb;
|
||||
// fillfromfile returns 0 if does not exist, -1 on read error
|
||||
if ( sb.fillFromFile ( tmp1 ) > 0 )
|
||||
//m_localCrawlInfo.setFromSafeBuf(&sb);
|
||||
// it is binary now
|
||||
memcpy ( &m_localCrawlInfo , sb.getBufStart(),sb.length() );
|
||||
// LOAD GLOBAL
|
||||
sprintf ( tmp1 , "%scoll.%s.%li/globalcrawlinfo.dat",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
log("coll: loading %s",tmp1);
|
||||
m_globalCrawlInfo.reset();
|
||||
sb.reset();
|
||||
if ( sb.fillFromFile ( tmp1 ) > 0 )
|
||||
//m_globalCrawlInfo.setFromSafeBuf(&sb);
|
||||
// it is binary now
|
||||
memcpy ( &m_globalCrawlInfo , sb.getBufStart(),sb.length() );
|
||||
|
||||
// ignore errors i guess
|
||||
g_errno = 0;
|
||||
|
||||
|
||||
// fix for diffbot
|
||||
if ( m_isCustomCrawl ) m_dedupingEnabled = true;
|
||||
|
||||
// always turn on distributed spider locking because otherwise
|
||||
// we end up calling Msg50 which calls Msg25 for the same root url
|
||||
// at the same time, thereby wasting massive resources. it is also
|
||||
// dangerous to run without this because webmaster get pissed when
|
||||
// we slam their servers.
|
||||
// This is now deprecated...
|
||||
//m_useSpiderLocks = false;
|
||||
// and all pages downloaded from a particular ip should be done
|
||||
// by the same host in our cluster to prevent webmaster rage
|
||||
//m_distributeSpiderGet = true;
|
||||
|
||||
//initSortByDateTable(m_coll);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
bool CollectionRec::countEvents ( ) {
|
||||
// set our m_numEventsOnHost value
|
||||
log("coll: loading event count termlist gbeventcount");
|
||||
// temporarily turn off threads
|
||||
bool enabled = g_threads.areThreadsEnabled();
|
||||
g_threads.disableThreads();
|
||||
// count them
|
||||
m_numEventsOnHost = 0;
|
||||
// 1MB at a time
|
||||
long minRecSizes = 1000000;
|
||||
// look up this termlist, gbeventcount which we index in XmlDoc.cpp
|
||||
long long termId = hash64n("gbeventcount") & TERMID_MASK;
|
||||
// make datedb key from it
|
||||
key128_t startKey = g_datedb.makeStartKey ( termId , 0xffffffff );
|
||||
key128_t endKey = g_datedb.makeEndKey ( termId , 0 );
|
||||
|
||||
Msg5 msg5;
|
||||
RdbList list;
|
||||
|
||||
// . init m_numEventsOnHost by getting the exact length of that
|
||||
// termlist on this host
|
||||
// . send in the ping request packet so all hosts can total up
|
||||
// . Rdb.cpp should be added to incrementally so we should have no
|
||||
// double positives.
|
||||
// . Rdb.cpp should inspect each datedb rec for this termid in
|
||||
// a fast an efficient manner
|
||||
loop:
|
||||
// use msg5 to get the list, should ALWAYS block since no threads
|
||||
if ( ! msg5.getList ( RDB_DATEDB ,
|
||||
m_coll ,
|
||||
&list ,
|
||||
(char *)&startKey ,
|
||||
(char *)&endKey ,
|
||||
minRecSizes ,
|
||||
true , // includeTree ,
|
||||
false , // add to cache?
|
||||
0 , // max cache age
|
||||
0 , // startFileNum ,
|
||||
-1 , // numFiles ,
|
||||
NULL , // state
|
||||
NULL , // callback
|
||||
0 , // niceness
|
||||
false , // err correction?
|
||||
NULL , // cache key ptr
|
||||
0 , // retry num
|
||||
-1 , // maxRetries
|
||||
true , // compensate for merge
|
||||
-1LL , // sync point
|
||||
NULL )){// msg5b
|
||||
// not allowed to block!
|
||||
char *xx=NULL;*xx=0; }
|
||||
// scan the list, score is how many valid events from that docid
|
||||
unsigned long total = 0;
|
||||
for ( ; ! list.isExhausted() ; list.skipCurrentRec() ) {
|
||||
unsigned char *rec = (unsigned char *)list.getCurrentRec();
|
||||
// in datedb score is byte #5
|
||||
total += (255-rec[5]);
|
||||
}
|
||||
// declare
|
||||
char *lastKeyPtr;
|
||||
key128_t newStartKey;
|
||||
// add to count. datedb uses half keys so subtract 6 bytes
|
||||
// since the termids will be the same...
|
||||
//m_numEventsOnHost += list.getListSize() / (sizeof(key128_t)-6);
|
||||
m_numEventsOnHost += total;
|
||||
// bail if under limit
|
||||
if ( list.getListSize() < minRecSizes ) goto done;
|
||||
// update key
|
||||
lastKeyPtr = list.m_listEnd - 10;
|
||||
// we make a new start key
|
||||
list.getKey ( lastKeyPtr , (char *)&newStartKey );
|
||||
// maxxed out?
|
||||
if ( newStartKey.n0==0xffffffffffffffffLL &&
|
||||
newStartKey.n1==0xffffffffffffffffLL )
|
||||
goto done;
|
||||
// sanity check
|
||||
if ( newStartKey < startKey ) { char *xx=NULL;*xx=0; }
|
||||
if ( newStartKey > endKey ) { char *xx=NULL;*xx=0; }
|
||||
// inc it
|
||||
newStartKey.n0++;
|
||||
// in the top if the bottom wrapped
|
||||
if ( newStartKey.n0 == 0LL ) newStartKey.n1++;
|
||||
// assign
|
||||
startKey = newStartKey;
|
||||
// and loop back up for more now
|
||||
goto loop;
|
||||
|
||||
done:
|
||||
|
||||
// update all colls count
|
||||
g_collectiondb.m_numEventsAllColls += m_numEventsOnHost;
|
||||
|
||||
if ( enabled ) g_threads.enableThreads();
|
||||
log("coll: got %li local events in termlist",m_numEventsOnHost);
|
||||
|
||||
// set "m_hasDocQualityFiler"
|
||||
//updateFilters();
|
||||
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
void CollectionRec::setUrlFiltersToDefaults ( ) {
|
||||
bool addDefault = false;
|
||||
if ( m_numRegExs == 0 )
|
||||
addDefault = true;
|
||||
//if ( m_numRegExs > 0 && strcmp(m_regExs[m_numRegExs-1],"default") )
|
||||
// addDefault = true;
|
||||
if ( ! addDefault ) return;
|
||||
|
||||
long n = 0;
|
||||
|
||||
//strcpy(m_regExs [n],"default");
|
||||
m_regExs[n].set("default");
|
||||
m_regExs[n].nullTerm();
|
||||
m_numRegExs++;
|
||||
|
||||
m_spiderFreqs [n] = 30; // 30 days default
|
||||
m_numRegExs2++;
|
||||
|
||||
m_spiderPriorities[n] = 0;
|
||||
m_numRegExs3++;
|
||||
|
||||
m_maxSpidersPerRule[n] = 99;
|
||||
m_numRegExs10++;
|
||||
|
||||
m_spiderIpWaits[n] = 1000;
|
||||
m_numRegExs5++;
|
||||
|
||||
m_spiderIpMaxSpiders[n] = 1;
|
||||
m_numRegExs6++;
|
||||
|
||||
m_spidersEnabled[n] = 1;
|
||||
m_numRegExs7++;
|
||||
|
||||
//m_spiderDiffbotApiNum[n] = 1;
|
||||
//m_numRegExs11++;
|
||||
m_spiderDiffbotApiUrl[n].set("");
|
||||
m_spiderDiffbotApiUrl[n].nullTerm();
|
||||
m_numRegExs11++;
|
||||
}
|
||||
|
||||
/*
|
||||
bool CrawlInfo::print (SafeBuf *sb ) {
|
||||
return sb->safePrintf("objectsAdded:%lli\n"
|
||||
"objectsDeleted:%lli\n"
|
||||
"urlsConsidered:%lli\n"
|
||||
"downloadAttempts:%lli\n"
|
||||
"downloadSuccesses:%lli\n"
|
||||
"processAttempts:%lli\n"
|
||||
"processSuccesses:%lli\n"
|
||||
"lastupdate:%lu\n"
|
||||
, m_objectsAdded
|
||||
, m_objectsDeleted
|
||||
, m_urlsConsidered
|
||||
, m_pageDownloadAttempts
|
||||
, m_pageDownloadSuccesses
|
||||
, m_pageProcessAttempts
|
||||
, m_pageProcessSuccesses
|
||||
, m_lastUpdateTime
|
||||
);
|
||||
}
|
||||
|
||||
bool CrawlInfo::setFromSafeBuf (SafeBuf *sb ) {
|
||||
return sscanf(sb->getBufStart(),
|
||||
"objectsAdded:%lli\n"
|
||||
"objectsDeleted:%lli\n"
|
||||
"urlsConsidered:%lli\n"
|
||||
"downloadAttempts:%lli\n"
|
||||
"downloadSuccesses:%lli\n"
|
||||
"processAttempts:%lli\n"
|
||||
"processSuccesses:%lli\n"
|
||||
"lastupdate:%lu\n"
|
||||
, &m_objectsAdded
|
||||
, &m_objectsDeleted
|
||||
, &m_urlsConsidered
|
||||
, &m_pageDownloadAttempts
|
||||
, &m_pageDownloadSuccesses
|
||||
, &m_pageProcessAttempts
|
||||
, &m_pageProcessSuccesses
|
||||
, &m_lastUpdateTime
|
||||
);
|
||||
}
|
||||
*/
|
||||
|
||||
// returns false on failure and sets g_errno, true otherwise
|
||||
bool CollectionRec::save ( ) {
|
||||
if ( g_conf.m_readOnlyMode ) return true;
|
||||
//File f;
|
||||
char tmp[1024];
|
||||
//sprintf ( tmp , "%scollections/%li.%s/c.conf",
|
||||
// g_hostdb.m_dir,m_id,m_coll);
|
||||
// collection name HACK for backwards compatibility
|
||||
//if ( m_collLen == 0 )
|
||||
// sprintf ( tmp , "%scoll.main/coll.conf", g_hostdb.m_dir);
|
||||
//else
|
||||
sprintf ( tmp , "%scoll.%s.%li/coll.conf",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
if ( ! g_parms.saveToXml ( (char *)this , tmp ) ) return false;
|
||||
// log msg
|
||||
//log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
|
||||
|
||||
//
|
||||
// save the crawlinfo class in the collectionrec for diffbot
|
||||
//
|
||||
// SAVE LOCAL
|
||||
sprintf ( tmp , "%scoll.%s.%li/localcrawlinfo.dat",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
//log("coll: saving %s",tmp);
|
||||
SafeBuf sb;
|
||||
//m_localCrawlInfo.print ( &sb );
|
||||
// binary now
|
||||
sb.safeMemcpy ( &m_localCrawlInfo , sizeof(CrawlInfo) );
|
||||
if ( sb.dumpToFile ( tmp ) == -1 ) {
|
||||
log("coll: failed to save file %s : %s",
|
||||
tmp,mstrerror(g_errno));
|
||||
g_errno = 0;
|
||||
}
|
||||
// SAVE GLOBAL
|
||||
sprintf ( tmp , "%scoll.%s.%li/globalcrawlinfo.dat",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
//log("coll: saving %s",tmp);
|
||||
sb.reset();
|
||||
//m_globalCrawlInfo.print ( &sb );
|
||||
// binary now
|
||||
sb.safeMemcpy ( &m_globalCrawlInfo , sizeof(CrawlInfo) );
|
||||
if ( sb.dumpToFile ( tmp ) == -1 ) {
|
||||
log("coll: failed to save file %s : %s",
|
||||
tmp,mstrerror(g_errno));
|
||||
g_errno = 0;
|
||||
}
|
||||
|
||||
// do not need a save now
|
||||
m_needsSave = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// calls hasPermissin() below
|
||||
bool CollectionRec::hasPermission ( HttpRequest *r , TcpSocket *s ) {
|
||||
long plen;
|
||||
char *p = r->getString ( "pwd" , &plen );
|
||||
long ip = s->m_ip;
|
||||
return hasPermission ( p , plen , ip );
|
||||
}
|
||||
|
||||
|
||||
// . does this password work for this collection?
|
||||
bool CollectionRec::isAssassin ( long ip ) {
|
||||
// ok, make sure they came from an acceptable IP
|
||||
//for ( long i = 0 ; i < m_numSpamIps ; i++ )
|
||||
// // they also have a matching IP, so they now have permission
|
||||
// if ( m_spamIps[i] == ip ) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// . does this password work for this collection?
|
||||
bool CollectionRec::hasPermission ( char *p, long plen , long ip ) {
|
||||
// just return true
|
||||
// collection permission is checked from Users::verifyColl
|
||||
// in User::getUserType for every request
|
||||
return true;
|
||||
|
||||
// scan the passwords
|
||||
// MDW: no longer, this is too vulnerable!!!
|
||||
/*
|
||||
for ( long i = 0 ; i < m_numAdminPwds ; i++ ) {
|
||||
long len = gbstrlen ( m_adminPwds[i] );
|
||||
if ( len != plen ) continue;
|
||||
if ( strncmp ( m_adminPwds[i] , p , plen ) != 0 ) continue;
|
||||
// otherwise it's a match!
|
||||
//goto checkIp;
|
||||
// . matching one password is good enough now, default OR
|
||||
// . because just matching an IP is good enough security,
|
||||
// there is really no need for both IP AND passwd match
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
// . if had passwords but the provided one didn't match, return false
|
||||
// . matching one password is good enough now, default OR
|
||||
//if ( m_numPasswords > 0 ) return false;
|
||||
// checkIp:
|
||||
// ok, make sure they came from an acceptable IP
|
||||
//for ( long i = 0 ; i < m_numAdminIps ; i++ )
|
||||
// // they also have a matching IP, so they now have permission
|
||||
// if ( m_adminIps[i] == ip ) return true;
|
||||
// if no security, allow all NONONONONONONONONO!!!!!!!!!!!!!!
|
||||
//if ( m_numAdminPwds == 0 && m_numAdminIps == 0 ) return true;
|
||||
// if they did not match an ip or password, even if both lists
|
||||
// are empty, do not allow access... this prevents security breeches
|
||||
// by accident
|
||||
return false;
|
||||
// if there were IPs then they failed to get in
|
||||
//if ( m_numAdminIps > 0 ) return false;
|
||||
// otherwise, they made it
|
||||
//return true;
|
||||
}
|
||||
|
||||
// can this ip perform a search or add url on this collection?
|
||||
bool CollectionRec::hasSearchPermission ( TcpSocket *s , long encapIp ) {
|
||||
// get the ip
|
||||
long ip = 0; if ( s ) ip = s->m_ip;
|
||||
// and the ip domain
|
||||
long ipd = 0; if ( s ) ipd = ipdom ( s->m_ip );
|
||||
// and top 2 bytes for the israel isp that has this huge block
|
||||
long ipt = 0; if ( s ) ipt = iptop ( s->m_ip );
|
||||
// is it in the ban list?
|
||||
/*
|
||||
for ( long i = 0 ; i < m_numBanIps ; i++ ) {
|
||||
if ( isIpTop ( m_banIps[i] ) ) {
|
||||
if ( m_banIps[i] == ipt ) return false;
|
||||
continue;
|
||||
}
|
||||
// check for ip domain match if this banned ip is an ip domain
|
||||
if ( isIpDom ( m_banIps[i] ) ) {
|
||||
if ( m_banIps[i] == ipd ) return false;
|
||||
continue;
|
||||
}
|
||||
// otherwise it's just a single banned ip
|
||||
if ( m_banIps[i] == ip ) return false;
|
||||
}
|
||||
*/
|
||||
// check the encapsulate ip if any
|
||||
// 1091771468731 0 Aug 05 23:51:08 63.236.25.77 GET
|
||||
// /search?code=mammaXbG&uip=65.87.190.39&n=15&raw=8&q=farm+insurance
|
||||
// +nj+state HTTP/1.0
|
||||
/*
|
||||
if ( encapIp ) {
|
||||
ipd = ipdom ( encapIp );
|
||||
ip = encapIp;
|
||||
for ( long i = 0 ; i < m_numBanIps ; i++ ) {
|
||||
if ( isIpDom ( m_banIps[i] ) ) {
|
||||
if ( m_banIps[i] == ipd ) return false;
|
||||
continue;
|
||||
}
|
||||
if ( isIpTop ( m_banIps[i] ) ) {
|
||||
if ( m_banIps[i] == ipt ) return false;
|
||||
continue;
|
||||
}
|
||||
if ( m_banIps[i] == ip ) return false;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
return true;
|
||||
/*
|
||||
// do we have an "only" list?
|
||||
if ( m_numSearchIps == 0 ) return true;
|
||||
// it must be in that list if we do
|
||||
for ( long i = 0 ; i < m_numSearchIps ; i++ ) {
|
||||
// check for ip domain match if this banned ip is an ip domain
|
||||
if ( isIpDom ( m_searchIps[i] ) ) {
|
||||
if ( m_searchIps[i] == ipd ) return true;
|
||||
continue;
|
||||
}
|
||||
// otherwise it's just a single ip
|
||||
if ( m_searchIps[i] == ip ) return true;
|
||||
}
|
||||
*/
|
||||
|
||||
// otherwise no permission
|
||||
return false;
|
||||
}
|
||||
|
||||
bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
|
||||
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
|
||||
// if we had a regex, that works for this purpose as well
|
||||
if ( ! ucp ) ucp = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
|
||||
|
||||
|
||||
char *upp = m_diffbotUrlProcessPattern.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
// if we had a regex, that works for this purpose as well
|
||||
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( upp && ! upp[0] ) upp = NULL;
|
||||
|
||||
|
||||
// what diffbot url to use for processing
|
||||
char *api = m_diffbotApiUrl.getBufStart();
|
||||
if ( api && ! api[0] ) api = NULL;
|
||||
|
||||
// convert from seconds to milliseconds. default is 250ms?
|
||||
long wait = (long)(m_collectiveCrawlDelay * 1000.0);
|
||||
// default to 250ms i guess. -1 means unset i think.
|
||||
if ( m_collectiveCrawlDelay < 0.0 ) wait = 250;
|
||||
|
||||
// make the gigablast regex table just "default" so it does not
|
||||
// filtering, but accepts all urls. we will add code to pass the urls
|
||||
// through m_diffbotUrlCrawlPattern alternatively. if that itself
|
||||
// is empty, we will just restrict to the seed urls subdomain.
|
||||
for ( long i = 0 ; i < MAX_FILTERS ; i++ ) {
|
||||
m_regExs[i].purge();
|
||||
m_spiderPriorities[i] = 0;
|
||||
m_maxSpidersPerRule [i] = 10;
|
||||
m_spiderIpWaits [i] = wait;
|
||||
m_spiderIpMaxSpiders[i] = 7; // keep it respectful
|
||||
m_spidersEnabled [i] = 1;
|
||||
m_spiderFreqs [i] =m_collectiveRespiderFrequency;
|
||||
m_spiderDiffbotApiUrl[i].purge();
|
||||
m_harvestLinks[i] = true;
|
||||
}
|
||||
|
||||
long i = 0;
|
||||
|
||||
|
||||
// 1st default url filter
|
||||
m_regExs[i].set("ismedia && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
|
||||
// 2nd default filter
|
||||
if ( m_restrictDomain ) {
|
||||
m_regExs[i].set("!isonsamedomain && !ismanualadd");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
}
|
||||
|
||||
// 3rd rule for respidering
|
||||
if ( m_collectiveRespiderFrequency > 0.0 ) {
|
||||
m_regExs[i].set("lastspidertime>={roundstart}");
|
||||
// do not "remove" from index
|
||||
m_spiderPriorities [i] = 10;
|
||||
// just turn off spidering. if we were to set priority to
|
||||
// filtered it would be removed from index!
|
||||
m_spidersEnabled [i] = 0;
|
||||
i++;
|
||||
}
|
||||
// if collectiverespiderfreq is 0 or less then do not RE-spider
|
||||
// documents already indexed.
|
||||
else {
|
||||
// this does NOT work! error docs continuosly respider
|
||||
// because they are never indexed!!! like EDOCSIMPLIFIEDREDIR
|
||||
//m_regExs[i].set("isindexed");
|
||||
m_regExs[i].set("hasreply");
|
||||
m_spiderPriorities [i] = 10;
|
||||
// just turn off spidering. if we were to set priority to
|
||||
// filtered it would be removed from index!
|
||||
m_spidersEnabled [i] = 0;
|
||||
i++;
|
||||
}
|
||||
|
||||
// and for docs that have errors respider once every 5 hours
|
||||
m_regExs[i].set("errorcount>0 && errcount<3");
|
||||
m_spiderPriorities [i] = 40;
|
||||
m_spiderFreqs [i] = 0.2; // half a day
|
||||
i++;
|
||||
|
||||
// excessive errors? (tcp/dns timed out, etc.) retry once per month?
|
||||
m_regExs[i].set("errorcount>=3");
|
||||
m_spiderPriorities [i] = 30;
|
||||
m_spiderFreqs [i] = 30; // 30 days
|
||||
i++;
|
||||
|
||||
// url crawl and process pattern
|
||||
if ( ucp && upp ) {
|
||||
m_regExs[i].set("matchesucp && matchesupp");
|
||||
m_spiderPriorities [i] = 55;
|
||||
m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
// if just matches ucp, just crawl it, do not process
|
||||
m_regExs[i].set("matchesucp");
|
||||
m_spiderPriorities [i] = 54;
|
||||
i++;
|
||||
// just process, do not spider links if does not match ucp
|
||||
m_regExs[i].set("matchesupp");
|
||||
m_spiderPriorities [i] = 53;
|
||||
m_harvestLinks [i] = false;
|
||||
m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
// do not crawl anything else
|
||||
m_regExs[i].set("default");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
}
|
||||
|
||||
// harvest links if we should crawl it
|
||||
if ( ucp && ! upp ) {
|
||||
m_regExs[i].set("matchesucp");
|
||||
m_spiderPriorities [i] = 54;
|
||||
// process everything since upp is empty
|
||||
m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
// do not crawl anything else
|
||||
m_regExs[i].set("default");
|
||||
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
|
||||
i++;
|
||||
}
|
||||
|
||||
// just process
|
||||
if ( upp && ! ucp ) {
|
||||
m_regExs[i].set("matchesupp");
|
||||
m_spiderPriorities [i] = 53;
|
||||
//m_harvestLinks [i] = false;
|
||||
m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
// crawl everything by default, no processing
|
||||
m_regExs[i].set("default");
|
||||
m_spiderPriorities [i] = 50;
|
||||
i++;
|
||||
}
|
||||
|
||||
// no restraints
|
||||
if ( ! upp && ! ucp ) {
|
||||
// crawl everything by default, no processing
|
||||
m_regExs[i].set("default");
|
||||
m_spiderPriorities [i] = 50;
|
||||
m_spiderDiffbotApiUrl[i].set ( api );
|
||||
i++;
|
||||
}
|
||||
|
||||
m_numRegExs = i;
|
||||
m_numRegExs2 = i;
|
||||
m_numRegExs3 = i;
|
||||
m_numRegExs10 = i;
|
||||
m_numRegExs5 = i;
|
||||
m_numRegExs6 = i;
|
||||
m_numRegExs7 = i;
|
||||
m_numRegExs8 = i;
|
||||
m_numRegExs11 = i;
|
||||
|
||||
return true;
|
||||
}
|
106
Collectiondb.cpp
106
Collectiondb.cpp
@ -24,6 +24,8 @@
|
||||
#include "Users.h"
|
||||
#include "Parms.h"
|
||||
|
||||
void testRegex ( ) ;
|
||||
|
||||
HashTableX g_collTable;
|
||||
|
||||
// a global class extern'd in .h file
|
||||
@ -1402,31 +1404,8 @@ bool CollectionRec::load ( char *coll , long i ) {
|
||||
// add default reg ex IFF there are no url filters there now
|
||||
if ( m_numRegExs == 0 ) setUrlFiltersToDefaults();
|
||||
|
||||
// compile regexs here
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasucr = true;
|
||||
if ( rx && regcomp ( &m_ucr , rx ,
|
||||
REG_EXTENDED|REG_ICASE|
|
||||
REG_NEWLINE|REG_NOSUB) ) {
|
||||
// error!
|
||||
return log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
}
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx && regcomp ( &m_upr , rx ,
|
||||
REG_EXTENDED|REG_ICASE|
|
||||
REG_NEWLINE|REG_NOSUB) ) {
|
||||
// error!
|
||||
return log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
}
|
||||
|
||||
// temp check
|
||||
//testRegex();
|
||||
|
||||
//
|
||||
// LOAD the crawlinfo class in the collectionrec for diffbot
|
||||
@ -1847,7 +1826,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_isCustomCrawl != 2 ) // bulk api
|
||||
return true;
|
||||
|
||||
log(LOG_DEBUG,"db: rebuilding url filters");
|
||||
logf(LOG_DEBUG,"db: rebuilding url filters");
|
||||
|
||||
char *ucp = m_diffbotUrlCrawlPattern.getBufStart();
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
@ -2009,5 +1988,80 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_numRegExs8 = i;
|
||||
m_numRegExs11 = i;
|
||||
|
||||
///////
|
||||
//
|
||||
// recompile regular expressions
|
||||
//
|
||||
///////
|
||||
|
||||
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasucr = true;
|
||||
if ( rx && regcomp ( &m_ucr , rx ,
|
||||
REG_EXTENDED|REG_ICASE|
|
||||
REG_NEWLINE|REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_ucr );
|
||||
m_hasucr = false;
|
||||
}
|
||||
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx && regcomp ( &m_upr , rx ,
|
||||
REG_EXTENDED|REG_ICASE|
|
||||
REG_NEWLINE|REG_NOSUB) ) {
|
||||
// error!
|
||||
log("coll: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
regfree ( &m_upr );
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void testRegex ( ) {
|
||||
|
||||
//
|
||||
// TEST
|
||||
//
|
||||
|
||||
char *rx;
|
||||
|
||||
rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=\\d";
|
||||
|
||||
rx = "(http://)?(www.)?vault.com/rankings-reviews/company-rankings/law/vault-law-100/\\.aspx\\?pg=[0-9]";
|
||||
|
||||
regex_t ucr;
|
||||
|
||||
if ( regcomp ( &ucr , rx ,
|
||||
REG_ICASE
|
||||
|REG_EXTENDED
|
||||
//|REG_NEWLINE
|
||||
//|REG_NOSUB
|
||||
) ) {
|
||||
// error!
|
||||
log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
}
|
||||
|
||||
logf(LOG_DEBUG,"db: compiled '%s' for crawl pattern",rx);
|
||||
|
||||
char *url = "http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2";
|
||||
|
||||
if ( regexec(&ucr,url,0,NULL,0) )
|
||||
logf(LOG_DEBUG,"db: failed to match %s on %s",
|
||||
url,rx);
|
||||
else
|
||||
logf(LOG_DEBUG,"db: MATCHED %s on %s",
|
||||
url,rx);
|
||||
exit(0);
|
||||
}
|
||||
|
@ -8781,7 +8781,7 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
long urlLen = sreq->getUrlLen();
|
||||
char *url = sreq->m_url;
|
||||
|
||||
//if ( strstr(url,"login.yahoo.com/") )
|
||||
//if ( strstr(url,"http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2" ))
|
||||
// log("hey");
|
||||
|
||||
//initAggregatorTable();
|
||||
|
@ -15,13 +15,13 @@
|
||||
<isCustomCrawl>0</>
|
||||
<maxToCrawl>100001</>
|
||||
<maxToProcess>100001</>
|
||||
<maxCrawlRounds>3</>
|
||||
<maxCrawlRounds>-1</>
|
||||
|
||||
# All <, >, " and # characters that are values for a field contained herein
|
||||
# must be represented as <, >, " and # respectively.
|
||||
|
||||
# When enabled the spider adds pages to your index.
|
||||
<spideringEnabled>0</>
|
||||
<spideringEnabled>1</>
|
||||
|
||||
# make each spider wait this many milliseconds before getting the ip and
|
||||
# downloading the page.
|
||||
@ -308,12 +308,12 @@
|
||||
<maxRobotstxtCacheAge>86400</>
|
||||
|
||||
# Only spider URLs scheduled to be spidered at this time or after. In UTC.
|
||||
<spiderStartTime>24 Jan 1970 20:00 UTC</>
|
||||
<spiderStartTime>15 Jan 1970 21:00 UTC</>
|
||||
|
||||
# Only spider URLs scheduled to be spidered at this time or before. If "use
|
||||
# current time" is true then the current local time is used for this value
|
||||
# instead. in UTC.
|
||||
<spiderEndTime>08 Jan 1970 08:00 UTC</>
|
||||
<spiderEndTime>15 Jan 2010 21:00 UTC</>
|
||||
|
||||
# Use the current time as the spider end time?
|
||||
<useCurrentTime>1</>
|
||||
@ -728,22 +728,22 @@
|
||||
<numberOfLinksToScanForRelatedPages>1024</>
|
||||
|
||||
# related pages with a quality lower than this will be ignored.
|
||||
<minRelatedPageQuality>0</>
|
||||
<minRelatedPageQuality>30</>
|
||||
|
||||
# related pages with an adjusted score lower than this will be ignored.
|
||||
<minRelatedPageScore>0</>
|
||||
<minRelatedPageScore>1</>
|
||||
|
||||
# related pages with less than this number of links will be ignored.
|
||||
<minRelatedPageLinks>2</>
|
||||
|
||||
# A in A * numLinks + B * avgLnkrQlty + C * PgQlty + D * numSRPLinks.
|
||||
<coefficientForNumberOfLinksInRelatedPagesScoreCalculation>0</>
|
||||
<coefficientForNumberOfLinksInRelatedPagesScoreCalculation>10</>
|
||||
|
||||
# B in A * numLinks + B * avgLnkrQlty + C * PgQlty + D * numSRPLinks.
|
||||
<coefficientForAverageLinkerQualityInRelatedPagesScoreCalculation>0</>
|
||||
<coefficientForAverageLinkerQualityInRelatedPagesScoreCalculation>1</>
|
||||
|
||||
# C in A * numLinks + B * avgLnkrQlty + C * PgQlty + D * numSRPLinks
|
||||
<coefficientForPageQualityInRelatedPagesScoreCalculation>0</>
|
||||
<coefficientForPageQualityInRelatedPagesScoreCalculation>1</>
|
||||
|
||||
# D in A * numLinks + B * avgLnkrQlty + C * PgQlty + D * numSRPLinks.
|
||||
<coefficientForSearchResultLinksInRelatedPagesScoreCalculation>1</>
|
||||
@ -756,7 +756,7 @@
|
||||
<highlightQueryTermsInRelatedPagesSummary>0</>
|
||||
|
||||
# Truncates a related page title after this many charaters and adds ...
|
||||
<numberOfCharactersToDisplayInTitleBeforeTruncating>0</>
|
||||
<numberOfCharactersToDisplayInTitleBeforeTruncating>50</>
|
||||
|
||||
# Use the search results' links in order to generate related pages.
|
||||
<useResultsPagesAsReferences>0</>
|
||||
@ -859,7 +859,7 @@
|
||||
# <br> tags are inserted to keep the number of chars in the summary per line
|
||||
# at or below this width. Strings without spaces that exceed this width are
|
||||
# not split.
|
||||
<maxSummaryLineWidth>0</>
|
||||
<maxSummaryLineWidth>80</>
|
||||
|
||||
# Maximum number of characters to allow in between search terms.
|
||||
<ProxSummaryCarverRadius>256</>
|
||||
@ -935,17 +935,7 @@
|
||||
# expressions. Use the <i>&&</i> operator to string multiple expressions
|
||||
# together in the same text box. <br><br>
|
||||
<filterExpression><![CDATA[isdocidbased]]></>
|
||||
<filterExpression><![CDATA[$.css]]></>
|
||||
<filterExpression><![CDATA[$.mpeg]]></>
|
||||
<filterExpression><![CDATA[$.mpg]]></>
|
||||
<filterExpression><![CDATA[$.mp3]]></>
|
||||
<filterExpression><![CDATA[$.wmv]]></>
|
||||
<filterExpression><![CDATA[.css?]]></>
|
||||
<filterExpression><![CDATA[$.jpg]]></>
|
||||
<filterExpression><![CDATA[$.JPG]]></>
|
||||
<filterExpression><![CDATA[$.gif]]></>
|
||||
<filterExpression><![CDATA[$.ico]]></>
|
||||
<filterExpression><![CDATA[/print/]]></>
|
||||
<filterExpression><![CDATA[ismedia]]></>
|
||||
<filterExpression><![CDATA[errorcount>=3 && hastmperror]]></>
|
||||
<filterExpression><![CDATA[errorcount>=1 && hastmperror]]></>
|
||||
<filterExpression><![CDATA[isaddurl]]></>
|
||||
@ -961,23 +951,131 @@
|
||||
<filterExpression><![CDATA[hopcount>=3]]></>
|
||||
<filterExpression><![CDATA[isnew]]></>
|
||||
<filterExpression><![CDATA[default]]></>
|
||||
|
||||
# Use <harvestLinks> tag.
|
||||
|
||||
# Use <spidersEnabled> tag.
|
||||
|
||||
# Use <filterFrequency> tag.
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<harvestLinks>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>7.000000</>
|
||||
<filterFrequency>7.000000</>
|
||||
<filterFrequency>7.000000</>
|
||||
<filterFrequency>10.000000</>
|
||||
<filterFrequency>20.000000</>
|
||||
<filterFrequency>20.000000</>
|
||||
<filterFrequency>40.000000</>
|
||||
<filterFrequency>40.000000</>
|
||||
<filterFrequency>60.000000</>
|
||||
<filterFrequency>60.000000</>
|
||||
<filterFrequency>30.000000</>
|
||||
<filterFrequency>30.000000</>
|
||||
|
||||
# Do not allow more than this many outstanding spiders for all urls in this
|
||||
# priority.
|
||||
# Use <maxSpidersPerRule> tag.
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>4</>
|
||||
<maxSpidersPerRule>2</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>2</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
|
||||
# Allow this many spiders per IP.
|
||||
# Use <maxSpidersPerIp> tag.
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
|
||||
# Wait at least this long before downloading urls from the same IP address.
|
||||
# Use <spiderIpWait> tag.
|
||||
|
||||
# Use <filterPriority> tag.
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<filterPriority>80</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>3</>
|
||||
<filterPriority>45</>
|
||||
<filterPriority>85</>
|
||||
<filterPriority>50</>
|
||||
<filterPriority>48</>
|
||||
<filterPriority>49</>
|
||||
<filterPriority>47</>
|
||||
<filterPriority>40</>
|
||||
<filterPriority>39</>
|
||||
<filterPriority>30</>
|
||||
<filterPriority>29</>
|
||||
<filterPriority>20</>
|
||||
<filterPriority>19</>
|
||||
<filterPriority>1</>
|
||||
<filterPriority>0</>
|
||||
|
||||
# Use <diffbotAPI> tag.
|
||||
|
Loading…
Reference in New Issue
Block a user