mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
seems like we can spider multiple urls
from same ip at same time now.
This commit is contained in:
parent
8461e33b53
commit
e594af898a
@ -620,6 +620,10 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
m_isLocal = true;
|
||||
#endif
|
||||
|
||||
// comcast
|
||||
//if ( sock && strncmp(iptoa(sock->m_ip),"75.160.49.8",11) == 0)
|
||||
// m_isLocal = true;
|
||||
|
||||
// roadrunner ip
|
||||
// if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0)
|
||||
// m_isLocal = true;
|
||||
|
21
Rdb.cpp
21
Rdb.cpp
@ -1951,7 +1951,9 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
if ( KEYNEG(key) ) {
|
||||
// log debug
|
||||
logf(LOG_DEBUG,"spflow: removed doledb key "
|
||||
"for uh48=%llu",
|
||||
"for pri=%li time=%lu uh48=%llu",
|
||||
(long)g_doledb.getPriority(&doleKey),
|
||||
(long)g_doledb.getSpiderTime(&doleKey),
|
||||
g_doledb.getUrlHash48(&doleKey));
|
||||
}
|
||||
else {
|
||||
@ -1959,9 +1961,14 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
//SpiderColl *sc = g_spiderCache.getSpiderColl(collnum)
|
||||
// do not overflow!
|
||||
// log debug
|
||||
logf(LOG_DEBUG,"spflow: added doledb key "
|
||||
"for uh48=%llu",
|
||||
g_doledb.getUrlHash48(&doleKey));
|
||||
SpiderRequest *sreq = (SpiderRequest *)data;
|
||||
logf(LOG_DEBUG,"spflow: added doledb key "
|
||||
"for pri=%li time=%lu uh48=%llu docid=%lli u=%s",
|
||||
(long)g_doledb.getPriority(&doleKey),
|
||||
(long)g_doledb.getSpiderTime(&doleKey),
|
||||
g_doledb.getUrlHash48(&doleKey),
|
||||
sreq->m_probDocId,
|
||||
sreq->m_url);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2198,8 +2205,10 @@ bool Rdb::addRecord ( collnum_t collnum,
|
||||
(char *)key,
|
||||
sizeof(key_t) );
|
||||
// debug log
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("rdb: cursor reset pri=%li to %s",
|
||||
if ( g_conf.m_logDebugSpider ||
|
||||
g_conf.m_logDebugSpiderFlow )
|
||||
log("spflow: cursor reset pri=%li to "
|
||||
"%s",
|
||||
pri,KEYSTR(key,12));
|
||||
}
|
||||
// that's it for doledb mods
|
||||
|
34
Spider.cpp
34
Spider.cpp
@ -2659,7 +2659,7 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
|
||||
// that scanSpiderdb() repopulates doledb again with that
|
||||
// "firstIp". this way we can spider multiple urls from the
|
||||
// same ip at the same time.
|
||||
if ( g_spiderLoop.m_lockTable.isInTable(&sreq->m_probDocId) )
|
||||
if ( g_spiderLoop.isInLockTable(sreq->m_probDocId) )
|
||||
continue;
|
||||
|
||||
// ok, we got a new winner
|
||||
@ -2839,7 +2839,7 @@ bool SpiderColl::scanSpiderdb ( bool needList ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( g_spiderLoop.m_lockTable.isInTable(&m_bestRequest->m_probDocId)){
|
||||
if ( g_spiderLoop.isInLockTable ( m_bestRequest->m_probDocId ) ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
|
||||
// make the doledb key first for this so we can add it
|
||||
@ -3444,15 +3444,8 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
m_gettingDoledbList = true;
|
||||
|
||||
// log this now
|
||||
if ( g_conf.m_logDebugSpider ) {
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
m_doleStart = gettimeofdayInMillisecondsLocal();
|
||||
// 12 byte doledb keys
|
||||
//long pri = g_doledb.getPriority(&m_sc->m_nextDoledbKey);
|
||||
//logf(LOG_DEBUG,"spider: loading list from doledb startkey=%s"
|
||||
// " pri=%li",
|
||||
// KEYSTR(&m_sc->m_nextDoledbKey,12),
|
||||
// pri);
|
||||
}
|
||||
|
||||
// get a spider rec for us to spider from doledb
|
||||
if ( ! m_msg5.getList ( RDB_DOLEDB ,
|
||||
@ -3570,6 +3563,21 @@ bool SpiderLoop::gotDoledbList2 ( ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// if debugging the spider flow show the start key if list non-empty
|
||||
if ( g_conf.m_logDebugSpiderFlow ) {
|
||||
// 12 byte doledb keys
|
||||
long pri = g_doledb.getPriority(&m_sc->m_nextDoledbKey);
|
||||
long stm = g_doledb.getSpiderTime(&m_sc->m_nextDoledbKey);
|
||||
long long uh48 = g_doledb.getUrlHash48(&m_sc->m_nextDoledbKey);
|
||||
logf(LOG_DEBUG,"spider: loading list from doledb startkey=%s"
|
||||
" pri=%li time=%lu uh48=%llu",
|
||||
KEYSTR(&m_sc->m_nextDoledbKey,12),
|
||||
pri,
|
||||
stm,
|
||||
uh48);
|
||||
}
|
||||
|
||||
|
||||
//time_t nowGlobal = getTimeGlobal();
|
||||
|
||||
// double check
|
||||
@ -4427,6 +4435,12 @@ void gotLockReplyWrapper ( void *state , UdpSlot *slot ) {
|
||||
else g_spiderLoop.spiderDoledUrls();
|
||||
}
|
||||
|
||||
bool SpiderLoop::isInLockTable ( long long probDocId ) {
|
||||
unsigned long long lockKey=g_titledb.getFirstProbableDocId(probDocId);
|
||||
HashTableX *ht = &g_spiderLoop.m_lockTable;
|
||||
return ht->isInTable ( &lockKey );
|
||||
}
|
||||
|
||||
// . returns false if blocked, true otherwise.
|
||||
// . returns true and sets g_errno on error
|
||||
// . before we can spider for a SpiderRequest we must be granted the lock
|
||||
|
Loading…
Reference in New Issue
Block a user