mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
new site list updates
This commit is contained in:
parent
ed626b162a
commit
11e8c16878
104
PageBasic.cpp
104
PageBasic.cpp
@ -168,7 +168,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
// make hash of the line
|
||||
long h32 = hash32 ( s , pe - s );
|
||||
|
||||
bool exact = false;
|
||||
bool seedMe = true;
|
||||
bool isUrl = true;
|
||||
bool isNeg = false;
|
||||
|
||||
innerLoop:
|
||||
// skip spaces at start of line
|
||||
@ -181,44 +183,89 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
if ( *s == '\n' ) continue;
|
||||
|
||||
// all?
|
||||
if ( *s == '*' ) {
|
||||
sc->m_siteListAsteriskLine = start;
|
||||
continue;
|
||||
}
|
||||
//if ( *s == '*' ) {
|
||||
// sc->m_siteListAsteriskLine = start;
|
||||
// continue;
|
||||
//}
|
||||
|
||||
if ( *s == '-' ) {
|
||||
sc->m_siteListHasNegatives = true;
|
||||
isNeg = true;
|
||||
s++;
|
||||
}
|
||||
|
||||
// exact:?
|
||||
if ( strncmp(s,"exact:",6) == 0 ) {
|
||||
exact = true;
|
||||
s += 6;
|
||||
goto innerLoop;
|
||||
}
|
||||
|
||||
u.set ( s , pe - s );
|
||||
if ( strncmp(s,"seed:",5) == 0 ) {
|
||||
s += 5;
|
||||
goto innerLoop;
|
||||
}
|
||||
|
||||
if ( strncmp(s,"site:",5) == 0 ) {
|
||||
s += 5;
|
||||
seedMe = false;
|
||||
goto innerLoop;
|
||||
}
|
||||
|
||||
if ( strncmp(s,"contains:",9) == 0 ) {
|
||||
s += 9;
|
||||
seedMe = false;
|
||||
isUrl = false;
|
||||
goto innerLoop;
|
||||
}
|
||||
|
||||
long slen = pe - s;
|
||||
|
||||
// empty line?
|
||||
if ( pe - s <= 0 )
|
||||
if ( slen <= 0 )
|
||||
continue;
|
||||
|
||||
if ( ! isUrl ) {
|
||||
// add to string buffers
|
||||
if ( isNeg ) {
|
||||
if ( !sc->m_negSubstringBuf.safeMemcpy(s,slen))
|
||||
return true;
|
||||
if ( !sc->m_negSubstringBuf.pushChar('\0') )
|
||||
return true;
|
||||
continue;
|
||||
}
|
||||
// add to string buffers
|
||||
if ( ! sc->m_posSubstringBuf.safeMemcpy(s,slen) )
|
||||
return true;
|
||||
if ( ! sc->m_posSubstringBuf.pushChar('\0') )
|
||||
return true;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
u.set ( s , slen );
|
||||
|
||||
// error? skip it then...
|
||||
if ( u.getHostLen() <= 0 ) {
|
||||
log("basic: error on line #%li in sitelist",lineNum);
|
||||
continue;
|
||||
}
|
||||
|
||||
// is fake ip assigned to us?
|
||||
long firstIp = getFakeIpForUrl2 ( &u );
|
||||
|
||||
if ( ! isAssignedToUs( firstIp ) ) continue;
|
||||
|
||||
// see if in existing table for existing site list
|
||||
if ( addSeeds &&
|
||||
// a "site:" directive mean no seeding
|
||||
// a "contains:" directive mean no seeding
|
||||
seedMe &&
|
||||
! dedup.isInTable ( &h32 ) ) {
|
||||
// make spider request
|
||||
SpiderRequest sreq;
|
||||
sreq.setFromAddUrl ( u.getUrl() );
|
||||
// is fake ip assigned to us?
|
||||
if ( isAssignedToUs( sreq.m_firstIp ) &&
|
||||
// . add this url to spiderdb as a spiderrequest
|
||||
if (
|
||||
// . add this url to spiderdb as a spiderrequest
|
||||
// . calling msg4 will be the last thing we do
|
||||
!spiderReqBuf->safeMemcpy(&sreq,sreq.getRecSize()))
|
||||
return true;
|
||||
@ -226,7 +273,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
added++;
|
||||
|
||||
}
|
||||
|
||||
|
||||
// make the data node
|
||||
PatternData pd;
|
||||
// hash of the subdomain or domain for this line in sitelist
|
||||
@ -304,13 +351,25 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
|
||||
|
||||
// if it has * and no negatives, we are in!
|
||||
if ( sc->m_siteListAsteriskLine && ! sc->m_siteListHasNegatives )
|
||||
return sc->m_siteListAsteriskLine;
|
||||
//if ( sc->m_siteListAsteriskLine && ! sc->m_siteListHasNegatives )
|
||||
// return sc->m_siteListAsteriskLine;
|
||||
|
||||
// if it is just a bunch of comments or blank lines, it is empty
|
||||
if ( sc->m_siteListIsEmpty )
|
||||
return NULL;
|
||||
|
||||
// if we had a list of contains: or regex: directives in the sitelist
|
||||
// we have to linear scan those
|
||||
char *nb = sc->m_negSubstringBuf.getBufStart();
|
||||
char *nbend = nb + sc->m_negSubstringBuf.getLength();
|
||||
for ( ; nb && nb < nbend ; ) {
|
||||
// return NULL if matches a negative substring
|
||||
if ( strstr ( sreq->m_url , nb ) ) return NULL;
|
||||
// skip it
|
||||
nb += strlen(nb) + 1;
|
||||
}
|
||||
|
||||
|
||||
char *myPath = NULL;
|
||||
|
||||
// check domain specific tables
|
||||
@ -359,8 +418,21 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
|
||||
return pd->m_patternStr;
|
||||
}
|
||||
|
||||
|
||||
// if we had a list of contains: or regex: directives in the sitelist
|
||||
// we have to linear scan those
|
||||
char *pb = sc->m_posSubstringBuf.getBufStart();
|
||||
char *pend = pb + sc->m_posSubstringBuf.length();
|
||||
for ( ; pb && pb < pend ; ) {
|
||||
// return NULL if matches a negative substring
|
||||
if ( strstr ( sreq->m_url , pb ) ) return pb;
|
||||
// skip it
|
||||
pb += strlen(pb) + 1;
|
||||
}
|
||||
|
||||
|
||||
// is there an '*' in the patterns?
|
||||
if ( sc->m_siteListAsteriskLine ) return sc->m_siteListAsteriskLine;
|
||||
//if ( sc->m_siteListAsteriskLine ) return sc->m_siteListAsteriskLine;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@ -541,7 +613,7 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
"</tr>"
|
||||
|
||||
"<tr>"
|
||||
"<td>-contain:badstuff</td>"
|
||||
"<td>-contains:badstuff</td>"
|
||||
"<td>Matches if does NOT contain badstuff."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
17
Spider.cpp
17
Spider.cpp
@ -12487,6 +12487,22 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long getFakeIpForUrl1 ( char *url1 ) {
|
||||
// make the probable docid
|
||||
long long probDocId = g_titledb.getProbableDocId ( url1 );
|
||||
// make one up, like we do in PageReindex.cpp
|
||||
long firstIp = (probDocId & 0xffffffff);
|
||||
return firstIp;
|
||||
}
|
||||
|
||||
long getFakeIpForUrl2 ( Url *url2 ) {
|
||||
// make the probable docid
|
||||
long long probDocId = g_titledb.getProbableDocId ( url2 );
|
||||
// make one up, like we do in PageReindex.cpp
|
||||
long firstIp = (probDocId & 0xffffffff);
|
||||
return firstIp;
|
||||
}
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
bool SpiderRequest::setFromAddUrl ( char *url ) {
|
||||
// reset it
|
||||
@ -12496,6 +12512,7 @@ bool SpiderRequest::setFromAddUrl ( char *url ) {
|
||||
|
||||
// make one up, like we do in PageReindex.cpp
|
||||
long firstIp = (probDocId & 0xffffffff);
|
||||
//long firstIp = getFakeIpForUrl1 ( url );
|
||||
|
||||
// ensure not crazy
|
||||
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;
|
||||
|
7
Spider.h
7
Spider.h
@ -49,6 +49,9 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
|
||||
class SafeBuf *msg ,
|
||||
long *status ) ;
|
||||
|
||||
long getFakeIpForUrl1 ( char *url1 ) ;
|
||||
long getFakeIpForUrl2 ( Url *url2 ) ;
|
||||
|
||||
// Overview of Spider
|
||||
//
|
||||
// this new spider algorithm ensures that urls get spidered even if a host
|
||||
@ -1105,6 +1108,10 @@ class SpiderColl {
|
||||
bool m_siteListIsEmpty;
|
||||
// data buckets in this table are of type
|
||||
HashTableX m_siteListDomTable;
|
||||
// substring matches like "contains:goodstuff" or
|
||||
// later "regex:.*"
|
||||
SafeBuf m_negSubstringBuf;
|
||||
SafeBuf m_posSubstringBuf;
|
||||
|
||||
RdbCache m_dupCache;
|
||||
RdbTree m_winnerTree;
|
||||
|
Loading…
Reference in New Issue
Block a user