new site list updates

This commit is contained in:
Matt Wells 2014-03-09 17:53:24 -07:00
parent ed626b162a
commit 11e8c16878
3 changed files with 112 additions and 16 deletions

View File

@ -168,7 +168,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
// make hash of the line
long h32 = hash32 ( s , pe - s );
bool exact = false;
bool seedMe = true;
bool isUrl = true;
bool isNeg = false;
innerLoop:
// skip spaces at start of line
@ -181,43 +183,88 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
if ( *s == '\n' ) continue;
// all?
if ( *s == '*' ) {
sc->m_siteListAsteriskLine = start;
continue;
}
//if ( *s == '*' ) {
// sc->m_siteListAsteriskLine = start;
// continue;
//}
if ( *s == '-' ) {
sc->m_siteListHasNegatives = true;
isNeg = true;
s++;
}
// exact:?
if ( strncmp(s,"exact:",6) == 0 ) {
exact = true;
s += 6;
goto innerLoop;
}
u.set ( s , pe - s );
if ( strncmp(s,"seed:",5) == 0 ) {
s += 5;
goto innerLoop;
}
if ( strncmp(s,"site:",5) == 0 ) {
s += 5;
seedMe = false;
goto innerLoop;
}
if ( strncmp(s,"contains:",9) == 0 ) {
s += 9;
seedMe = false;
isUrl = false;
goto innerLoop;
}
long slen = pe - s;
// empty line?
if ( pe - s <= 0 )
if ( slen <= 0 )
continue;
if ( ! isUrl ) {
// add to string buffers
if ( isNeg ) {
if ( !sc->m_negSubstringBuf.safeMemcpy(s,slen))
return true;
if ( !sc->m_negSubstringBuf.pushChar('\0') )
return true;
continue;
}
// add to string buffers
if ( ! sc->m_posSubstringBuf.safeMemcpy(s,slen) )
return true;
if ( ! sc->m_posSubstringBuf.pushChar('\0') )
return true;
continue;
}
u.set ( s , slen );
// error? skip it then...
if ( u.getHostLen() <= 0 ) {
log("basic: error on line #%li in sitelist",lineNum);
continue;
}
// is fake ip assigned to us?
long firstIp = getFakeIpForUrl2 ( &u );
if ( ! isAssignedToUs( firstIp ) ) continue;
// see if in existing table for existing site list
if ( addSeeds &&
// a "site:" directive mean no seeding
// a "contains:" directive mean no seeding
seedMe &&
! dedup.isInTable ( &h32 ) ) {
// make spider request
SpiderRequest sreq;
sreq.setFromAddUrl ( u.getUrl() );
// is fake ip assigned to us?
if ( isAssignedToUs( sreq.m_firstIp ) &&
if (
// . add this url to spiderdb as a spiderrequest
// . calling msg4 will be the last thing we do
!spiderReqBuf->safeMemcpy(&sreq,sreq.getRecSize()))
@ -304,13 +351,25 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
// if it has * and no negatives, we are in!
if ( sc->m_siteListAsteriskLine && ! sc->m_siteListHasNegatives )
return sc->m_siteListAsteriskLine;
//if ( sc->m_siteListAsteriskLine && ! sc->m_siteListHasNegatives )
// return sc->m_siteListAsteriskLine;
// if it is just a bunch of comments or blank lines, it is empty
if ( sc->m_siteListIsEmpty )
return NULL;
// if we had a list of contains: or regex: directives in the sitelist
// we have to linear scan those
char *nb = sc->m_negSubstringBuf.getBufStart();
char *nbend = nb + sc->m_negSubstringBuf.getLength();
for ( ; nb && nb < nbend ; ) {
// return NULL if matches a negative substring
if ( strstr ( sreq->m_url , nb ) ) return NULL;
// skip it
nb += strlen(nb) + 1;
}
char *myPath = NULL;
// check domain specific tables
@ -359,8 +418,21 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
return pd->m_patternStr;
}
// if we had a list of contains: or regex: directives in the sitelist
// we have to linear scan those
char *pb = sc->m_posSubstringBuf.getBufStart();
char *pend = pb + sc->m_posSubstringBuf.length();
for ( ; pb && pb < pend ; ) {
// return NULL if matches a negative substring
if ( strstr ( sreq->m_url , pb ) ) return pb;
// skip it
pb += strlen(pb) + 1;
}
// is there an '*' in the patterns?
if ( sc->m_siteListAsteriskLine ) return sc->m_siteListAsteriskLine;
//if ( sc->m_siteListAsteriskLine ) return sc->m_siteListAsteriskLine;
return NULL;
}
@ -541,7 +613,7 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
"</tr>"
"<tr>"
"<td>-contain:badstuff</td>"
"<td>-contains:badstuff</td>"
"<td>Matches if does NOT contain badstuff."
"</td>"
"</tr>"

View File

@ -12487,6 +12487,22 @@ bool doesStringContainPattern ( char *content , char *pattern ) {
return false;
}
long getFakeIpForUrl1 ( char *url1 ) {
// make the probable docid
long long probDocId = g_titledb.getProbableDocId ( url1 );
// make one up, like we do in PageReindex.cpp
long firstIp = (probDocId & 0xffffffff);
return firstIp;
}
long getFakeIpForUrl2 ( Url *url2 ) {
// make the probable docid
long long probDocId = g_titledb.getProbableDocId ( url2 );
// make one up, like we do in PageReindex.cpp
long firstIp = (probDocId & 0xffffffff);
return firstIp;
}
// returns false and sets g_errno on error
bool SpiderRequest::setFromAddUrl ( char *url ) {
// reset it
@ -12496,6 +12512,7 @@ bool SpiderRequest::setFromAddUrl ( char *url ) {
// make one up, like we do in PageReindex.cpp
long firstIp = (probDocId & 0xffffffff);
//long firstIp = getFakeIpForUrl1 ( url );
// ensure not crazy
if ( firstIp == -1 || firstIp == 0 ) firstIp = 1;

View File

@ -49,6 +49,9 @@ bool getSpiderStatusMsg ( class CollectionRec *cx ,
class SafeBuf *msg ,
long *status ) ;
long getFakeIpForUrl1 ( char *url1 ) ;
long getFakeIpForUrl2 ( Url *url2 ) ;
// Overview of Spider
//
// this new spider algorithm ensures that urls get spidered even if a host
@ -1105,6 +1108,10 @@ class SpiderColl {
bool m_siteListIsEmpty;
// data buckets in this table are of type
HashTableX m_siteListDomTable;
// substring matches like "contains:goodstuff" or
// later "regex:.*"
SafeBuf m_negSubstringBuf;
SafeBuf m_posSubstringBuf;
RdbCache m_dupCache;
RdbTree m_winnerTree;