more site list based spider fixes to be more like gsa

This commit is contained in:
Matt Wells 2014-03-08 20:52:31 -07:00
parent 4cb66c31bf
commit ed626b162a
2 changed files with 17 additions and 6 deletions

View File

@ -200,6 +200,10 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
u.set ( s , pe - s );
// empty line?
if ( pe - s <= 0 )
continue;
// error? skip it then...
if ( u.getHostLen() <= 0 ) {
log("basic: error on line #%li in sitelist",lineNum);
@ -249,9 +253,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
// ok, we got something here i think
if ( u.getPathLen() <= 1 ) { char *xx=NULL;*xx=0; }
// calc length from "start" of line so we can
// jump to the path quickly for compares
pd.m_pathOff = x - start;
pd.m_pathLen = pe - x;
// jump to the path quickly for compares. inc "/"
pd.m_pathOff = (x-1) - start;
pd.m_pathLen = pe - (x-1);
break;
}
@ -318,7 +322,11 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
updateSiteList ( sc->m_collnum , false );
}
if ( dt->getNumSlotsUsed() == 0 ) { char *xx=NULL;*xx=0; }
if ( dt->getNumSlotsUsed() == 0 ) {
// empty site list -- no matches
return NULL;
//char *xx=NULL;*xx=0; }
}
// this table maps a 32-bit domain hash of a domain to a
// patternData class. only for those urls that have firstIps that

View File

@ -735,8 +735,11 @@ class SpiderRequest {
char *p = m_url;
for ( ; *p ; p++ ) {
if ( *p != ':' ) continue;
p++; if ( *p != '/' ) continue;
p++; if ( *p != '/' ) continue;
p++;
if ( *p != '/' ) continue;
p++;
if ( *p != '/' ) continue;
p++;
break;
}
if ( ! *p ) return NULL;