more site list based spider fixes to be more like gsa

2024-10-04 12:17:35 +03:00 · 2014-03-08 20:52:31 -07:00 · 2014-03-08 20:52:31 -07:00 · ed626b162a
commit ed626b162a
parent 4cb66c31bf
2 changed files with 17 additions and 6 deletions
--- a/PageBasic.cpp
+++ b/PageBasic.cpp
@ -200,6 +200,10 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {

 		u.set ( s , pe - s );

+		// empty line?
+		if ( pe - s <= 0 ) 
+			continue;
+
 		// error? skip it then...
 		if ( u.getHostLen() <= 0 ) {
 			log("basic: error on line #%li in sitelist",lineNum);
@ -249,9 +253,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
 			// ok, we got something here i think
 			if ( u.getPathLen() <= 1 ) { char *xx=NULL;*xx=0; }
 			// calc length from "start" of line so we can
-			// jump to the path quickly for compares
-			pd.m_pathOff = x - start;
-			pd.m_pathLen = pe - x;
+			// jump to the path quickly for compares. inc "/"
+			pd.m_pathOff = (x-1) - start;
+			pd.m_pathLen = pe - (x-1);
 			break;
 		}

@ -318,7 +322,11 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
 		updateSiteList ( sc->m_collnum , false );
 	}

-	if ( dt->getNumSlotsUsed() == 0 ) { char *xx=NULL;*xx=0; }
+	if ( dt->getNumSlotsUsed() == 0 ) { 
+		// empty site list -- no matches
+		return NULL;
+		//char *xx=NULL;*xx=0; }
+	}

 	// this table maps a 32-bit domain hash of a domain to a
 	// patternData class. only for those urls that have firstIps that
--- a/Spider.h
+++ b/Spider.h
@ -735,8 +735,11 @@ class SpiderRequest {
 		char *p = m_url;
 		for ( ; *p ; p++ ) {
 			if ( *p != ':' ) continue;
-			p++; if ( *p != '/' ) continue;
-			p++; if ( *p != '/' ) continue;
+			p++; 
+			if ( *p != '/' ) continue;
+			p++; 
+			if ( *p != '/' ) continue;
+			p++;
 			break;
 		}
 		if ( ! *p ) return NULL;