open-source-search-engine/Url.cpp
2013-08-02 13:12:24 -07:00

2209 lines
69 KiB
C++

#include "gb-include.h"
#include "Url.h"
#include "Domains.h"
#include "Errno.h"
#include "HashTable.h"
#include "Speller.h"
static void print_string ( char *s , long len );
void Url::reset() {
m_scheme = NULL;
m_host = NULL;
m_path = NULL;
m_filename = NULL;
m_extension = NULL;
m_query = NULL;
m_domain = NULL;
m_tld = NULL;
m_anchor = NULL;
//m_site = NULL;
m_url[0] = '\0';
m_ulen = 0;
m_dlen = 0;
m_slen = 0;
m_qlen = 0;
m_hlen = 0;
m_elen = 0;
m_mdlen = 0;
m_anchorLen = 0;
//m_siteLen = 0;
// ip related stuff
m_ip = 0;
}
// set from another Url, does a copy
void Url::set ( Url *url , bool addWWW ) {
if ( ! url ) { reset(); return; }
set ( url->getUrl() , url->getUrlLen() , addWWW );
}
void Url::set (Url *baseUrl,char *s,long len,bool addWWW,bool stripSessionId,
bool stripPound , bool stripCommonFile, long titleRecVersion ) {
reset();
// debug msg
//if ( addWWW )
// log("Url::set: warning, forcing WWW\n");
if ( ! baseUrl ) { set ( s , len , addWWW ); return; }
char *base = (char *) baseUrl->m_url;
long blen = baseUrl->m_ulen;
// don't include cgi crap
if ( baseUrl->m_query ) blen -= (baseUrl->m_qlen + 1);
// . adjust length of the base url.
// . if base url does not end in / then it must have a m_filename at
// the end, therefore we should strip the m_filename
if ( blen > 0 && base[blen-1] != '/' )
while (blen > 0 && base[blen-1] != '/') blen--;
if ( blen==0 && len==0 ) return;
// skip s over spaces
char *send = s + len;
while ( s < send && is_wspace_a ( *s ) ) { s++; len--; }
// . is s a relative url? search for ://, but break at first /
// . but break at any non-alnum or non-hyphen
bool isAbsolute = false;
long i;
for ( i = 0; i < len && (is_alnum_a(s[i]) || s[i]=='-') ; i++ );
//for ( i = 0 ; s[i] && (is_alnum_a(s[i]) || s[i]=='-') ; i++ );
if ( ! isAbsolute )
isAbsolute = ( i + 2 < len &&
s[i+0]==':' &&
s[i+1]=='/' ); // some are missing both /'s!
//s[i+2]=='/' ) ;
if ( ! isAbsolute )
isAbsolute = ( i + 2 < len &&
s[i+0]==':' &&
s[i+1]=='\\' );
// or if s starts with // then it's also considered absolute!
if ( ! isAbsolute && len > 1 && s[0]=='/' && s[1]=='/' )
isAbsolute = true;
// watch out for idiots
if ( ! isAbsolute && len > 1 && s[0]=='\\' && s[1]=='\\' )
isAbsolute = true;
// debug msg
//log("base=%s, abs=%i, slen=%i, s=%s, i=%i\n",
// base,isAbsolute,len,s,i);
// don't use base if s is not relative
if ( blen==0 || isAbsolute ) {
set(s,len,addWWW,stripSessionId,stripPound,
false, // stripCommonFile?
titleRecVersion);
return;
}
// . if s starts with / then hack of base's m_path
// . careful not to hack of the port, if any
// . blen = baseUrl->m_slen + 3 + baseUrl->m_hlen;
if ( len > 0 && s[0]=='/' )
blen = baseUrl->m_path - baseUrl->m_url ;
char temp[MAX_URL_LEN*2+1];
strncpy(temp,base,blen);
if (len>MAX_URL_LEN) len = MAX_URL_LEN-2;
// if s does NOT start with a '/' then add one here in case baseUrl
// does NOT end in one
if ( len > 0 && s[0] != '/' && temp[blen-1] != '/' ) temp[blen++] ='/';
strncpy(temp+blen,s,len);
set ( temp, blen+len , addWWW , stripSessionId , stripPound ,
stripCommonFile ,
titleRecVersion );
}
// . url rfc = http://www.blooberry.com/indexdot/html/topics/urlencoding.htm
// . "...Only alphanumerics [0-9a-zA-Z], the special characters "$-_.+!*'(),"
// [not including the quotes - ed], and reserved characters used for their
// reserved purposes may be used unencoded within a URL."
// . i know sun.com has urls like "http://sun.com/;$sessionid=123ABC$"
// . url should be ENCODED PROPERLY for this to work properly
void Url::set ( char *t , long tlen , bool addWWW , bool stripSessionId ,
bool stripPound , bool stripCommonFile ,
long titleRecVersion ) {
reset();
// debug
//t = "http://www.ac.uk/../../news/.asp";
//tlen = gbstrlen(t);
if ( ! t || tlen == 0 ) return ;
// we may add a "www." a trailing backslash and \0, ...
if ( tlen > MAX_URL_LEN - 10 ) {
log( LOG_LIMIT,"db: Encountered url of length %li. "
"Truncating to %i" , tlen , MAX_URL_LEN - 10 );
tlen = MAX_URL_LEN - 10;
}
// . skip over non-alnum chars (except - or /) in the beginning
// . if url begins with // then it's just missing the http: (slashdot)
// . watch out for hostname like: -dark-.deviantart.com(yes, it's real)
// . so all protocols are hostnames MUST start with alnum OR hyphen
while ( tlen > 0 && !is_alnum_a(*t) && *t!='-' && *t!='/'){t++;tlen--;}
// . stop t at first space or binary char
// . url should be in encoded form!
long i ;
for ( i = 0 ; i < tlen ; i++ ) {
if ( ! is_ascii(t[i]) ) break; // no non-ascii chars allowed
if ( is_wspace_a(t[i]) ) break; // no spaces allowed
}
// truncate length to the first occurence of an unacceptable char
tlen = i;
// . decode characters that should not have been encoded
// . also NULL terminates
//char tmp[MAX_URL_LEN];
//long tmpLen;
//tmpLen = safeDecode ( t , tlen , tmp );
// . jump over http:// if it starts with http://http://
// . a common mistake...
while ( tlen > 14 && ! strncasecmp ( t , "http://http://" , 14 ) ) {
t += 7; tlen -= 7; }
//if ( tlen > 26 )
//while ( ! strncasecmp ( t , "http%3A%2F%2Fhttp%3A%2F%2F",26)){
//t += 13; tlen -= 13; }
// strip the "#anchor" from http://www.xyz.com/somepage.html#anchor"
long anchorPos = 0;
long anchorLen = 0;
for ( long i = 0 ; i < tlen ; i++ ) {
if ( t[i] != '#' ) continue;
anchorPos = i;
anchorLen = tlen - i;
if ( stripPound )
tlen = i;
break;
}
// copy to "s" so we can NULL terminate it
char s [ MAX_URL_LEN ];
long len = tlen;
// store filtered url into s
memcpy ( s , t , tlen );
s[len]='\0';
// . remove session ids from s
// . ';' most likely preceeds a session id
// . http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?pp=1
// . http://www.b.com/generic.html;$sessionid$QVBMODQAAAGNA?pid=7
// . http://www.b.com/?PHPSESSID=737aec14eb7b360983d4fe39395&p=1
// . http://www.b.com/cat.cgi/process?mv_session_id=xrf2EY3q&p=1
// . http://www.b.com/default?SID=f320a739cdecb4c3edef67e&p=1
if ( stripSessionId ) {
// CHECK FOR A SESSION ID USING SEMICOLONS
// or don't...bad for dmoz urls and apparently has ligit use
// long i = 0;
// while ( s[i] && s[i]!=';' ) i++;
// // did we get a semi colon?
// if ( s[i] == ';' ) {
// // i is start of it
// long a = i;
// // find the end of the session id
// long b = i + 1;
// while ( s[b] && s[b] != '?' ) b++;
// // remove the session id by covering it up
// memmove ( &s[a] , &s[b] , len - b );
// // reduce length
// len -= (b-a);
// // NULL terminate
// s[len] = '\0';
// }
// CHECK FOR A SESSION ID USING QUERY STRINGS
char *p = s;
while ( *p && *p != '?' && *p != ';' ) p++;
// bail if no ?
if ( ! *p ) goto skip;
// now search for severl strings in the cgi query string
char *tt = NULL;
long x;
if ( ! tt ) { tt = gb_strcasestr ( p , "PHPSESSID=" ); x = 10;}
if ( ! tt ) { tt = strstr ( p , "SID=" ); x = 4;}
// . osCsid and XTCsid are new session ids
// . keep this up here so "sid=" doesn't override it
if ( ! tt && titleRecVersion >= 59 ) {
tt = strstr ( p , "osCsid=" );
x = 7;
if ( ! tt ) tt = strstr ( p , "XTCsid=" );
// a hex sequence of at least 10 digits must follow
if ( tt && ! isSessionId ( tt + x, titleRecVersion ) )
tt = NULL;
}
if ( ! tt && titleRecVersion >= 59 ) {
tt = strstr ( p , "osCsid/" );
x = 7;
// a hex sequence of at least 10 digits must follow
if ( tt && ! isSessionId ( tt + x, titleRecVersion ) )
tt = NULL;
}
// this is a new session id thing
if ( ! tt && titleRecVersion >= 54 ) {
tt = strstr ( p , "sid=" ); x = 4;
// a hex sequence of at least 10 digits must follow
if ( tt && ! isSessionId ( tt + x, titleRecVersion ) )
tt = NULL;
}
// osCsid and XTCsid are new session ids
if ( ! tt && titleRecVersion >= 57 ) {
tt = strstr ( p , "osCsid=" );
x = 7;
if ( ! tt ) tt = strstr ( p , "XTCsid=" );
// a hex sequence of at least 10 digits must follow
if ( tt && ! isSessionId ( tt + x, titleRecVersion ) )
tt = NULL;
}
// fixes for bug of matching plain &sessionid= first and
// then realizing char before is an alnum...
if ( ! tt && titleRecVersion >= 59 ) {
tt = gb_strcasestr ( p, "jsessionid="); x = 11; }
if ( ! tt && titleRecVersion >= 59 ) {
tt = gb_strcasestr ( p , "vbsessid=" ); x = 9;}
if ( ! tt && titleRecVersion >= 59 ) {
tt = gb_strcasestr ( p, "asesessid=" ); x = 10; }
if ( ! tt && titleRecVersion >= 59 ) {
tt = gb_strcasestr ( p, "nlsessid=" ); x = 9; }
if ( ! tt && titleRecVersion >= 59 ) {
tt = gb_strcasestr ( p, "psession=" ); x = 9; }
if ( ! tt ) { tt = gb_strcasestr ( p , "session_id="); x = 11;}
if ( ! tt ) { tt = gb_strcasestr ( p , "sessionid=" ); x = 10;}
if ( ! tt ) { tt = gb_strcasestr ( p , "sessid=" ); x = 7;}
if ( ! tt ) { tt = gb_strcasestr ( p , "vbsessid=" ); x = 9;}
if ( ! tt ) { tt = gb_strcasestr ( p , "session=" ); x = 8;}
if ( ! tt ) { tt = gb_strcasestr ( p , "session/" ); x = 8;}
if ( ! tt ) { tt = gb_strcasestr ( p , "POSTNUKESID=");x = 12;}
// some new session ids as of Feb 2005
if ( ! tt ) { tt = gb_strcasestr ( p, "auth_sess=" ); x = 10; }
if ( ! tt ) { tt = gb_strcasestr ( p, "mysid=" ); x = 6; }
if ( ! tt ) { tt = gb_strcasestr ( p, "oscsid=" ); x = 7; }
if ( ! tt ) { tt = gb_strcasestr ( p, "cg_sess=" ); x = 8; }
if ( ! tt ) { tt = gb_strcasestr ( p, "galileoSession");x=14; }
if ( ! tt ) { tt = gb_strcasestr ( p, "asesessid=" ); x = 10; }
if ( ! tt ) { tt = gb_strcasestr ( p, "nlsessid=" ); x = 9; }
if ( ! tt ) { tt = gb_strcasestr ( p, "jsessionid="); x = 11; }
if ( ! tt ) { tt = gb_strcasestr ( p, "psession=" ); x = 9; }
// new as of Jan 2006. is hurting news5 collection on gb6
if ( ! tt ) { tt = gb_strcasestr ( p, "sess=" ); x = 5; }
// .php?s=8af9d6d0d59e8a3108f3bf3f64166f5a&
// .php?s=eae5808588c0708d428784a483083734&
// .php?s=6256dbb2912e517e5952caccdbc534f3&
if ( ! tt && (tt = strstr ( p-4 , ".php?s=" )) ) {
// point to the value of the s=
char *pp = tt + 7;
long i = 0;
// ensure we got 32 hexadecimal chars
while ( pp[i] &&
( is_digit(pp[i]) ||
( pp[i]>='a' && pp[i]<='f' ) ) ) i++;
// if not, do not consider it a session id
if ( i < 32 ) tt = NULL;
// point to s= for removal
else { tt += 5; x = 2; }
}
// bail if none were found
if ( ! tt ) goto skip;
// . must not have an alpha char before it!
// . prevent "DAVESID=" from being labeled as session id
if ( is_alnum_a ( *(tt-1) ) ) goto skip;
// start of the shit
long a = tt - s;
// get the end of the shit
long b = a + x;
// back up until we hit a ? or & or / or ;
while ( a > 0 && s[a-1] != '?' && s[a-1] != '&' &&
s[a-1] != '/' && s[a-1] != ';' ) a--;
// keep the '?'
if ( s[a]=='?' ) a++;
// back up over any semicolon
if ( s[a-1] == ';' && titleRecVersion >= 59 ) a--;
// advance b until we hit & or end or ? or a ';'
while ( s[b] && s[b] != '&' && s[b] != '?' && s[b] != ';') b++;
// if we don't have 5+ chars in session id itself, skip it
if ( b - (a + x) < 5 ) goto skip;
// go over a & or a ;
if ( s[b] == '&' || s[b] == ';' ) b++;
// remove the session id by covering it up
memmove ( &s[a] , &s[b] , len - b );
// reduce length
len -= (b-a);
// if s ends in ? or & or ;, backup
while ( len > 0 &&
(s[len-1]=='?'||s[len-1]=='&'||s[len-1]==';')) len--;
// NULL terminate
s[len] = '\0';
}
skip:
// remove common filenames like index.html
if ( stripCommonFile ) {
if ( len - 14 > 0 &&
strncasecmp(&s[len-14], "/default.xhtml", 14) == 0 )
len -= 13;
else if ( len - 13 > 0 &&
( strncasecmp(&s[len-13], "/default.html", 13) == 0 ||
strncasecmp(&s[len-13], "/default.ascx", 13) == 0 ||
strncasecmp(&s[len-13], "/default.ashx", 13) == 0 ||
strncasecmp(&s[len-13], "/default.asmx", 13) == 0 ||
strncasecmp(&s[len-13], "/default.xhtm", 13) == 0 ||
strncasecmp(&s[len-13], "/default.aspx", 13) == 0 ) )
len -= 12;
else if ( len - 12 > 0 &&
( strncasecmp(&s[len-12], "/default.htm", 12) == 0 ||
strncasecmp(&s[len-12], "/default.php", 12) == 0 ||
strncasecmp(&s[len-12], "/default.asp", 12) == 0 ||
strncasecmp(&s[len-12], "/index.xhtml", 12) == 0 ) )
len -= 11;
else if ( len - 11 > 0 &&
( strncasecmp(&s[len-11], "/index.html", 11) == 0 ||
strncasecmp(&s[len-11], "/index.aspx", 11) == 0 ||
strncasecmp(&s[len-11], "/index.xhtm", 11) == 0 ||
strncasecmp(&s[len-11], "/default.pl", 11) == 0 ||
strncasecmp(&s[len-11], "/default.cs", 11) == 0 ) )
len -= 10;
else if ( len - 10 > 0 &&
( strncasecmp(&s[len-10], "/index.htm", 10) == 0 ||
strncasecmp(&s[len-10], "/index.php", 10) == 0 ||
strncasecmp(&s[len-10], "/index.asp", 10) == 0 ||
strncasecmp(&s[len-10], "/main.html", 10) == 0 ||
strncasecmp(&s[len-10], "/main.aspx", 10) == 0 ) )
len -= 9;
else if ( len - 9 > 0 &&
( strncasecmp(&s[len-9], "/index.pl", 9) == 0 ||
strncasecmp(&s[len-9], "/main.htm", 9) == 0 ||
strncasecmp(&s[len-9], "/main.php", 9) == 0 ) )
len -= 8;
else if ( len - 8 > 0 &&
( strncasecmp(&s[len-8], "/main.pl", 8) == 0 ) )
len -= 7;
s[len] = '\0';
}
// replace the "\" with "/" -- a common mistake
long j;
for ( j = 0 ; s[j] ; j++) if (s[j]=='\\') s[j]='/';
// . dig out the protocol/scheme for this s (check for ://)
// . protocol may only have alnums and hyphens in it
for ( i = 0 ; s[i] && (is_alnum_a(s[i]) || s[i]=='-') ; i++ );
// if we have a legal protocol, then set "m_scheme", "slen" and "sch"
// and advance i to the m_host
if ( i + 2 < len && s[i]==':' && s[i+1]=='/' && s[i+2]=='/') {
// copy lowercase protocol to "m_url"
to_lower3_a ( s , i + 3 , m_url );
m_scheme = m_url;
m_slen = i;
m_ulen = i + 3;
i += 3;
}
else if (i + 2 < len && s[i]==':' && s[i+1]=='/'&& is_alnum_a(s[i+2])){
// copy lowercase protocol to "m_url"
to_lower3_a ( s , i + 2 , m_url );
// add in needed /
m_url[i+2]='/';
m_scheme = m_url;
m_slen = i;
m_ulen = i + 3;
i += 2;
}
// callto:+441202300007 (skype links)
// mailto:blah@blah.com
/*
else if ( i+1 < len && s[i]==':' && version >= 62 ) {
// copy lowercase protocol to "m_url"
to_lower3_a ( s , i + 1 , m_url );
// add in needed /
m_url[i+2]='/';
m_scheme = m_url;
m_slen = i;
m_ulen = i + 3;
i += 2;
}
*/
// otherwise we had no syntactically correct protocol
else {
memcpy ( m_url,"http://" , 7 );
m_scheme = m_url;
m_slen = 4;
m_ulen = 7;
i = 0;
// if s started with // then skip that (slashdot)
if ( s[0]=='/' && s[1]=='/' ) i = 2;
}
// . now &s[i] should point to the m_host name
// . chars allowed in hostname = period,alnum,hyphen,underscore
// . stops at '/' or ':' or any other disallowed character
j = i;
while (s[j] && (is_alnum_a(s[j]) || s[j]=='.' || s[j]=='-'||s[j]=='_'))
j++;
// copy m_host into "s" (make it lower case, too)
to_lower3_a ( s + i, j - i, m_url + m_ulen );
m_host = m_url + m_ulen;
m_hlen = j - i;
// common mistake: if hostname ends in a . then back up
while ( m_hlen > 0 && m_host[m_hlen-1]=='.' ) m_hlen--;
// NULL terminate for strchr()
m_host [ m_hlen ] = '\0';
// . common mistake: if hostname has no '.' in it append a ".com"
// . now that we use hosts in /etc/hosts we no longer do this
//if ( m_hlen > 0 && strchr ( m_host ,'.' ) == NULL ) {
// memcpy ( &m_host[m_hlen] , ".com" , 4 );
// m_hlen += 4;
//}
// advance m_ulen to end of hostname
m_ulen += m_hlen;
// . set our m_ip if hostname is in a.b.c.d format
// . this returns 0 if not a valid ip string
m_ip = atoip ( m_host , m_hlen );
// advance i to the : for the port, if it exists
i = j;
// NULL terminate m_host for getTLD(), getDomain() and strchr() below
m_host [ m_hlen ] = '\0';
// use ip as domain if we're just an ip address like 1.2.3.4
if ( m_ip ) {
// ip address has no tld, or mid domain
m_tld = NULL;
m_tldLen = 0;
// but it does have a domain (1.2.3)
m_domain = getDomainOfIp ( m_host , m_hlen , &m_dlen );
// just use the domain as the mid domain for ip-based urls
m_mdlen = m_dlen;
}
// . otherwise, get the tld
// . uses thorough list of tlds in Domains.cpp
else if ( ( m_tld = ::getTLD ( m_host, m_hlen ) ) && m_tld > m_host ) {
// set m_domain if we had a tld that's not equal to our host
m_tldLen = gbstrlen ( m_tld );
m_domain = ::getDomain ( m_host , m_hlen , m_tld , &m_dlen );
// set the mid domain length (-1 for the '.')
m_mdlen = m_dlen - m_tldLen - 1;
}
// otherwise, we're no ip and we have no valid domain
else {
m_domain = NULL;
m_dlen = 0;
m_tldLen = 0;
m_mdlen = 0;
}
// . if domain same as host then we might insert a "www." server name
// . however, must have a period in domain name
// . otherwise a domain name of "xxx" would become "www.xxx" and if
// Url::set() is called on that it would be "www.www.xxx" (bad bad)
// . let's only add "www." if there's only 1 period, ok?
if ( ! m_ip && addWWW && m_host == m_domain && strchr(m_host,'.') ) {
memmove ( m_host + 4 , m_host , m_hlen );
memcpy ( m_host , "www." , 4 );
if ( m_domain ) m_domain += 4;
if ( m_tld ) m_tld += 4;
m_ulen += 4;
m_hlen += 4;
}
// set the default port based on the protocol
m_defPort = 80;
if ( m_slen==5 && strncmp(m_scheme, "https",5)==0 ) m_defPort = 443;
if ( m_slen==3 && strncmp(m_scheme, "ftp" ,3)==0 ) m_defPort = 21;
// assume we're using the default port for this scheme/protocol
m_port = m_defPort;
m_portStr = NULL;
// see if a port was provided in the hostname after a colon
if ( s[i] == ':' ) {
// remember the ptr so far
long savedLen = m_ulen;
// add a colon to our m_url
m_url [ m_ulen++ ] = ':';
// scan for a '/'
j = i + 1;
while ( s[j] && s[j]!='/') m_url[m_ulen++] = s[j++];
// now read our port
m_portStr = s + i; // str includes ':'
m_port = atol2 ( s + (i + 1) , j - (i + 1) );
// if it's the default port, then remove what we copied
if ( m_port == m_defPort ) m_ulen = savedLen;
// make i point to the root / in the m_path, if any
i = j;
}
// how many chars is taken up by a specified port?
m_portLen = 0;
if ( m_port != m_defPort ) {
m_portLen += 2; // :3
if ( m_port >= 10 ) m_portLen += 1;
if ( m_port >= 100 ) m_portLen += 1;
if ( m_port >= 1000 ) m_portLen += 1;
if ( m_port >= 10000 ) m_portLen += 1;
}
//m_site = m_url;
//m_siteLen = m_ulen+1;
// append a '/' to m_url then bail if there is no m_path after the port
if ( s[i]=='\0' || s[i] != '/') {
m_path = m_url + m_ulen;
m_path[0] = '/';
m_plen = 1;
m_url[ ++m_ulen ]='\0';
// debug change
goto done;
return;
}
// . get the m_path and m_path length
// . j,i should point to start of path slash '/'
// . scan so it points to end or a ? or #
j = i;
while ( s[j] && s[j]!='?' && s[j]!='#' ) j++;
// point the path inside m_url even though we haven't written it yet
m_path = m_url + m_ulen;
m_plen = m_ulen;
// . deal with wierd things in the path
// . i points to start of path (should be /)
for (; i < j ; i++ ) {
// dedup double backslashes
// ensure m_ulen >= m_plen so we don't hurt "http:///" ...
// but people sometimes put http:// in the *path*
if ( s[i] == '/' && m_url[m_ulen-1] == '/' &&
m_ulen-1 >= m_plen &&
m_ulen >= 2 && m_url[m_ulen-2] != ':' ) continue;
// deal with current directories in the m_path
if ( s[i] == '.' && m_url[m_ulen-1] == '/' &&
(i+1 == j || s[i+1]=='/')) continue;
// . deal with damned ..'s in the m_path
// . if next 2 chars are .'s and last char we wrote was '/'
if ( s[i] == '.' && s[i+1]=='.' && m_url[m_ulen-1] == '/' ) {
// dont back up over first / in path
if ( m_url + m_ulen - 1 > m_path ) m_ulen--;
while ( m_url[m_ulen-1] != '/' ) m_ulen--;
// skip i to next / after these 2 dots
while ( s[i] && s[i]!='/' ) i++;
continue;
}
// don't allow ; before the ?...probably because of stripped
// sessionId...
// I was going to add other possible dup separators, but now
// it seems as though it might cause problems
if (titleRecVersion >= 78){
if (s[i] == ';' && s[i+1] == '?') continue;
}
// store char and advance to next
m_url[m_ulen++] = s[i];
}
// reset the path length in case we had to remove some wierd stuff
m_plen = m_ulen - m_plen;
// . remove trailing /'s from path, but not first one!
// . NO! this is WRONG! we need it so we know it's a dir name
//while ( m_plen > 1 && m_path[m_plen-1]=='/' ) { m_plen--; m_ulen--; }
// . get the m_query
// . the query is anything after the path that starts with ?
// . NOTE: we ignore strings beginning with '#' (page relative anchors)
if ( i < len && s[i] != '#' ) {
//memcpy ( m_url + m_ulen , s + i , len - i );
//remove back to back &'s in the cgi query
//http://www.nyasatimes.com/national/politics/160.html?print&&&
char *kstart = s + i;
char *kend = s + i + (len - i);
char *dst = m_url + m_ulen;
for ( char *k = kstart ; k < kend ; k++ ) {
// skip & if we just did one
if ( *k == '&' && k > kstart && *(k-1)=='&' ) continue;
// copy over one char at a time
*dst++ = *k;
}
// point after the '?' i guess
m_query = m_url + m_ulen + 1;
//m_qlen = len - i - 1;
m_qlen = dst - m_query;
m_ulen += m_qlen + 1;
}
// get the m_filename from the m_path (m_flen might be 0)
m_flen = 0;
while (m_path[m_plen-1-m_flen]!='/' && m_flen<m_plen) m_flen++;
m_filename = m_path + m_plen - m_flen;
// get the m_extension from the m_path
m_elen = 0;
while (is_alnum_a(m_path[m_plen-1-m_elen]) && m_elen < m_plen)m_elen++;
if ( m_path[ m_plen-1-m_elen] != '.' ) m_elen = 0; // no m_extension
m_extension = m_path + m_plen - m_elen;
// null terminate our s
m_url[ m_ulen ]='\0';
// add the anchor after
m_anchor = NULL;
m_anchorLen = anchorLen;
if ( anchorLen > 0 &&
m_ulen + anchorLen + 2 < MAX_URL_LEN ) {
m_anchor = &m_url[m_ulen+1];
memcpy(&m_url[m_ulen+1], &t[anchorPos], anchorLen);
m_url[m_ulen+1+anchorLen] = '\0';
}
done:
// debug msg
//log("--------------%s has domain \"",s);
//for (long k=0;k <m_dlen; k++ ) log("%c",m_domain[k]);
//log("\"\n");
// check for iterative stablization
static long flag = 0;
if ( flag == 1 ) return;
Url u2;
flag = 1;
// Must not use defaults!
u2.set ( m_url, m_ulen , addWWW, stripSessionId ,
stripPound , stripCommonFile , titleRecVersion );
if ( strcmp(u2.getUrl(),m_url) != 0 ) {
log(LOG_REMIND,"db: *********url %s-->%s\n",m_url,u2.getUrl());
//sleep(5000);
}
flag = 0;
}
char Url::isSessionId ( char *hh, long titleRecVersion ) {
long count = 0;
long step = 0;
long nonNumCount = 0;
// old bug didn't step through characters
if (titleRecVersion >= 69) step = 1;
// do not limit count to 12, the hex numbers may only be
// after the 12th character! we were not identifying these
// as sessionids when we shold have been because of that.
for ( ; *hh ; count++, hh+=step ) {
if ( *hh >= '0' && *hh <= '9' ) continue;
nonNumCount++;
if ( *hh >= 'a' && *hh <= 'f' ) continue;
// we got an illegal session id character
return false;
}
// if we got at least 12 of em, consider it a valid id
if (titleRecVersion >= 69)
// make sure it's a hexadecimal number...lots of product
// ids and dates use only decimal numbers
return ( nonNumCount > 0 && count >= 12);
return ( count >= 12 );
}
// hostname must also be www or NULL to be a root url
bool Url::isRoot() {
if ( m_plen != 1 ) return false;
if ( !m_path || m_path[0] != '/' ) return false;
if ( m_query ) return false;
// for now we'll let all thos *.deviantart.com names clog us up
// because i don't want to dis' stuff like espn.go.com
return true;
// get just the hostname w/o the domain (includes '.' following name)
//long nameLen = m_hlen - m_dlen ;
//if ( nameLen <= 0 ) return true;
//if ( nameLen != 4 ) return false; // "www."
//if ( strncmp ( m_host , "www" , 3 ) != 0 ) return false;
//return true;
}
// a super root url is a root url where the hostname is NULL or "www"
bool Url::isSuperRoot () {
if ( ! isRoot() ) return false;
// if hostname is same as domain, it's a super root
if ( m_host == m_domain && m_hlen == m_dlen ) return true;
// if host is not "www." followed by domain, it's NOT a super root
if ( m_hlen != m_dlen + 4 ) return false;
if ( strncmp ( m_host , "www." , 4 ) == 0 ) return true;
return false;
}
bool Url::isSimpleSubdomain ( ) {
// if hostname is same as domain, it's passes
if ( m_host == m_domain && m_hlen == m_dlen ) return true;
// if host is not "www." followed by domain, it's NOT
if ( m_hlen != m_dlen + 4 ) return false;
if ( strncmp ( m_host , "www." , 4 ) == 0 ) return true;
return false;
}
// . get length of sub-url #j
// . basically like adding j /.. to the end of the url
// . sub-url #0 is the full url
// . includes /~ as it's own path
long Url::getSubUrlLen ( long j ) {
// assume it's the whole url
long len = m_ulen;
// subtract the m_query (cgi) part at the end of the url
if ( m_query ) len -= m_qlen + 1; //and the ?
// return the full url (without m_query) if j is 0
if ( j == 0 ) return len;
// . start right past the http://m_host.domain.com/
long start = m_slen + 3 + m_hlen + 1 + m_portLen ;
while ( len > start ) {
if ( m_url [ len - 1 ] == '/' ) j--;
if ( m_url [ len - 2 ] == '/' && m_url [ len - 1 ] == '~') j--;
// include this backslash (or ~) in the sub-url
if ( j == 0 ) return len;
// shrink by one character
len--;
}
// return 0 if jth sub-url does not exist
return 0;
}
// . similar to getSubUrlLen() above but only works on the path
// . if j is 0 that's the whole url path!
long Url::getSubPathLen ( long j ) {
long subUrlLen = getSubUrlLen ( j );
if ( subUrlLen <= 0 ) return 0;
// . the subPath length includes the root backslash
// . portLen includes the whole :8080 thing (for non default ports)
return subUrlLen - m_slen - 3 - m_hlen - m_portLen;
}
void Url::print() {
printf("############ url ############\n");
printf("url: %s\n",m_url);
printf("host: ");
print_string( m_host, m_hlen );
printf("\n");
printf("scheme: ");
print_string( m_scheme , m_slen );
printf("\n");
printf("path: ");
print_string(m_path , m_plen );
printf("\n");
printf("query: %s\n",m_query);
printf("port: %li\n", m_port );
printf("domain: ");
print_string(m_domain, m_dlen );
printf("tld: ");
print_string(m_tld, m_tldLen );
printf("mid domain: ");
print_string(m_domain, m_mdlen );
printf("\n");
printf("is root %i\n",isRoot());
}
void print_string ( char *s , long len ) {
long i = 0;
if ( ! s ) return;
while ( i < len ) printf("%c",s[i++]);
}
bool Url::isExtensionIndexable () {
// assume no extension is html
if ( m_elen == 0 ) return true;
if ( ! m_extension ) return true;
// no matter what the extension, if it has cgi parms, let it through.
if ( m_query ) return true;
// now we index source code
if ( m_elen == 1 ) {
if ( m_extension[0] == 'c' ) return true;
if ( m_extension[0] == 'h' ) return true;
}
else if ( m_elen == 2 ) {
if ( strncasecmp ( m_extension , "ps", 2 ) == 0 ) return true;
// perl is used like cgi
if ( strncasecmp ( m_extension , "pl", 2 ) == 0 ) return true;
return false;
}
else if ( m_elen == 3 ) {
if ( strncasecmp ( m_extension , "htm", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "asp", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "xml", 3 ) == 0 ) return true;
// rss has a Content-Type of xml usually
if ( strncasecmp ( m_extension , "rss", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "cgi", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "dll", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "jsp", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "php", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "txt", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "pdf", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "doc", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "xls", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "ppt", 3 ) == 0 ) return true;
// now we index source code
if ( strncasecmp ( m_extension , "cpp", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "hpp", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "vbs", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "frm", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "bas", 3 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "jsp", 3 ) == 0 ) return true;
// probably cold fusion template file. seems to be text/html
if ( strncasecmp ( m_extension , "cfm", 3 ) == 0 ) return true;
//http://news.bbc.co.uk/2/hi/science/nature/default.stm
if ( strncasecmp ( m_extension , "stm", 3 ) == 0 ) return true;
// rdf for rss feeds
if ( strncasecmp ( m_extension , "rdf", 3 ) == 0 ) return true;
return false;
}
else if ( m_elen == 4 ) {
if ( strncasecmp ( m_extension , "html",4 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "phtm",4 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "shtm",4 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "fcgi",4 ) == 0 ) return true;
if ( strncasecmp ( m_extension , "aspx",4 ) == 0 ) return true;
// php3 and php4 etc.
if ( strncasecmp ( m_extension , "php" ,3 ) == 0 ) return true;
// now we index source code
if ( strncasecmp ( m_extension , "java",4 ) == 0 ) return true;
return false;
}
else if ( m_elen == 5 ) {
if ( strncasecmp ( m_extension, "shtml", 5) == 0 ) return true;
if ( strncasecmp ( m_extension, "jhtml", 5) == 0 ) return true;
if ( strncasecmp ( m_extension, "phtml", 5) == 0 ) return true;
if ( strncasecmp ( m_extension, "story", 5) == 0 ) return true;
return false;
}
return false;
}
// . return url w/o http://
// . without trailing / if path is just "/"
// . without "www." if in hostname and "rmWWW" is true
// . sets *len to it's length
char *Url::getShorthandUrl ( bool rmWWW , long *len ) {
char *u = m_url;
long ulen = m_ulen;
if ( ulen > 7 && strncasecmp ( u , "http://" , 7 ) == 0) {
u += 7 ;
ulen -= 7 ;
// if hostname is just "www" then skip iff rmWWW is true
if ( rmWWW && m_hlen >= 4 && strncmp(m_host,"www.",4) == 0) {
u += 4;
ulen -= 4;
}
}
// skip trailing /
if ( m_plen == 1 && m_path[0]=='/' && m_query == NULL ) ulen--;
// set the length
*len = ulen;
// return the url shorthand
return u;
}
long Url::getPathDepth ( bool countFilename ) {
char *s = m_path + 1;
char *send = m_url + m_ulen;
long count = 0;
while ( s < send ) if ( *s++ == '/' ) count++;
// if we're counting the filename as a path component...
if ( countFilename && *(send-1) != '/' ) count++;
return count;
}
char *Url::getPathComponent ( long num , long *clen ) {
// start countint at path
char *start = m_path;
char *p = m_path;
char *pend = m_path + m_plen;
long count = 0;
// loop up here for each component
loop:
// skip the '/'
p++;
start++;
// advance until next / or end
while ( p < pend && *p != '/' ) p++;
// set length of that
if ( clen ) *clen = p - start;
// all done?
if ( count == num ) return start;
// if none, simply does not exist
if ( p >= pend ) return NULL;
// advance
count++;
// set start to p now
start = p;
goto loop;
}
//char *Url::getPathEnd ( long num ) {
// // get component
// long pclen = 0;
// char *pc = getPathComponent ( num , &pclen );
// // return the end of it
// return pc + pclen;
//}
bool Url::isHostWWW ( ) {
if ( m_hlen < 4 ) return false;
if ( m_host[0] != 'w' ) return false;
if ( m_host[1] != 'w' ) return false;
if ( m_host[2] != 'w' ) return false;
if ( m_host[3] != '.' ) return false;
return true;
}
// . is the url a porn/spam url?
// . i use /usr/share/dict/words to check for legit words
// . if it's long and has 4+ hyphens, consider it spam
// . if you add a word here, add it to PageResults.cpp:isQueryDirty()
bool Url::isSpam() {
// store the hostname in a buf since we strtok it
char s [ MAX_URL_LEN ];
// don't store the .com or .org while searching for isSpam
long slen = m_hlen - m_tldLen - 1;
memcpy ( s , m_host , slen );
if ( ! m_domain ) return false;
if ( ! m_dlen ) return false;
//long len = m_dlen;
//memcpy ( s , m_domain , len );
// if tld is gov or edu or org, not porn
if ( m_tldLen >= 3 && strncmp ( m_tld , "edu" , 3 )==0 ) return false;
if ( m_tldLen >= 3 && strncmp ( m_tld , "gov" , 3 )==0 ) return false;
// NULL terminate for strstr
s[slen]='\0';
// . if there is 4 or more hyphens, and hostLen > 30 consider it spam
// . actually there seems to be a lot of legit sites with many hyphens
if ( slen > 30 ) {
long count = 0;
char *p = s;
while ( *p ) if ( *p++ == '-' ) count++;
if ( count >= 4 ) return true;
}
//
// TODO: use getMatch()!!!! +pts -pts system
//
// check each thing separated by periods for porn
char *send = s + slen;
char *p = s;
loop:
// if done, return
if ( p >= send ) return false;
// find the next period or hyphen
char *pend = p;
while ( pend < send && *pend != '.' && *pend !='-' ) pend++;
// ok NULL terminate it
*pend = '\0';
// check that
if ( isSpam ( p , pend - p ) ) return true;
// point to next
p = pend + 1;
// loop back
goto loop;
}
bool Url::isSpam ( char *s , long slen ) {
// no need to indent below, keep it clearer
if ( ! isAdult ( s, slen ) ) return false;
// check for naughty words. Split words to deep check if we're surely
// adult. Required because montanalinux.org is showing up as porn
// because it has 'anal' in the hostname.
// send each phrase seperately to be tested.
// hotjobs.yahoo.com
char *a = s;
char *p = s;
bool foundCleanSequence = false;
char splitWords[1024];
char *splitp = splitWords;
while ( p < s + slen ){
while ( p < s + slen && *p != '.' && *p != '-' )
p++;
bool isPorn = false;
// TODO: do not include "ult" in the dictionary, it is
// always splitting "adult" as "ad ult". i'd say do not
// allow it to split a dirty word into two words like that.
if ( g_speller.canSplitWords( a, p - a, &isPorn,
splitp,
langEnglish,
csUTF8 ) ){
if ( isPorn ){
log(LOG_DEBUG,"build: identified %s as "
"porn after splitting words as "
"%s", s, splitp);
return true;
}
foundCleanSequence = true;
// keep searching for some porn sequence
}
p++;
a = p;
splitp += gbstrlen(splitp);
}
// if we found a clean sequence, its not porn
if ( foundCleanSequence ) {
log(LOG_INFO,"build: did not identify url %s "
"as porn after splitting words as %s", s, splitWords);
return false;
}
// we tried to get some seq of words but failed. Still report
// this as porn, since isAdult() was true
logf ( LOG_DEBUG,"build: failed to find sequence of words to "
"prove %s was not porn.", s );
return true;
/* if ( strstr ( s , "upskirt" ) ) return true;
if ( strstr ( s , "downblouse") ) return true;
if ( strstr ( s , "adult" ) ) return true;
if ( strstr ( s , "shemale" ) ) return true;
if ( strstr ( s , "spank" ) ) return true;
if ( strstr ( s , "dildo" ) ) return true;
if ( strstr ( s , "shaved" ) ) return true;
if ( strstr ( s , "bdsm" ) ) return true;
if ( strstr ( s , "voyeur" ) ) return true;
if ( strstr ( s , "shemale" ) ) return true;
if ( strstr ( s , "fisting" ) ) return true;
if ( strstr ( s , "escorts" ) ) return true;
if ( strstr ( s , "vibrator" ) ) return true;
if ( strstr ( s , "rgasm" ) ) return true; // 0rgasm
if ( strstr ( s , "orgy" ) ) return true;
if ( strstr ( s , "orgies" ) ) return true;
if ( strstr ( s , "masturbat" ) ) return true;
if ( strstr ( s , "stripper" ) ) return true;
if ( strstr ( s , "lolita" ) ) return true;
//if ( strstr ( s , "hardcore" ) ) return true; ps2hardcore.co.uk
if ( strstr ( s , "softcore" ) ) return true;
if ( strstr ( s , "whore" ) ) return true;
if ( strstr ( s , "slut" ) ) return true;
if ( strstr ( s , "smut" ) ) return true;
if ( strstr ( s , "tits" ) ) return true;
if ( strstr ( s , "lesbian" ) ) return true;
if ( strstr ( s , "swinger" ) ) return true;
if ( strstr ( s , "fetish" ) ) return true;
if ( strstr ( s , "housewife" ) ) return true;
if ( strstr ( s , "housewive" ) ) return true;
if ( strstr ( s , "nude" ) ) return true;
if ( strstr ( s , "bondage" ) ) return true;
if ( strstr ( s , "centerfold") ) return true;
if ( strstr ( s , "incest" ) ) return true;
if ( strstr ( s , "pedophil" ) ) return true;
if ( strstr ( s , "pedofil" ) ) return true;
// hornyear.com
if ( strstr ( s , "horny" ) ) return true;
if ( strstr ( s , "pussy" ) ) return true;
if ( strstr ( s , "pussies" ) ) return true;
if ( strstr ( s , "penis" ) ) return true;
if ( strstr ( s , "vagina" ) ) return true;
if ( strstr ( s , "phuck" ) ) return true;
if ( strstr ( s , "blowjob" ) ) return true;
if ( strstr ( s , "gangbang" ) ) return true;
if ( strstr ( s , "xxx" ) ) return true;
if ( strstr ( s , "porn" ) ) return true;
if ( strstr ( s , "felch" ) ) return true;
if ( strstr ( s , "cunt" ) ) return true;
if ( strstr ( s , "bestial" ) ) return true;
if ( strstr ( s , "beastial" ) ) return true;
//if ( strstr ( s , "oral" ) ) return true; // moral, doctorial, ...
// these below may have legit meanings
if ( strstr ( s , "kink" ) ) {
if ( strstr ( s , "kinko" ) ) return false; // the store
return true;
}
if ( strstr ( s , "sex" ) ) {
// sexton, sextant, sextuplet, sextet
if ( strstr ( s , "sext" ) ) return false;
if ( strstr ( s , "middlesex" ) ) return false;
if ( strstr ( s , "sussex" ) ) return false;
if ( strstr ( s , "essex" ) ) return false;
if ( strstr ( s , "deusex" ) ) return false; // video game
if ( strstr ( s , "sexchange" ) ) return false; // businessexh
if ( strstr ( s , "sexpress" ) ) return false; // *express
if ( strstr ( s , "sexpert" ) ) return false; // *expert
if ( strstr ( s , "sexcel" ) ) return false; // *excellence
if ( strstr ( s , "sexist" ) ) return false; // existence
if ( strstr ( s , "sexile" ) ) return false; // existence
if ( strstr ( s , "harassm" ) ) return false; // harassment
if ( strstr ( s , "sexperi" ) ) return false; // experience
if ( strstr ( s , "transex" ) ) return false; // transexual
if ( strstr ( s , "sexual" ) ) return false; // abuse,health
if ( strstr ( s , "sexpo" ) ) return false; // expo,expose
if ( strstr ( s , "exoti" ) ) return false; // exotic(que)
if ( strstr ( s , "sexclu" ) ) return false; // exclusive/de
return true;
}
// www.losAnaLos.de
// sanalcafe.net
if ( strstr ( s , "anal") ) {
if ( strstr ( s , "analog" ) ) return false; // analogy
if ( strstr ( s , "analy" ) ) return false; // analysis
if ( strstr ( s , "canal" ) ) return false;
if ( strstr ( s , "kanal" ) ) return false; // german
if ( strstr ( s , "banal" ) ) return false;
return true;
}
if ( strstr ( s , "cum") ) {
if ( strstr ( s , "circum" ) ) return false; // circumvent
if ( strstr ( s , "magn" ) ) return false; // magna cum
if ( strstr ( s , "succu" ) ) return false; // succumb
if ( strstr ( s , "cumber" ) ) return false; // encumber
if ( strstr ( s , "docum" ) ) return false; // document
if ( strstr ( s , "cumul" ) ) return false; // accumulate
if ( strstr ( s , "acumen" ) ) return false; // acumen
if ( strstr ( s , "cucum" ) ) return false; // cucumber
if ( strstr ( s , "incum" ) ) return false; // incumbent
if ( strstr ( s , "capsicum" ) ) return false;
if ( strstr ( s , "modicum" ) ) return false;
if ( strstr ( s , "locum" ) ) return false; // slocum
if ( strstr ( s , "scum" ) ) return false;
if ( strstr ( s , "accu" ) ) return false; // compounds!
// arcum.de
// cummingscove.com
// cumchristo.org
return true;
}
//if ( strstr ( s , "lust" ) ) {
// if ( strstr ( s , "illust" ) ) return false; // illustrated
// if ( strstr ( s , "clust" ) ) return false; // cluster
// if ( strstr ( s , "blust" ) ) return false; // bluster
// if ( strstr ( s , "lustrad" ) ) return false; // balustrade
// // TODO: plusthemes.com wanderlust
// return true;
//}
// brettwatt.com
//if ( strstr ( s , "twat" ) ) {
// if ( strstr ( s , "watch" ) ) return false; // wristwatch
// if ( strstr ( s , "atwater" ) ) return false;
// if ( strstr ( s , "water" ) ) return false; // sweetwater
// return true;
//}
if ( strstr ( s, "clit" ) && ! strstr ( s, "heraclitus") ) return true;
// fuckedcompany.com is ok
if ( strstr ( s, "fuck" ) && ! strstr ( s, "fuckedcomp") ) return true;
if ( strstr ( s, "boob" ) && ! strstr ( s, "booboo" ) ) return true;
if ( strstr ( s, "wank" ) && ! strstr ( s, "swank" ) ) return true;
// fick is german for fuck (fornication under consent of the king)
if ( strstr ( s, "fick" ) && ! strstr ( s, "fickle") &&
! strstr ( s , "traffick" ) ) return true;
// sclerotic
// buerotipp.de
if ( strstr ( s, "eroti") && ! strstr ( s, "sclero" ) ) return true;
// albaberlin.com
// babelfish.altavista.com
if ( strstr ( s, "babe" ) && ! strstr ( s, "toyland" ) &&
! strstr ( s , "babel") ) return true;
// what is gaya.dk?
if ( strstr ( s , "gay" ) && ! strstr ( s, "gaylord" ) ) return true;
// url appears to be ok
return false;*/
}
// . remove any session id
// . i'm sick of these tihngs causing dup problems
// . types:
// http://www.b.com/?PHPSESSID=737aec14eb7b360983d4fe39395
// http://www.b.com/cat.cgi/process?mv_session_id=xrf2EY3q&
// http://www.b.com/default?SID=f320a739cdecb4c3edef67e
// http://www.b.com/generic.html;$sessionid$QVBMODQAAAGNA?pid=7
// http://www.b.com/p.jhtml;jsessionid=J4QMFWBG1SPRVWCKUUXCJ0W?stuff=1
// look for ';'
// look for PHPSESSID, session_id, SID, jsessionid
// followed by string of at least 4 letters/numbers
//List of extensions NOT to parse
static char *s_badExtensions[] = {
"ai",
"aif",
"aifc",
"aiff",
"asc",
"au",
"avi",
"bcpio",
"bin",
"bmp",
"bz2",
//"c",
"cc",
"ccad",
"cdf",
"class",
"cpio",
"cpt",
//"csh",
"css",
"dcr",
"dir",
"dms",
//"doc",
"drw",
"dvi",
"dwg",
"dxf",
"dxr",
"eps",
"etx",
"exe",
"ez",
"f",
"f90",
"fli",
"gif",
"gtar",
"gz",
//"h",
"hdf",
"hh",
"hqx",
//"htm",
//"html",
"ice",
"ief",
"iges",
"igs",
"ips",
"ipx",
"jpe",
"jpeg",
"jpg",
//"js",
"kar",
"latex",
"lha",
"lsp",
"lzh",
"m",
"man",
"me",
"mesh",
"mid",
"midi",
"mif",
"mime",
"mov",
"movie",
"mp2",
"mp3",
"mpe",
"mpeg",
"mpg",
"mpga",
"ms",
"msh",
"nc",
"oda",
"pbm",
"pdb",
//"pdf",
"pgm",
"pgn",
"png",
"pnm",
"pot",
"ppm",
"pps",
// "ppt",
"ppz",
"pre",
"prt",
// "ps",
"qt",
"ra",
"ram",
"ras",
"rgb",
"rm",
"roff",
"rpm",
"rtf",
"rtx",
"scm",
"set",
"sgm",
"sgml",
"sh",
"shar",
"silo",
"sit",
"skd",
"skm",
"skp",
"skt",
"smi",
"smil",
"snd",
"sol",
"spl",
"src",
"step",
"stl",
"stp",
"sv4cpio",
"sv4crc",
"swf",
"t",
"tar",
"tcl",
"tex",
"texi",
"texinfo",
"tif",
"tiff",
"tr",
"tsi",
"tsp",
"tsv",
//"txt",
"unv",
"ustar",
"vcd",
"vda",
"viv",
"vivo",
"vrml",
"wav",
"wrl",
"xbm",
"xlc",
"xll",
"xlm",
//"xls",
"xlw",
//"xml",
"xpm",
"xwd",
"xyz",
"zip",//
};//look below, I added 3 more types for TR version 73
static HashTable s_badExtTable;
static bool s_badExtInitialized;
//returns True if the extension is listed as bad
bool Url::isBadExtension ( long version ) {
// return !isExtensionIndexable();
if ( ! m_extension || m_elen == 0 ) return false;
if(!s_badExtInitialized) { //if hash has not been created-create one
long i=0;
//version 72 and before.
do {
int tlen = gbstrlen(s_badExtensions[i]);
long long swh = hash64Lower_a(s_badExtensions[i],tlen);
if(!s_badExtTable.addKey(swh,(long)50))
return false;
i++;
} while(strcmp(s_badExtensions[i],"zip")!=0);
//version 73 and after.
if(!s_badExtTable.addKey(hash64Lower_a("wmv", 3),
(long)73) ||
!s_badExtTable.addKey(hash64Lower_a("wma", 3),
(long)73) ||
!s_badExtTable.addKey(hash64Lower_a("ogg", 3),
(long)73))
return false;
s_badExtInitialized = true;
}
int myKey = hash64Lower_a(m_extension,m_elen);
//zero unless we have a bad extention, otherwise
//we return TR version in which it was banned
long badVersion = s_badExtTable.getValue(myKey);
if (badVersion == 0) return false;
if(badVersion <= version) return true;
return false;
}
// see Url.h for a description of this.
bool Url::isLinkLoop ( ) {
char *s = m_path ;
char *send = m_url + m_ulen;
long count = 0;
long components = 0;
bool prevWasDouble = false;
char *last = NULL;
if (!s) return false;
// use this hash table to hash each path component in the url
char buf [ 5000 ];
HashTable t; t.set ( 100 , buf , 5000 );
// grab each path component
for ( ; s < send ; s++ ) {
if ( *s != '/' ) continue;
// ok, add this guy to the hash table, if we had one
if ( ! last ) { last = s; continue; }
// give up after 50 components
if ( components++ >= 50 ) return false;
// hash him
unsigned long h = hash32 ( last , s - last );
// is he in there?
long slot = t.getSlot ( h );
// get his val (count)
long val = 0;
if ( slot >= 0 ) val = t.getValueFromSlot ( slot );
// if not in there put him in a slot
if ( slot < 0 ) {
last = s;
t.addKey ( h , 1 );
continue;
}
// increment it
val++;
// does it occur 3 or more times? if so, we have a link loop
if ( val >= 3 ) return true;
// is it 2 or more?
if ( val == 2 ) count++;
// if we have two such components, then we are a link loop.
// BUT, we must be a pair!
if ( count >= 2 && prevWasDouble ) return true;
// set this so in case next guy is a double
if ( val == 2 ) prevWasDouble = true;
else prevWasDouble = false;
// add it back after incrementing
t.setValue ( slot , val );
// update "last"
last = s;
}
return false;
}
//
// here are some examples of link loops in urls:
//
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/letters/send/archive/bish/archive/bish/letters/bish/archive/lette\rs/send/archive/letters/send/bish/letters/archive/bish/letters/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/bish/letters/archive/bish/archive/letters/send/archive/letters/send/archive/le\tters/send/archive/letters/send/bish/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/letters/send/archive/bish/archive/bish/\archive/bish/letters/send/archive/letters/archive/letters/send/archive/bish/let\ters/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/letters/send/archive/letters/archive/bish/archive/bish/archive/bi\sh/letters/send/archive/bish/archive/letters/send/bish/archive/bish/letters/sen\d/archive/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/bish/letters/bish/letters/send/archive/\bish/archive/letters/bish/letters/send/archive/bish/letters/send/bish/archive/l\etters/bish/letters/archive/letters/send/
//http://www.pittsburghlive.com:8000/x/tribune-review/opinion/steigerwald/letters\/send/archive/bish/letters/send/archive/bish/letters/send/bish/archive/letters/\send/bish/archive/letters/send/archive/letters/bish/archive/bish/archive/letter\s/
bool Url::isIp() {
if(!m_host) return false;
if(!is_digit(*m_host)) return false;
return atoip ( m_host , m_hlen );
}
/*
bool Url::isSiteRoot ( char *coll , TagRec *tagRec ,
char **retSite , long *retSiteLen ) {
long siteLen;
// use the DOMAIN as the default site
char *site = getSite ( &siteLen , coll , false , tagRec );
// check end of site
char *send = site + siteLen;
// our end
char *uend = m_url + m_ulen;
// backup over an ending '/'
if ( uend[-1] == '/' ) uend--;
// set it
if ( retSite ) *retSite = site;
if ( retSiteLen ) *retSiteLen = siteLen;
// if before our end, we are not a site root
return (uend <= send);
}
// . a "site" is a set of urls controlled/regulated primarily by the same
// entity.
// . this returns the smallest site containing the url, m_url
// . so fred.blogspot.com is considered a site regulated by "fred"
// . BUT blogspot.com is a larger site regulated by blogspot
// . the default site is the domain
// . returns NULL and sets g_errno on error
// . if "defaultToHostname" is true we default to the hostname
// as opposed to the domain name.
char *Url::getSite ( long *siteLen , char *coll , bool defaultToHostname ,
TagRec *tagRec ,
bool *isDefault ) {
// clear just in case
g_errno = 0;
// convenience vars
char *p;
long len = 0;
// assume we return the default
if ( isDefault ) *isDefault = true;
long sitepathdepth = -1;
// we may have a defined path depth
Tag *tag = NULL;
// see if we do
if ( tagRec ) tag = tagRec->getTag("sitepathdepth");
// sanity check
if ( tag && tag->m_dataSize != 1 ) { char *xx=NULL;*xx=0; }
// if there, get the sitepathdepth value it contains
if ( tag ) sitepathdepth = (long)tag->m_data[0];
// . deal with site indicators
// . these are applied to all domains uniformly
// . if it is xyz.com/users/ use xyz.com/users/fred/ as the site
p = m_path;
// a lot of times these were not indivual blogs, but the blog subsite
// of a site... http://dccc.org/blog/P4575/
//if ( strncasecmp(p,"/blogs/" , 7) == 0 ) len = 7;
//if ( strncasecmp(p,"/blog/" , 6) == 0 ) len = 6;
// commented out a bunch cuz they were profiles mostly, not blogs...
if ( strncasecmp(p,"/~" , 2) == 0 ) len = 2;
// assume this is a username. skip the first /
if ( sitepathdepth == 1 ) len = 1;
//if ( strncasecmp(p,"/users/" , 7) == 0 ) len = 7;
//if ( strncasecmp(p,"/user/" , 6) == 0 ) len = 6;
//if ( strncasecmp(p,"/members/" , 9) == 0 ) len = 9;
//if ( strncasecmp(p,"/membres/" , 9) == 0 ) len = 9;
//if ( strncasecmp(p,"/member/" , 8) == 0 ) len = 8;
//if ( strncasecmp(p,"/membre/" , 8) == 0 ) len = 8;
//if ( strncasecmp(p,"/member.php?u=",14) == 0 ) len = 14;
// point to after the /users/, /blogs/, /user/, /blog/ or /~xxx/
p += len;
// assume there is NOT an alpha char after this
char username = false;
// . skip to next / OR ?
// . stop at . or -, because we do not allow those in usernames and
// they are often indicative of filenames without file extensions
while ( len && *p && *p!= '/'&&*p!='?'&&*p!='.'&&*p!='-'&&*p!='_') {
if ( is_alpha_a(*p) ) username = true;
p++;
}
// if we hit this, not a username
if ( *p=='.' || *p == '-' || *p == '_' ) username = false;
// did we get a match?
// . www.cits.ucsb.edu/users/michael-osborne
// . www.cits.ucsb.edu/users/michael-osborne/
// . after /blog/ or /~ should be another / or \0, not a period,
// because that indicates probably a filename, which is not right,
// because we are expecting a username!
if ( username ) {
// include the '/'
if ( *p == '/' ) p++;
// get length
*siteLen = p - m_host;
// not the default
if ( isDefault ) *isDefault = false;
// return the site
return m_host;
}
// assume none
*siteLen = 0;
// . the default site is the domain
// . if domain is invalid, site/siteLen will be NULL/0
if ( ! getDomain() ) return NULL;
// check the Site Filters table
CollectionRec *cr = NULL; if ( coll ) cr = g_collectiondb.getRec(coll);
// the default site is the domain
char *site;
if ( defaultToHostname ) { site = m_host ; *siteLen = m_hlen; }
else { site = m_domain; *siteLen = m_dlen; }
// g_errno should be set
if ( ! cr ) return site;
// initialize the hash table if it needs to be
if ( cr->m_updateSiteRulesTable ) {
logf ( LOG_INFO, "db: Updating Site Filters Table" );
// fill in the hash tables with domain hashes
cr->m_siteRulesTable.reset();
cr->m_siteRulesTable.set(cr->m_numSiteExpressions*2);
for ( long i = 0; i < cr->m_numSiteExpressions ; i++ ) {
Url f;
char *u = cr->m_siteExpressions[i];
long ulen = gbstrlen ( u );
// do not add "www."
f.set(u,ulen,false);
// hash the whole hostname (might be just domain)
long h = hash32 ( f.getHost(), f.getHostLen() );
// also hash scheme and port
h = hash32 ( f.getScheme() , f.getSchemeLen() , h );
h = hash32 ( h , f.getPort() );
// add to the table
cr->m_siteRulesTable.addKey(h, i+1);
}
// unset the update flag
cr->m_updateSiteRulesTable = 0;
}
// . you can only have on entry per domain or subdomain in the table!
// . that entry will be a domain or a subdomain
// . so check for both in the hash table
long t = 2;
loop:
t--;
// return the DEFAULT SITE if no matches
if ( t < 0 ) return site;
// check hash table for this domain or subdomain
long h ;
if ( t == 1 ) h = hash32 ( getHost () , getHostLen () );
else h = hash32 ( getDomain() , getDomainLen() );
// also hash scheme and port
h = hash32 ( getScheme(), getSchemeLen(), h);
h = hash32 ( h, getPort());
// is it in the table?
long s = cr->m_siteRulesTable.getSlot(h);
// if not found, try the domain next
if ( s < 0 ) goto loop;
// found, grab the index #
long i = cr->m_siteRulesTable.getValueFromSlot(s) - 1;
// . see if the url properly matches a filter
// . do NOT add "www." to the domain/subdomain of the filter url
char *e = cr->m_siteExpressions[i];
Url f; f.set (e,gbstrlen(e),false);
// what is the rule #? if rule is 0, that means the "hostname" rule
// otherwise this specifies a path depth that defines the site...
long r = cr->m_siteRules[i];
// get the full hostname of it
//char *h = f.getHost();
//long hlen = f.getHostLen();
// get its hostname (might just be a domain name)
char *sub = f.getHost();
long subLen = f.getUrlLen() - f.getSchemeLen() - 3;
// assume we did not match it
char matched = 0;
// is the filtered url "f" a "substring" of us?
if ( strncmp ( getHost () , sub , subLen ) == 0 ) matched = 1;
if ( strncmp ( getDomain() , sub , subLen ) == 0 ) matched = 2;
// must also match scheme and port
if ( getPort() != f.getPort() ) matched = false;
if ( getSchemeLen() != f.getSchemeLen() ) matched = false;
if ( strncmp(getScheme(), f.getScheme(),m_slen) ) matched = false;
// if really not a match try again
if ( ! matched ) goto loop;
// . we got a match
// . if r == 0 then the hostname is the site for this rule
// . so return the hostname as the site
if ( r == 0 ) {
// not the default
if ( isDefault ) *isDefault = false;
*siteLen = m_hlen;
return m_host;
}
// . otherwise, it is path depth based
// . m_path starts off point to "/"
p = m_path;
// if empty, no good, no site. return the DEFAULT SITE
if ( ! p ) return site;
// do not count the first "/"
p++;
// how many /'s to count to?
long count ;
// count them
for ( count = r ; count > 0 ; count-- ) {
// inc p
while ( *p && *p !='/' ) p++;
// done?
if ( ! *p ) break;
// skip passed the '/'
p++;
}
// if count not accomplished, no site. return the DEFAULT SITE.
if ( count > 0 ) return site;
// not the default
if ( isDefault ) *isDefault = false;
// otherwise we got the site
if ( matched == 1 ) {
// use the host, since we matched that
*siteLen = p - m_host;
// return it
return m_host;
}
// otherwise, use domain
*siteLen = p - m_domain;
return m_domain;
}
long Url::getSiteHash32 ( char *coll ) {
long siteLen;
// prefer domain as default, not hostname
char *site = getSite ( &siteLen , coll , false );
return hash32 ( site , siteLen );
}
*/
long Url::getHostHash32 ( ) {
return hash32 ( m_host , m_hlen );
}
long long Url::getHostHash64 ( ) {
return hash64 ( m_host , m_hlen );
}
long Url::getDomainHash32 ( ) {
return hash32 ( m_domain , m_dlen );
}
long long Url::getDomainHash64 ( ) {
return hash64 ( m_domain , m_dlen );
}
long Url::getUrlHash32 ( ) {
return hash32(m_url,m_ulen);
}
long long Url::getUrlHash64 ( ) {
return hash64(m_url,m_ulen);
}
char *getHostFast ( char *url , long *hostLen , long *port ) {
// point to the url
char *pp = url;
// skip http(s):// or ftp:// (always there?)
while ( *pp && *pp != ':' ) pp++;
// skip ://
pp += 3;
// point "uhost" to hostname right away
char *uhost = pp;
// advance "pp" till we hit a / or :<port>
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// advance "pe" over the port
char *pe = pp;
if ( *pp == ':' ) {
// if port ptr given, do not treat port as part of hostname
if ( port ) *port = atoi(pp+1);
// i think this was including :1234 as part of hostname
// if port was NULL!
//else while ( *pe && *pe != '/' ) pe++;
}
// set length
if ( hostLen ) *hostLen = pe - uhost;
return uhost;
}
char *getPathFast ( char *url ) {
// point to the url
char *pp = url;
// skip http(s):// or ftp:// (always there?)
while ( *pp && *pp != ':' ) pp++;
// skip ://
pp += 3;
// point "uhost" to hostname right away
//char *uhost = pp;
// advance "pp" till we hit a / or :<port>
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// advance "pe" over the port
char *pe = pp;
if ( *pp == ':' )
while ( *pe && *pe != '/' ) pe++;
// but not if something follows the '/'
return pe;
}
char *getTLDFast ( char *url , long *tldLen , bool hasHttp ) {
// point to the url
char *pp = url;
// only do this for some
if ( hasHttp ) {
// skip http(s):// or ftp:// (always there?)
while ( *pp && *pp != ':' ) pp++;
// skip ://
pp += 3;
}
// point "uhost" to hostname right away
char *uhost = pp;
// advance "pp" till we hit a / or :<port> or \0
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// are we a root? assume so.
char isRoot = true;
// advance "pe" over the port
char *pe = pp;
if ( *pp == ':' )
while ( *pe && *pe != '/' ) pe++;
// but not if something follows the '/'
if ( *pe == '/' && *(pe+1) ) isRoot = false;
// set length of host
long uhostLen = pp - uhost;
// . is the hostname just an IP address?
// . if it is an ip based url make domain the hostname
char *ss = uhost;
bool isIp = true;
for ( ; *ss && ss<pp ; ss++ )
if ( is_alpha_a(*ss) ) { isIp = false; break; }
// if ip, no tld
if ( isIp ) return NULL;
// get the tld
char *tld = ::getTLD ( uhost , uhostLen );
// if none, done
if ( ! tld ) return NULL;
// set length
if ( tldLen ) *tldLen = pp - tld;
// return it
return tld;
}
bool hasSubdomain ( char *url ) {
// point to the url
char *pp = url;
// skip http if there
if ( pp[0] == 'h' &&
pp[1] == 't' &&
pp[2] == 't' &&
pp[3] == 'p' &&
pp[4] == ':' &&
pp[5] == '/' &&
pp[6] == '/' )
pp += 7;
else if ( pp[0] == 'h' &&
pp[1] == 't' &&
pp[2] == 't' &&
pp[3] == 'p' &&
pp[4] == 's' &&
pp[5] == ':' &&
pp[6] == '/' &&
pp[7] == '/' )
pp += 8;
// point "uhost" to hostname right away
char *uhost = pp;
// advance "pp" till we hit a / or :<port>
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// are we a root? assume so.
//char isRoot = true;
// advance "pe" over the port
char *pe = pp;
if ( *pp == ':' )
while ( *pe && *pe != '/' ) pe++;
// but not if something follows the '/'
//if ( *pe == '/' && *(pe+1) ) isRoot = false;
// set length
long uhostLen = pp - uhost;
// get end
//char *hostEnd = uhost + uhostLen;
// . is the hostname just an IP address?
// . if it is an ip based url make domain the hostname
char *ss = uhost;
while ( *ss && !is_alpha_a(*ss) && ss<pp ) ss++;
// if we are an ip, say yes
if ( ss == pp ) return true;
// get the tld
char *utld = ::getTLD ( uhost , uhostLen );
// no tld, then no domain
if ( ! utld ) return false;
// the domain, can only be gotten once we know the TLD
// back up a couple chars
char *udom = utld - 2;
// backup until we hit a '.' or hit the beginning
while ( udom > uhost && *udom != '.' ) udom--;
// fix http://ok/
if ( udom < uhost || *udom =='/' ) return false;
// if we hit '.' advance 1
if ( *udom == '.' ) udom++;
// eqal to host? if not, we do have a subdomain
if ( udom != uhost ) return true;
// otherwise the hostname equals the domain name
return false;
}
// returns NULL if url was in bad format and could not get domain. this
// was happening when a host gave us a bad redir url and xmldoc tried
// to set extra doc's robot.txt url to it "http://2010/robots.txt" where
// the host said "Location: 2010 ...".
char *getDomFast ( char *url , long *domLen , bool hasHttp ) {
// point to the url
char *pp = url;
// skip http if there
if ( hasHttp ) {
// skip http(s):// or ftp:// (always there?)
while ( *pp && *pp != ':' ) pp++;
// skip ://
pp += 3;
}
// point "uhost" to hostname right away
char *uhost = pp;
// advance "pp" till we hit a / or :<port>
while ( *pp && *pp !='/' && *pp !=':' ) pp++;
// are we a root? assume so.
char isRoot = true;
// advance "pe" over the port
char *pe = pp;
if ( *pp == ':' )
while ( *pe && *pe != '/' ) pe++;
// but not if something follows the '/'
if ( *pe == '/' && *(pe+1) ) isRoot = false;
// set length
long uhostLen = pp - uhost;
// get end
char *hostEnd = uhost + uhostLen;
// . is the hostname just an IP address?
// . if it is an ip based url make domain the hostname
char *ss = uhost;
while ( *ss && !is_alpha_a(*ss) && ss<pp ) ss++;
//bool isIp = false;
//if ( ss == pp ) isIp = true;
// if we are an ip, treat special
if ( ss == pp ) {
// . might just be empty! like "\0"
// . fixes core dump from
// http://www.marcom1.unimelb.edu.au/public/contact.html
// parsing host email address
if ( uhostLen == 0 ) return NULL;
// to be consistent with how Url::m_domain/m_dlen is set we
// need to remove the last .X from the ip address
// skip back over digits
for ( hostEnd-- ; is_digit(*hostEnd); hostEnd-- );
// must be a period
if ( *hostEnd != '.' ) {
log("url: getDomFast() could not find period for "
"hostname in url");
return NULL;
}
// set length
*domLen = hostEnd - uhost;
// that's it
return uhost;
}
// get the tld
char *utld = ::getTLD ( uhost , uhostLen );
// no tld, then no domain
if ( ! utld ) return NULL;
// the domain, can only be gotten once we know the TLD
// set utldLen
//long utldLen = hostEnd - utld;
// back up a couple chars
char *udom = utld - 2;
// backup until we hit a '.' or hit the beginning
while ( udom > uhost && *udom != '.' ) udom--;
// fix http://ok/
if ( udom < uhost || *udom =='/' ) return NULL;
// if we hit '.' advance 1
if ( *udom == '.' ) udom++;
// set domain length
*domLen = hostEnd - udom;
return udom;
}
// is it of a permalink form?
bool isPermalinky ( char *u ) {
// get its path
char *path = getPathFast(u);
// our ptr
char *p = path;
// we must have a sequence of 3 or more digits in the path
long dcount = 0;
// start scanning at the path
for ( ; *p && *p !='?' ; p++ ) {
// if not a digit, reset count
if ( ! is_digit(*p) ) { dcount = 0; continue; }
// count it if a digit
if ( ++dcount == 3 ) break;
}
// it can also have 2+ hyphens or 2+ underscores in a single
// path component to be a permalink
long hcount = 0;
p = path;
for ( ; *p && *p !='?' ; p++ ) {
// if not a digit, reset count
if ( *p == '/' ) { hcount = 0; continue; }
// is it a thing?
if ( *p != '_' && *p != '-' ) continue;
// count it
if ( ++hcount == 2 ) break;
}
// must be "permalinky"
if ( hcount < 2 && dcount < 3 ) return false;
// we are
return true;
}
/*
bool Url::isRSSFormat ( ) {
// if it ends in .rss, .xml or .rdf ASSUME rss
bool isRSS = false;
char *e = getExtension();
long elen = getExtensionLen();
if ( elen == 3 && strcmp(e,"rss")==0 ) isRSS = true;
if ( elen == 3 && strcmp(e,"xml")==0 ) isRSS = true;
if ( elen == 3 && strcmp(e,"rdf")==0 ) isRSS = true;
// . but if it has wlwmanifest, then not!
// . i don't know what those are, but they are not rss feeds
if ( strstr ( getPath(), "wlwmanifest" ) ) isRSS = false;
// same goes for foaf
if ( strstr ( getPath(), "foaf" ) ) isRSS = false;
return isRSS;
}
*/
// is it http://rpc.weblogs.com/shortChanges.xml, etc.?
bool Url::isPingServer ( ) {
if ( strcmp ( m_url , "http://rpc.weblogs.com/shortChanges.xml") == 0 )
return true;
// testing page
if ( strcmp ( m_url , "http://127.0.0.1:8000/shortChanges.xml") == 0 )
return true;
// default
return false;
}
bool isPingServer ( char *s ) {
if ( strstr ( s , "rpc.weblogs.com/shortChanges.xml") )
return true;
// testing page
if ( strstr ( s , "127.0.0.1:8000/shortChanges.xml") )
return true;
// default
return false;
}
// "s" point to the start of a normalized url (includes http://, etc.)
char *getHost ( char *s , long *hostLen ) {
// skip proto
while ( *s != ':' ) s++;
// skip ://
s += 3;
// that is the host
char *host = s;
// get length of hostname
for ( s++; *s && *s != '/' ; s++ );
// that is it
*hostLen = s - host;
// return it
return host;
}
char *getFilenameFast ( char *s , long *filenameLen ) {
// skip proto
while ( *s != ':' ) s++;
// skip ://
s += 3;
// get length of hostname
for ( s++; *s && *s != '/' ; s++ );
// should always have a /
if ( *s != '/' ) { char *xx=NULL;*xx=0;}
// skip that
s++;
// this point to the filename
char *filename ;
// loop over every subdir name in the path
subloop:
// assume that is filename
filename = s;
// advance s
for ( ; *s && *s !='/' && *s != '?' && *s !='#' ; s++ );
// if we hit another '/' re-set filename
if ( *s == '/' ) { s++; goto subloop; }
// set end of it
char *filenameEnd = s;
// set length
*filenameLen = filenameEnd - filename;
// if none, return null
if ( *filenameLen == 0 ) return NULL;
// return it
return filename;
}
// . return ptrs to the end
// . the character it points to SHOULD NOT BE part of the site
char *getPathEnd ( char *s , long desiredDepth ) {
// skip proto
while ( *s != ':' ) s++;
// skip ://
s += 3;
// get length of hostname
for ( s++; *s && *s != '/' ; s++ );
// should always have a /
if ( *s != '/' ) { char *xx=NULL;*xx=0;}
// skip that
s++;
// init depth
long depth = 0;
// do a character loop
for ( ; depth <= desiredDepth && *s ; s++ )
// count the '/'
if ( *s == '/' ) depth++;
// return the end
return s;
/*
// save for below
long saved = depth;
// keep going
while ( depth-- > 0 ) {
for ( s++; *s && *s != '/' && *s != '?' ; s++ );
// if not enough path components (or cgi), return NULL
if ( *s != '/' ) return NULL;
}
// include the last '/' if we have path components
if ( saved > 0 ) s++;
// . we got it
// . if depth==0 just use "www.xyz.com" as site
// . if depth==1 just use "www.xyz.com/foo/" as site
return s;
*/
}
// . pathDepth==0 for "www.xyz.com"
// . pathDepth==0 for "www.xyz.com/"
// . pathDepth==0 for "www.xyz.com/foo"
// . pathDepth==1 for "www.xyz.com/foo/"
// . pathDepth==1 for "www.xyz.com/foo/x"
// . pathDepth==2 for "www.xyz.com/foo/x/"
// . pathDepth==2 for "www.xyz.com/foo/x/y"
long getPathDepth ( char *s , bool hasHttp ) {
// skip http:// if we got it
if ( hasHttp ) {
// skip proto
while ( *s != ':' ) s++;
// must have it!
if ( ! *s ) { char *xx=NULL;*xx=0; }
// skip ://
s += 3;
}
// skip over hostname
for ( s++; *s && *s != '/' ; s++ );
// no, might be a site like "xyz.com"
if ( ! *s ) return 0;
// should always have a /
if ( *s != '/' ) { char *xx=NULL;*xx=0;}
// skip that
s++;
// init depth
long depth = 0;
// do a character loop
for ( ; *s ; s++ ) {
// stop if we hit ? or #
if ( *s == '?' ) break;
if ( *s == '#' ) break;
// count the '/'
if ( *s == '/' ) depth++;
}
return depth;
}
// must be like xyz.com/xxxx/yyyy/[a-z]*.htm format
bool isHijackerFormat ( char *url ) {
if ( ! url ) return false;
// get the path
char *p = getPathFast ( url );
if ( strstr(p,"/docs.php?id=") ) return true;
if ( strstr(p,"/mods.php?id=") ) return true;
if ( strstr(p,"/show.php?p=") ) return true;
// count the /'s
long pc = 0;
for ( ; *p=='-' || *p=='_' || *p=='/' || (*p>='a'&&*p<='z') ||
is_digit(*p); p++)
if ( *p == '/' ) pc++;
// too many /'s?
if ( pc >= 5 ) return false;
// too few. need at least 3
if ( pc <= 2 ) return false;
// need a .htm
if ( *p != '.' ) return false;
// skip .
p++;
// need "htm\0" now
if ( p[0] != 'h' ) return false;
if ( p[1] != 't' ) return false;
if ( p[2] != 'm' ) return false;
if ( p[3] != 0 ) return false;
return true;
}