open-source-search-engine/Url.h
Zak Betz 5622ca47ee Work on non-ascii domain names. It works on correct inputs, but
will crash on some non correct inputs, so it is forced to be disabled.
2015-09-14 00:34:44 -06:00

309 lines
9.5 KiB
C++

// Matt Wells, copyright Mar 2001
// . a class for parsing urls
// . used by many other classes
#ifndef _URL_H_
#define _URL_H_
#define MAX_URL_LEN 1024
// where should i put this #define? for now i'll keep it here
#define MAX_COLL_LEN 64
#include "ip.h" // atoip ( s,len)
char *getPathFast ( char *url );
char *getFilenameFast ( char *url , int32_t *filenameLen ) ;
char *getTLDFast ( char *url , int32_t *tldLen , bool hasHttp = true ) ;
char *getDomFast ( char *url , int32_t *domLen , bool hasHttp = true ) ;
bool hasSubdomain ( char *url );
char *getHostFast ( char *url , int32_t *hostLen , int32_t *port = NULL ) ;
bool isPermalinky ( char *url );
bool isHijackerFormat ( char *url );
bool isPingServer ( char *s ) ;
// . returns the host of a normalized url pointed to by "s"
// . i.e. "s" must start with the protocol (i.e. http:// or https:// etc.)
// . used by Links.cpp for fast parsing and SiteGetter.cpp too
char *getHost ( char *s , int32_t *hostLen ) ;
// . get the path end of a normalized url
// . used by SiteGetter.cpp
// . if num==0 just use "www.xyz.com" as site (the hostname)
// . if num==1 just use "www.xyz.com/foo/" as site
char *getPathEnd ( char *s , int32_t num );
int32_t getPathDepth ( char *s , bool hasHttp );
class Url {
public:
void print ();
void reset ();
// set from another Url, does a copy
void set ( Url *url , bool addWWW );
void set ( char *s ) {
if ( ! s ) { char *xx=NULL;*xx=0; }
return set ( s , strlen(s) ); }
void set ( Url *baseUrl , char *s ) {
if ( ! s ) { char *xx=NULL;*xx=0; }
set ( baseUrl , s , strlen(s) ); }
// . "s" must be an ENCODED url
void set ( char *s , int32_t len , bool addWWW = false,
bool stripSessionIds = false , bool stripPound = false ,
bool stripCommonFile = false ,
int32_t titleRecVersion = 0x7fffffff );
void set ( Url *baseUrl , char *s , int32_t len , bool addWWW = false,
bool stripSessionIds = false , bool stripPound = false ,
bool stripCommonFile = false ,
int32_t titleRecVersion = 0x7fffffff );
void setIp ( int32_t ip ) { m_ip = ip; };
char isSessionId ( char *hh, int32_t titleRecVersion ) ;
// compare another url to us
bool equals ( Url *u ) {
if ( m_ulen != u->m_ulen ) return false;
if ( strcmp(m_url,u->m_url) == 0 ) return true;
return false;
};
// is the url's hostname actually in ip in disguise ("a.b.c.d")
bool isIp ();
// is the hostname an ip #?
bool hasIp () { return m_ip; }; // ip of 0 means none
bool isRoot ();
// a super root url is a root url where the hostname is NULL or "www"
bool isSuperRoot ();
bool isCgi () { return m_query ; };
bool isExtensionIndexable(); // html, htm, cgi, asp, shtml, ...
//returns True if the extension is in the list of
//badExtensions - extensions not to be parsed
bool isBadExtension(int32_t xxx);
bool isSet() { return m_ulen != 0; }
// is this url a warc or arc url? i.e. ends in .warc or .arc or
// .warc.gz or .arc.gz?
bool isWarc ( );
bool isArc ( );
// does it end in .xml, .rdb or .rss, etc. kinda thing
//bool isRSSFormat ( ) ;
// is it http://rpc.weblogs.com/int16_tChanges.xml, etc.?
bool isPingServer ( ) ;
void setPort (uint16_t port ) { m_port = port; };
int32_t getSubUrlLen (int32_t i);
int32_t getSubPathLen (int32_t i);
int32_t getPort () { return m_port;};
int32_t getIp () { return m_ip; };
int32_t getIpDomain () { return ipdom(m_ip); };
char *getUrl () { return m_url;};
char *getUrlEnd () { return m_url + m_ulen;};
char *getScheme () { return m_scheme;};
char *getHost () { return m_host;};
char *getDomain () { return m_domain;};
char *getTLD () { return m_tld; };
char *getMidDomain () { return m_domain; }; // w/o the tld
char *getPath () { return m_path;};
char *getFilename () { return m_filename;};
char *getExtension () { return m_extension;};
char *getQuery () { return m_query;};
char *getIpString () { return iptoa ( m_ip ); };
char *getAnchor () { return m_anchor;};
//char *getSite () {return m_site;};
char *getPortStr () { return m_portStr; }
int32_t getUrlLen () { return m_ulen;};
int32_t getSchemeLen () { return m_slen;};
int32_t getHostLen () { return m_hlen;};
int32_t getDomainLen () { return m_dlen;};
int32_t getPathLen () { return m_plen;};
char *getPathEnd () { return m_path + m_plen; };
int32_t getFilenameLen () { return m_flen;};
int32_t getExtensionLen () { return m_elen;};
int32_t getQueryLen () { return m_qlen;};
int32_t getTLDLen () { return m_tldLen; };
int32_t getMidDomainLen () { return m_mdlen;};
int32_t getPortLen () { return m_portLen;};
int32_t getAnchorLen () { return m_anchorLen;};
int32_t getDefaultPort () { return m_defPort;};
//int32_t getSiteLen () {return m_siteLen;};
int32_t getPathLenWithCgi () {
if ( ! m_query ) return m_plen; return m_plen + 1 + m_qlen; };
bool isHttp () {
if ( m_ulen < 4 ) return false;
if ( m_slen != 4 ) return false;
if ( m_scheme[0] != 'h' ) return false;
if ( m_scheme[1] != 't' ) return false;
if ( m_scheme[2] != 't' ) return false;
if ( m_scheme[3] != 'p' ) return false;
return true;
};
bool isHttps () {
if ( m_ulen < 5 ) return false;
if ( m_slen != 5 ) return false;
if ( m_scheme[0] != 'h' ) return false;
if ( m_scheme[1] != 't' ) return false;
if ( m_scheme[2] != 't' ) return false;
if ( m_scheme[3] != 'p' ) return false;
if ( m_scheme[4] != 's' ) return false;
return true;
};
// . are we a site root?
// . i.e. does this url == hometown.com/users/fred/ , etc.
// . does not take into account whether we have a subdomain or domain
//bool isSiteRoot(char *coll,
// class TagRec *tagRec = NULL ,
// char **retSite=NULL,
// int32_t *retSiteLen=NULL);
// . returns the site and sets *siteLen
// . returns NULL and sets g_errno on error
// . returns NULL without g_errno set if our domain is invalid
// . sets "*isDefault" to true if we just returned the default site,
// otherwise false
//char *getSite ( int32_t *siteLen , char *coll ,
// bool defaultToHostname ,
// class TagRec *tagRec = NULL ,
// bool *isDefault = NULL );
// used by buzz i guess
//int32_t getSiteHash32 ( char *coll );
int32_t getUrlHash32 ( ) ;
int32_t getHostHash32 ( ) ;
int32_t getDomainHash32 ( ) ;
// if url is xyz.com then get hash of www.xyz.com
int32_t getHash32WithWWW ( );
int64_t getUrlHash64 ( ) ;
int64_t getHostHash64 ( ) ;
int64_t getDomainHash64 ( ) ;
int64_t getUrlHash48 ( ) {
return getUrlHash64() & 0x0000ffffffffffffLL; }
bool hasMediaExtension ( ) ;
// . store url w/o http://
// . without trailing / if path is just "/"
// . without "www." if in hostname and "rmWWW" is true
// . returns length
// . if "buf" is NULL just returns the int16_thand-form length
char *getShorthandUrl ( bool rmWWW , int32_t *len );
// count the path components (root url as 0 path components)
int32_t getPathDepth ( bool countFilename ); // = false );
// get path component #num. starts at 0.
char *getPathComponent ( int32_t num , int32_t *clen );
//char *getPathEnd ( int32_t num );
// is our hostname "www" ?
bool isHostWWW ( ) ;
bool hasSubdomain() { return (m_dlen != m_hlen); };
// is it xxx.com/* or www.xxx.com/* (CAUTION: www.xxx.yyy.com)
bool isSimpleSubdomain();
// spam means dirty/porn
bool isDirty () { return isSpam(); };
// is the url a porn/spam url?
bool isSpam();
// this is private
bool isSpam ( char *s , int32_t slen ) ;
// . detects crazy repetetive urls like this:
// http://www.pittsburghlive.com:8000/x/tribune-review/opinion/
// steigerwald/letters/send/archive/letters/send/archive/bish/
// archive/bish/letters/bish/archive/lettes/send/archive/letters/...
// . The problem is they use a relative href link on the page when they
// should us an absolute and the microsoft web server will still
// give the content they meant to give!
// . this is called by Msg14.cpp to not even spider such urls, and we
// also have some even better detection logic in Links.cpp which
// is probably more accurate than this function.
bool isLinkLoop();
static bool unitTests();
static char* getDisplayUrl(char* url);
// private:
char m_url[MAX_URL_LEN]; // the normalized url
int32_t m_ulen;
// points into "url" (http, ftp, mailto, ...)(all lowercase)
char *m_scheme;
int32_t m_slen;
// points into "url" (a.com, www.yahoo.com, 1.2.3.4, ...)(allLowercase)
char *m_host;
int32_t m_hlen;
// it's 0 if we don't have one
int32_t m_ip;
// points into "url" (/ /~mwells/ /a/b/ ...) (always ends in /)
char *m_path;
int32_t m_plen;
// points into "url" (a=hi+there, ...)
char *m_query;
int32_t m_qlen;
// points into "url" (html, mpg, wav, doc, ...)
char *m_extension;
int32_t m_elen;
// (a.html NULL index.html) (can be NULL)
char *m_filename;
int32_t m_flen;
char *m_domain;
int32_t m_dlen;
char *m_tld;
int32_t m_tldLen;
// char *m_midDomain equals m_domain
int32_t m_mdlen;
// (80, 8080, 8000, ...)
int32_t m_port;
int32_t m_defPort;
int32_t m_portLen;
char *m_portStr;
// anchor
char *m_anchor;
int32_t m_anchorLen;
// Base site url
//char *m_site;
//int32_t m_siteLen;
};
#endif