// Matt Wells, copyright Mar 2001

// . a class for parsing urls
// . used by many other classes

#ifndef  _URL_H_
#define  _URL_H_

#define MAX_URL_LEN 1024

// where should i put this #define? for now i'll keep it here
#define MAX_COLL_LEN  64

#include "ip.h"      // atoip ( s,len)

char *getPathFast  ( char *url );
char *getFilenameFast ( char *url , long *filenameLen ) ;
char *getTLDFast   ( char *url , long *tldLen  , bool hasHttp = true ) ;
char *getDomFast   ( char *url , long *domLen  , bool hasHttp = true ) ;
bool  hasSubdomain ( char *url );
char *getHostFast  ( char *url , long *hostLen , long *port = NULL ) ;
bool  isPermalinky ( char *url );

bool isHijackerFormat ( char *url );

bool  isPingServer ( char *s ) ;

// . returns the host of a normalized url pointed to by "s"
// . i.e. "s" must start with the protocol (i.e. http:// or https:// etc.)
// . used by Links.cpp for fast parsing and SiteGetter.cpp too
char *getHost ( char *s , long *hostLen ) ;

// . get the path end of a normalized url
// . used by SiteGetter.cpp
// . if num==0 just use "www.xyz.com" as site (the hostname)
// . if num==1 just use "www.xyz.com/foo/" as site
char *getPathEnd ( char *s , long num );

long getPathDepth ( char *s , bool hasHttp );

class Url {

public:

	void print  ();
	void reset  ();

	// set from another Url, does a copy
	void set ( Url *url , bool addWWW );

	void set    ( char *s ) { 
		if ( ! s ) { char *xx=NULL;*xx=0; }
		return set ( s , strlen(s) ); }

	void set ( Url *baseUrl , char *s ) {
		if ( ! s ) { char *xx=NULL;*xx=0; }
		set ( baseUrl , s , strlen(s) ); }

	// . "s" must be an ENCODED url
	void set    ( char *s , long len , bool addWWW = false,
		      bool stripSessionIds = false , bool stripPound = false ,
		      bool stripCommonFile = false ,
		      long titleRecVersion = 0x7fffffff );
	void set    ( Url *baseUrl , char *s , long len , bool addWWW = false,
		      bool stripSessionIds = false , bool stripPound = false ,
		      bool stripCommonFile = false ,
		      long titleRecVersion = 0x7fffffff );
	void setIp  ( long ip ) { m_ip = ip; };

	char isSessionId ( char *hh, long titleRecVersion ) ;

	// compare another url to us
	bool equals ( Url *u ) {
		if ( m_ulen != u->m_ulen ) return false;
		if ( strcmp(m_url,u->m_url) == 0 ) return true;
		return false;
	};

	// is the url's hostname actually in ip in disguise ("a.b.c.d")
	bool isIp   (); 

	// is the hostname an ip #?
	bool hasIp               () { return m_ip; }; // ip of 0 means none
	bool isRoot              ();
	// a super root url is a root url where the hostname is NULL or "www"
	bool isSuperRoot         (); 
	bool isCgi               () { return m_query ; };
	bool isExtensionIndexable(); // html, htm, cgi, asp, shtml, ...

	//returns True if the extension is in the list of 
	//badExtensions - extensions not to be parsed
	bool isBadExtension(long int);
	bool isSet()            { return m_ulen != 0; }

	// does it end in .xml, .rdb or .rss, etc. kinda thing
	//bool isRSSFormat ( ) ;

	// is it http://rpc.weblogs.com/shortChanges.xml, etc.?
	bool isPingServer ( ) ;

	void setPort             (unsigned short port ) { m_port = port; };

	long getSubUrlLen        (long i);
	long getSubPathLen       (long i);

	long getPort             () { return m_port;};
	long getIp               () { return m_ip; };
	long getIpDomain         () { return ipdom(m_ip); };

	char *getUrl         () { return m_url;};
	char *getUrlEnd      () { return m_url + m_ulen;};
	char *getScheme      () { return m_scheme;};
	char *getHost        () { return m_host;};
	char *getDomain      () { return m_domain;};
	char *getTLD         () { return m_tld; };
	char *getMidDomain   () { return m_domain; }; // w/o the tld
	char *getPath        () { return m_path;};
	char *getFilename    () { return m_filename;};
	char *getExtension   () { return m_extension;};
	char *getQuery       () { return m_query;};
	char *getIpString    () { return iptoa ( m_ip ); };
	char *getAnchor      () { return m_anchor;};
	//char *getSite         () {return m_site;};
	char *getPortStr     () { return m_portStr; }
	long  getUrlLen         () { return m_ulen;};
	long  getSchemeLen      () { return m_slen;};
	long  getHostLen        () { return m_hlen;};
	long  getDomainLen      () { return m_dlen;};
	long  getPathLen        () { return m_plen;};
	char *getPathEnd        () { return m_path + m_plen; };
	long  getFilenameLen    () { return m_flen;};
	long  getExtensionLen   () { return m_elen;};
	long  getQueryLen       () { return m_qlen;};
	long  getTLDLen         () { return m_tldLen; };
	long  getMidDomainLen   () { return m_mdlen;};
	long  getPortLen        () { return m_portLen;};
	long  getAnchorLen      () { return m_anchorLen;};
	long  getDefaultPort    () { return m_defPort;};
	//long  getSiteLen         () {return m_siteLen;};
	long  getPathLenWithCgi () {
		if ( ! m_query ) return m_plen;	return m_plen + 1 + m_qlen; };
	bool  isHttp            () { 
		if ( m_ulen  < 4 ) return false;
		if ( m_slen != 4 ) return false;
		if ( m_scheme[0] != 'h' ) return false;
		if ( m_scheme[1] != 't' ) return false;
		if ( m_scheme[2] != 't' ) return false;
		if ( m_scheme[3] != 'p' ) return false;
		return true;
	};
	bool  isHttps           () { 
		if ( m_ulen  < 5 ) return false;
		if ( m_slen != 5 ) return false;
		if ( m_scheme[0] != 'h' ) return false;
		if ( m_scheme[1] != 't' ) return false;
		if ( m_scheme[2] != 't' ) return false;
		if ( m_scheme[3] != 'p' ) return false;
		if ( m_scheme[4] != 's' ) return false;
		return true;
	};


	// . are we a site root?
	// . i.e. does this url == hometown.com/users/fred/ , etc.
	// . does not take into account whether we have a subdomain or domain
	//bool isSiteRoot(char *coll,
	//		class TagRec *tagRec = NULL ,
	//		char **retSite=NULL,
	//		long *retSiteLen=NULL);

	// . returns the site and sets *siteLen
	// . returns NULL and sets g_errno on error
	// . returns NULL without g_errno set if our domain is invalid
	// . sets "*isDefault" to true if we just returned the default site,
	//   otherwise false
	//char *getSite ( long *siteLen , char *coll , 
	//		bool defaultToHostname , 
	//		class TagRec *tagRec = NULL ,
	//		bool *isDefault = NULL );

	// used by buzz i guess
	//long  getSiteHash32   ( char *coll );
	long      getUrlHash32    ( ) ;
	long      getHostHash32   ( ) ;
	long      getDomainHash32 ( ) ;

	long long getUrlHash64    ( ) ;
	long long getHostHash64   ( ) ;
	long long getDomainHash64   ( ) ;

	long long getUrlHash48    ( ) {
		return getUrlHash64() & 0x0000ffffffffffffLL; }

	// . store url w/o http://
	// . without trailing / if path is just "/"
	// . without "www." if in hostname and "rmWWW" is true
	// . returns length
	// . if "buf" is NULL just returns the shorthand-form length
	char *getShorthandUrl    ( bool rmWWW , long *len );

	// count the path components (root url as 0 path components)
	long  getPathDepth ( bool countFilename = false );

	// get path component #num. starts at 0.
	char *getPathComponent ( long num , long *clen );
	//char *getPathEnd       ( long num );

	// is our hostname "www" ?
	bool isHostWWW ( ) ;

	bool hasSubdomain() { return (m_dlen != m_hlen); };

	// is it xxx.com/* or www.xxx.com/* (CAUTION: www.xxx.yyy.com)
	bool isSimpleSubdomain();

	// spam means dirty/porn
	bool isDirty () { return isSpam(); };

	// is the url a porn/spam url?
	bool isSpam();

	// this is private
	bool isSpam ( char *s , long slen ) ;

	// . detects crazy repetetive urls like this:
	//   http://www.pittsburghlive.com:8000/x/tribune-review/opinion/
	//   steigerwald/letters/send/archive/letters/send/archive/bish/
	//   archive/bish/letters/bish/archive/lettes/send/archive/letters/...
	// . The problem is they use a relative href link on the page when they
	//   should us an absolute and the microsoft web server will still
	//   give the content they meant to give!
	// . this is called by Msg14.cpp to not even spider such urls, and we
	//   also have some even better detection logic in Links.cpp which
	//   is probably more accurate than this function.
	bool isLinkLoop();

	// private:

	char    m_url[MAX_URL_LEN]; // the normalized url
	long    m_ulen;

	// points into "url" (http, ftp, mailto, ...)(all lowercase)
	char   *m_scheme;           
	long    m_slen;

	// points into "url" (a.com, www.yahoo.com, 1.2.3.4, ...)(allLowercase)
	char   *m_host;             
	long    m_hlen;

	// it's 0 if we don't have one
	long    m_ip;  

	// points into "url" (/  /~mwells/  /a/b/ ...) (always ends in /)
	char   *m_path;             
	long    m_plen;

	// points into "url" (a=hi+there, ...)
	char   *m_query;            
	long    m_qlen;

	// points into "url" (html, mpg, wav, doc, ...)
	char   *m_extension;        
	long    m_elen;

	// (a.html NULL index.html) (can be NULL)
	char   *m_filename;         
	long    m_flen;

	char   *m_domain;
	long    m_dlen;

	char   *m_tld;
	long    m_tldLen;

	// char *m_midDomain equals m_domain
	long    m_mdlen;

	// (80, 8080, 8000, ...)
	long    m_port;             
	long    m_defPort;
	long    m_portLen;
	char   *m_portStr;

	// anchor
	char   *m_anchor;
	long    m_anchorLen;
	
	// Base site url
	//char *m_site;
	//long m_siteLen;
};

#endif