2020-01-18 11:38:21 +03:00
/*
* Copyright ( c ) 2018 - 2020 , Andreas Kling < kling @ serenityos . org >
2021-05-24 00:31:16 +03:00
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
2020-01-18 11:38:21 +03:00
*
2021-04-22 11:24:48 +03:00
* SPDX - License - Identifier : BSD - 2 - Clause
2020-01-18 11:38:21 +03:00
*/
2021-06-01 22:18:08 +03:00
# include <AK/CharacterTypes.h>
2021-05-27 22:05:07 +03:00
# include <AK/Debug.h>
2020-05-26 14:52:44 +03:00
# include <AK/LexicalPath.h>
2019-08-10 18:27:56 +03:00
# include <AK/StringBuilder.h>
# include <AK/URL.h>
2021-05-27 22:05:07 +03:00
# include <AK/URLParser.h>
2021-05-25 14:50:03 +03:00
# include <AK/Utf8View.h>
2019-08-10 18:27:56 +03:00
namespace AK {
2021-05-27 22:05:07 +03:00
// FIXME: It could make sense to force users of URL to use URLParser::parse() explicitly instead of using a constructor.
2021-11-11 02:55:02 +03:00
URL : : URL ( StringView string )
2021-09-13 22:34:14 +03:00
: URL ( URLParser : : parse ( string ) )
2019-08-10 18:27:56 +03:00
{
2021-05-27 22:05:07 +03:00
if constexpr ( URL_PARSER_DEBUG ) {
if ( m_valid )
dbgln ( " URL constructor: Parsed URL to be '{}'. " , serialize ( ) ) ;
else
dbgln ( " URL constructor: Parsed URL to be invalid. " ) ;
}
2019-08-10 18:27:56 +03:00
}
2021-05-25 22:32:20 +03:00
String URL : : path ( ) const
{
if ( cannot_be_a_base_url ( ) )
return paths ( ) [ 0 ] ;
StringBuilder builder ;
for ( auto & path : m_paths ) {
builder . append ( ' / ' ) ;
builder . append ( path ) ;
}
return builder . to_string ( ) ;
}
2021-06-01 11:58:27 +03:00
URL URL : : complete_url ( String const & string ) const
2019-11-19 00:04:39 +03:00
{
2020-06-07 19:23:33 +03:00
if ( ! is_valid ( ) )
return { } ;
2021-09-13 22:34:14 +03:00
return URLParser : : parse ( string , this ) ;
2019-11-19 00:04:39 +03:00
}
2021-06-01 11:58:27 +03:00
void URL : : set_scheme ( String scheme )
2020-04-12 00:07:23 +03:00
{
2021-06-01 11:58:27 +03:00
m_scheme = move ( scheme ) ;
2020-04-12 00:07:23 +03:00
m_valid = compute_validity ( ) ;
}
2021-06-01 11:58:27 +03:00
void URL : : set_username ( String username )
2021-05-25 22:32:20 +03:00
{
2021-06-01 11:58:27 +03:00
m_username = move ( username ) ;
2021-05-25 22:32:20 +03:00
m_valid = compute_validity ( ) ;
}
2021-06-01 11:58:27 +03:00
void URL : : set_password ( String password )
2021-05-25 22:32:20 +03:00
{
2021-06-01 11:58:27 +03:00
m_password = move ( password ) ;
2021-05-25 22:32:20 +03:00
m_valid = compute_validity ( ) ;
}
2021-06-01 11:58:27 +03:00
void URL : : set_host ( String host )
2020-04-12 00:07:23 +03:00
{
2021-06-01 11:58:27 +03:00
m_host = move ( host ) ;
2020-04-12 00:07:23 +03:00
m_valid = compute_validity ( ) ;
}
2021-09-13 23:12:16 +03:00
void URL : : set_port ( Optional < u16 > port )
2020-11-04 09:20:20 +03:00
{
2021-05-25 22:32:20 +03:00
if ( port = = default_port_for_scheme ( m_scheme ) ) {
2021-09-13 23:12:16 +03:00
m_port = { } ;
2021-05-25 22:32:20 +03:00
return ;
}
2021-09-13 23:12:16 +03:00
m_port = move ( port ) ;
2020-11-04 09:20:20 +03:00
m_valid = compute_validity ( ) ;
}
2021-06-01 11:58:27 +03:00
void URL : : set_paths ( Vector < String > paths )
2021-05-25 22:32:20 +03:00
{
2021-06-01 11:58:27 +03:00
m_paths = move ( paths ) ;
2021-05-25 22:32:20 +03:00
m_valid = compute_validity ( ) ;
}
2021-06-01 11:58:27 +03:00
void URL : : set_query ( String query )
2020-04-12 00:07:23 +03:00
{
2021-06-01 11:58:27 +03:00
m_query = move ( query ) ;
2020-04-12 00:07:23 +03:00
}
2021-06-01 11:58:27 +03:00
void URL : : set_fragment ( String fragment )
2020-04-12 01:38:13 +03:00
{
2021-06-01 11:58:27 +03:00
m_fragment = move ( fragment ) ;
2020-04-12 01:38:13 +03:00
}
2021-05-29 21:46:49 +03:00
// FIXME: This is by no means complete.
// NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
2020-04-12 00:07:23 +03:00
bool URL : : compute_validity ( ) const
{
2021-05-24 00:31:16 +03:00
if ( m_scheme . is_empty ( ) )
2020-04-12 00:07:23 +03:00
return false ;
2020-11-04 09:20:20 +03:00
2021-05-24 00:31:16 +03:00
if ( m_scheme = = " data " ) {
2020-11-04 09:20:20 +03:00
if ( m_data_mime_type . is_empty ( ) )
2020-04-19 11:36:56 +03:00
return false ;
2021-05-29 21:46:49 +03:00
if ( m_data_payload_is_base64 ) {
if ( m_data_payload . length ( ) % 4 ! = 0 )
return false ;
for ( auto character : m_data_payload ) {
if ( ! is_ascii_alphanumeric ( character ) | | character = = ' + ' | | character = = ' / ' | | character = = ' = ' )
return false ;
}
}
} else if ( m_cannot_be_a_base_url ) {
if ( m_paths . size ( ) ! = 1 )
return false ;
if ( m_paths [ 0 ] . is_empty ( ) )
return false ;
} else {
if ( m_scheme . is_one_of ( " about " , " mailto " ) )
return false ;
// NOTE: Maybe it is allowed to have a zero-segment path.
if ( m_paths . size ( ) = = 0 )
return false ;
2020-04-12 00:07:23 +03:00
}
2020-11-04 09:20:20 +03:00
2021-05-29 21:46:49 +03:00
// NOTE: A file URL's host should be the empty string for localhost, not null.
if ( m_scheme = = " file " & & m_host . is_null ( ) )
2020-11-04 09:20:20 +03:00
return false ;
2020-04-12 00:07:23 +03:00
return true ;
}
2021-11-11 02:55:02 +03:00
bool URL : : scheme_requires_port ( StringView scheme )
2020-11-04 09:20:20 +03:00
{
2021-05-24 00:31:16 +03:00
return ( default_port_for_scheme ( scheme ) ! = 0 ) ;
2020-11-04 09:20:20 +03:00
}
2021-11-11 02:55:02 +03:00
u16 URL : : default_port_for_scheme ( StringView scheme )
2020-11-04 09:20:20 +03:00
{
2021-05-24 00:31:16 +03:00
if ( scheme = = " http " )
2020-11-04 09:20:20 +03:00
return 80 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " https " )
2020-11-04 09:20:20 +03:00
return 443 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " gemini " )
2020-11-04 09:20:20 +03:00
return 1965 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " irc " )
2020-11-04 09:20:20 +03:00
return 6667 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " ircs " )
2020-11-04 09:20:20 +03:00
return 6697 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " ws " )
2021-04-16 16:21:03 +03:00
return 80 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " wss " )
2021-04-16 16:21:03 +03:00
return 443 ;
2020-11-04 09:20:20 +03:00
return 0 ;
}
2021-06-01 11:58:27 +03:00
URL URL : : create_with_file_scheme ( String const & path , String const & fragment , String const & hostname )
2020-04-18 23:02:04 +03:00
{
2021-05-27 22:40:02 +03:00
LexicalPath lexical_path ( path ) ;
2021-06-29 14:11:03 +03:00
if ( ! lexical_path . is_absolute ( ) )
2021-05-27 22:40:02 +03:00
return { } ;
2021-05-29 22:57:20 +03:00
2020-04-18 23:02:04 +03:00
URL url ;
2021-05-24 00:31:16 +03:00
url . set_scheme ( " file " ) ;
2021-05-29 22:57:20 +03:00
// NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string.
// This is because a file URL always needs a non-null hostname.
url . set_host ( hostname . is_null ( ) | | hostname = = " localhost " ? String : : empty ( ) : hostname ) ;
2021-05-27 22:40:02 +03:00
url . set_paths ( lexical_path . parts ( ) ) ;
// NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment.
if ( path . ends_with ( ' / ' ) )
url . append_path ( " " ) ;
2021-03-02 00:24:34 +03:00
url . set_fragment ( fragment ) ;
2020-04-18 23:02:04 +03:00
return url ;
}
2021-06-01 11:58:27 +03:00
URL URL : : create_with_url_or_path ( String const & url_or_path )
2020-04-19 11:55:59 +03:00
{
URL url = url_or_path ;
if ( url . is_valid ( ) )
return url ;
2020-05-26 14:52:44 +03:00
String path = LexicalPath : : canonicalized_path ( url_or_path ) ;
2021-05-24 00:31:16 +03:00
return URL : : create_with_file_scheme ( path ) ;
2020-04-19 11:55:59 +03:00
}
2021-05-25 23:05:01 +03:00
// https://url.spec.whatwg.org/#special-scheme
2021-11-11 02:55:02 +03:00
bool URL : : is_special_scheme ( StringView scheme )
2021-05-25 23:05:01 +03:00
{
return scheme . is_one_of ( " ftp " , " file " , " http " , " https " , " ws " , " wss " ) ;
}
2021-05-25 23:32:39 +03:00
String URL : : serialize_data_url ( ) const
{
VERIFY ( m_scheme = = " data " ) ;
VERIFY ( ! m_data_mime_type . is_null ( ) ) ;
VERIFY ( ! m_data_payload . is_null ( ) ) ;
StringBuilder builder ;
builder . append ( m_scheme ) ;
builder . append ( ' : ' ) ;
builder . append ( m_data_mime_type ) ;
if ( m_data_payload_is_base64 )
builder . append ( " ;base64 " ) ;
builder . append ( ' , ' ) ;
// NOTE: The specification does not say anything about encoding this, but we should encode at least control and non-ASCII
// characters (since this is also a valid representation of the same data URL).
builder . append ( URL : : percent_encode ( m_data_payload , PercentEncodeSet : : C0Control ) ) ;
return builder . to_string ( ) ;
}
// https://url.spec.whatwg.org/#concept-url-serializer
String URL : : serialize ( ExcludeFragment exclude_fragment ) const
{
if ( m_scheme = = " data " )
return serialize_data_url ( ) ;
StringBuilder builder ;
builder . append ( m_scheme ) ;
builder . append ( ' : ' ) ;
if ( ! m_host . is_null ( ) ) {
builder . append ( " // " ) ;
if ( includes_credentials ( ) ) {
builder . append ( percent_encode ( m_username , PercentEncodeSet : : Userinfo ) ) ;
if ( ! m_password . is_empty ( ) ) {
builder . append ( ' : ' ) ;
builder . append ( percent_encode ( m_password , PercentEncodeSet : : Userinfo ) ) ;
}
builder . append ( ' @ ' ) ;
}
builder . append ( m_host ) ;
2021-09-13 23:12:16 +03:00
if ( m_port . has_value ( ) )
builder . appendff ( " :{} " , * m_port ) ;
2021-05-25 23:32:39 +03:00
}
if ( cannot_be_a_base_url ( ) ) {
builder . append ( percent_encode ( m_paths [ 0 ] , PercentEncodeSet : : Path ) ) ;
} else {
2021-05-27 22:40:02 +03:00
if ( m_host . is_null ( ) & & m_paths . size ( ) > 1 & & m_paths [ 0 ] . is_empty ( ) )
builder . append ( " /. " ) ;
for ( auto & segment : m_paths ) {
builder . append ( ' / ' ) ;
builder . append ( percent_encode ( segment , PercentEncodeSet : : Path ) ) ;
2021-05-25 23:32:39 +03:00
}
}
if ( ! m_query . is_null ( ) ) {
builder . append ( ' ? ' ) ;
builder . append ( percent_encode ( m_query , is_special ( ) ? URL : : PercentEncodeSet : : SpecialQuery : URL : : PercentEncodeSet : : Query ) ) ;
}
if ( exclude_fragment = = ExcludeFragment : : No & & ! m_fragment . is_null ( ) ) {
builder . append ( ' # ' ) ;
builder . append ( percent_encode ( m_fragment , PercentEncodeSet : : Fragment ) ) ;
}
return builder . to_string ( ) ;
}
// https://url.spec.whatwg.org/#url-rendering
// NOTE: This does e.g. not display credentials.
// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
// resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
String URL : : serialize_for_display ( ) const
{
VERIFY ( m_valid ) ;
if ( m_scheme = = " data " )
return serialize_data_url ( ) ;
StringBuilder builder ;
builder . append ( m_scheme ) ;
builder . append ( ' : ' ) ;
if ( ! m_host . is_null ( ) ) {
builder . append ( " // " ) ;
builder . append ( m_host ) ;
2021-09-13 23:12:16 +03:00
if ( m_port . has_value ( ) )
builder . appendff ( " :{} " , * m_port ) ;
2021-05-25 23:32:39 +03:00
}
if ( cannot_be_a_base_url ( ) ) {
builder . append ( percent_encode ( m_paths [ 0 ] , PercentEncodeSet : : Path ) ) ;
} else {
2021-05-27 22:40:02 +03:00
if ( m_host . is_null ( ) & & m_paths . size ( ) > 1 & & m_paths [ 0 ] . is_empty ( ) )
builder . append ( " /. " ) ;
for ( auto & segment : m_paths ) {
builder . append ( ' / ' ) ;
builder . append ( percent_encode ( segment , PercentEncodeSet : : Path ) ) ;
2021-05-25 23:32:39 +03:00
}
}
if ( ! m_query . is_null ( ) ) {
builder . append ( ' ? ' ) ;
builder . append ( percent_encode ( m_query , is_special ( ) ? URL : : PercentEncodeSet : : SpecialQuery : URL : : PercentEncodeSet : : Query ) ) ;
}
if ( ! m_fragment . is_null ( ) ) {
builder . append ( ' # ' ) ;
builder . append ( percent_encode ( m_fragment , PercentEncodeSet : : Fragment ) ) ;
}
return builder . to_string ( ) ;
}
2021-09-13 22:18:14 +03:00
// https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin
// https://url.spec.whatwg.org/#concept-url-origin
String URL : : serialize_origin ( ) const
{
VERIFY ( m_valid ) ;
if ( m_scheme = = " blob " sv ) {
// TODO: 1. If URL’ s blob URL entry is non-null, then return URL’ s blob URL entry’ s environment’ s origin.
// 2. Let url be the result of parsing URL’ s path[0].
VERIFY ( ! m_paths . is_empty ( ) ) ;
URL url = m_paths [ 0 ] ;
// 3. Return a new opaque origin, if url is failure, and url’ s origin otherwise.
if ( ! url . is_valid ( ) )
return " null " ;
return url . serialize_origin ( ) ;
} else if ( ! m_scheme . is_one_of ( " ftp " sv , " http " sv , " https " sv , " ws " sv , " wss " sv ) ) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin."
return " null " ;
}
StringBuilder builder ;
builder . append ( m_scheme ) ;
builder . append ( " :// " sv ) ;
builder . append ( m_host ) ;
2021-09-13 23:12:16 +03:00
if ( m_port . has_value ( ) )
builder . append ( " :{} " , * m_port ) ;
2021-09-13 22:18:14 +03:00
return builder . build ( ) ;
}
2021-06-01 11:58:27 +03:00
bool URL : : equals ( URL const & other , ExcludeFragment exclude_fragments ) const
2021-05-27 22:38:16 +03:00
{
2021-06-01 12:14:30 +03:00
if ( this = = & other )
return true ;
2021-05-27 22:38:16 +03:00
if ( ! m_valid | | ! other . m_valid )
return false ;
return serialize ( exclude_fragments ) = = other . serialize ( exclude_fragments ) ;
}
2020-05-06 00:56:35 +03:00
String URL : : basename ( ) const
{
if ( ! m_valid )
return { } ;
2021-05-25 22:32:20 +03:00
if ( m_paths . is_empty ( ) )
return { } ;
return m_paths . last ( ) ;
2020-05-06 00:56:35 +03:00
}
2021-05-25 14:50:03 +03:00
void URL : : append_percent_encoded ( StringBuilder & builder , u32 code_point )
{
if ( code_point < = 0x7f )
builder . appendff ( " %{:02X} " , code_point ) ;
else if ( code_point < = 0x07ff )
builder . appendff ( " %{:02X}%{:02X} " , ( ( code_point > > 6 ) & 0x1f ) | 0xc0 , ( code_point & 0x3f ) | 0x80 ) ;
else if ( code_point < = 0xffff )
builder . appendff ( " %{:02X}%{:02X}%{:02X} " , ( ( code_point > > 12 ) & 0x0f ) | 0xe0 , ( ( code_point > > 6 ) & 0x3f ) | 0x80 , ( code_point & 0x3f ) | 0x80 ) ;
else if ( code_point < = 0x10ffff )
builder . appendff ( " %{:02X}%{:02X}%{:02X}%{:02X} " , ( ( code_point > > 18 ) & 0x07 ) | 0xf0 , ( ( code_point > > 12 ) & 0x3f ) | 0x80 , ( ( code_point > > 6 ) & 0x3f ) | 0x80 , ( code_point & 0x3f ) | 0x80 ) ;
else
VERIFY_NOT_REACHED ( ) ;
}
// https://url.spec.whatwg.org/#c0-control-percent-encode-set
constexpr bool code_point_is_in_percent_encode_set ( u32 code_point , URL : : PercentEncodeSet set )
{
switch ( set ) {
case URL : : PercentEncodeSet : : C0Control :
return code_point < 0x20 | | code_point > 0x7E ;
case URL : : PercentEncodeSet : : Fragment :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : C0Control ) | | " \" <>` " sv . contains ( code_point ) ;
case URL : : PercentEncodeSet : : Query :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : C0Control ) | | " \" #<> " sv . contains ( code_point ) ;
case URL : : PercentEncodeSet : : SpecialQuery :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : Query ) | | code_point = = ' \' ' ;
case URL : : PercentEncodeSet : : Path :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : Query ) | | " ?` { } " sv.contains(code_point);
case URL : : PercentEncodeSet : : Userinfo :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : Path ) | | " /: ; = @ [ \ \ ] ^ | " sv.contains(code_point);
case URL : : PercentEncodeSet : : Component :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : Userinfo ) | | " $%&+, " sv . contains ( code_point ) ;
case URL : : PercentEncodeSet : : ApplicationXWWWFormUrlencoded :
return code_point > = 0x7E | | ! ( is_ascii_alphanumeric ( code_point ) | | " !'()~ " sv . contains ( code_point ) ) ;
case URL : : PercentEncodeSet : : EncodeURI :
// NOTE: This is the same percent encode set that JS encodeURI() uses.
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
return code_point > = 0x7E | | ( ! is_ascii_alphanumeric ( code_point ) & & ! " ;,/?:@&=+$-_.!~*'()# " sv . contains ( code_point ) ) ;
default :
VERIFY_NOT_REACHED ( ) ;
}
}
void URL : : append_percent_encoded_if_necessary ( StringBuilder & builder , u32 code_point , URL : : PercentEncodeSet set )
{
if ( code_point_is_in_percent_encode_set ( code_point , set ) )
append_percent_encoded ( builder , code_point ) ;
else
builder . append_code_point ( code_point ) ;
}
2021-11-11 02:55:02 +03:00
String URL : : percent_encode ( StringView input , URL : : PercentEncodeSet set )
2021-05-25 14:50:03 +03:00
{
StringBuilder builder ;
for ( auto code_point : Utf8View ( input ) ) {
append_percent_encoded_if_necessary ( builder , code_point , set ) ;
}
return builder . to_string ( ) ;
}
2021-11-11 02:55:02 +03:00
String URL : : percent_decode ( StringView input )
2021-05-25 14:50:03 +03:00
{
if ( ! input . contains ( ' % ' ) )
return input ;
StringBuilder builder ;
Utf8View utf8_view ( input ) ;
for ( auto it = utf8_view . begin ( ) ; ! it . done ( ) ; + + it ) {
if ( * it ! = ' % ' ) {
builder . append_code_point ( * it ) ;
} else if ( ! is_ascii_hex_digit ( it . peek ( 1 ) . value_or ( 0 ) ) | | ! is_ascii_hex_digit ( it . peek ( 2 ) . value_or ( 0 ) ) ) {
builder . append_code_point ( * it ) ;
} else {
+ + it ;
2021-06-01 22:18:08 +03:00
u8 byte = parse_ascii_hex_digit ( * it ) < < 4 ;
2021-05-25 14:50:03 +03:00
+ + it ;
2021-06-01 22:18:08 +03:00
byte + = parse_ascii_hex_digit ( * it ) ;
2021-05-25 14:50:03 +03:00
builder . append ( byte ) ;
}
}
return builder . to_string ( ) ;
}
2019-08-10 18:27:56 +03:00
}