2020-01-18 11:38:21 +03:00
/*
* Copyright ( c ) 2018 - 2020 , Andreas Kling < kling @ serenityos . org >
2021-05-24 00:31:16 +03:00
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
2020-01-18 11:38:21 +03:00
*
2021-04-22 11:24:48 +03:00
* SPDX - License - Identifier : BSD - 2 - Clause
2020-01-18 11:38:21 +03:00
*/
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 20:11:58 +03:00
# include <AK/Base64.h>
2021-06-01 22:18:08 +03:00
# include <AK/CharacterTypes.h>
2021-05-27 22:05:07 +03:00
# include <AK/Debug.h>
2020-05-26 14:52:44 +03:00
# include <AK/LexicalPath.h>
2019-08-10 18:27:56 +03:00
# include <AK/StringBuilder.h>
# include <AK/URL.h>
2021-05-27 22:05:07 +03:00
# include <AK/URLParser.h>
2021-05-25 14:50:03 +03:00
# include <AK/Utf8View.h>
2019-08-10 18:27:56 +03:00
namespace AK {
2023-07-15 05:29:20 +03:00
// FIXME: It could make sense to force users of URL to use URLParser::basic_parse() explicitly instead of using a constructor.
2021-11-11 02:55:02 +03:00
URL : : URL ( StringView string )
2023-07-15 05:29:20 +03:00
: URL ( URLParser : : basic_parse ( string ) )
2019-08-10 18:27:56 +03:00
{
2021-05-27 22:05:07 +03:00
if constexpr ( URL_PARSER_DEBUG ) {
if ( m_valid )
dbgln ( " URL constructor: Parsed URL to be '{}'. " , serialize ( ) ) ;
else
dbgln ( " URL constructor: Parsed URL to be invalid. " ) ;
}
2019-08-10 18:27:56 +03:00
}
2023-02-13 20:42:27 +03:00
URL URL : : complete_url ( StringView relative_url ) const
2019-11-19 00:04:39 +03:00
{
2020-06-07 19:23:33 +03:00
if ( ! is_valid ( ) )
return { } ;
2023-07-15 05:29:20 +03:00
return URLParser : : basic_parse ( relative_url , * this ) ;
2019-11-19 00:04:39 +03:00
}
2023-04-14 01:06:58 +03:00
DeprecatedString URL : : username ( ApplyPercentDecoding apply_percent_decoding ) const
{
return apply_percent_decoding = = ApplyPercentDecoding : : Yes ? percent_decode ( m_username ) : m_username ;
}
DeprecatedString URL : : password ( ApplyPercentDecoding apply_percent_decoding ) const
{
return apply_percent_decoding = = ApplyPercentDecoding : : Yes ? percent_decode ( m_password ) : m_password ;
}
2023-04-14 01:29:51 +03:00
DeprecatedString URL : : path_segment_at_index ( size_t index , ApplyPercentDecoding apply_percent_decoding ) const
{
VERIFY ( index < path_segment_count ( ) ) ;
return apply_percent_decoding = = ApplyPercentDecoding : : Yes ? percent_decode ( m_paths [ index ] ) : m_paths [ index ] ;
}
2023-04-14 01:06:58 +03:00
DeprecatedString URL : : basename ( ApplyPercentDecoding apply_percent_decoding ) const
{
if ( ! m_valid )
return { } ;
if ( m_paths . is_empty ( ) )
return { } ;
auto & last_segment = m_paths . last ( ) ;
return apply_percent_decoding = = ApplyPercentDecoding : : Yes ? percent_decode ( last_segment ) : last_segment ;
}
DeprecatedString URL : : query ( ApplyPercentDecoding apply_percent_decoding ) const
{
return apply_percent_decoding = = ApplyPercentDecoding : : Yes ? percent_decode ( m_query ) : m_query ;
}
DeprecatedString URL : : fragment ( ApplyPercentDecoding apply_percent_decoding ) const
{
return apply_percent_decoding = = ApplyPercentDecoding : : Yes ? percent_decode ( m_fragment ) : m_fragment ;
}
2023-04-09 16:21:00 +03:00
// NOTE: This only exists for compatibility with the existing URL tests which check for both .is_null() and .is_empty().
static DeprecatedString deprecated_string_percent_encode ( DeprecatedString const & input , URL : : PercentEncodeSet set = URL : : PercentEncodeSet : : Userinfo , URL : : SpaceAsPlus space_as_plus = URL : : SpaceAsPlus : : No )
{
if ( input . is_null ( ) | | input . is_empty ( ) )
return input ;
return URL : : percent_encode ( input . view ( ) , set , space_as_plus ) ;
}
2022-12-04 21:02:33 +03:00
void URL : : set_scheme ( DeprecatedString scheme )
2020-04-12 00:07:23 +03:00
{
2021-06-01 11:58:27 +03:00
m_scheme = move ( scheme ) ;
2020-04-12 00:07:23 +03:00
m_valid = compute_validity ( ) ;
}
2023-04-09 16:21:00 +03:00
void URL : : set_username ( DeprecatedString username , ApplyPercentEncoding apply_percent_encoding )
2021-05-25 22:32:20 +03:00
{
2023-04-09 16:21:00 +03:00
if ( apply_percent_encoding = = ApplyPercentEncoding : : Yes )
username = deprecated_string_percent_encode ( username , PercentEncodeSet : : Userinfo ) ;
2021-06-01 11:58:27 +03:00
m_username = move ( username ) ;
2021-05-25 22:32:20 +03:00
m_valid = compute_validity ( ) ;
}
2023-04-09 16:21:00 +03:00
void URL : : set_password ( DeprecatedString password , ApplyPercentEncoding apply_percent_encoding )
2021-05-25 22:32:20 +03:00
{
2023-04-09 16:21:00 +03:00
if ( apply_percent_encoding = = ApplyPercentEncoding : : Yes )
password = deprecated_string_percent_encode ( password , PercentEncodeSet : : Userinfo ) ;
2021-06-01 11:58:27 +03:00
m_password = move ( password ) ;
2021-05-25 22:32:20 +03:00
m_valid = compute_validity ( ) ;
}
2023-07-27 12:40:41 +03:00
void URL : : set_host ( Host host )
2020-04-12 00:07:23 +03:00
{
2021-06-01 11:58:27 +03:00
m_host = move ( host ) ;
2020-04-12 00:07:23 +03:00
m_valid = compute_validity ( ) ;
}
2023-07-27 12:40:41 +03:00
// https://url.spec.whatwg.org/#concept-host-serializer
ErrorOr < String > URL : : serialized_host ( ) const
{
return URLParser : : serialize_host ( m_host ) ;
}
2021-09-13 23:12:16 +03:00
void URL : : set_port ( Optional < u16 > port )
2020-11-04 09:20:20 +03:00
{
2021-05-25 22:32:20 +03:00
if ( port = = default_port_for_scheme ( m_scheme ) ) {
2021-09-13 23:12:16 +03:00
m_port = { } ;
2021-05-25 22:32:20 +03:00
return ;
}
2021-09-13 23:12:16 +03:00
m_port = move ( port ) ;
2020-11-04 09:20:20 +03:00
m_valid = compute_validity ( ) ;
}
2023-04-09 16:21:00 +03:00
void URL : : set_paths ( Vector < DeprecatedString > paths , ApplyPercentEncoding apply_percent_encoding )
2021-05-25 22:32:20 +03:00
{
2023-04-09 16:21:00 +03:00
if ( apply_percent_encoding = = ApplyPercentEncoding : : Yes ) {
Vector < DeprecatedString > encoded_paths ;
encoded_paths . ensure_capacity ( paths . size ( ) ) ;
for ( auto & segment : paths )
encoded_paths . unchecked_append ( deprecated_string_percent_encode ( segment , PercentEncodeSet : : Path ) ) ;
m_paths = move ( encoded_paths ) ;
} else {
m_paths = move ( paths ) ;
}
2021-05-25 22:32:20 +03:00
m_valid = compute_validity ( ) ;
}
2023-04-09 16:21:00 +03:00
void URL : : append_path ( DeprecatedString path , ApplyPercentEncoding apply_percent_encoding )
{
if ( apply_percent_encoding = = ApplyPercentEncoding : : Yes )
path = deprecated_string_percent_encode ( path , PercentEncodeSet : : Path ) ;
m_paths . append ( path ) ;
}
void URL : : set_query ( DeprecatedString query , ApplyPercentEncoding apply_percent_encoding )
2020-04-12 00:07:23 +03:00
{
2023-04-09 16:21:00 +03:00
if ( apply_percent_encoding = = ApplyPercentEncoding : : Yes )
query = deprecated_string_percent_encode ( query , is_special ( ) ? PercentEncodeSet : : SpecialQuery : PercentEncodeSet : : Query ) ;
2021-06-01 11:58:27 +03:00
m_query = move ( query ) ;
2020-04-12 00:07:23 +03:00
}
2023-04-09 16:21:00 +03:00
void URL : : set_fragment ( DeprecatedString fragment , ApplyPercentEncoding apply_percent_encoding )
2020-04-12 01:38:13 +03:00
{
2023-04-09 16:21:00 +03:00
if ( apply_percent_encoding = = ApplyPercentEncoding : : Yes )
fragment = deprecated_string_percent_encode ( fragment , PercentEncodeSet : : Fragment ) ;
2021-06-01 11:58:27 +03:00
m_fragment = move ( fragment ) ;
2020-04-12 01:38:13 +03:00
}
2023-07-26 11:54:36 +03:00
// https://url.spec.whatwg.org/#cannot-have-a-username-password-port
bool URL : : cannot_have_a_username_or_password_or_port ( ) const
{
// A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file".
// FIXME: The spec does not mention anything to do with 'cannot be a base URL'.
2023-07-27 12:40:41 +03:00
return m_host . has < Empty > ( ) | | m_host = = String { } | | m_cannot_be_a_base_url | | m_scheme = = " file " sv ;
2023-07-26 11:54:36 +03:00
}
2021-05-29 21:46:49 +03:00
// FIXME: This is by no means complete.
// NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
2020-04-12 00:07:23 +03:00
bool URL : : compute_validity ( ) const
{
2021-05-24 00:31:16 +03:00
if ( m_scheme . is_empty ( ) )
2020-04-12 00:07:23 +03:00
return false ;
2020-11-04 09:20:20 +03:00
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 20:11:58 +03:00
if ( m_cannot_be_a_base_url ) {
2021-05-29 21:46:49 +03:00
if ( m_paths . size ( ) ! = 1 )
return false ;
if ( m_paths [ 0 ] . is_empty ( ) )
return false ;
} else {
if ( m_scheme . is_one_of ( " about " , " mailto " ) )
return false ;
// NOTE: Maybe it is allowed to have a zero-segment path.
if ( m_paths . size ( ) = = 0 )
return false ;
2020-04-12 00:07:23 +03:00
}
2020-11-04 09:20:20 +03:00
2021-05-29 21:46:49 +03:00
// NOTE: A file URL's host should be the empty string for localhost, not null.
2023-07-27 12:40:41 +03:00
if ( m_scheme = = " file " & & m_host . has < Empty > ( ) )
2020-11-04 09:20:20 +03:00
return false ;
2020-04-12 00:07:23 +03:00
return true ;
}
2023-07-31 11:23:53 +03:00
// https://url.spec.whatwg.org/#default-port
2021-11-11 02:55:02 +03:00
u16 URL : : default_port_for_scheme ( StringView scheme )
2020-11-04 09:20:20 +03:00
{
2023-07-31 11:23:53 +03:00
// Spec defined mappings with port:
if ( scheme = = " ftp " )
return 21 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " http " )
2020-11-04 09:20:20 +03:00
return 80 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " https " )
2020-11-04 09:20:20 +03:00
return 443 ;
2023-07-31 11:23:53 +03:00
if ( scheme = = " ws " )
return 80 ;
if ( scheme = = " wss " )
return 443 ;
// NOTE: not in spec, but we support these too
2021-05-24 00:31:16 +03:00
if ( scheme = = " gemini " )
2020-11-04 09:20:20 +03:00
return 1965 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " irc " )
2020-11-04 09:20:20 +03:00
return 6667 ;
2021-05-24 00:31:16 +03:00
if ( scheme = = " ircs " )
2020-11-04 09:20:20 +03:00
return 6697 ;
2023-07-31 11:23:53 +03:00
2020-11-04 09:20:20 +03:00
return 0 ;
}
2022-12-04 21:02:33 +03:00
URL URL : : create_with_file_scheme ( DeprecatedString const & path , DeprecatedString const & fragment , DeprecatedString const & hostname )
2020-04-18 23:02:04 +03:00
{
2021-05-27 22:40:02 +03:00
LexicalPath lexical_path ( path ) ;
2021-06-29 14:11:03 +03:00
if ( ! lexical_path . is_absolute ( ) )
2021-05-27 22:40:02 +03:00
return { } ;
2021-05-29 22:57:20 +03:00
2020-04-18 23:02:04 +03:00
URL url ;
2021-05-24 00:31:16 +03:00
url . set_scheme ( " file " ) ;
2021-05-29 22:57:20 +03:00
// NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string.
2022-03-24 05:46:52 +03:00
// This is because a file URL always needs a non-null hostname.
2023-07-27 12:40:41 +03:00
url . set_host ( hostname . is_null ( ) | | hostname = = " localhost " ? String { } : String : : from_deprecated_string ( hostname ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2022-03-24 05:46:52 +03:00
url . set_paths ( lexical_path . parts ( ) ) ;
if ( path . ends_with ( ' / ' ) )
2023-04-09 16:21:00 +03:00
url . append_slash ( ) ;
2022-03-24 05:46:52 +03:00
url . set_fragment ( fragment ) ;
return url ;
}
2022-12-04 21:02:33 +03:00
URL URL : : create_with_help_scheme ( DeprecatedString const & path , DeprecatedString const & fragment , DeprecatedString const & hostname )
2022-03-24 05:46:52 +03:00
{
LexicalPath lexical_path ( path ) ;
URL url ;
url . set_scheme ( " help " ) ;
// NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string.
2021-05-29 22:57:20 +03:00
// This is because a file URL always needs a non-null hostname.
2023-07-27 12:40:41 +03:00
url . set_host ( hostname . is_null ( ) | | hostname = = " localhost " ? String { } : String : : from_deprecated_string ( hostname ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2021-05-27 22:40:02 +03:00
url . set_paths ( lexical_path . parts ( ) ) ;
if ( path . ends_with ( ' / ' ) )
2023-04-09 16:21:00 +03:00
url . append_slash ( ) ;
2021-03-02 00:24:34 +03:00
url . set_fragment ( fragment ) ;
2020-04-18 23:02:04 +03:00
return url ;
}
2022-12-04 21:02:33 +03:00
URL URL : : create_with_url_or_path ( DeprecatedString const & url_or_path )
2020-04-19 11:55:59 +03:00
{
URL url = url_or_path ;
if ( url . is_valid ( ) )
return url ;
2022-12-04 21:02:33 +03:00
DeprecatedString path = LexicalPath : : canonicalized_path ( url_or_path ) ;
2021-05-24 00:31:16 +03:00
return URL : : create_with_file_scheme ( path ) ;
2020-04-19 11:55:59 +03:00
}
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 20:11:58 +03:00
URL URL : : create_with_data ( StringView mime_type , StringView payload , bool is_base64 )
{
URL url ;
url . set_cannot_be_a_base_url ( true ) ;
url . set_scheme ( " data " sv ) ;
StringBuilder builder ;
builder . append ( mime_type ) ;
if ( is_base64 )
builder . append ( " ;base64 " sv ) ;
builder . append ( ' , ' ) ;
builder . append ( payload ) ;
url . set_paths ( { builder . to_deprecated_string ( ) } ) ;
return url ;
}
2021-05-25 23:05:01 +03:00
// https://url.spec.whatwg.org/#special-scheme
2021-11-11 02:55:02 +03:00
bool URL : : is_special_scheme ( StringView scheme )
2021-05-25 23:05:01 +03:00
{
return scheme . is_one_of ( " ftp " , " file " , " http " , " https " , " ws " , " wss " ) ;
}
2023-04-14 22:12:03 +03:00
DeprecatedString URL : : serialize_path ( ApplyPercentDecoding apply_percent_decoding ) const
{
if ( cannot_be_a_base_url ( ) )
return m_paths [ 0 ] ;
StringBuilder builder ;
for ( auto & path : m_paths ) {
builder . append ( ' / ' ) ;
builder . append ( apply_percent_decoding = = ApplyPercentDecoding : : Yes ? percent_decode ( path ) : path ) ;
}
return builder . to_deprecated_string ( ) ;
}
2021-05-25 23:32:39 +03:00
// https://url.spec.whatwg.org/#concept-url-serializer
2022-12-04 21:02:33 +03:00
DeprecatedString URL : : serialize ( ExcludeFragment exclude_fragment ) const
2021-05-25 23:32:39 +03:00
{
2023-07-25 11:04:09 +03:00
// 1. Let output be url’ s scheme and U+003A (:) concatenated.
StringBuilder output ;
output . append ( m_scheme ) ;
output . append ( ' : ' ) ;
// 2. If url’ s host is non-null:
2023-07-27 12:40:41 +03:00
if ( ! m_host . has < Empty > ( ) ) {
2023-07-25 11:04:09 +03:00
// 1. Append "//" to output.
output . append ( " // " sv ) ;
2021-05-25 23:32:39 +03:00
2023-07-25 11:04:09 +03:00
// 2. If url includes credentials, then:
2021-05-25 23:32:39 +03:00
if ( includes_credentials ( ) ) {
2023-07-25 11:04:09 +03:00
// 1. Append url’ s username to output.
output . append ( m_username ) ;
// 2. If url’ s password is not the empty string, then append U+003A (:), followed by url’ s password, to output.
2021-05-25 23:32:39 +03:00
if ( ! m_password . is_empty ( ) ) {
2023-07-25 11:04:09 +03:00
output . append ( ' : ' ) ;
output . append ( m_password ) ;
2021-05-25 23:32:39 +03:00
}
2023-07-25 11:04:09 +03:00
// 3. Append U+0040 (@) to output.
output . append ( ' @ ' ) ;
2021-05-25 23:32:39 +03:00
}
2023-07-25 11:04:09 +03:00
// 3. Append url’ s host, serialized, to output.
2023-07-27 12:40:41 +03:00
output . append ( serialized_host ( ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2023-07-25 11:04:09 +03:00
// 4. If url’ s port is non-null, append U+003A (:) followed by url’ s port, serialized, to output.
2021-09-13 23:12:16 +03:00
if ( m_port . has_value ( ) )
2023-07-25 11:04:09 +03:00
output . appendff ( " :{} " , * m_port ) ;
2021-05-25 23:32:39 +03:00
}
2023-07-25 11:04:09 +03:00
// 3. If url’ s host is null, url does not have an opaque path, url’ s path’ s size is greater than 1, and url’ s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
// 4. Append the result of URL path serializing url to output.
// FIXME: Implement this closer to spec steps.
2021-05-25 23:32:39 +03:00
if ( cannot_be_a_base_url ( ) ) {
2023-07-25 11:04:09 +03:00
output . append ( m_paths [ 0 ] ) ;
2021-05-25 23:32:39 +03:00
} else {
2023-07-27 12:40:41 +03:00
if ( m_host . has < Empty > ( ) & & m_paths . size ( ) > 1 & & m_paths [ 0 ] . is_empty ( ) )
2023-07-25 11:04:09 +03:00
output . append ( " /. " sv ) ;
2021-05-27 22:40:02 +03:00
for ( auto & segment : m_paths ) {
2023-07-25 11:04:09 +03:00
output . append ( ' / ' ) ;
output . append ( segment ) ;
2021-05-25 23:32:39 +03:00
}
}
2023-07-25 11:04:09 +03:00
// 5. If url’ s query is non-null, append U+003F (?), followed by url’ s query, to output.
2021-05-25 23:32:39 +03:00
if ( ! m_query . is_null ( ) ) {
2023-07-25 11:04:09 +03:00
output . append ( ' ? ' ) ;
output . append ( m_query ) ;
2021-05-25 23:32:39 +03:00
}
2023-07-25 11:04:09 +03:00
// 6. If exclude fragment is false and url’ s fragment is non-null, then append U+0023 (#), followed by url’ s fragment, to output.
2021-05-25 23:32:39 +03:00
if ( exclude_fragment = = ExcludeFragment : : No & & ! m_fragment . is_null ( ) ) {
2023-07-25 11:04:09 +03:00
output . append ( ' # ' ) ;
output . append ( m_fragment ) ;
2021-05-25 23:32:39 +03:00
}
2023-07-25 11:04:09 +03:00
// 7. Return output.
return output . to_deprecated_string ( ) ;
2021-05-25 23:32:39 +03:00
}
// https://url.spec.whatwg.org/#url-rendering
// NOTE: This does e.g. not display credentials.
// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
// resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
2022-12-04 21:02:33 +03:00
DeprecatedString URL : : serialize_for_display ( ) const
2021-05-25 23:32:39 +03:00
{
VERIFY ( m_valid ) ;
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 20:11:58 +03:00
2021-05-25 23:32:39 +03:00
StringBuilder builder ;
builder . append ( m_scheme ) ;
builder . append ( ' : ' ) ;
2023-07-27 12:40:41 +03:00
if ( ! m_host . has < Empty > ( ) ) {
2022-07-11 20:32:29 +03:00
builder . append ( " // " sv ) ;
2023-07-27 12:40:41 +03:00
builder . append ( serialized_host ( ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2021-09-13 23:12:16 +03:00
if ( m_port . has_value ( ) )
builder . appendff ( " :{} " , * m_port ) ;
2021-05-25 23:32:39 +03:00
}
if ( cannot_be_a_base_url ( ) ) {
2023-04-09 16:21:00 +03:00
builder . append ( m_paths [ 0 ] ) ;
2021-05-25 23:32:39 +03:00
} else {
2023-07-27 12:40:41 +03:00
if ( m_host . has < Empty > ( ) & & m_paths . size ( ) > 1 & & m_paths [ 0 ] . is_empty ( ) )
2022-07-11 20:32:29 +03:00
builder . append ( " /. " sv ) ;
2021-05-27 22:40:02 +03:00
for ( auto & segment : m_paths ) {
builder . append ( ' / ' ) ;
2023-04-09 16:21:00 +03:00
builder . append ( segment ) ;
2021-05-25 23:32:39 +03:00
}
}
if ( ! m_query . is_null ( ) ) {
builder . append ( ' ? ' ) ;
2023-04-09 16:21:00 +03:00
builder . append ( m_query ) ;
2021-05-25 23:32:39 +03:00
}
if ( ! m_fragment . is_null ( ) ) {
builder . append ( ' # ' ) ;
2023-04-09 16:21:00 +03:00
builder . append ( m_fragment ) ;
2021-05-25 23:32:39 +03:00
}
2022-12-06 04:12:49 +03:00
return builder . to_deprecated_string ( ) ;
2021-05-25 23:32:39 +03:00
}
2023-06-17 10:15:40 +03:00
ErrorOr < String > URL : : to_string ( ) const
{
return String : : from_deprecated_string ( serialize ( ) ) ;
}
2021-09-13 22:18:14 +03:00
// https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin
// https://url.spec.whatwg.org/#concept-url-origin
2022-12-04 21:02:33 +03:00
DeprecatedString URL : : serialize_origin ( ) const
2021-09-13 22:18:14 +03:00
{
VERIFY ( m_valid ) ;
if ( m_scheme = = " blob " sv ) {
// TODO: 1. If URL’ s blob URL entry is non-null, then return URL’ s blob URL entry’ s environment’ s origin.
// 2. Let url be the result of parsing URL’ s path[0].
VERIFY ( ! m_paths . is_empty ( ) ) ;
URL url = m_paths [ 0 ] ;
// 3. Return a new opaque origin, if url is failure, and url’ s origin otherwise.
if ( ! url . is_valid ( ) )
return " null " ;
return url . serialize_origin ( ) ;
} else if ( ! m_scheme . is_one_of ( " ftp " sv , " http " sv , " https " sv , " ws " sv , " wss " sv ) ) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin."
return " null " ;
}
StringBuilder builder ;
builder . append ( m_scheme ) ;
builder . append ( " :// " sv ) ;
2023-07-27 12:40:41 +03:00
builder . append ( serialized_host ( ) . release_value_but_fixme_should_propagate_errors ( ) ) ;
2021-09-13 23:12:16 +03:00
if ( m_port . has_value ( ) )
2022-06-10 21:37:51 +03:00
builder . appendff ( " :{} " , * m_port ) ;
2023-01-26 21:58:09 +03:00
return builder . to_deprecated_string ( ) ;
2021-09-13 22:18:14 +03:00
}
2021-06-01 11:58:27 +03:00
bool URL : : equals ( URL const & other , ExcludeFragment exclude_fragments ) const
2021-05-27 22:38:16 +03:00
{
2021-06-01 12:14:30 +03:00
if ( this = = & other )
return true ;
2021-05-27 22:38:16 +03:00
if ( ! m_valid | | ! other . m_valid )
return false ;
return serialize ( exclude_fragments ) = = other . serialize ( exclude_fragments ) ;
}
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 20:11:58 +03:00
// https://fetch.spec.whatwg.org/#data-url-processor
ErrorOr < URL : : DataURL > URL : : process_data_url ( ) const
{
// 1. Assert: dataURL’ s scheme is "data".
VERIFY ( scheme ( ) = = " data " ) ;
// 2. Let input be the result of running the URL serializer on dataURL with exclude fragment set to true.
auto input = serialize ( URL : : ExcludeFragment : : Yes ) ;
// 3. Remove the leading "data:" from input.
input = input . substring ( " data: " sv . length ( ) ) ;
// 4. Let position point at the start of input.
// 5. Let mimeType be the result of collecting a sequence of code points that are not equal to U+002C (,), given position.
auto position = input . find ( ' , ' ) ;
auto mime_type = input . substring_view ( 0 , position . value_or ( input . length ( ) ) ) ;
// 6. Strip leading and trailing ASCII whitespace from mimeType.
mime_type = mime_type . trim_whitespace ( TrimMode : : Both ) ;
// 7. If position is past the end of input, then return failure.
if ( ! position . has_value ( ) )
return Error : : from_string_literal ( " Missing a comma character " ) ;
// 8. Advance position by 1.
position = position . value ( ) + 1 ;
// 9. Let encodedBody be the remainder of input.
auto encoded_body = input . substring_view ( position . value ( ) ) ;
// 10. Let body be the percent-decoding of encodedBody.
auto body = URL : : percent_decode ( encoded_body ) . to_byte_buffer ( ) ;
// 11. If mimeType ends with U+003B (;), followed by zero or more U+0020 SPACE, followed by an ASCII case-insensitive match for "base64", then:
if ( mime_type . ends_with ( " base64 " sv , CaseSensitivity : : CaseInsensitive ) ) {
auto trimmed_substring_view = mime_type . substring_view ( 0 , mime_type . length ( ) - 6 ) ;
trimmed_substring_view = trimmed_substring_view . trim ( " " sv , TrimMode : : Right ) ;
if ( trimmed_substring_view . ends_with ( ' ; ' ) ) {
// 1. Let stringBody be the isomorphic decode of body.
auto string_body = StringView ( body ) ;
// 2. Set body to the forgiving-base64 decode of stringBody.
// FIXME: Check if it's really forgiving.
// 3. If body is failure, then return failure.
body = TRY ( decode_base64 ( string_body ) ) ;
// 4. Remove the last 6 code points from mimeType.
// 5. Remove trailing U+0020 SPACE code points from mimeType, if any.
// 6. Remove the last U+003B (;) from mimeType.
mime_type = trimmed_substring_view . substring_view ( 0 , trimmed_substring_view . length ( ) - 1 ) ;
}
}
// 12. If mimeType starts with ";", then prepend "text/plain" to mimeType.
StringBuilder builder ;
if ( mime_type . starts_with ( ' ; ' ) ) {
builder . append ( " text/plain " sv ) ;
builder . append ( mime_type ) ;
mime_type = builder . string_view ( ) ;
}
// FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
// FIXME: 13. Let mimeTypeRecord be the result of parsing mimeType.
auto mime_type_record = mime_type . trim ( " \n \r \t " sv , TrimMode : : Both ) ;
// 14. If mimeTypeRecord is failure, then set mimeTypeRecord to text/plain;charset=US-ASCII.
if ( mime_type_record . is_empty ( ) )
mime_type_record = " text/plain;charset=US-ASCII " sv ;
// 15. Return a new data: URL struct whose MIME type is mimeTypeRecord and body is body.
return URL : : DataURL { TRY ( String : : from_utf8 ( mime_type_record ) ) , body } ;
}
2021-05-25 14:50:03 +03:00
void URL : : append_percent_encoded ( StringBuilder & builder , u32 code_point )
{
if ( code_point < = 0x7f )
builder . appendff ( " %{:02X} " , code_point ) ;
else if ( code_point < = 0x07ff )
builder . appendff ( " %{:02X}%{:02X} " , ( ( code_point > > 6 ) & 0x1f ) | 0xc0 , ( code_point & 0x3f ) | 0x80 ) ;
else if ( code_point < = 0xffff )
builder . appendff ( " %{:02X}%{:02X}%{:02X} " , ( ( code_point > > 12 ) & 0x0f ) | 0xe0 , ( ( code_point > > 6 ) & 0x3f ) | 0x80 , ( code_point & 0x3f ) | 0x80 ) ;
else if ( code_point < = 0x10ffff )
builder . appendff ( " %{:02X}%{:02X}%{:02X}%{:02X} " , ( ( code_point > > 18 ) & 0x07 ) | 0xf0 , ( ( code_point > > 12 ) & 0x3f ) | 0x80 , ( ( code_point > > 6 ) & 0x3f ) | 0x80 , ( code_point & 0x3f ) | 0x80 ) ;
else
VERIFY_NOT_REACHED ( ) ;
}
// https://url.spec.whatwg.org/#c0-control-percent-encode-set
2022-04-10 01:48:15 +03:00
bool URL : : code_point_is_in_percent_encode_set ( u32 code_point , URL : : PercentEncodeSet set )
2021-05-25 14:50:03 +03:00
{
switch ( set ) {
case URL : : PercentEncodeSet : : C0Control :
return code_point < 0x20 | | code_point > 0x7E ;
case URL : : PercentEncodeSet : : Fragment :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : C0Control ) | | " \" <>` " sv . contains ( code_point ) ;
case URL : : PercentEncodeSet : : Query :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : C0Control ) | | " \" #<> " sv . contains ( code_point ) ;
case URL : : PercentEncodeSet : : SpecialQuery :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : Query ) | | code_point = = ' \' ' ;
case URL : : PercentEncodeSet : : Path :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : Query ) | | " ?` { } " sv.contains(code_point);
case URL : : PercentEncodeSet : : Userinfo :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : Path ) | | " /: ; = @ [ \ \ ] ^ | " sv.contains(code_point);
case URL : : PercentEncodeSet : : Component :
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : Userinfo ) | | " $%&+, " sv . contains ( code_point ) ;
case URL : : PercentEncodeSet : : ApplicationXWWWFormUrlencoded :
2022-06-10 21:39:08 +03:00
return code_point_is_in_percent_encode_set ( code_point , URL : : PercentEncodeSet : : Component ) | | " !'()~ " sv . contains ( code_point ) ;
2021-05-25 14:50:03 +03:00
case URL : : PercentEncodeSet : : EncodeURI :
// NOTE: This is the same percent encode set that JS encodeURI() uses.
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
2022-12-25 22:25:34 +03:00
return code_point > 0x7E | | ( ! is_ascii_alphanumeric ( code_point ) & & ! " ;,/?:@&=+$-_.!~*'()# " sv . contains ( static_cast < char > ( code_point ) ) ) ;
2021-05-25 14:50:03 +03:00
default :
VERIFY_NOT_REACHED ( ) ;
}
}
2022-04-08 16:20:30 +03:00
void URL : : append_percent_encoded_if_necessary ( StringBuilder & builder , u32 code_point , URL : : PercentEncodeSet set )
2021-05-25 14:50:03 +03:00
{
2022-04-08 16:20:30 +03:00
if ( code_point_is_in_percent_encode_set ( code_point , set ) )
2021-05-25 14:50:03 +03:00
append_percent_encoded ( builder , code_point ) ;
else
builder . append_code_point ( code_point ) ;
}
2022-12-04 21:02:33 +03:00
DeprecatedString URL : : percent_encode ( StringView input , URL : : PercentEncodeSet set , SpaceAsPlus space_as_plus )
2021-05-25 14:50:03 +03:00
{
StringBuilder builder ;
for ( auto code_point : Utf8View ( input ) ) {
2022-04-09 19:34:49 +03:00
if ( space_as_plus = = SpaceAsPlus : : Yes & & code_point = = ' ' )
builder . append ( ' + ' ) ;
else
append_percent_encoded_if_necessary ( builder , code_point , set ) ;
2021-05-25 14:50:03 +03:00
}
2022-12-06 04:12:49 +03:00
return builder . to_deprecated_string ( ) ;
2021-05-25 14:50:03 +03:00
}
2022-12-04 21:02:33 +03:00
DeprecatedString URL : : percent_decode ( StringView input )
2021-05-25 14:50:03 +03:00
{
if ( ! input . contains ( ' % ' ) )
return input ;
StringBuilder builder ;
Utf8View utf8_view ( input ) ;
for ( auto it = utf8_view . begin ( ) ; ! it . done ( ) ; + + it ) {
if ( * it ! = ' % ' ) {
builder . append_code_point ( * it ) ;
} else if ( ! is_ascii_hex_digit ( it . peek ( 1 ) . value_or ( 0 ) ) | | ! is_ascii_hex_digit ( it . peek ( 2 ) . value_or ( 0 ) ) ) {
builder . append_code_point ( * it ) ;
} else {
+ + it ;
2021-06-01 22:18:08 +03:00
u8 byte = parse_ascii_hex_digit ( * it ) < < 4 ;
2021-05-25 14:50:03 +03:00
+ + it ;
2021-06-01 22:18:08 +03:00
byte + = parse_ascii_hex_digit ( * it ) ;
2021-05-25 14:50:03 +03:00
builder . append ( byte ) ;
}
}
2022-12-06 04:12:49 +03:00
return builder . to_deprecated_string ( ) ;
2021-05-25 14:50:03 +03:00
}
2019-08-10 18:27:56 +03:00
}