2021-05-25 23:13:15 +03:00
/*
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
2023-07-23 12:09:29 +03:00
* Copyright ( c ) 2023 , Shannon Booth < shannon @ serenityos . org >
2021-05-25 23:13:15 +03:00
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
2023-12-16 17:19:34 +03:00
# include <AK/ByteString.h>
2021-06-01 22:18:08 +03:00
# include <AK/CharacterTypes.h>
2021-05-25 23:13:15 +03:00
# include <AK/Debug.h>
2023-07-23 12:09:29 +03:00
# include <AK/IntegralMath.h>
2021-05-25 23:13:15 +03:00
# include <AK/Optional.h>
# include <AK/SourceLocation.h>
# include <AK/StringBuilder.h>
# include <AK/StringUtils.h>
# include <AK/URLParser.h>
# include <AK/Utf8View.h>
namespace AK {
2021-06-03 13:43:08 +03:00
// NOTE: This is similar to the LibC macro EOF = -1.
constexpr u32 end_of_file = 0xFFFFFFFF ;
2023-10-04 10:01:56 +03:00
// https://url.spec.whatwg.org/#url-code-points
2022-09-12 19:32:52 +03:00
static bool is_url_code_point ( u32 code_point )
2021-05-25 23:13:15 +03:00
{
2023-10-04 10:01:56 +03:00
// The URL code points are ASCII alphanumeric, U+0021 (!), U+0024 ($), U+0026 (&),
// U+0027 ('), U+0028 LEFT PARENTHESIS, U+0029 RIGHT PARENTHESIS, U+002A (*),
// U+002B (+), U+002C (,), U+002D (-), U+002E (.), U+002F (/), U+003A (:),
// U+003B (;), U+003D (=), U+003F (?), U+0040 (@), U+005F (_), U+007E (~), and code
// points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and
// noncharacters.
return is_ascii_alphanumeric ( code_point ) | | " !$&'()*+,-./: ; = ? @ _ ~ " sv.contains(code_point)
| | ( code_point > = 0x00A0 & & code_point < = 0x10FFFD & & ! is_unicode_surrogate ( code_point ) & & ! is_unicode_noncharacter ( code_point ) ) ;
2021-05-25 23:13:15 +03:00
}
2021-06-03 13:03:56 +03:00
static void report_validation_error ( SourceLocation const & location = SourceLocation : : current ( ) )
2021-05-25 23:13:15 +03:00
{
2023-07-15 05:29:20 +03:00
dbgln_if ( URL_PARSER_DEBUG , " URLParser::basic_parse: Validation error! {} " , location ) ;
2021-05-25 23:13:15 +03:00
}
2023-10-04 10:02:06 +03:00
// https://url.spec.whatwg.org/#concept-opaque-host-parser
2023-07-27 12:40:41 +03:00
static Optional < URL : : Host > parse_opaque_host ( StringView input )
2021-05-25 23:13:15 +03:00
{
2023-10-04 10:02:06 +03:00
// 1. If input contains a forbidden host code point, host-invalid-code-point validation error, return failure.
2022-09-12 17:31:16 +03:00
auto forbidden_host_characters_excluding_percent = " \0 \t \n \r #/:<>?@[ \\ ]^| " sv ;
for ( auto character : forbidden_host_characters_excluding_percent ) {
if ( input . contains ( character ) ) {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
return { } ;
}
}
2023-10-04 10:02:06 +03:00
// 2. If input contains a code point that is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
// 3. If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, invalid-URL-unit validation error.
// NOTE: These steps are not implemented because they are not cheap checks and exist just to report validation errors. With how we
// currently report validation errors, they are only useful for debugging efforts in the URL parsing code.
// 4. Return the result of running UTF-8 percent-encode on input using the C0 control percent-encode set.
2023-12-16 17:19:34 +03:00
return String : : from_byte_string ( URL : : percent_encode ( input , URL : : PercentEncodeSet : : C0Control ) ) . release_value_but_fixme_should_propagate_errors ( ) ;
2021-05-25 23:13:15 +03:00
}
2023-07-23 12:09:29 +03:00
struct ParsedIPv4Number {
u32 number { 0 } ;
bool validation_error { false } ;
} ;
// https://url.spec.whatwg.org/#ipv4-number-parser
static Optional < ParsedIPv4Number > parse_ipv4_number ( StringView input )
2021-05-25 23:13:15 +03:00
{
2023-07-23 12:09:29 +03:00
// 1. If input is the empty string, then return failure.
if ( input . is_empty ( ) )
return { } ;
// 2. Let validationError be false.
bool validation_error = false ;
// 3. Let R be 10.
u8 radix = 10 ;
// 4. If input contains at least two code points and the first two code points are either "0X" or "0x", then:
if ( input . length ( ) > = 2 & & ( input . starts_with ( " 0X " sv ) | | input . starts_with ( " 0x " sv ) ) ) {
// 1. Set validationError to true.
validation_error = true ;
// 2. Remove the first two code points from input.
input = input . substring_view ( 2 ) ;
// 3. Set R to 16.
radix = 16 ;
}
// 5. Otherwise, if input contains at least two code points and the first code point is U+0030 (0), then:
else if ( input . length ( ) > = 2 & & input [ 0 ] = = ' 0 ' ) {
// 1. Set validationError to true.
validation_error = true ;
// 2. Remove the first code point from input.
input = input . substring_view ( 1 ) ;
// 3. Set R to 8.
radix = 8 ;
}
// 6. If input is the empty string, then return (0, true).
if ( input . is_empty ( ) )
return ParsedIPv4Number { 0 , true } ;
// 7. If input contains a code point that is not a radix-R digit, then return failure.
if ( radix = = 8 ) {
if ( ! all_of ( input , [ ] ( auto character ) { return is_ascii_octal_digit ( character ) ; } ) )
return { } ;
} else if ( radix = = 10 ) {
if ( ! all_of ( input , [ ] ( auto character ) { return is_ascii_digit ( character ) ; } ) )
return { } ;
} else if ( radix = = 16 ) {
if ( ! all_of ( input , [ ] ( auto character ) { return is_ascii_hex_digit ( character ) ; } ) )
return { } ;
} else {
VERIFY_NOT_REACHED ( ) ;
}
// 8. Let output be the mathematical integer value that is represented by input in radix-R notation, using ASCII hex digits for digits with values 0 through 15.
2023-10-04 11:17:00 +03:00
Optional < u32 > maybe_output ;
2023-07-23 12:09:29 +03:00
if ( radix = = 8 )
2023-10-04 11:17:00 +03:00
maybe_output = StringUtils : : convert_to_uint_from_octal ( input ) ;
2023-07-23 12:09:29 +03:00
else if ( radix = = 10 )
2023-12-23 05:59:14 +03:00
maybe_output = input . to_number < u32 > ( ) ;
2023-07-23 12:09:29 +03:00
else if ( radix = = 16 )
2023-10-04 11:17:00 +03:00
maybe_output = StringUtils : : convert_to_uint_from_hex ( input ) ;
2023-07-23 12:09:29 +03:00
else
VERIFY_NOT_REACHED ( ) ;
2023-10-04 11:17:00 +03:00
// NOTE: Parsing may have failed due to overflow.
if ( ! maybe_output . has_value ( ) )
return { } ;
2023-07-23 12:09:29 +03:00
// 9. Return (output, validationError).
2023-10-04 11:17:00 +03:00
return ParsedIPv4Number { maybe_output . value ( ) , validation_error } ;
2023-07-23 12:09:29 +03:00
}
// https://url.spec.whatwg.org/#concept-ipv4-parser
2023-07-26 11:49:49 +03:00
static Optional < URL : : IPv4Address > parse_ipv4_address ( StringView input )
2023-07-23 12:09:29 +03:00
{
// 1. Let parts be the result of strictly splitting input on U+002E (.).
auto parts = input . split_view ( " . " sv , SplitBehavior : : KeepEmpty ) ;
// 2. If the last item in parts is the empty string, then:
if ( parts . last ( ) . is_empty ( ) ) {
// 1. IPv4-empty-part validation error.
report_validation_error ( ) ;
// 2. If parts’ s size is greater than 1, then remove the last item from parts.
if ( parts . size ( ) > 1 )
parts . take_last ( ) ;
}
// 3. If parts’ s size is greater than 4, IPv4-too-many-parts validation error, return failure.
if ( parts . size ( ) > 4 ) {
report_validation_error ( ) ;
return { } ;
}
// 4. Let numbers be an empty list.
Vector < u32 , 4 > numbers ;
// 5. For each part of parts:
for ( auto const & part : parts ) {
// 1. Let result be the result of parsing part.
auto const result = parse_ipv4_number ( part ) ;
// 2. If result is failure, IPv4-non-numeric-part validation error, return failure.
if ( ! result . has_value ( ) ) {
report_validation_error ( ) ;
return { } ;
}
// 3. If result[1] is true, IPv4-non-decimal-part validation error.
if ( result - > validation_error )
report_validation_error ( ) ;
// 4. Append result[0] to numbers.
numbers . append ( result - > number ) ;
}
// 6. If any item in numbers is greater than 255, IPv4-out-of-range-part validation error.
// 7. If any but the last item in numbers is greater than 255, then return failure.
for ( size_t i = 0 ; i < numbers . size ( ) ; + + i ) {
if ( numbers [ i ] > 255 ) {
report_validation_error ( ) ;
if ( i ! = numbers . size ( ) - 1 )
return { } ;
}
}
// 8. If the last item in numbers is greater than or equal to 256^(5 − numbers’ s size), then return failure.
if ( numbers . last ( ) > = pow < size_t > ( 256 , 5 - numbers . size ( ) ) )
return { } ;
// 9. Let ipv4 be the last item in numbers.
auto ipv4 = numbers . last ( ) ;
// 10. Remove the last item from numbers.
numbers . take_last ( ) ;
// 11. Let counter be 0.
u8 counter = 0 ;
// 12. For each n of numbers:
for ( u32 n : numbers ) {
// 1. Increment ipv4 by n × 256^(3 − counter).
ipv4 + = n * pow < size_t > ( 256 , 3 - counter ) ;
// 2. Increment counter by 1.
+ + counter ;
}
// 13. Return ipv4.
return ipv4 ;
}
// https://url.spec.whatwg.org/#concept-ipv4-serializer
2023-07-26 11:49:49 +03:00
static ErrorOr < String > serialize_ipv4_address ( URL : : IPv4Address address )
2023-07-23 12:09:29 +03:00
{
// 1. Let output be the empty string.
// NOTE: Array to avoid prepend.
Array < u8 , 4 > output ;
// 2. Let n be the value of address.
u32 n = address ;
// 3. For each i in the range 1 to 4, inclusive:
for ( size_t i = 0 ; i < = 3 ; + + i ) {
// 1. Prepend n % 256, serialized, to output.
output [ 3 - i ] = n % 256 ;
// 2. If i is not 4, then prepend U+002E (.) to output.
// NOTE: done at end
// 3. Set n to floor(n / 256).
n / = 256 ;
}
// 4. Return output.
return String : : formatted ( " {}.{}.{}.{} " , output [ 0 ] , output [ 1 ] , output [ 2 ] , output [ 3 ] ) ;
2021-05-25 23:13:15 +03:00
}
2023-07-17 07:52:29 +03:00
// https://url.spec.whatwg.org/#concept-ipv6-serializer
2023-07-26 12:04:15 +03:00
static void serialize_ipv6_address ( URL : : IPv6Address const & address , StringBuilder & output )
2023-07-17 07:52:29 +03:00
{
// 1. Let output be the empty string.
// 2. Let compress be an index to the first IPv6 piece in the first longest sequences of address’ s IPv6 pieces that are 0.
Optional < size_t > compress ;
size_t longest_sequence_length = 0 ;
size_t current_sequence_length = 0 ;
size_t current_sequence_start = 0 ;
for ( size_t i = 0 ; i < 8 ; + + i ) {
if ( address [ i ] = = 0 ) {
if ( current_sequence_length = = 0 )
current_sequence_start = i ;
+ + current_sequence_length ;
} else {
if ( current_sequence_length > longest_sequence_length ) {
longest_sequence_length = current_sequence_length ;
compress = current_sequence_start ;
}
current_sequence_length = 0 ;
}
}
2023-07-31 13:57:43 +03:00
2023-08-06 10:40:09 +03:00
if ( current_sequence_length > longest_sequence_length ) {
longest_sequence_length = current_sequence_length ;
compress = current_sequence_start ;
}
2023-07-31 13:57:43 +03:00
// 3. If there is no sequence of address’ s IPv6 pieces that are 0 that is longer than 1, then set compress to null.
if ( longest_sequence_length < = 1 )
compress = { } ;
2023-07-17 07:52:29 +03:00
// 4. Let ignore0 be false.
auto ignore0 = false ;
// 5. For each pieceIndex in the range 0 to 7, inclusive:
for ( size_t piece_index = 0 ; piece_index < = 7 ; + + piece_index ) {
// 1. If ignore0 is true and address[pieceIndex] is 0, then continue.
if ( ignore0 & & address [ piece_index ] = = 0 )
continue ;
// 2. Otherwise, if ignore0 is true, set ignore0 to false.
if ( ignore0 )
ignore0 = false ;
// 3. If compress is pieceIndex, then:
if ( compress = = piece_index ) {
// 1. Let separator be "::" if pieceIndex is 0, and U+003A (:) otherwise.
auto separator = piece_index = = 0 ? " :: " sv : " : " sv ;
// 2. Append separator to output.
output . append ( separator ) ;
// 3. Set ignore0 to true and continue.
ignore0 = true ;
continue ;
}
// 4. Append address[pieceIndex], represented as the shortest possible lowercase hexadecimal number, to output.
output . appendff ( " {:x} " , address [ piece_index ] ) ;
// 5. If pieceIndex is not 7, then append U+003A (:) to output.
if ( piece_index ! = 7 )
output . append ( ' : ' ) ;
}
// 6. Return output.
}
// https://url.spec.whatwg.org/#concept-ipv6-parser
2023-07-26 11:49:49 +03:00
static Optional < URL : : IPv6Address > parse_ipv6_address ( StringView input )
2023-07-17 07:52:29 +03:00
{
// 1. Let address be a new IPv6 address whose IPv6 pieces are all 0.
Array < u16 , 8 > address { } ;
// 2. Let pieceIndex be 0.
size_t piece_index = 0 ;
// 3. Let compress be null.
Optional < size_t > compress ;
Vector < u32 > code_points ;
code_points . ensure_capacity ( input . length ( ) ) ;
for ( auto code_point : Utf8View { input } ) {
code_points . append ( code_point ) ;
}
// 4. Let pointer be a pointer for input.
size_t pointer = 0 ;
auto c = [ & ] ( ) - > u32 {
if ( pointer > = code_points . size ( ) )
return end_of_file ;
return code_points [ pointer ] ;
} ;
auto remaining = [ & ] ( ) - > ReadonlySpan < u32 > {
if ( ( pointer + 1 ) > = code_points . size ( ) )
return { } ;
return code_points . span ( ) . slice ( pointer + 1 ) ;
} ;
// 5. If c is U+003A (:), then:
if ( c ( ) = = ' : ' ) {
// 1. If remaining does not start with U+003A (:), IPv6-invalid-compression validation error, return failure.
if ( remaining ( ) . is_empty ( ) | | remaining ( ) [ 0 ] ! = ' : ' ) {
report_validation_error ( ) ;
return { } ;
}
// 2. Increase pointer by 2.
pointer + = 2 ;
// 3. Increase pieceIndex by 1 and then set compress to pieceIndex.
+ + piece_index ;
compress = piece_index ;
}
// 6. While c is not the EOF code point:
while ( c ( ) ! = end_of_file ) {
// 1. If pieceIndex is 8, IPv6-too-many-pieces validation error, return failure.
if ( piece_index = = 8 ) {
report_validation_error ( ) ;
return { } ;
}
// 2. If c is U+003A (:), then:
if ( c ( ) = = ' : ' ) {
// 1. If compress is non-null, IPv6-multiple-compression validation error, return failure.
if ( compress . has_value ( ) ) {
report_validation_error ( ) ;
return { } ;
}
// 2. Increase pointer and pieceIndex by 1, set compress to pieceIndex, and then continue.
+ + pointer ;
+ + piece_index ;
compress = piece_index ;
continue ;
}
// 3. Let value and length be 0.
u32 value = 0 ;
size_t length = 0 ;
// 4. While length is less than 4 and c is an ASCII hex digit,
// set value to value × 0x10 + c interpreted as hexadecimal number,
// and increase pointer and length by 1.
while ( length < 4 & & is_ascii_hex_digit ( c ( ) ) ) {
value = value * 0x10 + parse_ascii_hex_digit ( c ( ) ) ;
+ + pointer ;
+ + length ;
}
// 5. If c is U+002E (.), then:
if ( c ( ) = = ' . ' ) {
// 1. If length is 0, IPv4-in-IPv6-invalid-code-point validation error, return failure.
if ( length = = 0 ) {
report_validation_error ( ) ;
return { } ;
}
// 2. Decrease pointer by length.
pointer - = length ;
// 3. If pieceIndex is greater than 6, IPv4-in-IPv6-too-many-pieces validation error, return failure.
if ( piece_index > 6 ) {
report_validation_error ( ) ;
return { } ;
}
// 4. Let numbersSeen be 0.
size_t numbers_seen = 0 ;
// 5. While c is not the EOF code point:
while ( c ( ) ! = end_of_file ) {
// 1. Let ipv4Piece be null.
Optional < u32 > ipv4_piece ;
// 2. If numbersSeen is greater than 0, then:
if ( numbers_seen > 0 ) {
// 1. If c is a U+002E (.) and numbersSeen is less than 4, then increase pointer by 1.
if ( c ( ) = = ' . ' & & numbers_seen < 4 ) {
+ + pointer ;
}
// 2. Otherwise, IPv4-in-IPv6-invalid-code-point validation error, return failure.
else {
report_validation_error ( ) ;
return { } ;
}
}
// 3. If c is not an ASCII digit, IPv4-in-IPv6-invalid-code-point validation error, return failure.
if ( ! is_ascii_digit ( c ( ) ) ) {
report_validation_error ( ) ;
return { } ;
}
// 4. While c is an ASCII digit:
while ( is_ascii_digit ( c ( ) ) ) {
// 1. Let number be c interpreted as decimal number.
u32 number = parse_ascii_digit ( c ( ) ) ;
// 2. If ipv4Piece is null, then set ipv4Piece to number.
if ( ! ipv4_piece . has_value ( ) ) {
ipv4_piece = number ;
}
// Otherwise, if ipv4Piece is 0, IPv4-in-IPv6-invalid-code-point validation error, return failure.
else if ( ipv4_piece . value ( ) = = 0 ) {
report_validation_error ( ) ;
return { } ;
}
// Otherwise, set ipv4Piece to ipv4Piece × 10 + number.
else {
ipv4_piece = ipv4_piece . value ( ) * 10 + number ;
}
// 3. If ipv4Piece is greater than 255, IPv4-in-IPv6-out-of-range-part validation error, return failure.
if ( ipv4_piece . value ( ) > 255 ) {
report_validation_error ( ) ;
return { } ;
}
// 4. Increase pointer by 1.
+ + pointer ;
}
// 5. Set address[pieceIndex] to address[pieceIndex] × 0x100 + ipv4Piece.
address [ piece_index ] = address [ piece_index ] * 0x100 + ipv4_piece . value ( ) ;
// 6. Increase numbersSeen by 1.
+ + numbers_seen ;
// 7. If numbersSeen is 2 or 4, then increase pieceIndex by 1.
if ( numbers_seen = = 2 | | numbers_seen = = 4 )
+ + piece_index ;
}
// 6. If numbersSeen is not 4, IPv4-in-IPv6-too-few-parts validation error, return failure.
if ( numbers_seen ! = 4 ) {
report_validation_error ( ) ;
return { } ;
}
// 7. Break.
break ;
}
// 6. Otherwise, if c is U+003A (:):
else if ( c ( ) = = ' : ' ) {
// 1. Increase pointer by 1.
+ + pointer ;
// 2. If c is the EOF code point, IPv6-invalid-code-point validation error, return failure.
if ( c ( ) = = end_of_file ) {
report_validation_error ( ) ;
return { } ;
}
}
// 7. Otherwise, if c is not the EOF code point, IPv6-invalid-code-point validation error, return failure.
else if ( c ( ) ! = end_of_file ) {
report_validation_error ( ) ;
return { } ;
}
// 8. Set address[pieceIndex] to value.
address [ piece_index ] = value ;
// 9. Increase pieceIndex by 1.
+ + piece_index ;
}
// 7. If compress is non-null, then:
if ( compress . has_value ( ) ) {
// 1. Let swaps be pieceIndex − compress.
size_t swaps = piece_index - compress . value ( ) ;
// 2. Set pieceIndex to 7.
piece_index = 7 ;
// 3. While pieceIndex is not 0 and swaps is greater than 0,
// swap address[pieceIndex] with address[compress + swaps − 1],
// and then decrease both pieceIndex and swaps by 1.
while ( piece_index ! = 0 & & swaps > 0 ) {
swap ( address [ piece_index ] , address [ compress . value ( ) + swaps - 1 ] ) ;
- - piece_index ;
- - swaps ;
}
}
// 8. Otherwise, if compress is null and pieceIndex is not 8, IPv6-too-few-pieces validation error, return failure.
else if ( ! compress . has_value ( ) & & piece_index ! = 8 ) {
report_validation_error ( ) ;
return { } ;
}
// 9. Return address.
return address ;
}
2023-07-25 10:43:00 +03:00
// https://url.spec.whatwg.org/#ends-in-a-number-checker
static bool ends_in_a_number_checker ( StringView input )
{
// 1. Let parts be the result of strictly splitting input on U+002E (.).
auto parts = input . split_view ( " . " sv , SplitBehavior : : KeepEmpty ) ;
// 2. If the last item in parts is the empty string, then:
if ( parts . last ( ) . is_empty ( ) ) {
// 1. If parts’ s size is 1, then return false.
if ( parts . size ( ) = = 1 )
return false ;
// 2. Remove the last item from parts.
parts . take_last ( ) ;
}
// 3. Let last be the last item in parts.
auto last = parts . last ( ) ;
// 4. If last is non-empty and contains only ASCII digits, then return true.
if ( ! last . is_empty ( ) & & all_of ( last , is_ascii_digit ) )
return true ;
// 5. If parsing last as an IPv4 number does not return failure, then return true.
if ( parse_ipv4_number ( last ) . has_value ( ) )
return true ;
// 6. Return false.
return false ;
}
2021-05-25 23:13:15 +03:00
// https://url.spec.whatwg.org/#concept-host-parser
// NOTE: This is a very bare-bones implementation.
2023-10-01 09:07:03 +03:00
static Optional < URL : : Host > parse_host ( StringView input , bool is_opaque = false )
2021-05-25 23:13:15 +03:00
{
2023-07-17 07:52:29 +03:00
// 1. If input starts with U+005B ([), then:
2021-05-25 23:13:15 +03:00
if ( input . starts_with ( ' [ ' ) ) {
2023-07-17 07:52:29 +03:00
// 1. If input does not end with U+005D (]), IPv6-unclosed validation error, return failure.
2021-05-25 23:13:15 +03:00
if ( ! input . ends_with ( ' ] ' ) ) {
report_validation_error ( ) ;
return { } ;
}
2023-07-17 07:52:29 +03:00
// 2. Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
auto address = parse_ipv6_address ( input . substring_view ( 1 , input . length ( ) - 2 ) ) ;
if ( ! address . has_value ( ) )
return { } ;
2023-07-27 12:40:41 +03:00
return address . release_value ( ) ;
2021-05-25 23:13:15 +03:00
}
2023-10-01 09:07:03 +03:00
// 2. If isOpaque is true, then return the result of opaque-host parsing input.
if ( is_opaque )
2021-05-25 23:13:15 +03:00
return parse_opaque_host ( input ) ;
2023-07-23 12:09:29 +03:00
// 3. Assert: input is not the empty string.
2021-05-25 23:13:15 +03:00
VERIFY ( ! input . is_empty ( ) ) ;
2023-07-23 12:09:29 +03:00
// FIXME: 4. Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
2021-05-25 23:13:15 +03:00
auto domain = URL : : percent_decode ( input ) ;
2023-07-23 12:09:29 +03:00
2023-06-16 01:43:22 +03:00
// NOTE: This is handled in Unicode::create_unicode_url, to work around the fact that we can't call into LibUnicode here
// FIXME: 5. Let asciiDomain be the result of running domain to ASCII with domain and false.
2023-07-23 12:09:29 +03:00
// FIXME: 6. If asciiDomain is failure, then return failure.
2023-12-16 17:19:34 +03:00
auto ascii_domain_or_error = String : : from_byte_string ( domain ) ;
2023-07-27 12:40:41 +03:00
if ( ascii_domain_or_error . is_error ( ) )
return { } ;
auto ascii_domain = ascii_domain_or_error . release_value ( ) ;
2021-05-25 23:13:15 +03:00
2023-07-23 12:09:29 +03:00
// 7. If asciiDomain contains a forbidden domain code point, domain-invalid-code-point validation error, return failure.
2022-09-12 17:31:16 +03:00
auto forbidden_host_characters = " \0 \t \n \r #%/:<>?@[ \\ ]^| " sv ;
for ( auto character : forbidden_host_characters ) {
2023-07-27 12:40:41 +03:00
if ( ascii_domain . bytes_as_string_view ( ) . contains ( character ) ) {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
return { } ;
}
}
2023-07-23 12:09:29 +03:00
// 8. If asciiDomain ends in a number, then return the result of IPv4 parsing asciiDomain.
2023-07-25 10:43:00 +03:00
if ( ends_in_a_number_checker ( ascii_domain ) ) {
2023-07-23 12:09:29 +03:00
auto ipv4_host = parse_ipv4_address ( ascii_domain ) ;
if ( ! ipv4_host . has_value ( ) )
return { } ;
2023-07-27 12:40:41 +03:00
return ipv4_host . release_value ( ) ;
2023-07-23 12:09:29 +03:00
}
// 9. Return asciiDomain.
return ascii_domain ;
2021-05-25 23:13:15 +03:00
}
2023-07-26 12:05:35 +03:00
// https://url.spec.whatwg.org/#concept-host-serializer
ErrorOr < String > URLParser : : serialize_host ( URL : : Host const & host )
{
// 1. If host is an IPv4 address, return the result of running the IPv4 serializer on host.
if ( host . has < URL : : IPv4Address > ( ) )
return serialize_ipv4_address ( host . get < URL : : IPv4Address > ( ) ) ;
// 2. Otherwise, if host is an IPv6 address, return U+005B ([), followed by the result of running the IPv6 serializer on host, followed by U+005D (]).
if ( host . has < URL : : IPv6Address > ( ) ) {
StringBuilder output ;
TRY ( output . try_append ( ' [ ' ) ) ;
serialize_ipv6_address ( host . get < URL : : IPv6Address > ( ) , output ) ;
TRY ( output . try_append ( ' ] ' ) ) ;
return output . to_string ( ) ;
}
// 3. Otherwise, host is a domain, opaque host, or empty host, return host.
2023-08-08 23:31:38 +03:00
if ( host . has < String > ( ) )
return host . get < String > ( ) ;
return String { } ;
2023-07-26 12:05:35 +03:00
}
2022-09-20 16:38:53 +03:00
// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
2021-11-11 02:55:02 +03:00
constexpr bool starts_with_windows_drive_letter ( StringView input )
2021-05-25 23:13:15 +03:00
{
if ( input . length ( ) < 2 )
return false ;
2022-09-20 16:38:53 +03:00
if ( ! is_ascii_alpha ( input [ 0 ] ) | | ! ( input [ 1 ] = = ' : ' | | input [ 1 ] = = ' | ' ) )
2021-05-25 23:13:15 +03:00
return false ;
if ( input . length ( ) = = 2 )
return true ;
return " / \\ ?# " sv . contains ( input [ 2 ] ) ;
}
2021-11-11 02:55:02 +03:00
constexpr bool is_windows_drive_letter ( StringView input )
2021-05-25 23:13:15 +03:00
{
return input . length ( ) = = 2 & & is_ascii_alpha ( input [ 0 ] ) & & ( input [ 1 ] = = ' : ' | | input [ 1 ] = = ' | ' ) ;
}
2021-11-11 02:55:02 +03:00
constexpr bool is_normalized_windows_drive_letter ( StringView input )
2021-05-25 23:13:15 +03:00
{
return input . length ( ) = = 2 & & is_ascii_alpha ( input [ 0 ] ) & & input [ 1 ] = = ' : ' ;
}
2021-11-11 02:55:02 +03:00
constexpr bool is_single_dot_path_segment ( StringView input )
2021-05-25 23:13:15 +03:00
{
2023-03-10 10:48:54 +03:00
return input = = " . " sv | | input . equals_ignoring_ascii_case ( " %2e " sv ) ;
2021-05-25 23:13:15 +03:00
}
2021-11-11 02:55:02 +03:00
constexpr bool is_double_dot_path_segment ( StringView input )
2021-05-25 23:13:15 +03:00
{
2023-03-10 10:48:54 +03:00
return input = = " .. " sv | | input . equals_ignoring_ascii_case ( " .%2e " sv ) | | input . equals_ignoring_ascii_case ( " %2e. " sv ) | | input . equals_ignoring_ascii_case ( " %2e%2e " sv ) ;
2021-05-25 23:13:15 +03:00
}
2023-09-17 04:15:52 +03:00
// https://url.spec.whatwg.org/#shorten-a-urls-path
void URLParser : : shorten_urls_path ( URL & url )
{
// 1. Assert: url does not have an opaque path.
VERIFY ( ! url . cannot_be_a_base_url ( ) ) ;
// 2. Let path be url’ s path.
auto & path = url . m_paths ;
// 3. If url’ s scheme is "file", path’ s size is 1, and path[0] is a normalized Windows drive letter, then return.
if ( url . scheme ( ) = = " file " & & path . size ( ) = = 1 & & is_normalized_windows_drive_letter ( path [ 0 ] ) )
return ;
// 4. Remove path’ s last item, if any.
if ( ! path . is_empty ( ) )
path . take_last ( ) ;
}
2022-04-10 01:48:15 +03:00
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
2023-08-14 09:49:23 +03:00
ErrorOr < String > URLParser : : percent_encode_after_encoding ( StringView input , URL : : PercentEncodeSet percent_encode_set , bool space_as_plus )
2022-04-10 01:48:15 +03:00
{
// NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
StringBuilder output ;
// 3. For each byte of encodeOutput converted to a byte sequence:
for ( auto byte : input ) {
// 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
if ( space_as_plus & & byte = = ' ' ) {
output . append ( ' + ' ) ;
continue ;
}
// 2. Let isomorph be a code point whose value is byte’ s value.
u32 isomorph = byte ;
// 3. Assert: percentEncodeSet includes all non-ASCII code points.
// 4. If isomorphic is not in percentEncodeSet, then append isomorph to output.
if ( ! URL : : code_point_is_in_percent_encode_set ( isomorph , percent_encode_set ) ) {
output . append_code_point ( isomorph ) ;
}
// 5. Otherwise, percent-encode byte and append the result to output.
else {
output . appendff ( " %{:02X} " , byte ) ;
}
}
// 6. Return output.
2023-08-14 09:49:23 +03:00
return output . to_string ( ) ;
2022-04-10 01:48:15 +03:00
}
2021-05-25 23:13:15 +03:00
// https://url.spec.whatwg.org/#concept-basic-url-parser
// NOTE: This parser assumes a UTF-8 encoding.
// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in
// validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the
// future for validation of URLs, which would then lead to infinite recursion.
// The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
// variables' values here, not what the URL class presents to its users.
2023-07-15 05:29:20 +03:00
URL URLParser : : basic_parse ( StringView raw_input , Optional < URL > const & base_url , Optional < URL > url , Optional < State > state_override )
2021-05-25 23:13:15 +03:00
{
dbgln_if ( URL_PARSER_DEBUG , " URLParser::parse: Parsing '{}' " , raw_input ) ;
if ( raw_input . is_empty ( ) )
2023-04-11 15:53:40 +03:00
return base_url . has_value ( ) ? * base_url : URL { } ;
2021-05-25 23:13:15 +03:00
size_t start_index = 0 ;
size_t end_index = raw_input . length ( ) ;
2023-07-03 13:52:08 +03:00
// 1. If url is not given:
2021-09-13 22:34:14 +03:00
if ( ! url . has_value ( ) ) {
2023-07-03 13:52:08 +03:00
// 1. Set url to a new URL.
2021-09-13 22:34:14 +03:00
url = URL ( ) ;
2023-07-03 13:52:08 +03:00
// 2. If input contains any leading or trailing C0 control or space, invalid-URL-unit validation error.
// 3. Remove any leading and trailing C0 control or space from input.
//
// FIXME: We aren't checking exactly for 'trailing C0 control or space' here.
2021-09-13 22:34:14 +03:00
bool has_validation_error = false ;
for ( size_t i = 0 ; i < raw_input . length ( ) ; + + i ) {
i8 ch = raw_input [ i ] ;
if ( 0 < = ch & & ch < = 0x20 ) {
+ + start_index ;
has_validation_error = true ;
} else {
break ;
}
2021-05-25 23:13:15 +03:00
}
2021-09-13 22:34:14 +03:00
for ( ssize_t i = raw_input . length ( ) - 1 ; i > = 0 ; - - i ) {
i8 ch = raw_input [ i ] ;
if ( 0 < = ch & & ch < = 0x20 ) {
- - end_index ;
has_validation_error = true ;
} else {
break ;
}
2021-05-25 23:13:15 +03:00
}
2021-09-13 22:34:14 +03:00
if ( has_validation_error )
report_validation_error ( ) ;
2021-05-25 23:13:15 +03:00
}
if ( start_index > = end_index )
return { } ;
2023-12-16 17:19:34 +03:00
ByteString processed_input = raw_input . substring_view ( start_index , end_index - start_index ) ;
2021-05-25 23:13:15 +03:00
2023-07-03 13:52:08 +03:00
// 2. If input contains any ASCII tab or newline, invalid-URL-unit validation error.
// 3. Remove all ASCII tab or newline from input.
2023-12-29 20:33:10 +03:00
for ( auto const ch : processed_input ) {
if ( ch = = ' \t ' | | ch = = ' \n ' ) {
report_validation_error ( ) ;
processed_input = processed_input . replace ( " \t " sv , " " sv , ReplaceMode : : All ) . replace ( " \n " sv , " " sv , ReplaceMode : : All ) ;
break ;
}
2021-05-25 23:13:15 +03:00
}
2023-07-03 13:52:08 +03:00
// 4. Let state be state override if given, or scheme start state otherwise.
2021-09-13 22:34:14 +03:00
State state = state_override . value_or ( State : : SchemeStart ) ;
2023-07-03 13:52:08 +03:00
// FIXME: 5. Set encoding to the result of getting an output encoding from encoding.
// 6. Let buffer be the empty string.
2021-05-25 23:13:15 +03:00
StringBuilder buffer ;
2023-07-03 13:52:08 +03:00
// 7. Let atSignSeen, insideBrackets, and passwordTokenSeen be false.
2021-05-25 23:13:15 +03:00
bool at_sign_seen = false ;
bool inside_brackets = false ;
bool password_token_seen = false ;
Utf8View input ( processed_input ) ;
2023-07-03 13:52:08 +03:00
// 8. Let pointer be a pointer for input.
2021-06-01 10:45:52 +03:00
Utf8CodePointIterator iterator = input . begin ( ) ;
2021-05-25 23:13:15 +03:00
auto get_remaining = [ & input , & iterator ] {
2021-05-30 19:52:24 +03:00
return input . substring_view ( iterator - input . begin ( ) + iterator . underlying_code_point_length_in_bytes ( ) ) . as_string ( ) ;
2021-05-25 23:13:15 +03:00
} ;
2023-08-13 02:17:02 +03:00
auto remaining_starts_with_two_ascii_hex_digits = [ & ] ( ) {
return is_ascii_hex_digit ( iterator . peek ( 1 ) . value_or ( end_of_file ) ) & & is_ascii_hex_digit ( iterator . peek ( 2 ) . value_or ( end_of_file ) ) ;
} ;
2023-07-03 13:52:08 +03:00
// 9. Keep running the following state machine by switching on state. If after a run pointer points to the EOF code point, go to the next step. Otherwise, increase pointer by 1 and continue with the state machine.
2021-05-25 23:13:15 +03:00
// NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
// ++iterator : "increase pointer by 1"
// continue : "decrease pointer by 1"
for ( ; ; ) {
2021-06-03 13:43:08 +03:00
u32 code_point = end_of_file ;
2021-05-25 23:13:15 +03:00
if ( ! iterator . done ( ) )
code_point = * iterator ;
if constexpr ( URL_PARSER_DEBUG ) {
2021-06-03 13:43:08 +03:00
if ( code_point = = end_of_file )
2023-07-15 05:29:20 +03:00
dbgln ( " URLParser::basic_parse: {} state with EOF. " , state_name ( state ) ) ;
2021-06-03 13:40:04 +03:00
else if ( is_ascii_printable ( code_point ) )
2023-07-15 05:29:20 +03:00
dbgln ( " URLParser::basic_parse: {} state with code point U+{:04X} ({:c}). " , state_name ( state ) , code_point , code_point ) ;
2021-05-25 23:13:15 +03:00
else
2023-07-15 05:29:20 +03:00
dbgln ( " URLParser::basic_parse: {} state with code point U+{:04X}. " , state_name ( state ) , code_point ) ;
2021-05-25 23:13:15 +03:00
}
switch ( state ) {
2023-07-03 13:52:08 +03:00
// -> scheme start state, https://url.spec.whatwg.org/#scheme-start-state
2021-05-25 23:13:15 +03:00
case State : : SchemeStart :
2023-07-03 13:52:08 +03:00
// 1. If c is an ASCII alpha, append c, lowercased, to buffer, and set state to scheme state.
2021-05-25 23:13:15 +03:00
if ( is_ascii_alpha ( code_point ) ) {
buffer . append_as_lowercase ( code_point ) ;
state = State : : Scheme ;
2023-07-03 13:52:08 +03:00
}
2023-07-04 11:34:00 +03:00
// 2. Otherwise, if state override is not given, set state to no scheme state and decrease pointer by 1.
else if ( ! state_override . has_value ( ) ) {
2021-05-25 23:13:15 +03:00
state = State : : NoScheme ;
continue ;
}
2023-07-04 11:34:00 +03:00
// 3. Otherwise, return failure.
else {
return { } ;
}
2021-05-25 23:13:15 +03:00
break ;
2023-07-03 13:52:08 +03:00
// -> scheme state, https://url.spec.whatwg.org/#scheme-state
2021-05-25 23:13:15 +03:00
case State : : Scheme :
2023-07-03 13:52:08 +03:00
// 1. If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.), append c, lowercased, to buffer.
2021-05-25 23:13:15 +03:00
if ( is_ascii_alphanumeric ( code_point ) | | code_point = = ' + ' | | code_point = = ' - ' | | code_point = = ' . ' ) {
buffer . append_as_lowercase ( code_point ) ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, if c is U+003A (:), then:
else if ( code_point = = ' : ' ) {
2023-07-14 03:58:16 +03:00
// 1. If state override is given, then:
if ( state_override . has_value ( ) ) {
// 1. If url’ s scheme is a special scheme and buffer is not a special scheme, then return.
if ( URL : : is_special_scheme ( url - > scheme ( ) ) & & ! URL : : is_special_scheme ( buffer . string_view ( ) ) )
return * url ;
// 2. If url’ s scheme is not a special scheme and buffer is a special scheme, then return.
if ( ! URL : : is_special_scheme ( url - > scheme ( ) ) & & URL : : is_special_scheme ( buffer . string_view ( ) ) )
return * url ;
// 3. If url includes credentials or has a non-null port, and buffer is "file", then return.
if ( ( url - > includes_credentials ( ) | | url - > port ( ) . has_value ( ) ) & & buffer . string_view ( ) = = " file " sv )
return * url ;
// 4. If url’ s scheme is "file" and its host is an empty host, then return.
2023-07-27 12:40:41 +03:00
if ( url - > scheme ( ) = = " file " sv & & url - > host ( ) = = String { } )
2023-07-14 03:58:16 +03:00
return * url ;
2023-07-03 13:52:08 +03:00
}
// 2. Set url’ s scheme to buffer.
2023-12-29 19:35:01 +03:00
url - > m_scheme = buffer . to_string_without_validation ( ) ;
2023-07-03 13:52:08 +03:00
2023-07-14 03:58:16 +03:00
// 3. If state override is given, then:
if ( state_override . has_value ( ) ) {
// 1. If url’ s port is url’ s scheme’ s default port, then set url’ s port to null.
if ( url - > port ( ) = = URL : : default_port_for_scheme ( url - > scheme ( ) ) )
url - > m_port = { } ;
// 2. Return.
return * url ;
2023-07-03 13:52:08 +03:00
}
// 4. Set buffer to the empty string.
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
2023-07-03 13:52:08 +03:00
// 5. If url’ s scheme is "file", then:
2021-09-13 22:34:14 +03:00
if ( url - > scheme ( ) = = " file " ) {
2023-07-03 13:52:08 +03:00
// 1. If remaining does not start with "//", special-scheme-missing-following-solidus validation error.
2022-07-11 20:32:29 +03:00
if ( ! get_remaining ( ) . starts_with ( " // " sv ) ) {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
}
2023-07-03 13:52:08 +03:00
// 2. Set state to file state.
2021-05-25 23:13:15 +03:00
state = State : : File ;
2023-07-03 13:52:08 +03:00
}
// 6. Otherwise, if url is special, base is non-null, and base’ s scheme is url’ s scheme:
2023-08-13 02:08:12 +03:00
else if ( url - > is_special ( ) & & base_url . has_value ( ) & & base_url - > scheme ( ) = = url - > m_scheme ) {
// 1. Assert: base is is special (and therefore does not have an opaque path).
VERIFY ( base_url - > is_special ( ) ) ;
2023-07-03 13:52:08 +03:00
// 2. Set state to special relative or authority state.
2023-08-13 02:08:12 +03:00
state = State : : SpecialRelativeOrAuthority ;
}
// 7. Otherwise, if url is special, set state to special authority slashes state.
else if ( url - > is_special ( ) ) {
state = State : : SpecialAuthoritySlashes ;
2023-07-03 13:52:08 +03:00
}
// 8. Otherwise, if remaining starts with an U+002F (/), set state to path or authority state and increase pointer by 1.
else if ( get_remaining ( ) . starts_with ( " / " sv ) ) {
2021-05-25 23:13:15 +03:00
state = State : : PathOrAuthority ;
+ + iterator ;
2023-07-03 13:52:08 +03:00
}
// 9. Otherwise, set url’ s path to the empty string and set state to opaque path state.
else {
2021-09-13 22:34:14 +03:00
url - > m_cannot_be_a_base_url = true ;
2023-04-09 16:21:00 +03:00
url - > append_slash ( ) ;
2021-05-25 23:13:15 +03:00
state = State : : CannotBeABaseUrlPath ;
}
2023-07-03 13:52:08 +03:00
}
2023-07-04 11:34:00 +03:00
// 3. Otherwise, if state override is not given, set buffer to the empty string, state to no scheme state, and start over (from the first code point in input).
else if ( ! state_override . has_value ( ) ) {
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
state = State : : NoScheme ;
iterator = input . begin ( ) ;
continue ;
}
2023-07-04 11:34:00 +03:00
// 4. Otherwise, return failure.
else {
return { } ;
}
2021-05-25 23:13:15 +03:00
break ;
2023-07-03 13:52:08 +03:00
// -> no scheme state, https://url.spec.whatwg.org/#no-scheme-state
2021-05-25 23:13:15 +03:00
case State : : NoScheme :
2023-07-03 13:52:08 +03:00
// 1. If base is null, or base has an opaque path and c is not U+0023 (#), missing-scheme-non-relative-URL validation error, return failure.
2023-04-11 15:53:40 +03:00
if ( ! base_url . has_value ( ) | | ( base_url - > m_cannot_be_a_base_url & & code_point ! = ' # ' ) ) {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
return { } ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, if base has an opaque path and c is U+0023 (#), set url’ s scheme to base’ s scheme, url’ s path to base’ s path, url’ s query
// to base’ s query,url’ s fragment to the empty string, and set state to fragment state.
else if ( base_url - > m_cannot_be_a_base_url & & code_point = = ' # ' ) {
2021-09-13 22:34:14 +03:00
url - > m_scheme = base_url - > m_scheme ;
url - > m_paths = base_url - > m_paths ;
url - > m_query = base_url - > m_query ;
2023-08-12 07:52:42 +03:00
url - > m_fragment = String { } ;
2021-09-13 22:34:14 +03:00
url - > m_cannot_be_a_base_url = true ;
2021-05-25 23:13:15 +03:00
state = State : : Fragment ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, if base’ s scheme is not "file", set state to relative state and decrease pointer by 1.
else if ( base_url - > m_scheme ! = " file " ) {
2021-05-25 23:13:15 +03:00
state = State : : Relative ;
continue ;
2023-07-03 13:52:08 +03:00
}
// 4. Otherwise, set state to file state and decrease pointer by 1.
else {
2021-05-25 23:13:15 +03:00
state = State : : File ;
continue ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> special relative or authority state, https://url.spec.whatwg.org/#special-relative-or-authority-state
2021-05-25 23:13:15 +03:00
case State : : SpecialRelativeOrAuthority :
2023-07-03 13:52:08 +03:00
// 1. If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by 1.
2022-07-11 20:32:29 +03:00
if ( code_point = = ' / ' & & get_remaining ( ) . starts_with ( " / " sv ) ) {
2021-05-25 23:13:15 +03:00
state = State : : SpecialAuthorityIgnoreSlashes ;
+ + iterator ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, special-scheme-missing-following-solidus validation error, set state to relative state and decrease pointer by 1.
else {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
state = State : : Relative ;
continue ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> path or authority state, https://url.spec.whatwg.org/#path-or-authority-state
2021-05-25 23:13:15 +03:00
case State : : PathOrAuthority :
2023-07-03 13:52:08 +03:00
// 1. If c is U+002F (/), then set state to authority state.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' / ' ) {
state = State : : Authority ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, set state to path state, and decrease pointer by 1.
else {
2021-05-25 23:13:15 +03:00
state = State : : Path ;
continue ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> relative state, https://url.spec.whatwg.org/#relative-state
2021-05-25 23:13:15 +03:00
case State : : Relative :
2023-07-04 12:11:42 +03:00
// 1. Assert: base’ s scheme is not "file".
VERIFY ( base_url - > scheme ( ) ! = " file " ) ;
2023-07-03 13:52:08 +03:00
// 2. Set url’ s scheme to base’ s scheme.
2021-09-13 22:34:14 +03:00
url - > m_scheme = base_url - > m_scheme ;
2023-07-03 13:52:08 +03:00
// 3. If c is U+002F (/), then set state to relative slash state.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' / ' ) {
state = State : : RelativeSlash ;
2023-07-03 13:52:08 +03:00
}
// 4. Otherwise, if url is special and c is U+005C (\), invalid-reverse-solidus validation error, set state to relative slash state.
else if ( url - > is_special ( ) & & code_point = = ' \\ ' ) {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
state = State : : RelativeSlash ;
2023-07-03 13:52:08 +03:00
}
// 5. Otherwise:
else {
// 1. Set url’ s username to base’ s username, url’ s password to base’ s password, url’ s host to base’ s host, url’ s port to base’ s port, url’ s path to a clone of base’ s path, and url’ s query to base’ s query.
2021-09-13 22:34:14 +03:00
url - > m_username = base_url - > m_username ;
url - > m_password = base_url - > m_password ;
url - > m_host = base_url - > m_host ;
url - > m_port = base_url - > m_port ;
url - > m_paths = base_url - > m_paths ;
url - > m_query = base_url - > m_query ;
2021-05-25 23:13:15 +03:00
2023-07-03 13:52:08 +03:00
// 2. If c is U+003F (?), then set url’ s query to the empty string, and state to query state.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' ? ' ) {
2023-08-12 10:28:19 +03:00
url - > m_query = String { } ;
2021-05-25 23:13:15 +03:00
state = State : : Query ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, if c is U+0023 (#), set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2023-08-12 07:52:42 +03:00
url - > m_fragment = String { } ;
2021-05-25 23:13:15 +03:00
state = State : : Fragment ;
2023-07-03 13:52:08 +03:00
}
// 4. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. Set url’ s query to null.
2021-09-13 22:34:14 +03:00
url - > m_query = { } ;
2023-07-03 13:52:08 +03:00
// 2. Shorten url’ s path.
2023-09-17 04:15:52 +03:00
shorten_urls_path ( * url ) ;
2023-07-03 13:52:08 +03:00
// 3. Set state to path state and decrease pointer by 1.
2021-05-25 23:13:15 +03:00
state = State : : Path ;
continue ;
}
}
break ;
2023-07-03 13:52:08 +03:00
// -> relative slash state, https://url.spec.whatwg.org/#relative-slash-state
2021-05-25 23:13:15 +03:00
case State : : RelativeSlash :
2023-07-03 13:52:08 +03:00
// 1. If url is special and c is U+002F (/) or U+005C (\), then:
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) & & ( code_point = = ' / ' | | code_point = = ' \\ ' ) ) {
2023-07-03 13:52:08 +03:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 2. Set state to special authority ignore slashes state.
2021-05-25 23:13:15 +03:00
state = State : : SpecialAuthorityIgnoreSlashes ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, if c is U+002F (/), then set state to authority state.
else if ( code_point = = ' / ' ) {
2021-05-25 23:13:15 +03:00
state = State : : Authority ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, set url’ s username to base’ s username, url’ s password to base’ s password, url’ s host to base’ s host, url’ s port to base’ s port, state to path state, and then, decrease pointer by 1.
else {
2021-09-13 22:34:14 +03:00
url - > m_username = base_url - > m_username ;
url - > m_password = base_url - > m_password ;
url - > m_host = base_url - > m_host ;
url - > m_port = base_url - > m_port ;
2021-05-25 23:13:15 +03:00
state = State : : Path ;
continue ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> special authority slashes state, https://url.spec.whatwg.org/#special-authority-slashes-state
2021-05-25 23:13:15 +03:00
case State : : SpecialAuthoritySlashes :
2023-07-03 13:52:08 +03:00
// 1. If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by 1.
2022-07-11 20:32:29 +03:00
if ( code_point = = ' / ' & & get_remaining ( ) . starts_with ( " / " sv ) ) {
2021-05-25 23:13:15 +03:00
state = State : : SpecialAuthorityIgnoreSlashes ;
+ + iterator ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, special-scheme-missing-following-solidus validation error, set state to special authority ignore slashes state and decrease pointer by 1.
else {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
state = State : : SpecialAuthorityIgnoreSlashes ;
continue ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> special authority ignore slashes state, https://url.spec.whatwg.org/#special-authority-ignore-slashes-state
2021-05-25 23:13:15 +03:00
case State : : SpecialAuthorityIgnoreSlashes :
2023-07-03 13:52:08 +03:00
// 1. If c is neither U+002F (/) nor U+005C (\), then set state to authority state and decrease pointer by 1.
2021-05-25 23:13:15 +03:00
if ( code_point ! = ' / ' & & code_point ! = ' \\ ' ) {
state = State : : Authority ;
continue ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, special-scheme-missing-following-solidus validation error.
else {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> authority state, https://url.spec.whatwg.org/#authority-state
2021-05-25 23:13:15 +03:00
case State : : Authority :
2023-07-03 13:52:08 +03:00
// 1. If c is U+0040 (@), then:
2021-05-25 23:13:15 +03:00
if ( code_point = = ' @ ' ) {
2023-07-03 13:52:08 +03:00
// 1. Invalid-credentials validation error.
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 2. If atSignSeen is true, then prepend "%40" to buffer.
2021-05-25 23:13:15 +03:00
if ( at_sign_seen ) {
2023-12-16 17:19:34 +03:00
auto content = buffer . to_byte_string ( ) ;
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
2022-07-11 20:32:29 +03:00
buffer . append ( " %40 " sv ) ;
2021-05-25 23:13:15 +03:00
buffer . append ( content ) ;
}
2023-07-03 13:52:08 +03:00
// 3. Set atSignSeen to true.
2021-05-25 23:13:15 +03:00
at_sign_seen = true ;
2023-07-03 13:52:08 +03:00
2023-10-28 08:26:20 +03:00
StringBuilder username_builder ;
StringBuilder password_builder ;
2023-07-03 13:52:08 +03:00
2023-07-04 13:22:01 +03:00
// 4. For each codePoint in buffer:
for ( auto c : Utf8View ( buffer . string_view ( ) ) ) {
2023-07-03 13:52:08 +03:00
// 1. If codePoint is U+003A (:) and passwordTokenSeen is false, then set passwordTokenSeen to true and continue.
2021-05-25 23:13:15 +03:00
if ( c = = ' : ' & & ! password_token_seen ) {
password_token_seen = true ;
continue ;
}
2023-07-03 13:52:08 +03:00
// 2. Let encodedCodePoints be the result of running UTF-8 percent-encode codePoint using the userinfo percent-encode set.
// NOTE: This is done inside of step 3 and 4 implementation
// 3. If passwordTokenSeen is true, then append encodedCodePoints to url’ s password.
2021-05-25 23:13:15 +03:00
if ( password_token_seen ) {
2023-10-28 08:26:20 +03:00
if ( password_builder . is_empty ( ) )
password_builder . append ( url - > m_password ) ;
URL : : append_percent_encoded_if_necessary ( password_builder , c , URL : : PercentEncodeSet : : Userinfo ) ;
2023-07-03 13:52:08 +03:00
}
// 4. Otherwise, append encodedCodePoints to url’ s username.
else {
2023-10-28 08:26:20 +03:00
if ( username_builder . is_empty ( ) )
username_builder . append ( url - > m_username ) ;
URL : : append_percent_encoded_if_necessary ( username_builder , c , URL : : PercentEncodeSet : : Userinfo ) ;
2021-05-25 23:13:15 +03:00
}
}
2023-07-03 13:52:08 +03:00
2023-10-28 08:26:20 +03:00
if ( username_builder . string_view ( ) . length ( ) > url - > m_username . bytes ( ) . size ( ) )
url - > m_username = username_builder . to_string ( ) . release_value_but_fixme_should_propagate_errors ( ) ;
if ( password_builder . string_view ( ) . length ( ) > url - > m_password . bytes ( ) . size ( ) )
url - > m_password = password_builder . to_string ( ) . release_value_but_fixme_should_propagate_errors ( ) ;
2023-07-03 13:52:08 +03:00
// 5. Set buffer to the empty string.
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, if one of the following is true:
// * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
// * url is special and c is U+005C (\)
else if ( ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' ? ' | | code_point = = ' # ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
// then:
// 1. If atSignSeen is true and buffer is the empty string, invalid-credentials validation error, return failure.
2021-05-25 23:13:15 +03:00
if ( at_sign_seen & & buffer . is_empty ( ) ) {
report_validation_error ( ) ;
return { } ;
}
2023-07-03 13:52:08 +03:00
// 2. Decrease pointer by buffer’ s code point length + 1, set buffer to the empty string, and set state to host state.
2021-05-25 23:13:15 +03:00
iterator = input . iterator_at_byte_offset ( iterator - input . begin ( ) - buffer . length ( ) - 1 ) ;
buffer . clear ( ) ;
state = State : : Host ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, append c to buffer.
else {
2021-05-25 23:13:15 +03:00
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> host state, https://url.spec.whatwg.org/#host-state
// -> hostname state, https://url.spec.whatwg.org/#hostname-state
2021-05-25 23:13:15 +03:00
case State : : Host :
case State : : Hostname :
2023-07-04 11:34:00 +03:00
// 1. If state override is given and url’ s scheme is "file", then decrease pointer by 1 and set state to file host state.
if ( state_override . has_value ( ) & & url - > scheme ( ) = = " file " ) {
state = State : : FileHost ;
continue ;
}
2023-07-03 13:52:08 +03:00
// 2. Otherwise, if c is U+003A (:) and insideBrackets is false, then:
2021-05-25 23:13:15 +03:00
if ( code_point = = ' : ' & & ! inside_brackets ) {
2023-07-03 13:52:08 +03:00
// 1. If buffer is the empty string, host-missing validation error, return failure.
2021-05-25 23:13:15 +03:00
if ( buffer . is_empty ( ) ) {
report_validation_error ( ) ;
return { } ;
}
2023-07-03 13:52:08 +03:00
2023-07-14 03:58:16 +03:00
// 2. If state override is given and state override is hostname state, then return.
if ( state_override . has_value ( ) & & * state_override = = State : : Hostname )
return * url ;
2023-07-03 13:52:08 +03:00
// 3. Let host be the result of host parsing buffer with url is not special.
2021-09-13 22:34:14 +03:00
auto host = parse_host ( buffer . string_view ( ) , ! url - > is_special ( ) ) ;
2023-07-03 13:52:08 +03:00
// 4. If host is failure, then return failure.
2021-05-25 23:13:15 +03:00
if ( ! host . has_value ( ) )
return { } ;
2023-07-03 13:52:08 +03:00
// 5. Set url’ s host to host, buffer to the empty string, and state to port state.
2021-09-13 22:34:14 +03:00
url - > m_host = host . release_value ( ) ;
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
state = State : : Port ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, if one of the following is true:
// * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
// * url is special and c is U+005C (\)
else if ( ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' ? ' | | code_point = = ' # ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
// then decrease pointer by 1, and then:
// NOTE: pointer decrement is done by the continue below
// 1. If url is special and buffer is the empty string, host-missing validation error, return failure.
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) & & buffer . is_empty ( ) ) {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
return { } ;
}
2023-07-03 13:52:08 +03:00
2023-07-14 03:58:16 +03:00
// 2. Otherwise, if state override is given, buffer is the empty string, and either url includes credentials or url’ s port is non-null, return.
if ( state_override . has_value ( ) & & buffer . is_empty ( ) & & ( url - > includes_credentials ( ) | | url - > port ( ) . has_value ( ) ) )
return * url ;
2023-07-03 13:52:08 +03:00
// 3. Let host be the result of host parsing buffer with url is not special.
2021-09-13 22:34:14 +03:00
auto host = parse_host ( buffer . string_view ( ) , ! url - > is_special ( ) ) ;
2023-07-03 13:52:08 +03:00
// 4. If host is failure, then return failure.
2021-05-25 23:13:15 +03:00
if ( ! host . has_value ( ) )
return { } ;
2023-07-03 13:52:08 +03:00
// 5. Set url’ s host to host, buffer to the empty string, and state to path start state.
2021-09-13 22:34:14 +03:00
url - > m_host = host . value ( ) ;
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
state = State : : Port ;
2023-07-03 13:52:08 +03:00
2023-07-14 03:58:16 +03:00
// 6. If state override is given, then return.
if ( state_override . has_value ( ) )
return * url ;
2021-05-25 23:13:15 +03:00
continue ;
2023-07-03 13:52:08 +03:00
}
// 4. Otherwise:
else {
2023-07-04 12:57:05 +03:00
// 1. If c is U+005B ([), then set insideBrackets to true.
if ( code_point = = ' [ ' ) {
inside_brackets = true ;
}
// 2. If c is U+005D (]), then set insideBrackets to false.
else if ( code_point = = ' ] ' ) {
inside_brackets = false ;
}
// 3. Append c to buffer.
2021-05-25 23:13:15 +03:00
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> port state, https://url.spec.whatwg.org/#port-state
2021-05-25 23:13:15 +03:00
case State : : Port :
2023-07-03 13:52:08 +03:00
// 1. If c is an ASCII digit, append c to buffer.
2021-05-25 23:13:15 +03:00
if ( is_ascii_digit ( code_point ) ) {
buffer . append_code_point ( code_point ) ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, if one of the following is true:
// * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
// * url is special and c is U+005C (\)
2023-07-04 11:34:00 +03:00
// * state override is given
else if ( ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' ? ' | | code_point = = ' # ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' )
| | state_override . has_value ( ) ) {
2023-07-03 13:52:08 +03:00
// then:
// 1. If buffer is not the empty string, then:
2021-05-25 23:13:15 +03:00
if ( ! buffer . is_empty ( ) ) {
2023-07-03 13:52:08 +03:00
// 1. Let port be the mathematical integer value that is represented by buffer in radix-10 using ASCII digits for digits with values 0 through 9.
2023-12-23 05:59:14 +03:00
auto port = buffer . string_view ( ) . to_number < u16 > ( ) ;
2023-07-03 13:52:08 +03:00
// 2. If port is greater than 2^16 − 1, port-out-of-range validation error, return failure.
2023-12-23 05:59:14 +03:00
// NOTE: This is done by to_number.
if ( ! port . has_value ( ) ) {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
return { } ;
}
2023-07-03 13:52:08 +03:00
// 3. Set url’ s port to null, if port is url’ s scheme’ s default port; otherwise to port.
2021-09-13 22:34:14 +03:00
if ( port . value ( ) = = URL : : default_port_for_scheme ( url - > scheme ( ) ) )
2021-09-13 23:12:16 +03:00
url - > m_port = { } ;
2021-05-25 23:13:15 +03:00
else
2021-09-13 22:34:14 +03:00
url - > m_port = port . value ( ) ;
2023-07-03 13:52:08 +03:00
// 4. Set buffer to the empty string.
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
}
2023-07-03 13:52:08 +03:00
2023-07-14 03:58:16 +03:00
// 2. If state override is given, then return.
if ( state_override . has_value ( ) )
return * url ;
2023-07-03 13:52:08 +03:00
// 3. Set state to path start state and decrease pointer by 1.
2021-05-25 23:13:15 +03:00
state = State : : PathStart ;
continue ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, port-invalid validation error, return failure.
else {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
return { } ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> file state, https://url.spec.whatwg.org/#file-state
2021-05-25 23:13:15 +03:00
case State : : File :
2023-07-03 13:52:08 +03:00
// 1. Set url’ s scheme to "file".
2023-08-12 07:52:41 +03:00
url - > m_scheme = String : : from_utf8 ( " file " sv ) . release_value_but_fixme_should_propagate_errors ( ) ;
2023-07-03 13:52:08 +03:00
// 2. Set url’ s host to the empty string.
2023-07-27 12:40:41 +03:00
url - > m_host = String { } ;
2023-07-03 13:52:08 +03:00
// 3. If c is U+002F (/) or U+005C (\), then:
2021-05-25 23:13:15 +03:00
if ( code_point = = ' / ' | | code_point = = ' \\ ' ) {
2023-07-03 13:52:08 +03:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 2. Set state to file slash state.
2021-05-25 23:13:15 +03:00
state = State : : FileSlash ;
2023-07-03 13:52:08 +03:00
}
// 4. Otherwise, if base is non-null and base’ s scheme is "file":
else if ( base_url . has_value ( ) & & base_url - > m_scheme = = " file " ) {
// 1. Set url’ s host to base’ s host, url’ s path to a clone of base’ s path, and url’ s query to base’ s query.
2021-09-13 22:34:14 +03:00
url - > m_host = base_url - > m_host ;
url - > m_paths = base_url - > m_paths ;
url - > m_query = base_url - > m_query ;
2023-07-03 13:52:08 +03:00
// 2. If c is U+003F (?), then set url’ s query to the empty string and state to query state.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' ? ' ) {
2023-08-12 10:28:19 +03:00
url - > m_query = String { } ;
2021-05-25 23:13:15 +03:00
state = State : : Query ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, if c is U+0023 (#), set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2023-08-12 07:52:42 +03:00
url - > m_fragment = String { } ;
2021-05-25 23:13:15 +03:00
state = State : : Fragment ;
2023-07-03 13:52:08 +03:00
}
// 4. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. Set url’ s query to null.
2021-09-13 22:34:14 +03:00
url - > m_query = { } ;
2023-07-03 13:52:08 +03:00
// 2. If the code point substring from pointer to the end of input does not start with a Windows drive letter, then shorten url’ s path.
2021-05-25 23:13:15 +03:00
auto substring_from_pointer = input . substring_view ( iterator - input . begin ( ) ) . as_string ( ) ;
if ( ! starts_with_windows_drive_letter ( substring_from_pointer ) ) {
2023-09-17 04:15:52 +03:00
shorten_urls_path ( * url ) ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise:
else {
// 1. File-invalid-Windows-drive-letter validation error.
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 2. Set url’ s path to « ».
2021-09-13 22:34:14 +03:00
url - > m_paths . clear ( ) ;
2021-05-25 23:13:15 +03:00
}
2023-07-03 13:52:08 +03:00
// 4. Set state to path state and decrease pointer by 1.
2021-05-25 23:13:15 +03:00
state = State : : Path ;
continue ;
}
}
2023-07-04 12:06:58 +03:00
// 5. Otherwise, set state to path state, and decrease pointer by 1.
else {
state = State : : Path ;
continue ;
}
2023-07-03 13:52:08 +03:00
2021-05-25 23:13:15 +03:00
break ;
2023-07-03 13:52:08 +03:00
// -> file slash state, https://url.spec.whatwg.org/#file-slash-state
2021-05-25 23:13:15 +03:00
case State : : FileSlash :
2023-07-03 13:52:08 +03:00
// 1. If c is U+002F (/) or U+005C (\), then:
2021-05-25 23:13:15 +03:00
if ( code_point = = ' / ' | | code_point = = ' \\ ' ) {
2023-07-03 13:52:08 +03:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 2. Set state to file host state.
2021-05-25 23:13:15 +03:00
state = State : : FileHost ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise:
2023-07-04 12:12:33 +03:00
else {
// 1. If base is non-null and base’ s scheme is "file", then:
if ( base_url . has_value ( ) & & base_url - > m_scheme = = " file " ) {
// 1. Set url’ s host to base’ s host.
2023-09-17 04:47:29 +03:00
url - > m_host = base_url - > m_host ;
// FIXME: The spec does not seem to mention these steps.
2023-07-04 12:12:33 +03:00
url - > m_paths = base_url - > m_paths ;
url - > m_paths . remove ( url - > m_paths . size ( ) - 1 ) ;
2023-07-03 13:52:08 +03:00
2023-07-04 12:12:33 +03:00
// 2. If the code point substring from pointer to the end of input does not start with a Windows drive letter and base’ s path[0] is a normalized Windows drive letter, then append base’ s path[0] to url’ s path.
auto substring_from_pointer = input . substring_view ( iterator - input . begin ( ) ) . as_string ( ) ;
if ( ! starts_with_windows_drive_letter ( substring_from_pointer ) & & is_normalized_windows_drive_letter ( base_url - > m_paths [ 0 ] ) )
2023-08-06 07:13:08 +03:00
url - > m_paths . append ( base_url - > m_paths [ 0 ] ) ;
2023-07-04 12:12:33 +03:00
}
2023-07-03 13:52:08 +03:00
2023-07-04 12:12:33 +03:00
// 2. Set state to path state, and decrease pointer by 1.
2021-05-25 23:13:15 +03:00
state = State : : Path ;
continue ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> file host state, https://url.spec.whatwg.org/#file-host-state
2021-05-25 23:13:15 +03:00
case State : : FileHost :
2023-07-03 13:52:08 +03:00
// 1. If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by 1 and then:
// NOTE: decreasing the pointer is done at the bottom of this block.
2021-06-03 13:43:08 +03:00
if ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' \\ ' | | code_point = = ' ? ' | | code_point = = ' # ' ) {
2023-07-03 13:52:08 +03:00
// 1. If state override is not given and buffer is a Windows drive letter, file-invalid-Windows-drive-letter-host validation error, set state to path state.
2023-07-04 11:34:00 +03:00
if ( ! state_override . has_value ( ) & & is_windows_drive_letter ( buffer . string_view ( ) ) ) {
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
state = State : : Path ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, if buffer is the empty string, then:
else if ( buffer . is_empty ( ) ) {
// 1. Set url’ s host to the empty string.
2023-07-27 12:40:41 +03:00
url - > m_host = String { } ;
2023-07-03 13:52:08 +03:00
2023-07-14 03:58:16 +03:00
// 2. If state override is given, then return.
if ( state_override . has_value ( ) )
return * url ;
2023-07-03 13:52:08 +03:00
// 3. Set state to path start state.
2021-05-25 23:13:15 +03:00
state = State : : PathStart ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, run these steps:
else {
// 1. Let host be the result of host parsing buffer with url is not special.
// FIXME: It seems we are not passing through url is not special through here
2021-06-08 16:22:02 +03:00
auto host = parse_host ( buffer . string_view ( ) , true ) ;
2023-07-03 13:52:08 +03:00
// 2. If host is failure, then return failure.
2021-05-25 23:13:15 +03:00
if ( ! host . has_value ( ) )
return { } ;
2023-07-03 13:52:08 +03:00
// 3. If host is "localhost", then set host to the empty string.
2023-07-27 12:40:41 +03:00
if ( host . value ( ) . has < String > ( ) & & host . value ( ) . get < String > ( ) = = " localhost " sv )
host = String { } ;
2023-07-03 13:52:08 +03:00
// 4. Set url’ s host to host.
2021-09-13 22:34:14 +03:00
url - > m_host = host . release_value ( ) ;
2023-07-03 13:52:08 +03:00
2023-07-14 03:58:16 +03:00
// 5. If state override is given, then return.
if ( state_override . has_value ( ) )
return * url ;
2023-07-03 13:52:08 +03:00
// 6. Set buffer to the empty string and state to path start state.
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
state = State : : PathStart ;
}
2023-07-03 13:52:08 +03:00
// NOTE: Decrement specified at the top of this 'if' statement.
2021-05-25 23:13:15 +03:00
continue ;
} else {
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> path start state, https://url.spec.whatwg.org/#path-start-state
2021-05-25 23:13:15 +03:00
case State : : PathStart :
2023-07-03 13:52:08 +03:00
// 1. If url is special, then:
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) ) {
2023-07-03 13:52:08 +03:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 2. Set state to path state.
2021-05-25 23:13:15 +03:00
state = State : : Path ;
2023-07-03 13:52:08 +03:00
// 3. If c is neither U+002F (/) nor U+005C (\), then decrease pointer by 1.
2021-05-25 23:13:15 +03:00
if ( code_point ! = ' / ' & & code_point ! = ' \\ ' )
continue ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, if state override is not given and c is U+003F (?), set url’ s query to the empty string and state to query state.
2023-07-04 11:34:00 +03:00
else if ( ! state_override . has_value ( ) & & code_point = = ' ? ' ) {
2023-08-12 10:28:19 +03:00
url - > m_query = String { } ;
2021-05-25 23:13:15 +03:00
state = State : : Query ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, if state override is not given and c is U+0023 (#), set url’ s fragment to the empty string and state to fragment state.
2023-07-04 11:34:00 +03:00
else if ( ! state_override . has_value ( ) & & code_point = = ' # ' ) {
2023-08-12 07:52:42 +03:00
url - > m_fragment = String { } ;
2021-05-25 23:13:15 +03:00
state = State : : Fragment ;
2023-07-03 13:52:08 +03:00
}
// 4. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. Set state to path state.
2021-05-25 23:13:15 +03:00
state = State : : Path ;
2023-07-03 13:52:08 +03:00
// 2. If c is not U+002F (/), then decrease pointer by 1.
2021-05-25 23:13:15 +03:00
if ( code_point ! = ' / ' )
continue ;
}
2023-07-04 11:34:00 +03:00
// 5. Otherwise, if state override is given and url’ s host is null, append the empty string to url’ s path.
2023-07-27 12:40:41 +03:00
else if ( state_override . has_value ( ) & & url - > host ( ) . has < Empty > ( ) ) {
2023-07-04 11:34:00 +03:00
url - > append_slash ( ) ;
}
2021-05-25 23:13:15 +03:00
break ;
2023-07-03 13:52:08 +03:00
// -> path state, https://url.spec.whatwg.org/#path-state
2021-05-25 23:13:15 +03:00
case State : : Path :
2023-07-03 13:52:08 +03:00
// 1. If one of the following is true:
// * c is the EOF code point or U+002F (/)
// * url is special and c is U+005C (\)
2023-07-04 11:34:00 +03:00
// * state override is not given and c is U+003F (?) or U+0023 (#)
if ( ( code_point = = end_of_file | | code_point = = ' / ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' )
| | ( ! state_override . has_value ( ) & & ( code_point = = ' ? ' | | code_point = = ' # ' ) ) ) {
2023-07-03 13:52:08 +03:00
// then:
// 1. If url is special and c is U+005C (\), invalid-reverse-solidus validation error.
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) & & code_point = = ' \\ ' )
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 2. If buffer is a double-dot URL path segment, then:
2021-06-08 16:22:02 +03:00
if ( is_double_dot_path_segment ( buffer . string_view ( ) ) ) {
2023-07-04 12:21:33 +03:00
// 1. Shorten url’ s path.
2023-09-17 04:15:52 +03:00
shorten_urls_path ( * url ) ;
2023-07-03 13:52:08 +03:00
// 2. If neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’ s path.
2021-09-13 22:34:14 +03:00
if ( code_point ! = ' / ' & & ! ( url - > is_special ( ) & & code_point = = ' \\ ' ) )
2023-04-09 16:21:00 +03:00
url - > append_slash ( ) ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, if buffer is a single-dot URL path segment and if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’ s path.
else if ( is_single_dot_path_segment ( buffer . string_view ( ) ) & & code_point ! = ' / ' & & ! ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
2023-04-09 16:21:00 +03:00
url - > append_slash ( ) ;
2023-07-03 13:52:08 +03:00
}
// 4. Otherwise, if buffer is not a single-dot URL path segment, then:
else if ( ! is_single_dot_path_segment ( buffer . string_view ( ) ) ) {
// 1. If url’ s scheme is "file", url’ s path is empty, and buffer is a Windows drive letter, then replace the second code point in buffer with U+003A (:).
2021-09-13 22:34:14 +03:00
if ( url - > m_scheme = = " file " & & url - > m_paths . is_empty ( ) & & is_windows_drive_letter ( buffer . string_view ( ) ) ) {
2021-06-08 16:22:02 +03:00
auto drive_letter = buffer . string_view ( ) [ 0 ] ;
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
buffer . append ( drive_letter ) ;
buffer . append ( ' : ' ) ;
}
2023-07-03 13:52:08 +03:00
// 2. Append buffer to url’ s path.
2023-12-29 19:35:01 +03:00
url - > m_paths . append ( buffer . to_string_without_validation ( ) ) ;
2021-05-25 23:13:15 +03:00
}
2023-07-03 13:52:08 +03:00
// 5. Set buffer to the empty string.
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
2023-07-03 13:52:08 +03:00
// 6. If c is U+003F (?), then set url’ s query to the empty string and state to query state.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' ? ' ) {
2023-08-12 10:28:19 +03:00
url - > m_query = String { } ;
2021-05-25 23:13:15 +03:00
state = State : : Query ;
2023-07-03 13:52:08 +03:00
}
// 7. If c is U+0023 (#), then set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2023-08-12 07:52:42 +03:00
url - > m_fragment = String { } ;
2021-05-25 23:13:15 +03:00
state = State : : Fragment ;
}
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, run these steps
else {
// 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
2021-05-25 23:13:15 +03:00
if ( ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
2023-08-13 02:17:02 +03:00
// 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if ( code_point = = ' % ' & & ! remaining_starts_with_two_ascii_hex_digits ( ) )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 3. UTF-8 percent-encode c using the path percent-encode set and append the result to buffer.
2021-05-25 23:13:15 +03:00
URL : : append_percent_encoded_if_necessary ( buffer , code_point , URL : : PercentEncodeSet : : Path ) ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> opaque path state, https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
2021-05-25 23:13:15 +03:00
case State : : CannotBeABaseUrlPath :
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
2021-09-13 22:34:14 +03:00
VERIFY ( url - > m_paths . size ( ) = = 1 & & url - > m_paths [ 0 ] . is_empty ( ) ) ;
2023-07-03 13:52:08 +03:00
// 1. If c is U+003F (?), then set url’ s query to the empty string and state to query state.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' ? ' ) {
2023-12-29 19:35:01 +03:00
url - > m_paths [ 0 ] = buffer . to_string_without_validation ( ) ;
2023-08-12 10:28:19 +03:00
url - > m_query = String { } ;
2023-07-06 20:13:42 +03:00
buffer . clear ( ) ;
2021-05-25 23:13:15 +03:00
state = State : : Query ;
2023-07-03 13:52:08 +03:00
}
// 2. Otherwise, if c is U+0023 (#), then set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2021-05-25 23:13:15 +03:00
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
2023-12-29 19:35:01 +03:00
url - > m_paths [ 0 ] = buffer . to_string_without_validation ( ) ;
2023-08-12 07:52:42 +03:00
url - > m_fragment = String { } ;
2023-07-06 20:13:42 +03:00
buffer . clear ( ) ;
2021-05-25 23:13:15 +03:00
state = State : : Fragment ;
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise:
else {
// 1. If c is not the EOF code point, not a URL code point, and not U+0025 (%), invalid-URL-unit validation error.
2021-06-03 13:43:08 +03:00
if ( code_point ! = end_of_file & & ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
2021-05-25 23:13:15 +03:00
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
2023-08-13 02:17:02 +03:00
// 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if ( code_point = = ' % ' & & ! remaining_starts_with_two_ascii_hex_digits ( ) )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 3. If c is not the EOF code point, UTF-8 percent-encode c using the C0 control percent-encode set and append the result to url’ s path.
2021-06-03 13:43:08 +03:00
if ( code_point ! = end_of_file ) {
2021-05-25 23:13:15 +03:00
URL : : append_percent_encoded_if_necessary ( buffer , code_point , URL : : PercentEncodeSet : : C0Control ) ;
} else {
2023-12-29 19:35:01 +03:00
url - > m_paths [ 0 ] = buffer . to_string_without_validation ( ) ;
2023-07-06 20:13:42 +03:00
buffer . clear ( ) ;
2021-05-25 23:13:15 +03:00
}
}
break ;
2023-07-03 13:52:08 +03:00
// -> query state, https://url.spec.whatwg.org/#query-state
2021-05-25 23:13:15 +03:00
case State : : Query :
2023-07-03 13:52:08 +03:00
// FIXME: 1. If encoding is not UTF-8 and one of the following is true:
// * url is not special
// * url’ s scheme is "ws" or "wss"
// then set encoding to UTF-8.
// 2. If one of the following is true:
2023-07-04 11:34:00 +03:00
// * state override is not given and c is U+0023 (#)
2023-07-03 13:52:08 +03:00
// * c is the EOF code point
2023-07-04 11:34:00 +03:00
if ( ( ! state_override . has_value ( ) & & code_point = = ' # ' )
| | code_point = = end_of_file ) {
2023-07-03 13:52:08 +03:00
// then:
// 1. Let queryPercentEncodeSet be the special-query percent-encode set if url is special; otherwise the query percent-encode set.
2021-09-13 22:34:14 +03:00
auto query_percent_encode_set = url - > is_special ( ) ? URL : : PercentEncodeSet : : SpecialQuery : URL : : PercentEncodeSet : : Query ;
2023-07-03 13:52:08 +03:00
// 2. Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to url’ s query.
2023-08-14 09:49:23 +03:00
url - > m_query = percent_encode_after_encoding ( buffer . string_view ( ) , query_percent_encode_set ) . release_value_but_fixme_should_propagate_errors ( ) ;
2023-07-03 13:52:08 +03:00
// 3. Set buffer to the empty string.
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
2023-07-03 13:52:08 +03:00
// 4. If c is U+0023 (#), then set url’ s fragment to the empty string and state to fragment state.
2021-05-25 23:13:15 +03:00
if ( code_point = = ' # ' ) {
2023-08-12 07:52:42 +03:00
url - > m_fragment = String { } ;
2021-05-25 23:13:15 +03:00
state = State : : Fragment ;
}
2023-07-03 13:52:08 +03:00
}
// 3. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
2021-05-25 23:13:15 +03:00
if ( ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
2023-08-13 02:17:02 +03:00
// 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if ( code_point = = ' % ' & & ! remaining_starts_with_two_ascii_hex_digits ( ) )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
// 3. Append c to buffer.
2021-05-25 23:13:15 +03:00
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 13:52:08 +03:00
// -> fragment state, https://url.spec.whatwg.org/#fragment-state
2021-05-25 23:13:15 +03:00
case State : : Fragment :
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
2023-07-03 13:52:08 +03:00
// 1. If c is not the EOF code point, then:
2021-06-03 13:43:08 +03:00
if ( code_point ! = end_of_file ) {
2023-07-03 13:52:08 +03:00
// 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
2021-05-25 23:13:15 +03:00
if ( ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
2023-08-13 02:17:02 +03:00
// 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if ( code_point = = ' % ' & & ! remaining_starts_with_two_ascii_hex_digits ( ) )
report_validation_error ( ) ;
2023-07-03 13:52:08 +03:00
2023-08-14 07:25:21 +03:00
// 3. UTF-8 percent-encode c using the fragment percent-encode set and append the result to url’ s fragment.
// NOTE: The percent-encode is done on EOF on the entire buffer.
2021-05-25 23:13:15 +03:00
buffer . append_code_point ( code_point ) ;
} else {
2023-08-14 09:49:23 +03:00
url - > m_fragment = percent_encode_after_encoding ( buffer . string_view ( ) , URL : : PercentEncodeSet : : Fragment ) . release_value_but_fixme_should_propagate_errors ( ) ;
2021-05-25 23:13:15 +03:00
buffer . clear ( ) ;
}
break ;
default :
VERIFY_NOT_REACHED ( ) ;
}
if ( iterator . done ( ) )
break ;
+ + iterator ;
}
2021-09-13 22:34:14 +03:00
url - > m_valid = true ;
dbgln_if ( URL_PARSER_DEBUG , " URLParser::parse: Parsed URL to be '{}'. " , url - > serialize ( ) ) ;
2023-07-03 13:52:08 +03:00
// 10. Return url.
2021-09-13 22:34:14 +03:00
return url . release_value ( ) ;
2021-05-25 23:13:15 +03:00
}
}