ladybird/AK/URLParser.cpp
2023-01-06 13:59:17 -07:00

719 lines
31 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <AK/DeprecatedString.h>
#include <AK/Optional.h>
#include <AK/SourceLocation.h>
#include <AK/StringBuilder.h>
#include <AK/StringUtils.h>
#include <AK/URLParser.h>
#include <AK/Utf8View.h>
namespace AK {
// NOTE: This is similar to the LibC macro EOF = -1.
constexpr u32 end_of_file = 0xFFFFFFFF;
static bool is_url_code_point(u32 code_point)
{
// FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
return is_ascii_alphanumeric(code_point) || code_point >= 0xA0 || "!$&'()*+,-./:;=?@_~"sv.contains(code_point);
}
static void report_validation_error(SourceLocation const& location = SourceLocation::current())
{
dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Validation error! {}", location);
}
static Optional<DeprecatedString> parse_opaque_host(StringView input)
{
auto forbidden_host_characters_excluding_percent = "\0\t\n\r #/:<>?@[\\]^|"sv;
for (auto character : forbidden_host_characters_excluding_percent) {
if (input.contains(character)) {
report_validation_error();
return {};
}
}
// FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error.
// FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error.
return URL::percent_encode(input, URL::PercentEncodeSet::C0Control);
}
static Optional<DeprecatedString> parse_ipv4_address(StringView input)
{
// FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser.
return input;
}
// https://url.spec.whatwg.org/#concept-host-parser
// NOTE: This is a very bare-bones implementation.
static Optional<DeprecatedString> parse_host(StringView input, bool is_not_special = false)
{
if (input.starts_with('[')) {
if (!input.ends_with(']')) {
report_validation_error();
return {};
}
// FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
TODO();
}
if (is_not_special)
return parse_opaque_host(input);
VERIFY(!input.is_empty());
// FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
auto domain = URL::percent_decode(input);
// FIXME: Let asciiDomain be the result of running domain to ASCII on domain.
auto& ascii_domain = domain;
auto forbidden_host_characters = "\0\t\n\r #%/:<>?@[\\]^|"sv;
for (auto character : forbidden_host_characters) {
if (ascii_domain.view().contains(character)) {
report_validation_error();
return {};
}
}
auto ipv4_host = parse_ipv4_address(ascii_domain);
return ipv4_host;
}
// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
constexpr bool starts_with_windows_drive_letter(StringView input)
{
if (input.length() < 2)
return false;
if (!is_ascii_alpha(input[0]) || !(input[1] == ':' || input[1] == '|'))
return false;
if (input.length() == 2)
return true;
return "/\\?#"sv.contains(input[2]);
}
constexpr bool is_windows_drive_letter(StringView input)
{
return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|');
}
constexpr bool is_normalized_windows_drive_letter(StringView input)
{
return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':';
}
constexpr bool is_single_dot_path_segment(StringView input)
{
return input == "."sv || input.equals_ignoring_case("%2e"sv);
}
constexpr bool is_double_dot_path_segment(StringView input)
{
return input == ".."sv || input.equals_ignoring_case(".%2e"sv) || input.equals_ignoring_case("%2e."sv) || input.equals_ignoring_case("%2e%2e"sv);
}
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
static DeprecatedString percent_encode_after_encoding(StringView input, URL::PercentEncodeSet percent_encode_set, bool space_as_plus = false)
{
// NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
StringBuilder output;
// 3. For each byte of encodeOutput converted to a byte sequence:
for (auto byte : input) {
// 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
if (space_as_plus && byte == ' ') {
output.append('+');
continue;
}
// 2. Let isomorph be a code point whose value is bytes value.
u32 isomorph = byte;
// 3. Assert: percentEncodeSet includes all non-ASCII code points.
// 4. If isomorphic is not in percentEncodeSet, then append isomorph to output.
if (!URL::code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) {
output.append_code_point(isomorph);
}
// 5. Otherwise, percent-encode byte and append the result to output.
else {
output.appendff("%{:02X}", byte);
}
}
// 6. Return output.
return output.to_deprecated_string();
}
// https://fetch.spec.whatwg.org/#data-urls
// FIXME: This only loosely follows the spec, as we use the same class for "regular" and data URLs, unlike the spec.
Optional<URL> URLParser::parse_data_url(StringView raw_input)
{
dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsing '{}'.", raw_input);
VERIFY(raw_input.starts_with("data:"sv));
auto input = raw_input.substring_view(5);
auto comma_offset = input.find(',');
if (!comma_offset.has_value())
return {};
auto mime_type = StringUtils::trim(input.substring_view(0, comma_offset.value()), "\t\n\f\r "sv, TrimMode::Both);
auto encoded_body = input.substring_view(comma_offset.value() + 1);
auto body = URL::percent_decode(encoded_body);
bool is_base64_encoded = false;
if (mime_type.ends_with("base64"sv, CaseSensitivity::CaseInsensitive)) {
auto substring_view = mime_type.substring_view(0, mime_type.length() - 6);
auto trimmed_substring_view = StringUtils::trim(substring_view, " "sv, TrimMode::Right);
if (trimmed_substring_view.ends_with(';')) {
is_base64_encoded = true;
mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1);
}
}
StringBuilder builder;
if (mime_type.starts_with(";"sv) || mime_type.is_empty()) {
builder.append("text/plain"sv);
builder.append(mime_type);
mime_type = builder.string_view();
}
// FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
URL url { StringUtils::trim(mime_type, "\n\r\t "sv, TrimMode::Both), move(body), is_base64_encoded };
dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsed data URL to be '{}'.", url.serialize());
return url;
}
// https://url.spec.whatwg.org/#concept-basic-url-parser
// NOTE: This parser assumes a UTF-8 encoding.
// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in
// validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the
// future for validation of URLs, which would then lead to infinite recursion.
// The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
// variables' values here, not what the URL class presents to its users.
// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting
// some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode
// everything before setting the member variables.
URL URLParser::parse(StringView raw_input, URL const* base_url, Optional<URL> url, Optional<State> state_override)
{
dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input);
if (raw_input.is_empty())
return base_url ? *base_url : URL {};
if (raw_input.starts_with("data:"sv)) {
auto maybe_url = parse_data_url(raw_input);
if (!maybe_url.has_value())
return {};
return maybe_url.release_value();
}
size_t start_index = 0;
size_t end_index = raw_input.length();
if (!url.has_value()) {
url = URL();
// NOTE: This removes all leading and trailing C0 control or space characters.
bool has_validation_error = false;
for (size_t i = 0; i < raw_input.length(); ++i) {
i8 ch = raw_input[i];
if (0 <= ch && ch <= 0x20) {
++start_index;
has_validation_error = true;
} else {
break;
}
}
for (ssize_t i = raw_input.length() - 1; i >= 0; --i) {
i8 ch = raw_input[i];
if (0 <= ch && ch <= 0x20) {
--end_index;
has_validation_error = true;
} else {
break;
}
}
if (has_validation_error)
report_validation_error();
}
if (start_index >= end_index)
return {};
DeprecatedString processed_input = raw_input.substring_view(start_index, end_index - start_index);
// NOTE: This replaces all tab and newline characters with nothing.
if (processed_input.contains("\t"sv) || processed_input.contains("\n"sv)) {
report_validation_error();
processed_input = processed_input.replace("\t"sv, ""sv, ReplaceMode::All).replace("\n"sv, ""sv, ReplaceMode::All);
}
State state = state_override.value_or(State::SchemeStart);
StringBuilder buffer;
bool at_sign_seen = false;
bool inside_brackets = false;
bool password_token_seen = false;
Utf8View input(processed_input);
Utf8CodePointIterator iterator = input.begin();
auto get_remaining = [&input, &iterator] {
return input.substring_view(iterator - input.begin() + iterator.underlying_code_point_length_in_bytes()).as_string();
};
// NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
// ++iterator : "increase pointer by 1"
// continue : "decrease pointer by 1"
for (;;) {
u32 code_point = end_of_file;
if (!iterator.done())
code_point = *iterator;
if constexpr (URL_PARSER_DEBUG) {
if (code_point == end_of_file)
dbgln("URLParser::parse: {} state with EOF.", state_name(state));
else if (is_ascii_printable(code_point))
dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
else
dbgln("URLParser::parse: {} state with code point U+{:04X}.", state_name(state), code_point);
}
switch (state) {
case State::SchemeStart:
if (is_ascii_alpha(code_point)) {
buffer.append_as_lowercase(code_point);
state = State::Scheme;
} else {
state = State::NoScheme;
continue;
}
break;
case State::Scheme:
if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') {
buffer.append_as_lowercase(code_point);
} else if (code_point == ':') {
url->m_scheme = buffer.to_deprecated_string();
buffer.clear();
if (url->scheme() == "file") {
if (!get_remaining().starts_with("//"sv)) {
report_validation_error();
}
state = State::File;
} else if (url->is_special()) {
if (base_url && base_url->m_scheme == url->m_scheme)
state = State::SpecialRelativeOrAuthority;
else
state = State::SpecialAuthoritySlashes;
} else if (get_remaining().starts_with("/"sv)) {
state = State::PathOrAuthority;
++iterator;
} else {
url->m_cannot_be_a_base_url = true;
url->append_path("");
state = State::CannotBeABaseUrlPath;
}
} else {
buffer.clear();
state = State::NoScheme;
iterator = input.begin();
continue;
}
break;
case State::NoScheme:
if (!base_url || (base_url->m_cannot_be_a_base_url && code_point != '#')) {
report_validation_error();
return {};
} else if (base_url->m_cannot_be_a_base_url && code_point == '#') {
url->m_scheme = base_url->m_scheme;
url->m_paths = base_url->m_paths;
url->m_query = base_url->m_query;
url->m_fragment = "";
url->m_cannot_be_a_base_url = true;
state = State::Fragment;
} else if (base_url->m_scheme != "file") {
state = State::Relative;
continue;
} else {
state = State::File;
continue;
}
break;
case State::SpecialRelativeOrAuthority:
if (code_point == '/' && get_remaining().starts_with("/"sv)) {
state = State::SpecialAuthorityIgnoreSlashes;
++iterator;
} else {
report_validation_error();
state = State::Relative;
continue;
}
break;
case State::PathOrAuthority:
if (code_point == '/') {
state = State::Authority;
} else {
state = State::Path;
continue;
}
break;
case State::Relative:
url->m_scheme = base_url->m_scheme;
if (code_point == '/') {
state = State::RelativeSlash;
} else if (url->is_special() && code_point == '\\') {
report_validation_error();
state = State::RelativeSlash;
} else {
url->m_username = base_url->m_username;
url->m_password = base_url->m_password;
url->m_host = base_url->m_host;
url->m_port = base_url->m_port;
url->m_paths = base_url->m_paths;
url->m_query = base_url->m_query;
if (code_point == '?') {
url->m_query = "";
state = State::Query;
} else if (code_point == '#') {
url->m_fragment = "";
state = State::Fragment;
} else if (code_point != end_of_file) {
url->m_query = {};
if (url->m_paths.size())
url->m_paths.remove(url->m_paths.size() - 1);
state = State::Path;
continue;
}
}
break;
case State::RelativeSlash:
if (url->is_special() && (code_point == '/' || code_point == '\\')) {
if (code_point == '\\')
report_validation_error();
state = State::SpecialAuthorityIgnoreSlashes;
} else if (code_point == '/') {
state = State::Authority;
} else {
url->m_username = base_url->m_username;
url->m_password = base_url->m_password;
url->m_host = base_url->m_host;
url->m_port = base_url->m_port;
state = State::Path;
continue;
}
break;
case State::SpecialAuthoritySlashes:
if (code_point == '/' && get_remaining().starts_with("/"sv)) {
state = State::SpecialAuthorityIgnoreSlashes;
++iterator;
} else {
report_validation_error();
state = State::SpecialAuthorityIgnoreSlashes;
continue;
}
break;
case State::SpecialAuthorityIgnoreSlashes:
if (code_point != '/' && code_point != '\\') {
state = State::Authority;
continue;
} else {
report_validation_error();
}
break;
case State::Authority:
if (code_point == '@') {
report_validation_error();
if (at_sign_seen) {
auto content = buffer.to_deprecated_string();
buffer.clear();
buffer.append("%40"sv);
buffer.append(content);
}
at_sign_seen = true;
StringBuilder builder;
for (auto c : Utf8View(builder.string_view())) {
if (c == ':' && !password_token_seen) {
password_token_seen = true;
continue;
}
builder.clear();
if (password_token_seen) {
builder.append(url->password());
URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
// NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
url->m_password = URL::percent_decode(builder.string_view());
} else {
builder.append(url->username());
URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
// NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
url->m_username = URL::percent_decode(builder.string_view());
}
}
buffer.clear();
} else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
if (at_sign_seen && buffer.is_empty()) {
report_validation_error();
return {};
}
// NOTE: This decreases the iterator by the number of code points in buffer plus one.
iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1);
buffer.clear();
state = State::Host;
} else {
buffer.append_code_point(code_point);
}
break;
case State::Host:
case State::Hostname:
if (code_point == ':' && !inside_brackets) {
if (buffer.is_empty()) {
report_validation_error();
return {};
}
auto host = parse_host(buffer.string_view(), !url->is_special());
if (!host.has_value())
return {};
url->m_host = host.release_value();
buffer.clear();
state = State::Port;
} else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
if (url->is_special() && buffer.is_empty()) {
report_validation_error();
return {};
}
auto host = parse_host(buffer.string_view(), !url->is_special());
if (!host.has_value())
return {};
url->m_host = host.value();
buffer.clear();
state = State::Port;
continue;
} else if (code_point == '[') {
inside_brackets = true;
} else if (code_point == ']') {
inside_brackets = false;
} else {
buffer.append_code_point(code_point);
}
break;
case State::Port:
if (is_ascii_digit(code_point)) {
buffer.append_code_point(code_point);
} else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
if (!buffer.is_empty()) {
auto port = buffer.string_view().to_uint();
if (!port.has_value() || port.value() > 65535) {
report_validation_error();
return {};
}
if (port.value() == URL::default_port_for_scheme(url->scheme()))
url->m_port = {};
else
url->m_port = port.value();
buffer.clear();
}
state = State::PathStart;
continue;
} else {
report_validation_error();
return {};
}
break;
case State::File:
url->m_scheme = "file";
url->m_host = "";
if (code_point == '/' || code_point == '\\') {
if (code_point == '\\')
report_validation_error();
state = State::FileSlash;
} else if (base_url && base_url->m_scheme == "file") {
url->m_host = base_url->m_host;
url->m_paths = base_url->m_paths;
url->m_query = base_url->m_query;
if (code_point == '?') {
url->m_query = "";
state = State::Query;
} else if (code_point == '#') {
url->m_fragment = "";
state = State::Fragment;
} else if (code_point != end_of_file) {
url->m_query = {};
auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
if (!starts_with_windows_drive_letter(substring_from_pointer)) {
if (!url->paths().is_empty() && !(url->scheme() == "file" && url->paths().size() == 1 && is_normalized_windows_drive_letter(url->paths()[0])))
url->m_paths.remove(url->m_paths.size() - 1);
} else {
report_validation_error();
url->m_paths.clear();
}
state = State::Path;
continue;
}
}
break;
case State::FileSlash:
if (code_point == '/' || code_point == '\\') {
if (code_point == '\\')
report_validation_error();
state = State::FileHost;
} else if (base_url && base_url->m_scheme == "file") {
url->m_host = base_url->m_host;
auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0]))
url->append_path(base_url->m_paths[0]);
state = State::Path;
continue;
}
break;
case State::FileHost:
if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
if (is_windows_drive_letter(buffer.string_view())) {
report_validation_error();
state = State::Path;
} else if (buffer.is_empty()) {
url->m_host = "";
state = State::PathStart;
} else {
auto host = parse_host(buffer.string_view(), true);
if (!host.has_value())
return {};
if (host.value() == "localhost")
host = "";
url->m_host = host.release_value();
buffer.clear();
state = State::PathStart;
}
continue;
} else {
buffer.append_code_point(code_point);
}
break;
case State::PathStart:
if (url->is_special()) {
if (code_point == '\\')
report_validation_error();
state = State::Path;
if (code_point != '/' && code_point != '\\')
continue;
} else if (code_point == '?') {
url->m_query = "";
state = State::Query;
} else if (code_point == '#') {
url->m_fragment = "";
state = State::Fragment;
} else if (code_point != end_of_file) {
state = State::Path;
if (code_point != '/')
continue;
}
break;
case State::Path:
if (code_point == end_of_file || code_point == '/' || (url->is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
if (url->is_special() && code_point == '\\')
report_validation_error();
if (is_double_dot_path_segment(buffer.string_view())) {
if (!url->m_paths.is_empty() && !(url->m_scheme == "file" && url->m_paths.size() == 1 && is_normalized_windows_drive_letter(url->m_paths[0])))
url->m_paths.remove(url->m_paths.size() - 1);
if (code_point != '/' && !(url->is_special() && code_point == '\\'))
url->append_path("");
} else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) {
url->append_path("");
} else if (!is_single_dot_path_segment(buffer.string_view())) {
if (url->m_scheme == "file" && url->m_paths.is_empty() && is_windows_drive_letter(buffer.string_view())) {
auto drive_letter = buffer.string_view()[0];
buffer.clear();
buffer.append(drive_letter);
buffer.append(':');
}
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
url->append_path(URL::percent_decode(buffer.string_view()));
}
buffer.clear();
if (code_point == '?') {
url->m_query = "";
state = State::Query;
} else if (code_point == '#') {
url->m_fragment = "";
state = State::Fragment;
}
} else {
if (!is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::Path);
}
break;
case State::CannotBeABaseUrlPath:
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
// NOTE: Verify that the assumptions required for this simplification are correct.
VERIFY(url->m_paths.size() == 1 && url->m_paths[0].is_empty());
if (code_point == '?') {
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
url->m_paths[0] = URL::percent_decode(buffer.string_view());
url->m_query = "";
state = State::Query;
} else if (code_point == '#') {
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
url->m_paths[0] = URL::percent_decode(buffer.string_view());
url->m_fragment = "";
state = State::Fragment;
} else {
if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
if (code_point != end_of_file) {
URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
} else {
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
url->m_paths[0] = URL::percent_decode(buffer.string_view());
}
}
break;
case State::Query:
// https://url.spec.whatwg.org/#query-state
if (code_point == end_of_file || code_point == '#') {
VERIFY(url->m_query == "");
auto query_percent_encode_set = url->is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
url->m_query = percent_encode_after_encoding(buffer.string_view(), query_percent_encode_set);
buffer.clear();
if (code_point == '#') {
url->m_fragment = "";
state = State::Fragment;
}
} else if (code_point != end_of_file) {
if (!is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
buffer.append_code_point(code_point);
}
break;
case State::Fragment:
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
if (code_point != end_of_file) {
if (!is_url_code_point(code_point) && code_point != '%')
report_validation_error();
// FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
buffer.append_code_point(code_point);
} else {
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
url->m_fragment = URL::percent_decode(buffer.string_view());
buffer.clear();
}
break;
default:
VERIFY_NOT_REACHED();
}
if (iterator.done())
break;
++iterator;
}
url->m_valid = true;
dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsed URL to be '{}'.", url->serialize());
return url.release_value();
}
}