diff --git a/AK/URL.cpp b/AK/URL.cpp index cbe45e8ad3a..8e6d5f32e01 100644 --- a/AK/URL.cpp +++ b/AK/URL.cpp @@ -9,9 +9,30 @@ #include #include #include +#include namespace AK { +constexpr bool is_ascii_alpha(u32 code_point) +{ + return ('a' <= code_point && code_point <= 'z') || ('A' <= code_point && code_point <= 'Z'); +} + +constexpr bool is_ascii_digit(u32 code_point) +{ + return '0' <= code_point && code_point <= '9'; +} + +constexpr bool is_ascii_alphanumeric(u32 code_point) +{ + return is_ascii_alpha(code_point) || is_ascii_digit(code_point); +} + +constexpr bool is_ascii_hex_digit(u32 code_point) +{ + return is_ascii_digit(code_point) || (code_point >= 'a' && code_point <= 'f') || (code_point >= 'A' && code_point <= 'F'); +} + static inline bool is_valid_scheme_character(char ch) { return ch >= 'a' && ch <= 'z'; @@ -467,4 +488,97 @@ String URL::basename() const return LexicalPath(m_path).basename(); } +void URL::append_percent_encoded(StringBuilder& builder, u32 code_point) +{ + if (code_point <= 0x7f) + builder.appendff("%{:02X}", code_point); + else if (code_point <= 0x07ff) + builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80); + else if (code_point <= 0xffff) + builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80); + else if (code_point <= 0x10ffff) + builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80); + else + VERIFY_NOT_REACHED(); +} + +// https://url.spec.whatwg.org/#c0-control-percent-encode-set +constexpr bool code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet set) +{ + switch (set) { + case URL::PercentEncodeSet::C0Control: + return code_point < 0x20 || code_point > 0x7E; + case URL::PercentEncodeSet::Fragment: + return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"<>`"sv.contains(code_point); + case URL::PercentEncodeSet::Query: + return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"#<>"sv.contains(code_point); + case URL::PercentEncodeSet::SpecialQuery: + return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || code_point == '\''; + case URL::PercentEncodeSet::Path: + return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || "?`{}"sv.contains(code_point); + case URL::PercentEncodeSet::Userinfo: + return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(code_point); + case URL::PercentEncodeSet::Component: + return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(code_point); + case URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded: + return code_point >= 0x7E || !(is_ascii_alphanumeric(code_point) || "!'()~"sv.contains(code_point)); + case URL::PercentEncodeSet::EncodeURI: + // NOTE: This is the same percent encode set that JS encodeURI() uses. + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI + return code_point >= 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(code_point)); + default: + VERIFY_NOT_REACHED(); + } +} + +void URL::append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, URL::PercentEncodeSet set) +{ + if (code_point_is_in_percent_encode_set(code_point, set)) + append_percent_encoded(builder, code_point); + else + builder.append_code_point(code_point); +} + +String URL::percent_encode(const StringView& input, URL::PercentEncodeSet set) +{ + StringBuilder builder; + for (auto code_point : Utf8View(input)) { + append_percent_encoded_if_necessary(builder, code_point, set); + } + return builder.to_string(); +} + +constexpr u8 parse_hex_digit(u8 digit) +{ + if (digit >= '0' && digit <= '9') + return digit - '0'; + if (digit >= 'a' && digit <= 'f') + return digit - 'a' + 10; + if (digit >= 'A' && digit <= 'F') + return digit - 'A' + 10; + VERIFY_NOT_REACHED(); +} + +String URL::percent_decode(const StringView& input) +{ + if (!input.contains('%')) + return input; + StringBuilder builder; + Utf8View utf8_view(input); + for (auto it = utf8_view.begin(); !it.done(); ++it) { + if (*it != '%') { + builder.append_code_point(*it); + } else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) { + builder.append_code_point(*it); + } else { + ++it; + u8 byte = parse_hex_digit(*it) << 4; + ++it; + byte += parse_hex_digit(*it); + builder.append(byte); + } + } + return builder.to_string(); +} + } diff --git a/AK/URL.h b/AK/URL.h index 2ce309239d9..a14dd7bbd95 100644 --- a/AK/URL.h +++ b/AK/URL.h @@ -17,6 +17,18 @@ namespace AK { class URL { public: + enum class PercentEncodeSet { + C0Control, + Fragment, + Query, + SpecialQuery, + Path, + Userinfo, + Component, + ApplicationXWWWFormUrlencoded, + EncodeURI + }; + URL() = default; URL(const StringView&); URL(const char* string) @@ -67,6 +79,9 @@ public: static bool scheme_requires_port(const StringView&); static u16 default_port_for_scheme(const StringView&); + static String percent_encode(const StringView& input, PercentEncodeSet set = PercentEncodeSet::Userinfo); + static String percent_decode(const StringView& input); + bool operator==(const URL& other) const { if (this == &other) @@ -78,6 +93,9 @@ private: bool parse(const StringView&); bool compute_validity() const; + static void append_percent_encoded_if_necessary(StringBuilder&, u32 code_point, PercentEncodeSet set = PercentEncodeSet::Userinfo); + static void append_percent_encoded(StringBuilder&, u32 code_point); + bool m_valid { false }; u16 m_port { 0 }; bool m_data_payload_is_base64 { false };