AK: Implement more conforming URL percent encode/decode mechanism

This adds a few new functions to percent encode/decode strings according
to the URL specification. The functions allow specifying a
PercentEncodeSet, which is defined by the specification. It will be used
to replace the current urlencode() and urldecode() functions in a
further commit.

This commit adds a few duplicate helper functions in the URL class, such
as is_digit() and is_ascii_digit(). This will be cleaned up as soon as
the upcoming new URL parser will replace the current one.
This commit is contained in:
Max Wipfli 2021-05-25 13:50:03 +02:00 committed by Andreas Kling
parent 0e4f7aa8e8
commit 2a6c9bc5f7
Notes: sideshowbarker 2024-07-18 17:04:44 +09:00
2 changed files with 132 additions and 0 deletions

View File

@ -9,9 +9,30 @@
#include <AK/StringBuilder.h>
#include <AK/URL.h>
#include <AK/URLParser.h>
#include <AK/Utf8View.h>
namespace AK {
constexpr bool is_ascii_alpha(u32 code_point)
{
return ('a' <= code_point && code_point <= 'z') || ('A' <= code_point && code_point <= 'Z');
}
constexpr bool is_ascii_digit(u32 code_point)
{
return '0' <= code_point && code_point <= '9';
}
constexpr bool is_ascii_alphanumeric(u32 code_point)
{
return is_ascii_alpha(code_point) || is_ascii_digit(code_point);
}
constexpr bool is_ascii_hex_digit(u32 code_point)
{
return is_ascii_digit(code_point) || (code_point >= 'a' && code_point <= 'f') || (code_point >= 'A' && code_point <= 'F');
}
static inline bool is_valid_scheme_character(char ch)
{
return ch >= 'a' && ch <= 'z';
@ -467,4 +488,97 @@ String URL::basename() const
return LexicalPath(m_path).basename();
}
void URL::append_percent_encoded(StringBuilder& builder, u32 code_point)
{
if (code_point <= 0x7f)
builder.appendff("%{:02X}", code_point);
else if (code_point <= 0x07ff)
builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80);
else if (code_point <= 0xffff)
builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
else if (code_point <= 0x10ffff)
builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
else
VERIFY_NOT_REACHED();
}
// https://url.spec.whatwg.org/#c0-control-percent-encode-set
constexpr bool code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet set)
{
switch (set) {
case URL::PercentEncodeSet::C0Control:
return code_point < 0x20 || code_point > 0x7E;
case URL::PercentEncodeSet::Fragment:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"<>`"sv.contains(code_point);
case URL::PercentEncodeSet::Query:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"#<>"sv.contains(code_point);
case URL::PercentEncodeSet::SpecialQuery:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || code_point == '\'';
case URL::PercentEncodeSet::Path:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || "?`{}"sv.contains(code_point);
case URL::PercentEncodeSet::Userinfo:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(code_point);
case URL::PercentEncodeSet::Component:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(code_point);
case URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded:
return code_point >= 0x7E || !(is_ascii_alphanumeric(code_point) || "!'()~"sv.contains(code_point));
case URL::PercentEncodeSet::EncodeURI:
// NOTE: This is the same percent encode set that JS encodeURI() uses.
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
return code_point >= 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(code_point));
default:
VERIFY_NOT_REACHED();
}
}
void URL::append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, URL::PercentEncodeSet set)
{
if (code_point_is_in_percent_encode_set(code_point, set))
append_percent_encoded(builder, code_point);
else
builder.append_code_point(code_point);
}
String URL::percent_encode(const StringView& input, URL::PercentEncodeSet set)
{
StringBuilder builder;
for (auto code_point : Utf8View(input)) {
append_percent_encoded_if_necessary(builder, code_point, set);
}
return builder.to_string();
}
constexpr u8 parse_hex_digit(u8 digit)
{
if (digit >= '0' && digit <= '9')
return digit - '0';
if (digit >= 'a' && digit <= 'f')
return digit - 'a' + 10;
if (digit >= 'A' && digit <= 'F')
return digit - 'A' + 10;
VERIFY_NOT_REACHED();
}
String URL::percent_decode(const StringView& input)
{
if (!input.contains('%'))
return input;
StringBuilder builder;
Utf8View utf8_view(input);
for (auto it = utf8_view.begin(); !it.done(); ++it) {
if (*it != '%') {
builder.append_code_point(*it);
} else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) {
builder.append_code_point(*it);
} else {
++it;
u8 byte = parse_hex_digit(*it) << 4;
++it;
byte += parse_hex_digit(*it);
builder.append(byte);
}
}
return builder.to_string();
}
}

View File

@ -17,6 +17,18 @@ namespace AK {
class URL {
public:
enum class PercentEncodeSet {
C0Control,
Fragment,
Query,
SpecialQuery,
Path,
Userinfo,
Component,
ApplicationXWWWFormUrlencoded,
EncodeURI
};
URL() = default;
URL(const StringView&);
URL(const char* string)
@ -67,6 +79,9 @@ public:
static bool scheme_requires_port(const StringView&);
static u16 default_port_for_scheme(const StringView&);
static String percent_encode(const StringView& input, PercentEncodeSet set = PercentEncodeSet::Userinfo);
static String percent_decode(const StringView& input);
bool operator==(const URL& other) const
{
if (this == &other)
@ -78,6 +93,9 @@ private:
bool parse(const StringView&);
bool compute_validity() const;
static void append_percent_encoded_if_necessary(StringBuilder&, u32 code_point, PercentEncodeSet set = PercentEncodeSet::Userinfo);
static void append_percent_encoded(StringBuilder&, u32 code_point);
bool m_valid { false };
u16 m_port { 0 };
bool m_data_payload_is_base64 { false };