LibURL+LibWeb: Move data URL processing to LibWeb's fetch infrastructure

This is a fetching AO and is only used by LibWeb in the context of fetch
tasks. Move it to LibWeb with other fetch methods.

The main reason for this is that it requires the use of other LibWeb AOs
such as the forgiving Base64 decoder and MIME sniffing. These AOs aren't
available within LibURL.
This commit is contained in:
Timothy Flynn 2024-03-24 08:37:33 -04:00 committed by Andreas Kling
parent 2118cdfcaa
commit 24ecf31ff5
Notes: sideshowbarker 2024-07-16 20:39:14 +09:00
10 changed files with 219 additions and 187 deletions

View File

@ -12,6 +12,15 @@ unittest("TestCSSPixels") {
deps = [ "//Userland/Libraries/LibWeb" ]
}
unittest("TestFetchURL") {
include_dirs = [ "//Userland/Libraries" ]
sources = [ "TestFetchURL.cpp" ]
deps = [
"//Userland/Libraries/LibURL",
"//Userland/Libraries/LibWeb",
]
}
unittest("TestHTMLTokenizer") {
include_dirs = [ "//Userland/Libraries" ]
sources = [ "TestHTMLTokenizer.cpp" ]
@ -41,6 +50,7 @@ group("LibWeb") {
deps = [
":TestCSSIDSpeed",
":TestCSSPixels",
":TestFetchURL",
":TestHTMLTokenizer",
":TestMicrosyntax",
":TestMimeSniff",

View File

@ -238,109 +238,6 @@ TEST_CASE(mailto_url_with_subject)
EXPECT_EQ(url.serialize(), "mailto:mail@example.com?subject=test");
}
TEST_CASE(data_url)
{
URL::URL url("data:text/html,test"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:text/html,test");
auto data_url = TRY_OR_FAIL(url.process_data_url());
EXPECT_EQ(data_url.mime_type, "text/html");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}
TEST_CASE(data_url_default_mime_type)
{
URL::URL url("data:,test"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:,test");
auto data_url = TRY_OR_FAIL(url.process_data_url());
EXPECT_EQ(data_url.mime_type, "text/plain;charset=US-ASCII");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}
TEST_CASE(data_url_encoded)
{
URL::URL url("data:text/html,Hello%20friends%2C%0X%X0"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:text/html,Hello%20friends%2C%0X%X0");
auto data_url = TRY_OR_FAIL(url.process_data_url());
EXPECT_EQ(data_url.mime_type, "text/html");
EXPECT_EQ(StringView(data_url.body.bytes()), "Hello friends,%0X%X0"sv);
}
TEST_CASE(data_url_base64_encoded)
{
URL::URL url("data:text/html;base64,dGVzdA=="sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:text/html;base64,dGVzdA==");
auto data_url = TRY_OR_FAIL(url.process_data_url());
EXPECT_EQ(data_url.mime_type, "text/html");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}
TEST_CASE(data_url_base64_encoded_default_mime_type)
{
URL::URL url("data:;base64,dGVzdA=="sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:;base64,dGVzdA==");
auto data_url = TRY_OR_FAIL(url.process_data_url());
EXPECT_EQ(data_url.mime_type, "text/plain;charset=US-ASCII");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}
TEST_CASE(data_url_base64_encoded_with_whitespace)
{
URL::URL url("data: text/html ; bAsE64 , dGVz dA== "sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data: text/html ; bAsE64 , dGVz dA==");
auto data_url = TRY_OR_FAIL(url.process_data_url());
EXPECT_EQ(data_url.mime_type, "text/html");
EXPECT_EQ(StringView(data_url.body.bytes()), "test");
}
TEST_CASE(data_url_base64_encoded_with_inline_whitespace)
{
URL::URL url("data:text/javascript;base64,%20ZD%20Qg%0D%0APS%20An%20Zm91cic%0D%0A%207%20"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
auto data_url = TRY_OR_FAIL(url.process_data_url());
EXPECT_EQ(data_url.mime_type, "text/javascript");
EXPECT_EQ(StringView(data_url.body.bytes()), "d4 = 'four';"sv);
}
TEST_CASE(data_url_completed_with_fragment)
{
auto url = URL::URL("data:text/plain,test"sv).complete_url("#a"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT_EQ(url.fragment(), "a");
EXPECT(url.host().has<Empty>());
auto data_url = TRY_OR_FAIL(url.process_data_url());
EXPECT_EQ(data_url.mime_type, "text/plain");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}
TEST_CASE(trailing_slash_with_complete_url)
{
EXPECT_EQ(URL::URL("http://a/b/"sv).complete_url("c/"sv).serialize(), "http://a/b/c/");

View File

@ -1,6 +1,7 @@
set(TEST_SOURCES
TestCSSIDSpeed.cpp
TestCSSPixels.cpp
TestFetchURL.cpp
TestHTMLTokenizer.cpp
TestMicrosyntax.cpp
TestMimeSniff.cpp
@ -11,4 +12,6 @@ foreach(source IN LISTS TEST_SOURCES)
serenity_test("${source}" LibWeb LIBS LibWeb)
endforeach()
target_link_libraries(TestFetchURL PRIVATE LibURL)
install(FILES tokenizer-test.html DESTINATION usr/Tests/LibWeb)

View File

@ -0,0 +1,113 @@
/*
* Copyright (c) 2023, Karol Kosek <krkk@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTest/TestCase.h>
#include <LibURL/URL.h>
#include <LibWeb/Fetch/Infrastructure/URL.h>
TEST_CASE(data_url)
{
URL::URL url("data:text/html,test"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:text/html,test");
auto data_url = TRY_OR_FAIL(Web::Fetch::Infrastructure::process_data_url(url));
EXPECT_EQ(data_url.mime_type, "text/html");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}
TEST_CASE(data_url_default_mime_type)
{
URL::URL url("data:,test"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:,test");
auto data_url = TRY_OR_FAIL(Web::Fetch::Infrastructure::process_data_url(url));
EXPECT_EQ(data_url.mime_type, "text/plain;charset=US-ASCII");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}
TEST_CASE(data_url_encoded)
{
URL::URL url("data:text/html,Hello%20friends%2C%0X%X0"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:text/html,Hello%20friends%2C%0X%X0");
auto data_url = TRY_OR_FAIL(Web::Fetch::Infrastructure::process_data_url(url));
EXPECT_EQ(data_url.mime_type, "text/html");
EXPECT_EQ(StringView(data_url.body.bytes()), "Hello friends,%0X%X0"sv);
}
TEST_CASE(data_url_base64_encoded)
{
URL::URL url("data:text/html;base64,dGVzdA=="sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:text/html;base64,dGVzdA==");
auto data_url = TRY_OR_FAIL(Web::Fetch::Infrastructure::process_data_url(url));
EXPECT_EQ(data_url.mime_type, "text/html");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}
TEST_CASE(data_url_base64_encoded_default_mime_type)
{
URL::URL url("data:;base64,dGVzdA=="sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data:;base64,dGVzdA==");
auto data_url = TRY_OR_FAIL(Web::Fetch::Infrastructure::process_data_url(url));
EXPECT_EQ(data_url.mime_type, "text/plain;charset=US-ASCII");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}
TEST_CASE(data_url_base64_encoded_with_whitespace)
{
URL::URL url("data: text/html ; bAsE64 , dGVz dA== "sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
EXPECT_EQ(url.serialize(), "data: text/html ; bAsE64 , dGVz dA==");
auto data_url = TRY_OR_FAIL(Web::Fetch::Infrastructure::process_data_url(url));
EXPECT_EQ(data_url.mime_type, "text/html");
EXPECT_EQ(StringView(data_url.body.bytes()), "test");
}
TEST_CASE(data_url_base64_encoded_with_inline_whitespace)
{
URL::URL url("data:text/javascript;base64,%20ZD%20Qg%0D%0APS%20An%20Zm91cic%0D%0A%207%20"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT(url.host().has<Empty>());
auto data_url = TRY_OR_FAIL(Web::Fetch::Infrastructure::process_data_url(url));
EXPECT_EQ(data_url.mime_type, "text/javascript");
EXPECT_EQ(StringView(data_url.body.bytes()), "d4 = 'four';"sv);
}
TEST_CASE(data_url_completed_with_fragment)
{
auto url = URL::URL("data:text/plain,test"sv).complete_url("#a"sv);
EXPECT(url.is_valid());
EXPECT_EQ(url.scheme(), "data");
EXPECT_EQ(url.fragment(), "a");
EXPECT(url.host().has<Empty>());
auto data_url = TRY_OR_FAIL(Web::Fetch::Infrastructure::process_data_url(url));
EXPECT_EQ(data_url.mime_type, "text/plain");
EXPECT_EQ(StringView(data_url.body.bytes()), "test"sv);
}

View File

@ -419,80 +419,6 @@ bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const
return serialize(exclude_fragments) == other.serialize(exclude_fragments);
}
// https://fetch.spec.whatwg.org/#data-url-processor
ErrorOr<DataURL> URL::process_data_url() const
{
// 1. Assert: dataURLs scheme is "data".
VERIFY(scheme() == "data");
// 2. Let input be the result of running the URL serializer on dataURL with exclude fragment set to true.
auto input = serialize(ExcludeFragment::Yes);
// 3. Remove the leading "data:" from input.
input = input.substring("data:"sv.length());
// 4. Let position point at the start of input.
// 5. Let mimeType be the result of collecting a sequence of code points that are not equal to U+002C (,), given position.
auto position = input.find(',');
auto mime_type = input.substring_view(0, position.value_or(input.length()));
// 6. Strip leading and trailing ASCII whitespace from mimeType.
mime_type = mime_type.trim_whitespace(TrimMode::Both);
// 7. If position is past the end of input, then return failure.
if (!position.has_value())
return Error::from_string_literal("Missing a comma character");
// 8. Advance position by 1.
position = position.value() + 1;
// 9. Let encodedBody be the remainder of input.
auto encoded_body = input.substring_view(position.value());
// 10. Let body be the percent-decoding of encodedBody.
auto body = percent_decode(encoded_body).to_byte_buffer();
// 11. If mimeType ends with U+003B (;), followed by zero or more U+0020 SPACE, followed by an ASCII case-insensitive match for "base64", then:
if (mime_type.ends_with("base64"sv, CaseSensitivity::CaseInsensitive)) {
auto trimmed_substring_view = mime_type.substring_view(0, mime_type.length() - 6);
trimmed_substring_view = trimmed_substring_view.trim(" "sv, TrimMode::Right);
if (trimmed_substring_view.ends_with(';')) {
// 1. Let stringBody be the isomorphic decode of body.
auto string_body = StringView(body);
// 2. Set body to the forgiving-base64 decode of stringBody.
// FIXME: Check if it's really forgiving.
// 3. If body is failure, then return failure.
body = TRY(decode_base64(string_body));
// 4. Remove the last 6 code points from mimeType.
// 5. Remove trailing U+0020 SPACE code points from mimeType, if any.
// 6. Remove the last U+003B (;) from mimeType.
mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1);
}
}
// 12. If mimeType starts with ";", then prepend "text/plain" to mimeType.
StringBuilder builder;
if (mime_type.starts_with(';')) {
builder.append("text/plain"sv);
builder.append(mime_type);
mime_type = builder.string_view();
}
// FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
// FIXME: 13. Let mimeTypeRecord be the result of parsing mimeType.
auto mime_type_record = mime_type.trim("\n\r\t "sv, TrimMode::Both);
// 14. If mimeTypeRecord is failure, then set mimeTypeRecord to text/plain;charset=US-ASCII.
if (mime_type_record.is_empty())
mime_type_record = "text/plain;charset=US-ASCII"sv;
// 15. Return a new data: URL struct whose MIME type is mimeTypeRecord and body is body.
return DataURL { TRY(String::from_utf8(mime_type_record)), body };
}
void append_percent_encoded(StringBuilder& builder, u32 code_point)
{
if (code_point <= 0x7f)

View File

@ -58,11 +58,6 @@ enum class ApplyPercentDecoding {
No
};
struct DataURL {
String mime_type;
ByteBuffer body;
};
void append_percent_encoded_if_necessary(StringBuilder&, u32 code_point, PercentEncodeSet set = PercentEncodeSet::Userinfo);
void append_percent_encoded(StringBuilder&, u32 code_point);
bool code_point_is_in_percent_encode_set(u32 code_point, PercentEncodeSet);
@ -143,8 +138,6 @@ public:
URL complete_url(StringView) const;
ErrorOr<DataURL> process_data_url() const;
bool operator==(URL const& other) const { return equals(other, ExcludeFragment::No); }
String const& raw_username() const { return m_username; }

View File

@ -805,7 +805,7 @@ WebIDL::ExceptionOr<JS::NonnullGCPtr<PendingResponse>> scheme_fetch(JS::Realm& r
// -> "data"
else if (request->current_url().scheme() == "data"sv) {
// 1. Let dataURLStruct be the result of running the data: URL processor on requests current URL.
auto data_url_struct = request->current_url().process_data_url();
auto data_url_struct = Infrastructure::process_data_url(request->current_url());
// 2. If dataURLStruct is failure, then return a network error.
if (data_url_struct.is_error())

View File

@ -1,10 +1,13 @@
/*
* Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
* Copyright (c) 2022, Andreas Kling <kling@serenityos.org>
* Copyright (c) 2023, Karol Kosek <krkk@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Base64.h>
#include <LibURL/URL.h>
#include <LibWeb/Fetch/Infrastructure/URL.h>
namespace Web::Fetch::Infrastructure {
@ -30,4 +33,79 @@ bool is_http_or_https_scheme(StringView scheme)
return any_of(HTTP_SCHEMES, [&](auto http_scheme) { return scheme == http_scheme; });
}
// https://fetch.spec.whatwg.org/#data-url-processor
ErrorOr<DataURL> process_data_url(URL::URL const& data_url)
{
// 1. Assert: dataURLs scheme is "data".
VERIFY(data_url.scheme() == "data");
// 2. Let input be the result of running the URL serializer on dataURL with exclude fragment set to true.
auto input_serialized = data_url.serialize(URL::ExcludeFragment::Yes);
StringView input = input_serialized;
// 3. Remove the leading "data:" from input.
input = input.substring_view("data:"sv.length());
// 4. Let position point at the start of input.
// 5. Let mimeType be the result of collecting a sequence of code points that are not equal to U+002C (,), given position.
auto position = input.find(',');
auto mime_type = input.substring_view(0, position.value_or(input.length()));
// 6. Strip leading and trailing ASCII whitespace from mimeType.
mime_type = mime_type.trim_whitespace(TrimMode::Both);
// 7. If position is past the end of input, then return failure.
if (!position.has_value())
return Error::from_string_literal("Missing a comma character");
// 8. Advance position by 1.
position = position.value() + 1;
// 9. Let encodedBody be the remainder of input.
auto encoded_body = input.substring_view(position.value());
// 10. Let body be the percent-decoding of encodedBody.
auto body = URL::percent_decode(encoded_body).to_byte_buffer();
// 11. If mimeType ends with U+003B (;), followed by zero or more U+0020 SPACE, followed by an ASCII case-insensitive match for "base64", then:
if (mime_type.ends_with("base64"sv, CaseSensitivity::CaseInsensitive)) {
auto trimmed_substring_view = mime_type.substring_view(0, mime_type.length() - 6);
trimmed_substring_view = trimmed_substring_view.trim(" "sv, TrimMode::Right);
if (trimmed_substring_view.ends_with(';')) {
// 1. Let stringBody be the isomorphic decode of body.
auto string_body = StringView(body);
// 2. Set body to the forgiving-base64 decode of stringBody.
// FIXME: Check if it's really forgiving.
// 3. If body is failure, then return failure.
body = TRY(decode_base64(string_body));
// 4. Remove the last 6 code points from mimeType.
// 5. Remove trailing U+0020 SPACE code points from mimeType, if any.
// 6. Remove the last U+003B (;) from mimeType.
mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1);
}
}
// 12. If mimeType starts with ";", then prepend "text/plain" to mimeType.
StringBuilder builder;
if (mime_type.starts_with(';')) {
builder.append("text/plain"sv);
builder.append(mime_type);
mime_type = builder.string_view();
}
// FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
// FIXME: 13. Let mimeTypeRecord be the result of parsing mimeType.
auto mime_type_record = mime_type.trim("\n\r\t "sv, TrimMode::Both);
// 14. If mimeTypeRecord is failure, then set mimeTypeRecord to text/plain;charset=US-ASCII.
if (mime_type_record.is_empty())
mime_type_record = "text/plain;charset=US-ASCII"sv;
// 15. Return a new data: URL struct whose MIME type is mimeTypeRecord and body is body.
return DataURL { TRY(String::from_utf8(mime_type_record)), body };
}
}

View File

@ -8,7 +8,11 @@
#pragma once
#include <AK/Array.h>
#include <LibURL/URL.h>
#include <AK/ByteBuffer.h>
#include <AK/Error.h>
#include <AK/String.h>
#include <AK/StringView.h>
#include <LibURL/Forward.h>
namespace Web::Fetch::Infrastructure {
@ -33,8 +37,15 @@ inline constexpr Array FETCH_SCHEMES = {
"resource"sv
};
// https://fetch.spec.whatwg.org/#data-url-struct
struct DataURL {
String mime_type;
ByteBuffer body;
};
[[nodiscard]] bool is_local_url(URL::URL const&);
[[nodiscard]] bool is_fetch_scheme(StringView);
[[nodiscard]] bool is_http_or_https_scheme(StringView);
ErrorOr<DataURL> process_data_url(URL::URL const&);
}

View File

@ -13,6 +13,7 @@
#include <LibCore/Resource.h>
#include <LibWeb/Cookie/Cookie.h>
#include <LibWeb/Cookie/ParsedCookie.h>
#include <LibWeb/Fetch/Infrastructure/URL.h>
#include <LibWeb/Loader/ContentFilter.h>
#include <LibWeb/Loader/GeneratedPagesLoader.h>
#include <LibWeb/Loader/LoadRequest.h>
@ -251,7 +252,7 @@ void ResourceLoader::load(LoadRequest& request, SuccessCallback success_callback
}
if (url.scheme() == "data") {
auto data_url_or_error = url.process_data_url();
auto data_url_or_error = Fetch::Infrastructure::process_data_url(url);
if (data_url_or_error.is_error()) {
auto error_message = data_url_or_error.error().string_literal();
log_failure(request, error_message);