ladybird/Userland/Libraries/LibWeb/DOM/DocumentLoading.cpp
Timothy Flynn 1ffda6a805 LibWeb: Propagate OOM in Body::fully_read() through its error callback
Fetched bodies can be on the order of gigabytes, so rather than crashing
when we hit OOM here, we can simply invoke the error callback with a DOM
exception. We use "UnknownError" here as the spec directly supports this
for OOM errors:

    UnknownError: The operation failed for an unknown transient reason
                  (e.g. out of memory).

This is still an ad-hoc implementation. We should be using streams, and
we do have the AOs available to do so. But they need to be massaged to
be compatible with callers of Body::fully_read. And once we do use
streams, this function will become infallible - so making it infallible
here is at least a step in the right direction.
2024-04-27 07:08:14 +02:00

550 lines
28 KiB
C++

/*
* Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
* Copyright (c) 2023, Aliaksandr Kalenik <kalenik.aliaksandr@gmail.com>
* Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Debug.h>
#include <AK/LexicalPath.h>
#include <LibGemini/Document.h>
#include <LibGfx/ImageFormats/ImageDecoder.h>
#include <LibMarkdown/Document.h>
#include <LibTextCodec/Decoder.h>
#include <LibWeb/DOM/Document.h>
#include <LibWeb/DOM/DocumentLoading.h>
#include <LibWeb/HTML/HTMLHeadElement.h>
#include <LibWeb/HTML/Navigable.h>
#include <LibWeb/HTML/NavigationParams.h>
#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
#include <LibWeb/HTML/Parser/HTMLParser.h>
#include <LibWeb/Loader/GeneratedPagesLoader.h>
#include <LibWeb/Namespace.h>
#include <LibWeb/Platform/EventLoopPlugin.h>
#include <LibWeb/XML/XMLDocumentBuilder.h>
namespace Web {
// Replaces a document's content with a simple error message.
static void convert_to_xml_error_document(DOM::Document& document, String error_string)
{
auto html_element = MUST(DOM::create_element(document, HTML::TagNames::html, Namespace::HTML));
auto body_element = MUST(DOM::create_element(document, HTML::TagNames::body, Namespace::HTML));
MUST(html_element->append_child(body_element));
MUST(body_element->append_child(document.heap().allocate<DOM::Text>(document.realm(), document, error_string)));
document.remove_all_children();
MUST(document.append_child(html_element));
}
static WebIDL::ExceptionOr<JS::NonnullGCPtr<DOM::Document>> load_markdown_document(HTML::NavigationParams const& navigation_params)
{
auto extra_head_contents = R"~~~(
<style>
.zoomable {
cursor: zoom-in;
max-width: 100%;
}
.zoomable.zoomed-in {
cursor: zoom-out;
max-width: none;
}
</style>
<script>
function imageClickEventListener(event) {
let image = event.target;
if (image.classList.contains("zoomable")) {
image.classList.toggle("zoomed-in");
}
}
function processImages() {
let images = document.querySelectorAll("img");
let windowWidth = window.innerWidth;
images.forEach((image) => {
if (image.naturalWidth > windowWidth) {
image.classList.add("zoomable");
} else {
image.classList.remove("zoomable");
image.classList.remove("zoomed-in");
}
image.addEventListener("click", imageClickEventListener);
});
}
document.addEventListener("load", () => {
processImages();
});
window.addEventListener("resize", () => {
processImages();
});
</script>
)~~~"sv;
return create_document_for_inline_content(navigation_params.navigable.ptr(), navigation_params.id, [&](DOM::Document& document) {
auto& realm = document.realm();
auto process_body = JS::create_heap_function(realm.heap(), [&document, url = navigation_params.response->url().value(), extra_head_contents](ByteBuffer data) {
auto markdown_document = Markdown::Document::parse(data);
if (!markdown_document)
return;
auto parser = HTML::HTMLParser::create(document, markdown_document->render_to_html(extra_head_contents), "utf-8"sv);
parser->run(url);
});
auto process_body_error = JS::create_heap_function(realm.heap(), [](JS::GCPtr<WebIDL::DOMException>) {
dbgln("FIXME: Load html page with an error if read of body failed.");
});
navigation_params.response->body()->fully_read(
realm,
process_body,
process_body_error,
JS::NonnullGCPtr { realm.global_object() });
});
}
bool build_xml_document(DOM::Document& document, ByteBuffer const& data, Optional<String> content_encoding)
{
Optional<TextCodec::Decoder&> decoder;
// The actual HTTP headers and other metadata, not the headers as mutated or implied by the algorithms given in this specification,
// are the ones that must be used when determining the character encoding according to the rules given in the above specifications.
if (content_encoding.has_value())
decoder = TextCodec::decoder_for(*content_encoding);
if (!decoder.has_value()) {
auto encoding = HTML::run_encoding_sniffing_algorithm(document, data);
decoder = TextCodec::decoder_for(encoding);
}
VERIFY(decoder.has_value());
// Well-formed XML documents contain only properly encoded characters
if (!decoder->validate(data)) {
convert_to_xml_error_document(document, "XML Document contains improperly-encoded characters"_string);
return false;
}
auto source = decoder->to_utf8(data).release_value_but_fixme_should_propagate_errors();
XML::Parser parser(source, { .resolve_external_resource = resolve_xml_resource });
XMLDocumentBuilder builder { document };
auto result = parser.parse_with_listener(builder);
return !result.is_error() && !builder.has_error();
}
// https://html.spec.whatwg.org/multipage/document-lifecycle.html#navigate-html
static WebIDL::ExceptionOr<JS::NonnullGCPtr<DOM::Document>> load_html_document(HTML::NavigationParams const& navigation_params)
{
// To load an HTML document, given navigation params navigationParams:
// 1. Let document be the result of creating and initializing a Document object given "html", "text/html", and navigationParams.
auto document = TRY(DOM::Document::create_and_initialize(DOM::Document::Type::HTML, "text/html"_string, navigation_params));
// 2. If document's URL is about:blank, then populate with html/head/body given document.
// FIXME: The additional check for a non-empty body fixes issues with loading javascript urls in iframes, which
// default to an "about:blank" url. Is this a spec bug?
if (document->url_string() == "about:blank"_string
&& navigation_params.response->body()->length().value_or(0) == 0) {
TRY(document->populate_with_html_head_and_body());
// Nothing else is added to the document, so mark it as loaded.
HTML::HTMLParser::the_end(document);
}
// 3. Otherwise, create an HTML parser and associate it with the document.
// Each task that the networking task source places on the task queue while fetching runs must then fill the
// parser's input byte stream with the fetched bytes and cause the HTML parser to perform the appropriate
// processing of the input stream.
// The first task that the networking task source places on the task queue while fetching runs must process link
// headers given document, navigationParams's response, and "media", after the task has been processed by the
// HTML parser.
// Before any script execution occurs, the user agent must wait for scripts may run for the newly-created
// document to be true for document.
// When no more bytes are available, the user agent must queue a global task on the networking task source given
// document's relevant global object to have the parser to process the implied EOF character, which eventually
// causes a load event to be fired.
else {
// FIXME: Parse as we receive the document data, instead of waiting for the whole document to be fetched first.
auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value()](ByteBuffer data) {
Platform::EventLoopPlugin::the().deferred_invoke([document = document, data = move(data), url = url] {
auto parser = HTML::HTMLParser::create_with_uncertain_encoding(document, data);
parser->run(url);
});
});
auto process_body_error = JS::create_heap_function(document->heap(), [](JS::GCPtr<WebIDL::DOMException>) {
dbgln("FIXME: Load html page with an error if read of body failed.");
});
auto& realm = document->realm();
navigation_params.response->body()->fully_read(realm, process_body, process_body_error, JS::NonnullGCPtr { realm.global_object() });
}
// 4. Return document.
return document;
}
// https://html.spec.whatwg.org/multipage/document-lifecycle.html#read-xml
static WebIDL::ExceptionOr<JS::NonnullGCPtr<DOM::Document>> load_xml_document(HTML::NavigationParams const& navigation_params, MimeSniff::MimeType type)
{
// When faced with displaying an XML file inline, provided navigation params navigationParams and a string type, user agents
// must follow the requirements defined in XML and Namespaces in XML, XML Media Types, DOM, and other relevant specifications
// to create and initialize a Document object document, given "xml", type, and navigationParams, and return that Document.
// They must also create a corresponding XML parser. [XML] [XMLNS] [RFC7303] [DOM]
//
// Note: At the time of writing, the XML specification community had not actually yet specified how XML and the DOM interact.
//
// The first task that the networking task source places on the task queue while fetching runs must process link headers
// given document, navigationParams's response, and "media", after the task has been processed by the XML parser.
//
// The actual HTTP headers and other metadata, not the headers as mutated or implied by the algorithms given in this
// specification, are the ones that must be used when determining the character encoding according to the rules given in the
// above specifications. Once the character encoding is established, the document's character encoding must be set to that
// character encoding.
//
// Before any script execution occurs, the user agent must wait for scripts may run for the newly-created document to be
// true for the newly-created Document.
//
// Once parsing is complete, the user agent must set document's during-loading navigation ID for WebDriver BiDi to null.
//
// Note: For HTML documents this is reset when parsing is complete, after firing the load event.
//
// Error messages from the parse process (e.g., XML namespace well-formedness errors) may be reported inline by mutating
// the Document.
// FIXME: Actually follow the spec! This is just the ad-hoc code we had before, modified somewhat.
auto document = TRY(DOM::Document::create_and_initialize(DOM::Document::Type::XML, type.essence(), navigation_params));
Optional<String> content_encoding;
if (auto maybe_encoding = type.parameters().get("charset"sv); maybe_encoding.has_value())
content_encoding = maybe_encoding.value();
auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value(), content_encoding = move(content_encoding)](ByteBuffer data) {
Optional<TextCodec::Decoder&> decoder;
// The actual HTTP headers and other metadata, not the headers as mutated or implied by the algorithms given in this specification,
// are the ones that must be used when determining the character encoding according to the rules given in the above specifications.
if (content_encoding.has_value())
decoder = TextCodec::decoder_for(*content_encoding);
if (!decoder.has_value()) {
auto encoding = HTML::run_encoding_sniffing_algorithm(document, data);
decoder = TextCodec::decoder_for(encoding);
}
VERIFY(decoder.has_value());
// Well-formed XML documents contain only properly encoded characters
if (!decoder->validate(data)) {
// FIXME: Insert error message into the document.
dbgln("XML Document contains improperly-encoded characters");
convert_to_xml_error_document(document, "XML Document contains improperly-encoded characters"_string);
// NOTE: This ensures that the `load` event gets fired for the frame loading this document.
document->completely_finish_loading();
return;
}
auto source = decoder->to_utf8(data);
if (source.is_error()) {
// FIXME: Insert error message into the document.
dbgln("Failed to decode XML document: {}", source.error());
convert_to_xml_error_document(document, MUST(String::formatted("Failed to decode XML document: {}", source.error())));
// NOTE: This ensures that the `load` event gets fired for the frame loading this document.
document->completely_finish_loading();
return;
}
XML::Parser parser(source.value(), { .resolve_external_resource = resolve_xml_resource });
XMLDocumentBuilder builder { document };
auto result = parser.parse_with_listener(builder);
if (result.is_error()) {
// FIXME: Insert error message into the document.
dbgln("Failed to parse XML document: {}", result.error());
convert_to_xml_error_document(document, MUST(String::formatted("Failed to parse XML document: {}", result.error())));
// NOTE: XMLDocumentBuilder ensures that the `load` event gets fired. We don't need to do anything else here.
}
});
auto process_body_error = JS::create_heap_function(document->heap(), [](JS::GCPtr<WebIDL::DOMException>) {
dbgln("FIXME: Load html page with an error if read of body failed.");
});
auto& realm = document->realm();
navigation_params.response->body()->fully_read(realm, process_body, process_body_error, JS::NonnullGCPtr { realm.global_object() });
return document;
}
// https://html.spec.whatwg.org/multipage/document-lifecycle.html#navigate-text
static WebIDL::ExceptionOr<JS::NonnullGCPtr<DOM::Document>> load_text_document(HTML::NavigationParams const& navigation_params, MimeSniff::MimeType type)
{
// To load a text document, given a navigation params navigationParams and a string type:
// 1. Let document be the result of creating and initializing a Document object given "html", type, and navigationParams.
auto document = TRY(DOM::Document::create_and_initialize(DOM::Document::Type::XML, type.essence(), navigation_params));
// FIXME: 2. Set document's parser cannot change the mode flag to true.
// 3. Set document's mode to "no-quirks".
document->set_quirks_mode(DOM::QuirksMode::No);
// 4. Create an HTML parser and associate it with the document. Act as if the tokenizer had emitted a start tag token
// with the tag name "pre" followed by a single U+000A LINE FEED (LF) character, and switch the HTML parser's tokenizer
// to the PLAINTEXT state. Each task that the networking task source places on the task queue while fetching runs must
// then fill the parser's input byte stream with the fetched bytes and cause the HTML parser to perform the appropriate
// processing of the input stream.
// document's encoding must be set to the character encoding used to decode the document during parsing.
// The first task that the networking task source places on the task queue while fetching runs must process link
// headers given document, navigationParams's response, and "media", after the task has been processed by the HTML parser.
// Before any script execution occurs, the user agent must wait for scripts may run for the newly-created document to be
// true for document.
// When no more bytes are available, the user agent must queue a global task on the networking task source given
// document's relevant global object to have the parser to process the implied EOF character, which eventually causes a
// load event to be fired.
// FIXME: Parse as we receive the document data, instead of waiting for the whole document to be fetched first.
auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value()](ByteBuffer data) {
auto encoding = run_encoding_sniffing_algorithm(document, data);
dbgln_if(HTML_PARSER_DEBUG, "The encoding sniffing algorithm returned encoding '{}'", encoding);
auto parser = HTML::HTMLParser::create_for_scripting(document);
parser->tokenizer().update_insertion_point();
parser->tokenizer().insert_input_at_insertion_point("<pre>\n"sv);
parser->run();
parser->tokenizer().switch_to(HTML::HTMLTokenizer::State::PLAINTEXT);
parser->tokenizer().insert_input_at_insertion_point(data);
parser->tokenizer().insert_eof();
parser->run(url);
document->set_encoding(MUST(String::from_byte_string(encoding)));
// 5. User agents may add content to the head element of document, e.g., linking to a style sheet, providing
// script, or giving the document a title.
auto title = MUST(String::from_byte_string(LexicalPath::basename(url.to_byte_string())));
auto title_element = MUST(DOM::create_element(document, HTML::TagNames::title, Namespace::HTML));
MUST(document->head()->append_child(title_element));
auto title_text = document->heap().allocate<DOM::Text>(document->realm(), document, title);
MUST(title_element->append_child(*title_text));
});
auto process_body_error = JS::create_heap_function(document->heap(), [](JS::GCPtr<WebIDL::DOMException>) {
dbgln("FIXME: Load html page with an error if read of body failed.");
});
auto& realm = document->realm();
navigation_params.response->body()->fully_read(realm, process_body, process_body_error, JS::NonnullGCPtr { realm.global_object() });
// 6. Return document.
return document;
}
// https://html.spec.whatwg.org/multipage/document-lifecycle.html#navigate-media
static WebIDL::ExceptionOr<JS::NonnullGCPtr<DOM::Document>> load_media_document(HTML::NavigationParams const& navigation_params, MimeSniff::MimeType type)
{
// To load a media document, given navigationParams and a string type:
// 1. Let document be the result of creating and initializing a Document object given "html", type, and navigationParams.
auto document = TRY(DOM::Document::create_and_initialize(DOM::Document::Type::XML, type.essence(), navigation_params));
// 2. Set document's mode to "no-quirks".
document->set_quirks_mode(DOM::QuirksMode::No);
// 3. Populate with html/head/body given document.
TRY(document->populate_with_html_head_and_body());
// 4. Append an element host element for the media, as described below, to the body element.
// 5. Set the appropriate attribute of the element host element, as described below, to the address of the image,
// video, or audio resource.
// 6. User agents may add content to the head element of document, or attributes to host element, e.g., to link
// to a style sheet, to provide a script, to give the document a title, or to make the media autoplay.
auto insert_title = [](auto& document, auto title) -> WebIDL::ExceptionOr<void> {
auto title_element = TRY(DOM::create_element(document, HTML::TagNames::title, Namespace::HTML));
TRY(document->head()->append_child(title_element));
auto title_text = document->heap().template allocate<DOM::Text>(document->realm(), document, title);
TRY(title_element->append_child(*title_text));
return {};
};
auto style_element = TRY(DOM::create_element(document, HTML::TagNames::style, Namespace::HTML));
style_element->set_text_content(R"~~~(
:root {
background-color: #222;
}
img, video, audio {
position: absolute;
inset: 0;
max-width: 100vw;
max-height: 100vh;
margin: auto;
}
img {
background-color: #fff;
}
)~~~"_string);
TRY(document->head()->append_child(style_element));
auto url_string = document->url_string();
if (type.is_image()) {
auto img_element = TRY(DOM::create_element(document, HTML::TagNames::img, Namespace::HTML));
TRY(img_element->set_attribute(HTML::AttributeNames::src, url_string));
TRY(document->body()->append_child(img_element));
TRY(insert_title(document, MUST(String::from_byte_string(LexicalPath::basename(url_string.to_byte_string())))));
} else if (type.type() == "video"sv) {
auto video_element = TRY(DOM::create_element(document, HTML::TagNames::video, Namespace::HTML));
TRY(video_element->set_attribute(HTML::AttributeNames::src, url_string));
TRY(video_element->set_attribute(HTML::AttributeNames::autoplay, String {}));
TRY(video_element->set_attribute(HTML::AttributeNames::controls, String {}));
TRY(document->body()->append_child(video_element));
TRY(insert_title(document, MUST(String::from_byte_string(LexicalPath::basename(url_string.to_byte_string())))));
} else if (type.type() == "audio"sv) {
auto audio_element = TRY(DOM::create_element(document, HTML::TagNames::audio, Namespace::HTML));
TRY(audio_element->set_attribute(HTML::AttributeNames::src, url_string));
TRY(audio_element->set_attribute(HTML::AttributeNames::autoplay, String {}));
TRY(audio_element->set_attribute(HTML::AttributeNames::controls, String {}));
TRY(document->body()->append_child(audio_element));
TRY(insert_title(document, MUST(String::from_byte_string(LexicalPath::basename(url_string.to_byte_string())))));
} else {
// FIXME: According to https://mimesniff.spec.whatwg.org/#audio-or-video-mime-type we might have to deal with
// "application/ogg" and figure out whether it's audio or video.
VERIFY_NOT_REACHED();
}
// FIXME: 7. Process link headers given document, navigationParams's response, and "media".
// 8. Act as if the user agent had stopped parsing document.
// FIXME: We should not need to force the media file to load before saying that parsing has completed!
// However, if we don't, then we get stuck in HTMLParser::the_end() waiting for the media file to load, which
// never happens.
auto& realm = document->realm();
navigation_params.response->body()->fully_read(
realm,
JS::create_heap_function(document->heap(), [document](ByteBuffer) { HTML::HTMLParser::the_end(document); }),
JS::create_heap_function(document->heap(), [](JS::GCPtr<WebIDL::DOMException>) {}),
JS::NonnullGCPtr { realm.global_object() });
// 9. Return document.
return document;
// The element host element to create for the media is the element given in the table below in the second cell of
// the row whose first cell describes the media. The appropriate attribute to set is the one given by the third cell
// in that same row.
// Type of media | Element for the media | Appropriate attribute
// -------------------------------------------------------------
// Image | img | src
// Video | video | src
// Audio | audio | src
// Before any script execution occurs, the user agent must wait for scripts may run for the newly-created document to
// be true for the Document.
}
bool can_load_document_with_type(MimeSniff::MimeType const& type)
{
if (type.is_html())
return true;
if (type.is_xml())
return true;
if (type.is_javascript()
|| type.is_json()
|| type.essence() == "text/css"_string
|| type.essence() == "text/plain"_string
|| type.essence() == "text/vtt"_string) {
return true;
}
if (type.essence() == "multipart/x-mixed-replace"_string)
return true;
if (type.is_image() || type.is_audio_or_video())
return true;
if (type.essence() == "application/pdf"_string || type.essence() == "text/pdf"_string)
return true;
if (type.essence() == "text/markdown"sv)
return true;
return false;
}
// https://html.spec.whatwg.org/multipage/browsing-the-web.html#loading-a-document
JS::GCPtr<DOM::Document> load_document(HTML::NavigationParams const& navigation_params)
{
// To load a document given navigation params navigationParams, source snapshot params sourceSnapshotParams,
// and origin initiatorOrigin, perform the following steps. They return a Document or null.
// 1. Let type be the computed type of navigationParams's response.
auto extracted_mime_type = navigation_params.response->header_list()->extract_mime_type();
if (!extracted_mime_type.has_value())
return nullptr;
auto type = extracted_mime_type.release_value();
VERIFY(navigation_params.response->body());
// 2. If the user agent has been configured to process resources of the given type using some mechanism other than
// rendering the content in a navigable, then skip this step.
// Otherwise, if the type is one of the following types:
// -> an HTML MIME type
if (type.is_html()) {
// Return the result of loading an HTML document, given navigationParams.
return load_html_document(navigation_params).release_value_but_fixme_should_propagate_errors();
}
// -> an XML MIME type that is not an explicitly supported XML MIME type
// FIXME: that is not an explicitly supported XML MIME type
if (type.is_xml()) {
// Return the result of loading an XML document given navigationParams and type.
return load_xml_document(navigation_params, type).release_value_but_fixme_should_propagate_errors();
}
// -> a JavaScript MIME type
// -> a JSON MIME type that is not an explicitly supported JSON MIME type
// -> "text/css"
// -> "text/plain"
// -> "text/vtt"
if (type.is_javascript()
|| type.is_json()
|| type.essence() == "text/css"_string
|| type.essence() == "text/plain"_string
|| type.essence() == "text/vtt"_string) {
// Return the result of loading a text document given navigationParams and type.
return load_text_document(navigation_params, type).release_value_but_fixme_should_propagate_errors();
}
// -> "multipart/x-mixed-replace"
if (type.essence() == "multipart/x-mixed-replace"_string) {
// FIXME: Return the result of loading a multipart/x-mixed-replace document, given navigationParams,
// sourceSnapshotParams, and initiatorOrigin.
}
// -> A supported image, video, or audio type
if (type.is_image()
|| type.is_audio_or_video()) {
// Return the result of loading a media document given navigationParams and type.
return load_media_document(navigation_params, type).release_value_but_fixme_should_propagate_errors();
}
// -> "application/pdf"
// -> "text/pdf"
if (type.essence() == "application/pdf"_string
|| type.essence() == "text/pdf"_string) {
// FIXME: If the user agent's PDF viewer supported is true, return the result of creating a document for inline
// content that doesn't have a DOM given navigationParams's navigable.
}
// Otherwise, proceed onward.
// 3. If, given type, the new resource is to be handled by displaying some sort of inline content, e.g., a
// native rendering of the content or an error message because the specified type is not supported, then
// return the result of creating a document for inline content that doesn't have a DOM given navigationParams's
// navigable, navigationParams's id, and navigationParams's navigation timing type.
if (type.essence() == "text/markdown"sv)
return load_markdown_document(navigation_params).release_value_but_fixme_should_propagate_errors();
// FIXME: 4. Otherwise, the document's type is such that the resource will not affect navigationParams's navigable,
// e.g., because the resource is to be handed to an external application or because it is an unknown type
// that will be processed as a download. Hand-off to external software given navigationParams's response,
// navigationParams's navigable, navigationParams's final sandboxing flag set, sourceSnapshotParams's has
// transient activation, and initiatorOrigin.
// 5. Return null.
return nullptr;
}
}