mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-13 01:59:14 +03:00
LibWeb: HTML Parser, handle html escaped characters
Convert HTML escaped (&#XXX;) characters to string.
This commit is contained in:
parent
738235574f
commit
241df7206e
Notes:
sideshowbarker
2024-07-19 06:18:28 +09:00
Author: https://github.com/asliturk Commit: https://github.com/SerenityOS/serenity/commit/241df7206ec Pull-request: https://github.com/SerenityOS/serenity/pull/2310 Reviewed-by: https://github.com/linusg
@ -27,6 +27,7 @@
|
||||
#include <AK/Function.h>
|
||||
#include <AK/NonnullRefPtrVector.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/StringUtils.h>
|
||||
#include <LibTextCodec/Decoder.h>
|
||||
#include <LibWeb/DOM/Comment.h>
|
||||
#include <LibWeb/DOM/DocumentFragment.h>
|
||||
@ -64,6 +65,23 @@ static bool is_void_element(const StringView& tag_name)
|
||||
|| tag_name == "wbr";
|
||||
}
|
||||
|
||||
static Vector<char> codepoint_to_bytes(const u32 codepoint)
|
||||
{
|
||||
Vector<char, 0> bytes;
|
||||
|
||||
if (codepoint < 0x80) {
|
||||
bytes.insert(0, (char)codepoint);
|
||||
} else if (codepoint < 0x800) {
|
||||
char b2 = (codepoint & 0x3F) + 0x80;
|
||||
char b1 = ((codepoint >> 6) & 0x1F) + +0xC0;
|
||||
|
||||
bytes.insert(0, b1);
|
||||
bytes.insert(1, b2);
|
||||
}
|
||||
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static bool parse_html_document(const StringView& html, Document& document, ParentNode& root)
|
||||
{
|
||||
NonnullRefPtrVector<ParentNode> node_stack;
|
||||
@ -213,6 +231,7 @@ static bool parse_html_document(const StringView& html, Document& document, Pare
|
||||
};
|
||||
auto rest_of_html = html.substring_view(i, html.length() - i);
|
||||
bool found = false;
|
||||
|
||||
for (auto& escape : escapes) {
|
||||
if (rest_of_html.starts_with(escape.code)) {
|
||||
text_buffer.append(escape.value);
|
||||
@ -221,8 +240,43 @@ static bool parse_html_document(const StringView& html, Document& document, Pare
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
dbg() << "Unhandled escape sequence";
|
||||
|
||||
if (!found) {
|
||||
char num_sign = html[i + 1];
|
||||
if (num_sign && num_sign == '#') {
|
||||
int j = 2; // spip '&#' and search for ';'
|
||||
while (html[i + j] != ';' && j < 7) {
|
||||
j++;
|
||||
}
|
||||
|
||||
if (j < 7) { // We found ; char
|
||||
bool ok;
|
||||
u32 codepoint;
|
||||
String str_code_point = html.substring_view(i + 2, j - 2);
|
||||
if (str_code_point.starts_with('x')) {
|
||||
String str = str_code_point.substring(1, str_code_point.length() - 1);
|
||||
codepoint = AK::StringUtils::convert_to_uint_from_hex(str, ok);
|
||||
} else {
|
||||
codepoint = str_code_point.to_uint(ok);
|
||||
}
|
||||
|
||||
if (ok) {
|
||||
Vector<char> bytes = codepoint_to_bytes(codepoint);
|
||||
if (bytes.size() > 0) {
|
||||
for (size_t i = 0; i < bytes.size(); i++) {
|
||||
text_buffer.append(bytes.at(i));
|
||||
}
|
||||
found = true;
|
||||
i = i + j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
dbg() << "Unhandled escape sequence:" << html.substring_view(i, min((size_t)5, html.length()));
|
||||
}
|
||||
}
|
||||
break;
|
||||
case State::BeforeTagName:
|
||||
|
Loading…
Reference in New Issue
Block a user