From 0c42aece362edfbd71f3b149601c065b5c675e80 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 20 Jul 2021 10:46:53 -0400 Subject: [PATCH] LibJS: Transcode UTF-8 strings to UTF-16 and add UTF-16 accessors LibJS parses JavaScript as UTF-8, so when creating a string, we must transcode it to UTF-16 to handle encoded surrogate pairs. For example, consider the following string: "\ud83d\ude00" The UTF-8 encoding of this surrogate pair is: 0xf0 0x9f 0x98 0x80 However, LibJS will currently store the two surrogates individually as UTF-8 encoded bytes, rather than combining the pair: 0xed 0xa0 0xb8, 0xed 0xb8 0x80 These are not equivalent. So, as String.prototype becomes UTF-16 aware, this encoding will no longer work for abstractions like strict equality. --- .../LibJS/Runtime/PrimitiveString.cpp | 52 ++++++++++++++++++- .../Libraries/LibJS/Runtime/PrimitiveString.h | 10 +++- Userland/Libraries/LibJS/Runtime/Value.cpp | 13 +++++ Userland/Libraries/LibJS/Runtime/Value.h | 1 + 4 files changed, 73 insertions(+), 3 deletions(-) diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp index 7a1f1e75809..b35982fafa2 100644 --- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp +++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp @@ -4,6 +4,8 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#include +#include #include #include @@ -18,13 +20,59 @@ PrimitiveString::~PrimitiveString() { } +Vector const& PrimitiveString::utf16_string() const +{ + if (m_utf16_string.is_empty() && !m_string.is_empty()) + m_utf16_string = AK::utf8_to_utf16(m_string); + return m_utf16_string; +} + +Utf16View PrimitiveString::utf16_string_view() const +{ + return Utf16View { utf16_string() }; +} + +PrimitiveString* js_string(Heap& heap, Utf16View const& string) +{ + if (string.is_empty()) + return &heap.vm().empty_string(); + + if (string.length_in_code_units() == 1) { + u16 code_unit = string.code_unit_at(0); + if (is_ascii(code_unit)) + return &heap.vm().single_ascii_character_string(static_cast(code_unit)); + } + + auto utf8_string = string.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); + return heap.allocate_without_global_object(move(utf8_string)); +} + +PrimitiveString* js_string(VM& vm, Utf16View const& string) +{ + return js_string(vm.heap(), string); +} + PrimitiveString* js_string(Heap& heap, String string) { if (string.is_empty()) return &heap.vm().empty_string(); - if (string.length() == 1 && (u8)string.characters()[0] < 0x80) - return &heap.vm().single_ascii_character_string(string.characters()[0]); + if (string.length() == 1) { + auto ch = static_cast(string.characters()[0]); + if (is_ascii(ch)) + return &heap.vm().single_ascii_character_string(ch); + } + + // UTF-8 strings must first be transcoded to UTF-16, even though they are stored as String objects + // internally, to parse encoded surrogate pairs. As an optimization to reduce string copying, only + // perform that transcoding if there are non-ASCII codepoints in the string. + for (auto it : string) { + auto ch = static_cast(it); + if (!is_ascii(ch)) { + auto utf16_string = AK::utf8_to_utf16(string); + return js_string(heap, Utf16View { utf16_string }); + } + } return heap.allocate_without_global_object(move(string)); } diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.h b/Userland/Libraries/LibJS/Runtime/PrimitiveString.h index b9e24d7c5f3..743ab796495 100644 --- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.h +++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.h @@ -7,6 +7,7 @@ #pragma once #include +#include #include namespace JS { @@ -16,14 +17,21 @@ public: explicit PrimitiveString(String); virtual ~PrimitiveString(); - const String& string() const { return m_string; } + String const& string() const { return m_string; } + + Vector const& utf16_string() const; + Utf16View utf16_string_view() const; private: virtual const char* class_name() const override { return "PrimitiveString"; } String m_string; + mutable Vector m_utf16_string; }; +PrimitiveString* js_string(Heap&, Utf16View const&); +PrimitiveString* js_string(VM&, Utf16View const&); + PrimitiveString* js_string(Heap&, String); PrimitiveString* js_string(VM&, String); diff --git a/Userland/Libraries/LibJS/Runtime/Value.cpp b/Userland/Libraries/LibJS/Runtime/Value.cpp index 5666b28e713..cf54fa09fc1 100644 --- a/Userland/Libraries/LibJS/Runtime/Value.cpp +++ b/Userland/Libraries/LibJS/Runtime/Value.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -365,6 +366,18 @@ String Value::to_string(GlobalObject& global_object, bool legacy_null_to_empty_s } } +Vector Value::to_utf16_string(GlobalObject& global_object) const +{ + if (m_type == Type::String) + return m_value.as_string->utf16_string(); + + auto utf8_string = to_string(global_object); + if (global_object.vm().exception()) + return {}; + + return AK::utf8_to_utf16(utf8_string); +} + // 7.1.2 ToBoolean ( argument ), https://tc39.es/ecma262/#sec-toboolean bool Value::to_boolean() const { diff --git a/Userland/Libraries/LibJS/Runtime/Value.h b/Userland/Libraries/LibJS/Runtime/Value.h index 8126660982a..ee3a6fd74d0 100644 --- a/Userland/Libraries/LibJS/Runtime/Value.h +++ b/Userland/Libraries/LibJS/Runtime/Value.h @@ -246,6 +246,7 @@ public: u64 encoded() const { return m_value.encoded; } String to_string(GlobalObject&, bool legacy_null_to_empty_string = false) const; + Vector to_utf16_string(GlobalObject&) const; PrimitiveString* to_primitive_string(GlobalObject&); Value to_primitive(GlobalObject&, PreferredType preferred_type = PreferredType::Default) const; Object* to_object(GlobalObject&) const;