From a3e82eaad3b808d96d5d83abab94ecb8124a49ef Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Thu, 1 Dec 2022 13:27:43 +0100 Subject: [PATCH] AK: Introduce the new String, replacement for DeprecatedString DeprecatedString (formerly String) has been with us since the start, and it has served us well. However, it has a number of shortcomings that I'd like to address. Some of these issues are hard if not impossible to solve incrementally inside of DeprecatedString, so instead of doing that, let's build a new String class and then incrementally move over to it instead. Problems in DeprecatedString: - It assumes string allocation never fails. This makes it impossible to use in allocation-sensitive contexts, and is the reason we had to ban DeprecatedString from the kernel entirely. - The awkward null state. DeprecatedString can be null. It's different from the empty state, although null strings are considered empty. All code is immediately nicer when using Optional but DeprecatedString came before Optional, which is how we ended up like this. - The encoding of the underlying data is ambiguous. For the most part, we use it as if it's always UTF-8, but there have been cases where we pass around strings in other encodings (e.g ISO8859-1) - operator[] and length() are used to iterate over DeprecatedString one byte at a time. This is done all over the codebase, and will *not* give the right results unless the string is all ASCII. How we solve these issues in the new String: - Functions that may allocate now return ErrorOr so that ENOMEM errors can be passed to the caller. - String has no null state. Use Optional when needed. - String is always UTF-8. This is validated when constructing a String. We may need to add a bypass for this in the future, for cases where you have a known-good string, but for now: validate all the things! - There is no operator[] or length(). You can get the underlying data with bytes(), but for iterating over code points, you should be using an UTF-8 iterator. Furthermore, it has two nifty new features: - String implements a small string optimization (SSO) for strings that can fit entirely within a pointer. This means up to 3 bytes on 32-bit platforms, and 7 bytes on 64-bit platforms. Such small strings will not be heap-allocated. - String can create substrings without making a deep copy of the substring. Instead, the superstring gets +1 refcount from the substring, and it acts like a view into the superstring. To make substrings like this, use the substring_with_shared_superstring() API. One caveat: - String does not guarantee that the underlying data is null-terminated like DeprecatedString does today. While this was nifty in a handful of places where we were calling C functions, it did stand in the way of shared-superstring substrings. --- AK/CMakeLists.txt | 1 + AK/Forward.h | 2 + AK/String.cpp | 337 ++++++++++++++++++++++++++++++++++++++++ AK/String.h | 132 ++++++++++++++++ AK/StringBuilder.cpp | 6 + AK/StringBuilder.h | 2 + AK/StringUtils.cpp | 32 +++- AK/StringUtils.h | 2 + Tests/AK/CMakeLists.txt | 1 + Tests/AK/TestString.cpp | 102 ++++++++++++ 10 files changed, 616 insertions(+), 1 deletion(-) create mode 100644 AK/String.cpp create mode 100644 AK/String.h create mode 100644 Tests/AK/TestString.cpp diff --git a/AK/CMakeLists.txt b/AK/CMakeLists.txt index 46ec2ee1a4c..295e41a3284 100644 --- a/AK/CMakeLists.txt +++ b/AK/CMakeLists.txt @@ -15,6 +15,7 @@ set(AK_SOURCES LexicalPath.cpp Random.cpp StackInfo.cpp + String.cpp StringBuilder.cpp StringFloatingPointConversions.cpp StringImpl.cpp diff --git a/AK/Forward.h b/AK/Forward.h index a981707994f..31561e6de47 100644 --- a/AK/Forward.h +++ b/AK/Forward.h @@ -31,6 +31,7 @@ class StringView; class Time; class URL; class FlyString; +class String; class Utf16View; class Utf32View; class Utf8CodePointIterator; @@ -188,6 +189,7 @@ using AK::RefPtr; using AK::SinglyLinkedList; using AK::Span; using AK::StackInfo; +using AK::String; using AK::StringBuilder; using AK::StringImpl; using AK::StringView; diff --git a/AK/String.cpp b/AK/String.cpp new file mode 100644 index 00000000000..ac443a0ce59 --- /dev/null +++ b/AK/String.cpp @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2018-2022, Andreas Kling + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace AK { + +namespace Detail { + +class StringData final : public RefCounted { +public: + static ErrorOr> create_uninitialized(size_t, u8*& buffer); + static ErrorOr> create_substring(StringData const& superstring, size_t start, size_t byte_count); + static ErrorOr> from_utf8(char const* utf8_bytes, size_t); + + struct SubstringData { + StringData const* superstring { nullptr }; + u32 start_offset { 0 }; + }; + + void operator delete(void* ptr); + + ~StringData(); + + SubstringData const& substring_data() const + { + return *reinterpret_cast(m_bytes_or_substring_data); + } + + // NOTE: There is no guarantee about null-termination. + ReadonlyBytes bytes() const + { + if (m_substring) { + auto const& data = substring_data(); + return data.superstring->bytes().slice(data.start_offset, m_byte_count); + } + return { &m_bytes_or_substring_data[0], m_byte_count }; + } + + StringView bytes_as_string_view() const { return { bytes() }; } + + bool operator==(StringData const& other) const + { + return bytes_as_string_view() == other.bytes_as_string_view(); + } + + unsigned hash() const + { + if (!m_has_hash) + compute_hash(); + return m_hash; + } + +private: + explicit StringData(size_t byte_count); + StringData(StringData const& superstring, size_t start, size_t byte_count); + + void compute_hash() const; + + u32 m_byte_count { 0 }; + mutable unsigned m_hash { 0 }; + mutable bool m_has_hash { false }; + bool m_substring { false }; + + u8 m_bytes_or_substring_data[0]; +}; + +void StringData::operator delete(void* ptr) +{ + free(ptr); +} + +StringData::StringData(size_t byte_count) + : m_byte_count(byte_count) +{ +} + +StringData::StringData(StringData const& superstring, size_t start, size_t byte_count) + : m_byte_count(byte_count) + , m_substring(true) +{ + auto& data = const_cast(substring_data()); + data.start_offset = start; + data.superstring = &superstring; + superstring.ref(); +} + +StringData::~StringData() +{ + if (m_substring) + substring_data().superstring->unref(); +} + +constexpr size_t allocation_size_for_string_data(size_t length) +{ + return sizeof(StringData) + (sizeof(char) * length) + sizeof(char); +} + +ErrorOr> StringData::create_uninitialized(size_t byte_count, u8*& buffer) +{ + VERIFY(byte_count); + void* slot = malloc(allocation_size_for_string_data(byte_count)); + if (!slot) { + return Error::from_errno(ENOMEM); + } + auto new_string_data = adopt_ref(*new (slot) StringData(byte_count)); + buffer = const_cast(new_string_data->bytes().data()); + return new_string_data; +} + +ErrorOr> StringData::from_utf8(char const* utf8_data, size_t byte_count) +{ + // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. + VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); + + Utf8View view(StringView(utf8_data, byte_count)); + if (!view.validate()) + return Error::from_string_literal("StringData::from_utf8: Input was not valid UTF-8"); + + VERIFY(utf8_data); + u8* buffer = nullptr; + auto new_string_data = TRY(create_uninitialized(byte_count, buffer)); + memcpy(buffer, utf8_data, byte_count * sizeof(char)); + return new_string_data; +} + +ErrorOr> StringData::create_substring(StringData const& superstring, size_t start, size_t byte_count) +{ + // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. + VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); + + void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData)); + if (!slot) { + return Error::from_errno(ENOMEM); + } + return adopt_ref(*new (slot) StringData(superstring, start, byte_count)); +} + +void StringData::compute_hash() const +{ + auto bytes = this->bytes(); + if (bytes.size() == 0) + m_hash = 0; + else + m_hash = string_hash(reinterpret_cast(bytes.data()), bytes.size()); + m_has_hash = true; +} + +} + +String::String(NonnullRefPtr data) + : m_data(&data.leak_ref()) +{ +} + +String::String(ShortString short_string) + : m_short_string(short_string) +{ +} + +String::String(String const& other) + : m_data(other.m_data) +{ + if (!is_short_string()) + m_data->ref(); +} + +String::String(String&& other) + : m_data(exchange(other.m_data, nullptr)) +{ +} + +String& String::operator=(String&& other) +{ + m_data = exchange(other.m_data, nullptr); + return *this; +} + +String& String::operator=(String const& other) +{ + if (&other != this) { + m_data = other.m_data; + if (!is_short_string()) + m_data->ref(); + } + return *this; +} + +String::~String() +{ + if (!is_short_string() && m_data) + m_data->unref(); +} + +String::String() +{ + // This is an empty string, it's always short and zero-length. + m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG; +} + +ErrorOr String::from_utf8(StringView view) +{ + if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) { + ShortString short_string; + if (!view.is_empty()) + memcpy(short_string.storage, view.characters_without_null_termination(), view.length()); + short_string.byte_count_and_short_string_flag = (view.length() << 1) | SHORT_STRING_FLAG; + return String { short_string }; + } + auto data = TRY(Detail::StringData::from_utf8(view.characters_without_null_termination(), view.length())); + return String { move(data) }; +} + +StringView String::bytes_as_string_view() const +{ + return StringView(bytes()); +} + +ReadonlyBytes String::bytes() const +{ + if (is_short_string()) + return m_short_string.bytes(); + return m_data->bytes(); +} + +bool String::is_empty() const +{ + return bytes().size() == 0; +} + +ErrorOr String::vformatted(StringView fmtstr, TypeErasedFormatParams& params) +{ + StringBuilder builder; + TRY(vformat(builder, fmtstr, params)); + return builder.to_string(); +} + +bool String::operator==(String const& other) const +{ + if (is_short_string()) + return m_data == other.m_data; + return bytes_as_string_view() == other.bytes_as_string_view(); +} + +bool String::operator==(StringView other) const +{ + return bytes_as_string_view() == other; +} + +ErrorOr String::substring_from_byte_offset(size_t start, size_t byte_count) const +{ + if (!byte_count) + return String {}; + return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count)); +} + +ErrorOr String::substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const +{ + if (!byte_count) + return String {}; + if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) + return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count)); + return String { TRY(Detail::StringData::create_substring(*m_data, start, byte_count)) }; +} + +bool String::operator==(char const* c_string) const +{ + return bytes_as_string_view() == c_string; +} + +u32 String::hash() const +{ + if (is_short_string()) { + auto bytes = this->bytes(); + return string_hash(reinterpret_cast(bytes.data()), bytes.size()); + } + return m_data->hash(); +} + +Utf8View String::code_points() const +{ + return Utf8View(bytes_as_string_view()); +} + +ErrorOr Formatter::format(FormatBuilder& builder, String const& utf8_string) +{ + return Formatter::format(builder, utf8_string.bytes_as_string_view()); +} + +ErrorOr String::replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const +{ + return StringUtils::replace(*this, needle, replacement, replace_mode); +} + +bool String::is_short_string() const +{ + return reinterpret_cast(m_data) & SHORT_STRING_FLAG; +} + +ReadonlyBytes String::ShortString::bytes() const +{ + return { storage, byte_count() }; +} + +size_t String::ShortString::byte_count() const +{ + return byte_count_and_short_string_flag >> 1; +} + +unsigned Traits::hash(String const& string) +{ + return string.hash(); +} + +DeprecatedString String::to_deprecated_string() const +{ + return DeprecatedString(bytes_as_string_view()); +} + +ErrorOr String::from_deprecated_string(DeprecatedString const& deprecated_string) +{ + Utf8View view(deprecated_string); + if (!view.validate()) + return Error::from_string_literal("String::from_deprecated_string: Input was not valid UTF-8"); + return String::from_utf8(deprecated_string.view()); +} + +} diff --git a/AK/String.h b/AK/String.h new file mode 100644 index 00000000000..3ce07bdb895 --- /dev/null +++ b/AK/String.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2018-2022, Andreas Kling + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace AK { + +namespace Detail { +class StringData; +} + +// String is a strongly owned sequence of Unicode code points encoded as UTF-8. +// The data may or may not be heap-allocated, and may or may not be reference counted. +// There is no guarantee that the underlying bytes are null-terminated. +class String { +public: + // NOTE: For short strings, we avoid heap allocations by storing them in the data pointer slot. + static constexpr size_t MAX_SHORT_STRING_BYTE_COUNT = sizeof(Detail::StringData*) - 1; + + String(String const&); + String(String&&); + + String& operator=(String&&); + String& operator=(String const&); + + ~String(); + + // Creates an empty (zero-length) String. + String(); + + // Creates a new String from a sequence of UTF-8 encoded code points. + static ErrorOr from_utf8(StringView); + + // Creates a substring with a deep copy of the specified data window. + ErrorOr substring_from_byte_offset(size_t start, size_t byte_count) const; + + // Creates a substring that strongly references the origin superstring instead of making a deep copy of the data. + ErrorOr substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const; + + // Returns an iterable view over the Unicode code points. + [[nodiscard]] Utf8View code_points() const; + + // Returns the underlying UTF-8 encoded bytes. + // NOTE: There is no guarantee about null-termination. + [[nodiscard]] ReadonlyBytes bytes() const; + + // Returns true if the String is zero-length. + [[nodiscard]] bool is_empty() const; + + // Returns a StringView covering the full length of the string. Note that iterating this will go byte-at-a-time, not code-point-at-a-time. + [[nodiscard]] StringView bytes_as_string_view() const; + + ErrorOr replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const; + + [[nodiscard]] bool operator==(String const&) const; + [[nodiscard]] bool operator!=(String const& other) const { return !(*this == other); } + + [[nodiscard]] bool operator==(StringView) const; + [[nodiscard]] bool operator!=(StringView other) const { return !(*this == other); } + + [[nodiscard]] bool operator==(char const* cstring) const; + [[nodiscard]] bool operator!=(char const* cstring) const { return !(*this == cstring); } + + [[nodiscard]] u32 hash() const; + + template + static ErrorOr number(T value) + requires IsArithmetic + { + return formatted("{}", value); + } + + static ErrorOr vformatted(StringView fmtstr, TypeErasedFormatParams&); + + template + static ErrorOr formatted(CheckedFormatString&& fmtstr, Parameters const&... parameters) + { + VariadicFormatParams variadic_format_parameters { parameters... }; + return vformatted(fmtstr.view(), variadic_format_parameters); + } + + // NOTE: This is primarily interesting to unit tests. + [[nodiscard]] bool is_short_string() const; + + // FIXME: Remove these once all code has been ported to String + [[nodiscard]] DeprecatedString to_deprecated_string() const; + static ErrorOr from_deprecated_string(DeprecatedString const&); + +private: + // NOTE: If the least significant bit of the pointer is set, this is a short string. + static constexpr uintptr_t SHORT_STRING_FLAG = 1; + + struct ShortString { + ReadonlyBytes bytes() const; + size_t byte_count() const; + + // NOTE: This is the byte count shifted left 1 step and or'ed with a 1 (the SHORT_STRING_FLAG) + u8 byte_count_and_short_string_flag { 0 }; + u8 storage[MAX_SHORT_STRING_BYTE_COUNT] = { 0 }; + }; + + explicit String(NonnullRefPtr); + explicit String(ShortString); + + union { + ShortString m_short_string; + Detail::StringData* m_data { nullptr }; + }; +}; + +template<> +struct Traits : public GenericTraits { + static unsigned hash(String const&); +}; + +template<> +struct Formatter : Formatter { + ErrorOr format(FormatBuilder&, String const&); +}; + +} diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index 9db7c9f23ef..932699e208d 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -115,6 +116,11 @@ DeprecatedString StringBuilder::build() const { return to_deprecated_string(); } + +ErrorOr StringBuilder::to_string() const +{ + return String::from_utf8(string_view()); +} #endif StringView StringBuilder::string_view() const diff --git a/AK/StringBuilder.h b/AK/StringBuilder.h index e5c62cb4d0f..9bde5b102d7 100644 --- a/AK/StringBuilder.h +++ b/AK/StringBuilder.h @@ -62,7 +62,9 @@ public: #ifndef KERNEL [[nodiscard]] DeprecatedString build() const; [[nodiscard]] DeprecatedString to_deprecated_string() const; + ErrorOr to_string() const; #endif + [[nodiscard]] ByteBuffer to_byte_buffer() const; [[nodiscard]] StringView string_view() const; diff --git a/AK/StringUtils.cpp b/AK/StringUtils.cpp index f72d3ec6eae..58157d148e6 100644 --- a/AK/StringUtils.cpp +++ b/AK/StringUtils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, Andreas Kling + * Copyright (c) 2018-2022, Andreas Kling * Copyright (c) 2020, Fei Wu * * SPDX-License-Identifier: BSD-2-Clause @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -533,6 +534,35 @@ DeprecatedString replace(StringView str, StringView needle, StringView replaceme replaced_string.append(str.substring_view(last_position, str.length() - last_position)); return replaced_string.build(); } + +ErrorOr replace(String const& haystack, StringView needle, StringView replacement, ReplaceMode replace_mode) +{ + if (haystack.is_empty()) + return haystack; + + // FIXME: Propagate Vector allocation failures (or do this without putting positions in a vector) + Vector positions; + if (replace_mode == ReplaceMode::All) { + positions = haystack.bytes_as_string_view().find_all(needle); + if (!positions.size()) + return haystack; + } else { + auto pos = haystack.bytes_as_string_view().find(needle); + if (!pos.has_value()) + return haystack; + positions.append(pos.value()); + } + + StringBuilder replaced_string; + size_t last_position = 0; + for (auto& position : positions) { + replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, position - last_position)); + replaced_string.append(replacement); + last_position = position + needle.length(); + } + replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, haystack.bytes_as_string_view().length() - last_position)); + return replaced_string.to_string(); +} #endif // TODO: Benchmark against KMP (AK/MemMem.h) and switch over if it's faster for short strings too diff --git a/AK/StringUtils.h b/AK/StringUtils.h index a5313803237..173d8c941bf 100644 --- a/AK/StringUtils.h +++ b/AK/StringUtils.h @@ -103,6 +103,8 @@ DeprecatedString to_titlecase(StringView); DeprecatedString invert_case(StringView); DeprecatedString replace(StringView, StringView needle, StringView replacement, ReplaceMode); +ErrorOr replace(String const&, StringView needle, StringView replacement, ReplaceMode); + size_t count(StringView, StringView needle); } diff --git a/Tests/AK/CMakeLists.txt b/Tests/AK/CMakeLists.txt index be5538afa00..c0d5d18763f 100644 --- a/Tests/AK/CMakeLists.txt +++ b/Tests/AK/CMakeLists.txt @@ -62,6 +62,7 @@ set(AK_TEST_SOURCES TestSpan.cpp TestStack.cpp TestStdLibExtras.cpp + TestString.cpp TestStringFloatingPointConversions.cpp TestStringUtils.cpp TestStringView.cpp diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp new file mode 100644 index 00000000000..c453e6d74c7 --- /dev/null +++ b/Tests/AK/TestString.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2022, Andreas Kling + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include +#include +#include +#include +#include + +TEST_CASE(construct_empty) +{ + String empty; + EXPECT(empty.is_empty()); + EXPECT_EQ(empty.bytes().size(), 0u); + + auto empty2 = MUST(String::from_utf8(""sv)); + EXPECT(empty2.is_empty()); + EXPECT_EQ(empty, empty2); + EXPECT_EQ(empty, ""sv); +} + +TEST_CASE(short_strings) +{ +#ifdef AK_ARCH_64_BIT + auto string = MUST(String::from_utf8("abcdefg"sv)); + EXPECT_EQ(string.is_short_string(), true); + EXPECT_EQ(string.bytes().size(), 7u); + EXPECT_EQ(string.bytes_as_string_view(), "abcdefg"sv); +#else + auto string = MUST(String::from_utf8("abc"sv)); + EXPECT_EQ(string.is_short_string(), true); + EXPECT_EQ(string.bytes().size(), 3u); + EXPECT_EQ(string.bytes_as_string_view(), "abc"sv); +#endif +} + +TEST_CASE(long_strings) +{ + auto string = MUST(String::from_utf8("abcdefgh"sv)); + EXPECT_EQ(string.is_short_string(), false); + EXPECT_EQ(string.bytes().size(), 8u); + EXPECT_EQ(string.bytes_as_string_view(), "abcdefgh"sv); +} + +TEST_CASE(substring) +{ + auto superstring = MUST(String::from_utf8("Hello I am a long string"sv)); + auto short_substring = MUST(superstring.substring_from_byte_offset(0, 5)); + EXPECT_EQ(short_substring, "Hello"sv); + + auto long_substring = MUST(superstring.substring_from_byte_offset(0, 10)); + EXPECT_EQ(long_substring, "Hello I am"sv); +} + +TEST_CASE(code_points) +{ + auto string = MUST(String::from_utf8("🦬🪒"sv)); + + Vector code_points; + for (auto code_point : string.code_points()) + code_points.append(code_point); + + EXPECT_EQ(code_points[0], 0x1f9acu); + EXPECT_EQ(code_points[1], 0x1fa92u); +} + +TEST_CASE(string_builder) +{ + StringBuilder builder; + builder.append_code_point(0x1f9acu); + builder.append_code_point(0x1fa92u); + + auto string = MUST(builder.to_string()); + EXPECT_EQ(string, "🦬🪒"sv); + EXPECT_EQ(string.bytes().size(), 8u); +} + +TEST_CASE(ak_format) +{ + auto foo = MUST(String::formatted("Hello {}", MUST(String::from_utf8("friends"sv)))); + EXPECT_EQ(foo, "Hello friends"sv); +} + +TEST_CASE(replace) +{ + { + auto haystack = MUST(String::from_utf8("Hello enemies"sv)); + auto result = MUST(haystack.replace("enemies"sv, "friends"sv, ReplaceMode::All)); + EXPECT_EQ(result, "Hello friends"sv); + } + + { + auto base_title = MUST(String::from_utf8("anon@courage:~"sv)); + auto result = MUST(base_title.replace("[*]"sv, "(*)"sv, ReplaceMode::FirstOnly)); + EXPECT_EQ(result, "anon@courage:~"sv); + } +}