diff --git a/AK/CMakeLists.txt b/AK/CMakeLists.txt index 46ec2ee1a4c..295e41a3284 100644 --- a/AK/CMakeLists.txt +++ b/AK/CMakeLists.txt @@ -15,6 +15,7 @@ set(AK_SOURCES LexicalPath.cpp Random.cpp StackInfo.cpp + String.cpp StringBuilder.cpp StringFloatingPointConversions.cpp StringImpl.cpp diff --git a/AK/Forward.h b/AK/Forward.h index a981707994f..31561e6de47 100644 --- a/AK/Forward.h +++ b/AK/Forward.h @@ -31,6 +31,7 @@ class StringView; class Time; class URL; class FlyString; +class String; class Utf16View; class Utf32View; class Utf8CodePointIterator; @@ -188,6 +189,7 @@ using AK::RefPtr; using AK::SinglyLinkedList; using AK::Span; using AK::StackInfo; +using AK::String; using AK::StringBuilder; using AK::StringImpl; using AK::StringView; diff --git a/AK/String.cpp b/AK/String.cpp new file mode 100644 index 00000000000..ac443a0ce59 --- /dev/null +++ b/AK/String.cpp @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2018-2022, Andreas Kling + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace AK { + +namespace Detail { + +class StringData final : public RefCounted { +public: + static ErrorOr> create_uninitialized(size_t, u8*& buffer); + static ErrorOr> create_substring(StringData const& superstring, size_t start, size_t byte_count); + static ErrorOr> from_utf8(char const* utf8_bytes, size_t); + + struct SubstringData { + StringData const* superstring { nullptr }; + u32 start_offset { 0 }; + }; + + void operator delete(void* ptr); + + ~StringData(); + + SubstringData const& substring_data() const + { + return *reinterpret_cast(m_bytes_or_substring_data); + } + + // NOTE: There is no guarantee about null-termination. + ReadonlyBytes bytes() const + { + if (m_substring) { + auto const& data = substring_data(); + return data.superstring->bytes().slice(data.start_offset, m_byte_count); + } + return { &m_bytes_or_substring_data[0], m_byte_count }; + } + + StringView bytes_as_string_view() const { return { bytes() }; } + + bool operator==(StringData const& other) const + { + return bytes_as_string_view() == other.bytes_as_string_view(); + } + + unsigned hash() const + { + if (!m_has_hash) + compute_hash(); + return m_hash; + } + +private: + explicit StringData(size_t byte_count); + StringData(StringData const& superstring, size_t start, size_t byte_count); + + void compute_hash() const; + + u32 m_byte_count { 0 }; + mutable unsigned m_hash { 0 }; + mutable bool m_has_hash { false }; + bool m_substring { false }; + + u8 m_bytes_or_substring_data[0]; +}; + +void StringData::operator delete(void* ptr) +{ + free(ptr); +} + +StringData::StringData(size_t byte_count) + : m_byte_count(byte_count) +{ +} + +StringData::StringData(StringData const& superstring, size_t start, size_t byte_count) + : m_byte_count(byte_count) + , m_substring(true) +{ + auto& data = const_cast(substring_data()); + data.start_offset = start; + data.superstring = &superstring; + superstring.ref(); +} + +StringData::~StringData() +{ + if (m_substring) + substring_data().superstring->unref(); +} + +constexpr size_t allocation_size_for_string_data(size_t length) +{ + return sizeof(StringData) + (sizeof(char) * length) + sizeof(char); +} + +ErrorOr> StringData::create_uninitialized(size_t byte_count, u8*& buffer) +{ + VERIFY(byte_count); + void* slot = malloc(allocation_size_for_string_data(byte_count)); + if (!slot) { + return Error::from_errno(ENOMEM); + } + auto new_string_data = adopt_ref(*new (slot) StringData(byte_count)); + buffer = const_cast(new_string_data->bytes().data()); + return new_string_data; +} + +ErrorOr> StringData::from_utf8(char const* utf8_data, size_t byte_count) +{ + // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. + VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); + + Utf8View view(StringView(utf8_data, byte_count)); + if (!view.validate()) + return Error::from_string_literal("StringData::from_utf8: Input was not valid UTF-8"); + + VERIFY(utf8_data); + u8* buffer = nullptr; + auto new_string_data = TRY(create_uninitialized(byte_count, buffer)); + memcpy(buffer, utf8_data, byte_count * sizeof(char)); + return new_string_data; +} + +ErrorOr> StringData::create_substring(StringData const& superstring, size_t start, size_t byte_count) +{ + // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. + VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); + + void* slot = malloc(sizeof(StringData) + sizeof(StringData::SubstringData)); + if (!slot) { + return Error::from_errno(ENOMEM); + } + return adopt_ref(*new (slot) StringData(superstring, start, byte_count)); +} + +void StringData::compute_hash() const +{ + auto bytes = this->bytes(); + if (bytes.size() == 0) + m_hash = 0; + else + m_hash = string_hash(reinterpret_cast(bytes.data()), bytes.size()); + m_has_hash = true; +} + +} + +String::String(NonnullRefPtr data) + : m_data(&data.leak_ref()) +{ +} + +String::String(ShortString short_string) + : m_short_string(short_string) +{ +} + +String::String(String const& other) + : m_data(other.m_data) +{ + if (!is_short_string()) + m_data->ref(); +} + +String::String(String&& other) + : m_data(exchange(other.m_data, nullptr)) +{ +} + +String& String::operator=(String&& other) +{ + m_data = exchange(other.m_data, nullptr); + return *this; +} + +String& String::operator=(String const& other) +{ + if (&other != this) { + m_data = other.m_data; + if (!is_short_string()) + m_data->ref(); + } + return *this; +} + +String::~String() +{ + if (!is_short_string() && m_data) + m_data->unref(); +} + +String::String() +{ + // This is an empty string, it's always short and zero-length. + m_short_string.byte_count_and_short_string_flag = SHORT_STRING_FLAG; +} + +ErrorOr String::from_utf8(StringView view) +{ + if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) { + ShortString short_string; + if (!view.is_empty()) + memcpy(short_string.storage, view.characters_without_null_termination(), view.length()); + short_string.byte_count_and_short_string_flag = (view.length() << 1) | SHORT_STRING_FLAG; + return String { short_string }; + } + auto data = TRY(Detail::StringData::from_utf8(view.characters_without_null_termination(), view.length())); + return String { move(data) }; +} + +StringView String::bytes_as_string_view() const +{ + return StringView(bytes()); +} + +ReadonlyBytes String::bytes() const +{ + if (is_short_string()) + return m_short_string.bytes(); + return m_data->bytes(); +} + +bool String::is_empty() const +{ + return bytes().size() == 0; +} + +ErrorOr String::vformatted(StringView fmtstr, TypeErasedFormatParams& params) +{ + StringBuilder builder; + TRY(vformat(builder, fmtstr, params)); + return builder.to_string(); +} + +bool String::operator==(String const& other) const +{ + if (is_short_string()) + return m_data == other.m_data; + return bytes_as_string_view() == other.bytes_as_string_view(); +} + +bool String::operator==(StringView other) const +{ + return bytes_as_string_view() == other; +} + +ErrorOr String::substring_from_byte_offset(size_t start, size_t byte_count) const +{ + if (!byte_count) + return String {}; + return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count)); +} + +ErrorOr String::substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const +{ + if (!byte_count) + return String {}; + if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) + return String::from_utf8(bytes_as_string_view().substring_view(start, byte_count)); + return String { TRY(Detail::StringData::create_substring(*m_data, start, byte_count)) }; +} + +bool String::operator==(char const* c_string) const +{ + return bytes_as_string_view() == c_string; +} + +u32 String::hash() const +{ + if (is_short_string()) { + auto bytes = this->bytes(); + return string_hash(reinterpret_cast(bytes.data()), bytes.size()); + } + return m_data->hash(); +} + +Utf8View String::code_points() const +{ + return Utf8View(bytes_as_string_view()); +} + +ErrorOr Formatter::format(FormatBuilder& builder, String const& utf8_string) +{ + return Formatter::format(builder, utf8_string.bytes_as_string_view()); +} + +ErrorOr String::replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const +{ + return StringUtils::replace(*this, needle, replacement, replace_mode); +} + +bool String::is_short_string() const +{ + return reinterpret_cast(m_data) & SHORT_STRING_FLAG; +} + +ReadonlyBytes String::ShortString::bytes() const +{ + return { storage, byte_count() }; +} + +size_t String::ShortString::byte_count() const +{ + return byte_count_and_short_string_flag >> 1; +} + +unsigned Traits::hash(String const& string) +{ + return string.hash(); +} + +DeprecatedString String::to_deprecated_string() const +{ + return DeprecatedString(bytes_as_string_view()); +} + +ErrorOr String::from_deprecated_string(DeprecatedString const& deprecated_string) +{ + Utf8View view(deprecated_string); + if (!view.validate()) + return Error::from_string_literal("String::from_deprecated_string: Input was not valid UTF-8"); + return String::from_utf8(deprecated_string.view()); +} + +} diff --git a/AK/String.h b/AK/String.h new file mode 100644 index 00000000000..3ce07bdb895 --- /dev/null +++ b/AK/String.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2018-2022, Andreas Kling + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace AK { + +namespace Detail { +class StringData; +} + +// String is a strongly owned sequence of Unicode code points encoded as UTF-8. +// The data may or may not be heap-allocated, and may or may not be reference counted. +// There is no guarantee that the underlying bytes are null-terminated. +class String { +public: + // NOTE: For short strings, we avoid heap allocations by storing them in the data pointer slot. + static constexpr size_t MAX_SHORT_STRING_BYTE_COUNT = sizeof(Detail::StringData*) - 1; + + String(String const&); + String(String&&); + + String& operator=(String&&); + String& operator=(String const&); + + ~String(); + + // Creates an empty (zero-length) String. + String(); + + // Creates a new String from a sequence of UTF-8 encoded code points. + static ErrorOr from_utf8(StringView); + + // Creates a substring with a deep copy of the specified data window. + ErrorOr substring_from_byte_offset(size_t start, size_t byte_count) const; + + // Creates a substring that strongly references the origin superstring instead of making a deep copy of the data. + ErrorOr substring_from_byte_offset_with_shared_superstring(size_t start, size_t byte_count) const; + + // Returns an iterable view over the Unicode code points. + [[nodiscard]] Utf8View code_points() const; + + // Returns the underlying UTF-8 encoded bytes. + // NOTE: There is no guarantee about null-termination. + [[nodiscard]] ReadonlyBytes bytes() const; + + // Returns true if the String is zero-length. + [[nodiscard]] bool is_empty() const; + + // Returns a StringView covering the full length of the string. Note that iterating this will go byte-at-a-time, not code-point-at-a-time. + [[nodiscard]] StringView bytes_as_string_view() const; + + ErrorOr replace(StringView needle, StringView replacement, ReplaceMode replace_mode) const; + + [[nodiscard]] bool operator==(String const&) const; + [[nodiscard]] bool operator!=(String const& other) const { return !(*this == other); } + + [[nodiscard]] bool operator==(StringView) const; + [[nodiscard]] bool operator!=(StringView other) const { return !(*this == other); } + + [[nodiscard]] bool operator==(char const* cstring) const; + [[nodiscard]] bool operator!=(char const* cstring) const { return !(*this == cstring); } + + [[nodiscard]] u32 hash() const; + + template + static ErrorOr number(T value) + requires IsArithmetic + { + return formatted("{}", value); + } + + static ErrorOr vformatted(StringView fmtstr, TypeErasedFormatParams&); + + template + static ErrorOr formatted(CheckedFormatString&& fmtstr, Parameters const&... parameters) + { + VariadicFormatParams variadic_format_parameters { parameters... }; + return vformatted(fmtstr.view(), variadic_format_parameters); + } + + // NOTE: This is primarily interesting to unit tests. + [[nodiscard]] bool is_short_string() const; + + // FIXME: Remove these once all code has been ported to String + [[nodiscard]] DeprecatedString to_deprecated_string() const; + static ErrorOr from_deprecated_string(DeprecatedString const&); + +private: + // NOTE: If the least significant bit of the pointer is set, this is a short string. + static constexpr uintptr_t SHORT_STRING_FLAG = 1; + + struct ShortString { + ReadonlyBytes bytes() const; + size_t byte_count() const; + + // NOTE: This is the byte count shifted left 1 step and or'ed with a 1 (the SHORT_STRING_FLAG) + u8 byte_count_and_short_string_flag { 0 }; + u8 storage[MAX_SHORT_STRING_BYTE_COUNT] = { 0 }; + }; + + explicit String(NonnullRefPtr); + explicit String(ShortString); + + union { + ShortString m_short_string; + Detail::StringData* m_data { nullptr }; + }; +}; + +template<> +struct Traits : public GenericTraits { + static unsigned hash(String const&); +}; + +template<> +struct Formatter : Formatter { + ErrorOr format(FormatBuilder&, String const&); +}; + +} diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index 9db7c9f23ef..932699e208d 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -115,6 +116,11 @@ DeprecatedString StringBuilder::build() const { return to_deprecated_string(); } + +ErrorOr StringBuilder::to_string() const +{ + return String::from_utf8(string_view()); +} #endif StringView StringBuilder::string_view() const diff --git a/AK/StringBuilder.h b/AK/StringBuilder.h index e5c62cb4d0f..9bde5b102d7 100644 --- a/AK/StringBuilder.h +++ b/AK/StringBuilder.h @@ -62,7 +62,9 @@ public: #ifndef KERNEL [[nodiscard]] DeprecatedString build() const; [[nodiscard]] DeprecatedString to_deprecated_string() const; + ErrorOr to_string() const; #endif + [[nodiscard]] ByteBuffer to_byte_buffer() const; [[nodiscard]] StringView string_view() const; diff --git a/AK/StringUtils.cpp b/AK/StringUtils.cpp index f72d3ec6eae..58157d148e6 100644 --- a/AK/StringUtils.cpp +++ b/AK/StringUtils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, Andreas Kling + * Copyright (c) 2018-2022, Andreas Kling * Copyright (c) 2020, Fei Wu * * SPDX-License-Identifier: BSD-2-Clause @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -533,6 +534,35 @@ DeprecatedString replace(StringView str, StringView needle, StringView replaceme replaced_string.append(str.substring_view(last_position, str.length() - last_position)); return replaced_string.build(); } + +ErrorOr replace(String const& haystack, StringView needle, StringView replacement, ReplaceMode replace_mode) +{ + if (haystack.is_empty()) + return haystack; + + // FIXME: Propagate Vector allocation failures (or do this without putting positions in a vector) + Vector positions; + if (replace_mode == ReplaceMode::All) { + positions = haystack.bytes_as_string_view().find_all(needle); + if (!positions.size()) + return haystack; + } else { + auto pos = haystack.bytes_as_string_view().find(needle); + if (!pos.has_value()) + return haystack; + positions.append(pos.value()); + } + + StringBuilder replaced_string; + size_t last_position = 0; + for (auto& position : positions) { + replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, position - last_position)); + replaced_string.append(replacement); + last_position = position + needle.length(); + } + replaced_string.append(haystack.bytes_as_string_view().substring_view(last_position, haystack.bytes_as_string_view().length() - last_position)); + return replaced_string.to_string(); +} #endif // TODO: Benchmark against KMP (AK/MemMem.h) and switch over if it's faster for short strings too diff --git a/AK/StringUtils.h b/AK/StringUtils.h index a5313803237..173d8c941bf 100644 --- a/AK/StringUtils.h +++ b/AK/StringUtils.h @@ -103,6 +103,8 @@ DeprecatedString to_titlecase(StringView); DeprecatedString invert_case(StringView); DeprecatedString replace(StringView, StringView needle, StringView replacement, ReplaceMode); +ErrorOr replace(String const&, StringView needle, StringView replacement, ReplaceMode); + size_t count(StringView, StringView needle); } diff --git a/Tests/AK/CMakeLists.txt b/Tests/AK/CMakeLists.txt index be5538afa00..c0d5d18763f 100644 --- a/Tests/AK/CMakeLists.txt +++ b/Tests/AK/CMakeLists.txt @@ -62,6 +62,7 @@ set(AK_TEST_SOURCES TestSpan.cpp TestStack.cpp TestStdLibExtras.cpp + TestString.cpp TestStringFloatingPointConversions.cpp TestStringUtils.cpp TestStringView.cpp diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp new file mode 100644 index 00000000000..c453e6d74c7 --- /dev/null +++ b/Tests/AK/TestString.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2022, Andreas Kling + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include +#include +#include +#include +#include + +TEST_CASE(construct_empty) +{ + String empty; + EXPECT(empty.is_empty()); + EXPECT_EQ(empty.bytes().size(), 0u); + + auto empty2 = MUST(String::from_utf8(""sv)); + EXPECT(empty2.is_empty()); + EXPECT_EQ(empty, empty2); + EXPECT_EQ(empty, ""sv); +} + +TEST_CASE(short_strings) +{ +#ifdef AK_ARCH_64_BIT + auto string = MUST(String::from_utf8("abcdefg"sv)); + EXPECT_EQ(string.is_short_string(), true); + EXPECT_EQ(string.bytes().size(), 7u); + EXPECT_EQ(string.bytes_as_string_view(), "abcdefg"sv); +#else + auto string = MUST(String::from_utf8("abc"sv)); + EXPECT_EQ(string.is_short_string(), true); + EXPECT_EQ(string.bytes().size(), 3u); + EXPECT_EQ(string.bytes_as_string_view(), "abc"sv); +#endif +} + +TEST_CASE(long_strings) +{ + auto string = MUST(String::from_utf8("abcdefgh"sv)); + EXPECT_EQ(string.is_short_string(), false); + EXPECT_EQ(string.bytes().size(), 8u); + EXPECT_EQ(string.bytes_as_string_view(), "abcdefgh"sv); +} + +TEST_CASE(substring) +{ + auto superstring = MUST(String::from_utf8("Hello I am a long string"sv)); + auto short_substring = MUST(superstring.substring_from_byte_offset(0, 5)); + EXPECT_EQ(short_substring, "Hello"sv); + + auto long_substring = MUST(superstring.substring_from_byte_offset(0, 10)); + EXPECT_EQ(long_substring, "Hello I am"sv); +} + +TEST_CASE(code_points) +{ + auto string = MUST(String::from_utf8("🦬🪒"sv)); + + Vector code_points; + for (auto code_point : string.code_points()) + code_points.append(code_point); + + EXPECT_EQ(code_points[0], 0x1f9acu); + EXPECT_EQ(code_points[1], 0x1fa92u); +} + +TEST_CASE(string_builder) +{ + StringBuilder builder; + builder.append_code_point(0x1f9acu); + builder.append_code_point(0x1fa92u); + + auto string = MUST(builder.to_string()); + EXPECT_EQ(string, "🦬🪒"sv); + EXPECT_EQ(string.bytes().size(), 8u); +} + +TEST_CASE(ak_format) +{ + auto foo = MUST(String::formatted("Hello {}", MUST(String::from_utf8("friends"sv)))); + EXPECT_EQ(foo, "Hello friends"sv); +} + +TEST_CASE(replace) +{ + { + auto haystack = MUST(String::from_utf8("Hello enemies"sv)); + auto result = MUST(haystack.replace("enemies"sv, "friends"sv, ReplaceMode::All)); + EXPECT_EQ(result, "Hello friends"sv); + } + + { + auto base_title = MUST(String::from_utf8("anon@courage:~"sv)); + auto result = MUST(base_title.replace("[*]"sv, "(*)"sv, ReplaceMode::FirstOnly)); + EXPECT_EQ(result, "anon@courage:~"sv); + } +}