AK: Ensure short String instances are valid UTF-8

We are currently only validating long strings.
This commit is contained in:
Timothy Flynn 2023-03-03 09:03:45 -05:00 committed by Tim Flynn
parent 434ca78425
commit da0d000909
Notes: sideshowbarker 2024-07-16 23:32:35 +09:00
3 changed files with 37 additions and 15 deletions

View File

@ -11,7 +11,6 @@
#include <AK/MemMem.h> #include <AK/MemMem.h>
#include <AK/Stream.h> #include <AK/Stream.h>
#include <AK/String.h> #include <AK/String.h>
#include <AK/Utf8View.h>
#include <AK/Vector.h> #include <AK/Vector.h>
#include <stdlib.h> #include <stdlib.h>
@ -132,10 +131,6 @@ ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data,
// Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT); VERIFY(byte_count > String::MAX_SHORT_STRING_BYTE_COUNT);
Utf8View view(StringView(utf8_data, byte_count));
if (!view.validate())
return Error::from_string_literal("StringData::from_utf8: Input was not valid UTF-8");
VERIFY(utf8_data); VERIFY(utf8_data);
u8* buffer = nullptr; u8* buffer = nullptr;
auto new_string_data = TRY(create_uninitialized(byte_count, buffer)); auto new_string_data = TRY(create_uninitialized(byte_count, buffer));
@ -143,6 +138,16 @@ ErrorOr<NonnullRefPtr<StringData>> StringData::from_utf8(char const* utf8_data,
return new_string_data; return new_string_data;
} }
static ErrorOr<void> read_stream_into_buffer(Stream& stream, Bytes buffer)
{
TRY(stream.read_entire_buffer(buffer));
if (!Utf8View { StringView { buffer } }.validate())
return Error::from_string_literal("String::from_stream: Input was not valid UTF-8");
return {};
}
ErrorOr<NonnullRefPtr<StringData>> StringData::from_stream(Stream& stream, size_t byte_count) ErrorOr<NonnullRefPtr<StringData>> StringData::from_stream(Stream& stream, size_t byte_count)
{ {
// Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization. // Strings of MAX_SHORT_STRING_BYTE_COUNT bytes or less should be handled by the String short string optimization.
@ -150,12 +155,7 @@ ErrorOr<NonnullRefPtr<StringData>> StringData::from_stream(Stream& stream, size_
u8* buffer = nullptr; u8* buffer = nullptr;
auto new_string_data = TRY(create_uninitialized(byte_count, buffer)); auto new_string_data = TRY(create_uninitialized(byte_count, buffer));
Bytes new_string_bytes = { buffer, byte_count }; TRY(read_stream_into_buffer(stream, { buffer, byte_count }));
TRY(stream.read_entire_buffer(new_string_bytes));
Utf8View view(StringView { new_string_bytes });
if (!view.validate())
return Error::from_string_literal("StringData::from_stream: Input was not valid UTF-8");
return new_string_data; return new_string_data;
} }
@ -230,6 +230,9 @@ void String::destroy_string()
ErrorOr<String> String::from_utf8(StringView view) ErrorOr<String> String::from_utf8(StringView view)
{ {
if (!Utf8View { view }.validate())
return Error::from_string_literal("String::from_utf8: Input was not valid UTF-8");
if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) { if (view.length() <= MAX_SHORT_STRING_BYTE_COUNT) {
ShortString short_string; ShortString short_string;
if (!view.is_empty()) if (!view.is_empty())
@ -246,7 +249,7 @@ ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) { if (byte_count <= MAX_SHORT_STRING_BYTE_COUNT) {
ShortString short_string; ShortString short_string;
if (byte_count > 0) if (byte_count > 0)
TRY(stream.read_entire_buffer({ short_string.storage, byte_count })); TRY(Detail::read_stream_into_buffer(stream, { short_string.storage, byte_count }));
short_string.byte_count_and_short_string_flag = (byte_count << 1) | SHORT_STRING_FLAG; short_string.byte_count_and_short_string_flag = (byte_count << 1) | SHORT_STRING_FLAG;
return String { short_string }; return String { short_string };
} }
@ -587,9 +590,6 @@ DeprecatedString String::to_deprecated_string() const
ErrorOr<String> String::from_deprecated_string(DeprecatedString const& deprecated_string) ErrorOr<String> String::from_deprecated_string(DeprecatedString const& deprecated_string)
{ {
Utf8View view(deprecated_string);
if (!view.validate())
return Error::from_string_literal("String::from_deprecated_string: Input was not valid UTF-8");
return String::from_utf8(deprecated_string.view()); return String::from_utf8(deprecated_string.view());
} }

View File

@ -20,6 +20,7 @@
#include <AK/Traits.h> #include <AK/Traits.h>
#include <AK/Types.h> #include <AK/Types.h>
#include <AK/UnicodeUtils.h> #include <AK/UnicodeUtils.h>
#include <AK/Utf8View.h>
#include <AK/Vector.h> #include <AK/Vector.h>
namespace AK { namespace AK {
@ -72,6 +73,7 @@ public:
static AK_SHORT_STRING_CONSTEVAL String from_utf8_short_string(StringView string) static AK_SHORT_STRING_CONSTEVAL String from_utf8_short_string(StringView string)
{ {
VERIFY(string.length() <= MAX_SHORT_STRING_BYTE_COUNT); VERIFY(string.length() <= MAX_SHORT_STRING_BYTE_COUNT);
VERIFY(Utf8View { string }.validate());
ShortString short_string; ShortString short_string;
for (size_t i = 0; i < string.length(); ++i) for (size_t i = 0; i < string.length(); ++i)

View File

@ -140,6 +140,26 @@ TEST_CASE(long_streams)
} }
} }
TEST_CASE(invalid_utf8)
{
auto string1 = String::from_utf8("long string \xf4\x8f\xbf\xc0"sv); // U+110000
EXPECT(string1.is_error());
EXPECT(string1.error().string_literal().contains("Input was not valid UTF-8"sv));
auto string2 = String::from_utf8("\xf4\xa1\xb0\xbd"sv); // U+121C3D
EXPECT(string2.is_error());
EXPECT(string2.error().string_literal().contains("Input was not valid UTF-8"sv));
AllocatingMemoryStream stream;
MUST(stream.write_value<u8>(0xf4));
MUST(stream.write_value<u8>(0xa1));
MUST(stream.write_value<u8>(0xb0));
MUST(stream.write_value<u8>(0xbd));
auto string3 = String::from_stream(stream, stream.used_buffer_size());
EXPECT_EQ(string3.is_error(), true);
EXPECT(string3.error().string_literal().contains("Input was not valid UTF-8"sv));
}
TEST_CASE(from_code_points) TEST_CASE(from_code_points)
{ {
for (u32 code_point = 0; code_point < 0x80; ++code_point) { for (u32 code_point = 0; code_point < 0x80; ++code_point) {