From 9c2a7c0e03fcfcf32c89b7e51f921c88afbe41cf Mon Sep 17 00:00:00 2001 From: Jelle Raaijmakers Date: Tue, 8 Mar 2022 14:27:11 +0100 Subject: [PATCH] LibTextCodec: Add support for the UTF16-LE encoding --- Userland/Libraries/LibTextCodec/Decoder.cpp | 29 ++++++++++++++++++--- Userland/Libraries/LibTextCodec/Decoder.h | 7 +++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index 35be9c52efd..2855bff677d 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2020, Andreas Kling + * Copyright (c) 2022, Jelle Raaijmakers * * SPDX-License-Identifier: BSD-2-Clause */ @@ -14,6 +15,7 @@ namespace { Latin1Decoder s_latin1_decoder; UTF8Decoder s_utf8_decoder; UTF16BEDecoder s_utf16be_decoder; +UTF16LEDecoder s_utf16le_decoder; Latin2Decoder s_latin2_decoder; HebrewDecoder s_hebrew_decoder; CyrillicDecoder s_cyrillic_decoder; @@ -33,6 +35,8 @@ Decoder* decoder_for(const String& a_encoding) return &s_utf8_decoder; if (encoding.value().equals_ignoring_case("utf-16be")) return &s_utf16be_decoder; + if (encoding.value().equals_ignoring_case("utf-16le")) + return &s_utf16le_decoder; if (encoding.value().equals_ignoring_case("iso-8859-2")) return &s_latin2_decoder; if (encoding.value().equals_ignoring_case("windows-1255")) @@ -172,8 +176,7 @@ Decoder* bom_sniff_to_decoder(StringView input) case 0xFE: // UTF-16BE return bytes[1] == 0xFF ? &s_utf16be_decoder : nullptr; case 0xFF: // UTF-16LE - // FIXME: There is currently no UTF-16LE decoder. - TODO(); + return bytes[1] == 0xFE ? &s_utf16le_decoder : nullptr; } return nullptr; @@ -241,9 +244,29 @@ String UTF16BEDecoder::to_utf8(StringView input) { // Discard the BOM auto bomless_input = input; - if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) { + if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) bomless_input = input.substring_view(2); + + StringBuilder builder(bomless_input.length() / 2); + process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); }); + return builder.to_string(); +} + +void UTF16LEDecoder::process(StringView input, Function on_code_point) +{ + size_t utf16_length = input.length() - (input.length() % 2); + for (size_t i = 0; i < utf16_length; i += 2) { + u16 code_point = input[i] | (input[i + 1] << 8); + on_code_point(code_point); } +} + +String UTF16LEDecoder::to_utf8(StringView input) +{ + // Discard the BOM + auto bomless_input = input; + if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE) + bomless_input = input.substring_view(2); StringBuilder builder(bomless_input.length() / 2); process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); }); diff --git a/Userland/Libraries/LibTextCodec/Decoder.h b/Userland/Libraries/LibTextCodec/Decoder.h index 7c7c47c7fbf..110648b2d63 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.h +++ b/Userland/Libraries/LibTextCodec/Decoder.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2020-2021, Andreas Kling + * Copyright (c) 2022, Jelle Raaijmakers * * SPDX-License-Identifier: BSD-2-Clause */ @@ -32,6 +33,12 @@ public: virtual String to_utf8(StringView) override; }; +class UTF16LEDecoder final : public Decoder { +public: + virtual void process(StringView, Function on_code_point) override; + virtual String to_utf8(StringView) override; +}; + class Latin1Decoder final : public Decoder { public: virtual void process(StringView, Function on_code_point) override;