From 477e3946e50cdfcbd3ee3688e4529a7fa1d9e442 Mon Sep 17 00:00:00 2001 From: Matthew Olsson Date: Sat, 22 May 2021 20:44:18 -0700 Subject: [PATCH] LibPDF: Add support for stream filters This commit also splits up StreamObject into PlainTextStreamObject and EncodedStreamObject, which is essentially just a stream object which does not own its bytes vs one which does. --- Userland/Libraries/LibPDF/CMakeLists.txt | 2 +- Userland/Libraries/LibPDF/Filter.cpp | 173 +++++++++++++++++++++++ Userland/Libraries/LibPDF/Filter.h | 31 ++++ Userland/Libraries/LibPDF/Forward.h | 3 + Userland/Libraries/LibPDF/Object.h | 40 +++++- Userland/Libraries/LibPDF/Parser.cpp | 11 +- 6 files changed, 253 insertions(+), 7 deletions(-) create mode 100644 Userland/Libraries/LibPDF/Filter.cpp create mode 100644 Userland/Libraries/LibPDF/Filter.h diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt index 2da7ea56b51..6d52363eb66 100644 --- a/Userland/Libraries/LibPDF/CMakeLists.txt +++ b/Userland/Libraries/LibPDF/CMakeLists.txt @@ -1,6 +1,6 @@ set(SOURCES - Object.cpp Document.cpp + Filter.cpp Object.cpp Parser.cpp Renderer.cpp diff --git a/Userland/Libraries/LibPDF/Filter.cpp b/Userland/Libraries/LibPDF/Filter.cpp new file mode 100644 index 00000000000..2b2ab8ba846 --- /dev/null +++ b/Userland/Libraries/LibPDF/Filter.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2021, Matthew Olsson + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include + +namespace PDF { + +Optional Filter::decode(const ReadonlyBytes& bytes, const FlyString& encoding_type) +{ + if (encoding_type == "ASCIIHexDecode") + return decode_ascii_hex(bytes); + if (encoding_type == "ASCII85Decode") + return decode_ascii85(bytes); + if (encoding_type == "LZWDecode") + return decode_lzw(bytes); + if (encoding_type == "FlateDecode") + return decode_flate(bytes); + if (encoding_type == "RunLengthDecode") + return decode_run_length(bytes); + if (encoding_type == "CCITTFaxDecode") + return decode_ccitt(bytes); + if (encoding_type == "JBIG2Decode") + return decode_jbig2(bytes); + if (encoding_type == "DCTDecode") + return decode_dct(bytes); + if (encoding_type == "JPXDecode") + return decode_jpx(bytes); + if (encoding_type == "Crypt") + return decode_crypt(bytes); + + return {}; +} + +Optional Filter::decode_ascii_hex(const ReadonlyBytes& bytes) +{ + if (bytes.size() % 2 == 0) + return decode_hex(bytes); + + // FIXME: Integrate this padding into AK/Hex? + + auto output = ByteBuffer::create_zeroed(bytes.size() / 2 + 1); + + for (size_t i = 0; i < bytes.size() / 2; ++i) { + const auto c1 = decode_hex_digit(static_cast(bytes[i * 2])); + if (c1 >= 16) + return {}; + + const auto c2 = decode_hex_digit(static_cast(bytes[i * 2 + 1])); + if (c2 >= 16) + return {}; + + output[i] = (c1 << 4) + c2; + } + + // Process last byte with a padded zero + output[output.size() - 1] = decode_hex_digit(static_cast(bytes[bytes.size() - 1])) * 16; + + return output; +}; + +Optional Filter::decode_ascii85(const ReadonlyBytes& bytes) +{ + Vector buff; + buff.ensure_capacity(bytes.size()); + + size_t byte_index = 0; + + while (byte_index < bytes.size()) { + if (bytes[byte_index] == ' ') { + byte_index++; + continue; + } + + if (bytes[byte_index] == 'z') { + byte_index++; + for (int i = 0; i < 4; i++) + buff.append(0); + continue; + } + + u32 number = 0; + + if (byte_index + 5 >= bytes.size()) { + auto to_write = bytes.size() - byte_index; + for (int i = 0; i < 5; i++) { + auto byte = byte_index >= bytes.size() ? 'u' : bytes[byte_index++]; + if (byte == ' ') { + i--; + continue; + } + number = number * 85 + byte - 33; + } + + for (size_t i = 0; i < to_write - 1; i++) + buff.append(reinterpret_cast(&number)[3 - i]); + + break; + } else { + for (int i = 0; i < 5; i++) { + auto byte = bytes[byte_index++]; + if (byte == ' ') { + i--; + continue; + } + number = number * 85 + byte - 33; + } + } + + for (int i = 0; i < 4; i++) + buff.append(reinterpret_cast(&number)[3 - i]); + } + + return ByteBuffer::copy(buff.span()); +}; + +Optional Filter::decode_lzw(const ReadonlyBytes&) +{ + dbgln("LZW decoding is not supported"); + VERIFY_NOT_REACHED(); +}; + +Optional Filter::decode_flate(const ReadonlyBytes& bytes) +{ + // FIXME: The spec says Flate decoding is "based on" zlib, does that mean they + // aren't exactly the same? + + auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2)); + VERIFY(buff.has_value()); + return buff.value(); +}; + +Optional Filter::decode_run_length(const ReadonlyBytes&) +{ + // FIXME: Support RunLength decoding + TODO(); +}; + +Optional Filter::decode_ccitt(const ReadonlyBytes&) +{ + // FIXME: Support CCITT decoding + TODO(); +}; + +Optional Filter::decode_jbig2(const ReadonlyBytes&) +{ + // FIXME: Support JBIG2 decoding + TODO(); +}; + +Optional Filter::decode_dct(const ReadonlyBytes&) +{ + // FIXME: Support dct decoding + TODO(); +}; + +Optional Filter::decode_jpx(const ReadonlyBytes&) +{ + // FIXME: Support JPX decoding + TODO(); +}; + +Optional Filter::decode_crypt(const ReadonlyBytes&) +{ + // FIXME: Support Crypt decoding + TODO(); +}; + +} diff --git a/Userland/Libraries/LibPDF/Filter.h b/Userland/Libraries/LibPDF/Filter.h new file mode 100644 index 00000000000..876656c38aa --- /dev/null +++ b/Userland/Libraries/LibPDF/Filter.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2021, Matthew Olsson + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include + +namespace PDF { + +class Filter { +public: + static Optional decode(const ReadonlyBytes& bytes, const FlyString& encoding_type); + +private: + static Optional decode_ascii_hex(const ReadonlyBytes& bytes); + static Optional decode_ascii85(const ReadonlyBytes& bytes); + static Optional decode_lzw(const ReadonlyBytes& bytes); + static Optional decode_flate(const ReadonlyBytes& bytes); + static Optional decode_run_length(const ReadonlyBytes& bytes); + static Optional decode_ccitt(const ReadonlyBytes& bytes); + static Optional decode_jbig2(const ReadonlyBytes& bytes); + static Optional decode_dct(const ReadonlyBytes& bytes); + static Optional decode_jpx(const ReadonlyBytes& bytes); + static Optional decode_crypt(const ReadonlyBytes& bytes); +}; + +} diff --git a/Userland/Libraries/LibPDF/Forward.h b/Userland/Libraries/LibPDF/Forward.h index f821af05cbc..05ef8249f21 100644 --- a/Userland/Libraries/LibPDF/Forward.h +++ b/Userland/Libraries/LibPDF/Forward.h @@ -11,6 +11,9 @@ namespace PDF { class Document; class Object; +// Note: This macro doesn't care about PlainTextStreamObject and EncodedStreamObject because +// we never need to work directly with either of them. + #define ENUMERATE_DIRECT_OBJECT_TYPES(V) \ V(StringObject, string) \ V(NameObject, name) \ diff --git a/Userland/Libraries/LibPDF/Object.h b/Userland/Libraries/LibPDF/Object.h index d185c6fdfd9..84a7bfb2a09 100644 --- a/Userland/Libraries/LibPDF/Object.h +++ b/Userland/Libraries/LibPDF/Object.h @@ -147,18 +147,17 @@ private: HashMap m_map; }; -class StreamObject final : public Object { +class StreamObject : public Object { public: - StreamObject(const NonnullRefPtr& dict, const ReadonlyBytes& bytes) + explicit StreamObject(const NonnullRefPtr& dict) : m_dict(dict) - , m_bytes(bytes) { } - ~StreamObject() override = default; + virtual ~StreamObject() override = default; [[nodiscard]] ALWAYS_INLINE NonnullRefPtr dict() const { return m_dict; } - [[nodiscard]] ALWAYS_INLINE const ReadonlyBytes& bytes() const { return m_bytes; } + [[nodiscard]] virtual ReadonlyBytes bytes() const = 0; ALWAYS_INLINE bool is_stream() const override { return true; } ALWAYS_INLINE const char* type_name() const override { return "stream"; } @@ -166,9 +165,40 @@ public: private: NonnullRefPtr m_dict; +}; + +class PlainTextStreamObject final : public StreamObject { +public: + PlainTextStreamObject(const NonnullRefPtr& dict, const ReadonlyBytes& bytes) + : StreamObject(dict) + , m_bytes(bytes) + { + } + + virtual ~PlainTextStreamObject() override = default; + + [[nodiscard]] ALWAYS_INLINE virtual ReadonlyBytes bytes() const override { return m_bytes; } + +private: ReadonlyBytes m_bytes; }; +class EncodedStreamObject final : public StreamObject { +public: + EncodedStreamObject(const NonnullRefPtr& dict, ByteBuffer&& buffer) + : StreamObject(dict) + , m_buffer(buffer) + { + } + + virtual ~EncodedStreamObject() override = default; + + [[nodiscard]] ALWAYS_INLINE virtual ReadonlyBytes bytes() const override { return m_buffer.bytes(); } + +private: + ByteBuffer m_buffer; +}; + class IndirectValue final : public Object { public: IndirectValue(u32 index, u32 generation_index, const Value& value) diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index fc4ec2b6666..07e57046fe3 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -657,7 +658,15 @@ NonnullRefPtr Parser::parse_stream(NonnullRefPtr dict) m_reader.move_by(9); consume_whitespace(); - return make_object(dict, bytes); + if (dict->contains("Filter")) { + auto filter_type = dict->get_name(m_document, "Filter")->name(); + auto maybe_bytes = Filter::decode(bytes, filter_type); + // FIXME: Handle error condition + VERIFY(maybe_bytes.has_value()); + return make_object(dict, move(maybe_bytes.value())); + } + + return make_object(dict, bytes); } Vector Parser::parse_graphics_commands()