LibPDF: Add support for stream filters

This commit also splits up StreamObject into PlainTextStreamObject and EncodedStreamObject, which is essentially just a stream object which does not own its bytes vs one which does.
Author: https://github.com/mattco98 Commit: https://github.com/SerenityOS/serenity/commit/477e3946e50 Pull-request: https://github.com/SerenityOS/serenity/pull/7436 Reviewed-by: https://github.com/alimpfard
2024-09-20 09:49:15 +03:00 · 2021-05-22 20:44:18 -07:00 · 2021-05-22 20:44:18 -07:00 · 477e3946e5 · 2024-07-18 17:26:46 +09:00
commit 477e3946e5
parent 97cc482087
6 changed files with 253 additions and 7 deletions
--- a/Userland/Libraries/LibPDF/CMakeLists.txt
+++ b/Userland/Libraries/LibPDF/CMakeLists.txt
@ -1,6 +1,6 @@
 set(SOURCES
-    Object.cpp
    Document.cpp
+    Filter.cpp
    Object.cpp
    Parser.cpp
    Renderer.cpp
--- a/Userland/Libraries/LibPDF/Filter.cpp
+++ b/Userland/Libraries/LibPDF/Filter.cpp
@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Hex.h>
+#include <LibCompress/Deflate.h>
+#include <LibPDF/Filter.h>
+
+namespace PDF {
+
+Optional<ByteBuffer> Filter::decode(const ReadonlyBytes& bytes, const FlyString& encoding_type)
+{
+    if (encoding_type == "ASCIIHexDecode")
+        return decode_ascii_hex(bytes);
+    if (encoding_type == "ASCII85Decode")
+        return decode_ascii85(bytes);
+    if (encoding_type == "LZWDecode")
+        return decode_lzw(bytes);
+    if (encoding_type == "FlateDecode")
+        return decode_flate(bytes);
+    if (encoding_type == "RunLengthDecode")
+        return decode_run_length(bytes);
+    if (encoding_type == "CCITTFaxDecode")
+        return decode_ccitt(bytes);
+    if (encoding_type == "JBIG2Decode")
+        return decode_jbig2(bytes);
+    if (encoding_type == "DCTDecode")
+        return decode_dct(bytes);
+    if (encoding_type == "JPXDecode")
+        return decode_jpx(bytes);
+    if (encoding_type == "Crypt")
+        return decode_crypt(bytes);
+
+    return {};
+}
+
+Optional<ByteBuffer> Filter::decode_ascii_hex(const ReadonlyBytes& bytes)
+{
+    if (bytes.size() % 2 == 0)
+        return decode_hex(bytes);
+
+    // FIXME: Integrate this padding into AK/Hex?
+
+    auto output = ByteBuffer::create_zeroed(bytes.size() / 2 + 1);
+
+    for (size_t i = 0; i < bytes.size() / 2; ++i) {
+        const auto c1 = decode_hex_digit(static_cast<char>(bytes[i * 2]));
+        if (c1 >= 16)
+            return {};
+
+        const auto c2 = decode_hex_digit(static_cast<char>(bytes[i * 2 + 1]));
+        if (c2 >= 16)
+            return {};
+
+        output[i] = (c1 << 4) + c2;
+    }
+
+    // Process last byte with a padded zero
+    output[output.size() - 1] = decode_hex_digit(static_cast<char>(bytes[bytes.size() - 1])) * 16;
+
+    return output;
+};
+
+Optional<ByteBuffer> Filter::decode_ascii85(const ReadonlyBytes& bytes)
+{
+    Vector<u8> buff;
+    buff.ensure_capacity(bytes.size());
+
+    size_t byte_index = 0;
+
+    while (byte_index < bytes.size()) {
+        if (bytes[byte_index] == ' ') {
+            byte_index++;
+            continue;
+        }
+
+        if (bytes[byte_index] == 'z') {
+            byte_index++;
+            for (int i = 0; i < 4; i++)
+                buff.append(0);
+            continue;
+        }
+
+        u32 number = 0;
+
+        if (byte_index + 5 >= bytes.size()) {
+            auto to_write = bytes.size() - byte_index;
+            for (int i = 0; i < 5; i++) {
+                auto byte = byte_index >= bytes.size() ? 'u' : bytes[byte_index++];
+                if (byte == ' ') {
+                    i--;
+                    continue;
+                }
+                number = number * 85 + byte - 33;
+            }
+
+            for (size_t i = 0; i < to_write - 1; i++)
+                buff.append(reinterpret_cast<u8*>(&number)[3 - i]);
+
+            break;
+        } else {
+            for (int i = 0; i < 5; i++) {
+                auto byte = bytes[byte_index++];
+                if (byte == ' ') {
+                    i--;
+                    continue;
+                }
+                number = number * 85 + byte - 33;
+            }
+        }
+
+        for (int i = 0; i < 4; i++)
+            buff.append(reinterpret_cast<u8*>(&number)[3 - i]);
+    }
+
+    return ByteBuffer::copy(buff.span());
+};
+
+Optional<ByteBuffer> Filter::decode_lzw(const ReadonlyBytes&)
+{
+    dbgln("LZW decoding is not supported");
+    VERIFY_NOT_REACHED();
+};
+
+Optional<ByteBuffer> Filter::decode_flate(const ReadonlyBytes& bytes)
+{
+    // FIXME: The spec says Flate decoding is "based on" zlib, does that mean they
+    // aren't exactly the same?
+
+    auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2));
+    VERIFY(buff.has_value());
+    return buff.value();
+};
+
+Optional<ByteBuffer> Filter::decode_run_length(const ReadonlyBytes&)
+{
+    // FIXME: Support RunLength decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_ccitt(const ReadonlyBytes&)
+{
+    // FIXME: Support CCITT decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_jbig2(const ReadonlyBytes&)
+{
+    // FIXME: Support JBIG2 decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_dct(const ReadonlyBytes&)
+{
+    // FIXME: Support dct decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_jpx(const ReadonlyBytes&)
+{
+    // FIXME: Support JPX decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_crypt(const ReadonlyBytes&)
+{
+    // FIXME: Support Crypt decoding
+    TODO();
+};
+
+}
--- a/Userland/Libraries/LibPDF/Filter.h
+++ b/Userland/Libraries/LibPDF/Filter.h
@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/ByteBuffer.h>
+#include <AK/FlyString.h>
+
+namespace PDF {
+
+class Filter {
+public:
+    static Optional<ByteBuffer> decode(const ReadonlyBytes& bytes, const FlyString& encoding_type);
+
+private:
+    static Optional<ByteBuffer> decode_ascii_hex(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_ascii85(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_lzw(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_flate(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_run_length(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_ccitt(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_jbig2(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_dct(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_jpx(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_crypt(const ReadonlyBytes& bytes);
+};
+
+}
--- a/Userland/Libraries/LibPDF/Forward.h
+++ b/Userland/Libraries/LibPDF/Forward.h
@ -11,6 +11,9 @@ namespace PDF {
 class Document;
 class Object;

+// Note: This macro doesn't care about PlainTextStreamObject and EncodedStreamObject because
+//       we never need to work directly with either of them.
+
 #define ENUMERATE_DIRECT_OBJECT_TYPES(V) \
    V(StringObject, string)              \
    V(NameObject, name)                  \
--- a/Userland/Libraries/LibPDF/Object.h
+++ b/Userland/Libraries/LibPDF/Object.h
@ -147,18 +147,17 @@ private:
    HashMap<FlyString, Value> m_map;
 };

-class StreamObject final : public Object {
+class StreamObject : public Object {
 public:
-    StreamObject(const NonnullRefPtr<DictObject>& dict, const ReadonlyBytes& bytes)
+    explicit StreamObject(const NonnullRefPtr<DictObject>& dict)
        : m_dict(dict)
-        , m_bytes(bytes)
    {
    }

-    ~StreamObject() override = default;
+    virtual ~StreamObject() override = default;

    [[nodiscard]] ALWAYS_INLINE NonnullRefPtr<DictObject> dict() const { return m_dict; }
-    [[nodiscard]] ALWAYS_INLINE const ReadonlyBytes& bytes() const { return m_bytes; }
+    [[nodiscard]] virtual ReadonlyBytes bytes() const = 0;

    ALWAYS_INLINE bool is_stream() const override { return true; }
    ALWAYS_INLINE const char* type_name() const override { return "stream"; }
@ -166,9 +165,40 @@ public:

 private:
    NonnullRefPtr<DictObject> m_dict;
+};
+
+class PlainTextStreamObject final : public StreamObject {
+public:
+    PlainTextStreamObject(const NonnullRefPtr<DictObject>& dict, const ReadonlyBytes& bytes)
+        : StreamObject(dict)
+        , m_bytes(bytes)
+    {
+    }
+
+    virtual ~PlainTextStreamObject() override = default;
+
+    [[nodiscard]] ALWAYS_INLINE virtual ReadonlyBytes bytes() const override { return m_bytes; }
+
+private:
    ReadonlyBytes m_bytes;
 };

+class EncodedStreamObject final : public StreamObject {
+public:
+    EncodedStreamObject(const NonnullRefPtr<DictObject>& dict, ByteBuffer&& buffer)
+        : StreamObject(dict)
+        , m_buffer(buffer)
+    {
+    }
+
+    virtual ~EncodedStreamObject() override = default;
+
+    [[nodiscard]] ALWAYS_INLINE virtual ReadonlyBytes bytes() const override { return m_buffer.bytes(); }
+
+private:
+    ByteBuffer m_buffer;
+};
+
 class IndirectValue final : public Object {
 public:
    IndirectValue(u32 index, u32 generation_index, const Value& value)
--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@ -7,6 +7,7 @@
 #include <AK/ScopeGuard.h>
 #include <AK/TypeCasts.h>
 #include <LibPDF/Document.h>
+#include <LibPDF/Filter.h>
 #include <LibPDF/Parser.h>
 #include <ctype.h>
 #include <math.h>
@ -657,7 +658,15 @@ NonnullRefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
    m_reader.move_by(9);
    consume_whitespace();

-    return make_object<StreamObject>(dict, bytes);
+    if (dict->contains("Filter")) {
+        auto filter_type = dict->get_name(m_document, "Filter")->name();
+        auto maybe_bytes = Filter::decode(bytes, filter_type);
+        // FIXME: Handle error condition
+        VERIFY(maybe_bytes.has_value());
+        return make_object<EncodedStreamObject>(dict, move(maybe_bytes.value()));
+    }
+
+    return make_object<PlainTextStreamObject>(dict, bytes);
 }

 Vector<Command> Parser::parse_graphics_commands()