From 477e3946e50cdfcbd3ee3688e4529a7fa1d9e442 Mon Sep 17 00:00:00 2001
From: Matthew Olsson <matthewcolsson@gmail.com>
Date: Sat, 22 May 2021 20:44:18 -0700
Subject: [PATCH] LibPDF: Add support for stream filters

This commit also splits up StreamObject into PlainTextStreamObject and
EncodedStreamObject, which is essentially just a stream object which
does not own its bytes vs one which does.
---
 Userland/Libraries/LibPDF/CMakeLists.txt |   2 +-
 Userland/Libraries/LibPDF/Filter.cpp     | 173 +++++++++++++++++++++++
 Userland/Libraries/LibPDF/Filter.h       |  31 ++++
 Userland/Libraries/LibPDF/Forward.h      |   3 +
 Userland/Libraries/LibPDF/Object.h       |  40 +++++-
 Userland/Libraries/LibPDF/Parser.cpp     |  11 +-
 6 files changed, 253 insertions(+), 7 deletions(-)
 create mode 100644 Userland/Libraries/LibPDF/Filter.cpp
 create mode 100644 Userland/Libraries/LibPDF/Filter.h
diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt
index 2da7ea56b51..6d52363eb66 100644
--- a/Userland/Libraries/LibPDF/CMakeLists.txt
+++ b/Userland/Libraries/LibPDF/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(SOURCES
-    Object.cpp
     Document.cpp
+    Filter.cpp
     Object.cpp
     Parser.cpp
     Renderer.cpp
diff --git a/Userland/Libraries/LibPDF/Filter.cpp b/Userland/Libraries/LibPDF/Filter.cpp
new file mode 100644
index 00000000000..2b2ab8ba846
--- /dev/null
+++ b/Userland/Libraries/LibPDF/Filter.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Hex.h>
+#include <LibCompress/Deflate.h>
+#include <LibPDF/Filter.h>
+
+namespace PDF {
+
+Optional<ByteBuffer> Filter::decode(const ReadonlyBytes& bytes, const FlyString& encoding_type)
+{
+    if (encoding_type == "ASCIIHexDecode")
+        return decode_ascii_hex(bytes);
+    if (encoding_type == "ASCII85Decode")
+        return decode_ascii85(bytes);
+    if (encoding_type == "LZWDecode")
+        return decode_lzw(bytes);
+    if (encoding_type == "FlateDecode")
+        return decode_flate(bytes);
+    if (encoding_type == "RunLengthDecode")
+        return decode_run_length(bytes);
+    if (encoding_type == "CCITTFaxDecode")
+        return decode_ccitt(bytes);
+    if (encoding_type == "JBIG2Decode")
+        return decode_jbig2(bytes);
+    if (encoding_type == "DCTDecode")
+        return decode_dct(bytes);
+    if (encoding_type == "JPXDecode")
+        return decode_jpx(bytes);
+    if (encoding_type == "Crypt")
+        return decode_crypt(bytes);
+
+    return {};
+}
+
+Optional<ByteBuffer> Filter::decode_ascii_hex(const ReadonlyBytes& bytes)
+{
+    if (bytes.size() % 2 == 0)
+        return decode_hex(bytes);
+
+    // FIXME: Integrate this padding into AK/Hex?
+
+    auto output = ByteBuffer::create_zeroed(bytes.size() / 2 + 1);
+
+    for (size_t i = 0; i < bytes.size() / 2; ++i) {
+        const auto c1 = decode_hex_digit(static_cast<char>(bytes[i * 2]));
+        if (c1 >= 16)
+            return {};
+
+        const auto c2 = decode_hex_digit(static_cast<char>(bytes[i * 2 + 1]));
+        if (c2 >= 16)
+            return {};
+
+        output[i] = (c1 << 4) + c2;
+    }
+
+    // Process last byte with a padded zero
+    output[output.size() - 1] = decode_hex_digit(static_cast<char>(bytes[bytes.size() - 1])) * 16;
+
+    return output;
+};
+
+Optional<ByteBuffer> Filter::decode_ascii85(const ReadonlyBytes& bytes)
+{
+    Vector<u8> buff;
+    buff.ensure_capacity(bytes.size());
+
+    size_t byte_index = 0;
+
+    while (byte_index < bytes.size()) {
+        if (bytes[byte_index] == ' ') {
+            byte_index++;
+            continue;
+        }
+
+        if (bytes[byte_index] == 'z') {
+            byte_index++;
+            for (int i = 0; i < 4; i++)
+                buff.append(0);
+            continue;
+        }
+
+        u32 number = 0;
+
+        if (byte_index + 5 >= bytes.size()) {
+            auto to_write = bytes.size() - byte_index;
+            for (int i = 0; i < 5; i++) {
+                auto byte = byte_index >= bytes.size() ? 'u' : bytes[byte_index++];
+                if (byte == ' ') {
+                    i--;
+                    continue;
+                }
+                number = number * 85 + byte - 33;
+            }
+
+            for (size_t i = 0; i < to_write - 1; i++)
+                buff.append(reinterpret_cast<u8*>(&number)[3 - i]);
+
+            break;
+        } else {
+            for (int i = 0; i < 5; i++) {
+                auto byte = bytes[byte_index++];
+                if (byte == ' ') {
+                    i--;
+                    continue;
+                }
+                number = number * 85 + byte - 33;
+            }
+        }
+
+        for (int i = 0; i < 4; i++)
+            buff.append(reinterpret_cast<u8*>(&number)[3 - i]);
+    }
+
+    return ByteBuffer::copy(buff.span());
+};
+
+Optional<ByteBuffer> Filter::decode_lzw(const ReadonlyBytes&)
+{
+    dbgln("LZW decoding is not supported");
+    VERIFY_NOT_REACHED();
+};
+
+Optional<ByteBuffer> Filter::decode_flate(const ReadonlyBytes& bytes)
+{
+    // FIXME: The spec says Flate decoding is "based on" zlib, does that mean they
+    // aren't exactly the same?
+
+    auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2));
+    VERIFY(buff.has_value());
+    return buff.value();
+};
+
+Optional<ByteBuffer> Filter::decode_run_length(const ReadonlyBytes&)
+{
+    // FIXME: Support RunLength decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_ccitt(const ReadonlyBytes&)
+{
+    // FIXME: Support CCITT decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_jbig2(const ReadonlyBytes&)
+{
+    // FIXME: Support JBIG2 decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_dct(const ReadonlyBytes&)
+{
+    // FIXME: Support dct decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_jpx(const ReadonlyBytes&)
+{
+    // FIXME: Support JPX decoding
+    TODO();
+};
+
+Optional<ByteBuffer> Filter::decode_crypt(const ReadonlyBytes&)
+{
+    // FIXME: Support Crypt decoding
+    TODO();
+};
+
+}
diff --git a/Userland/Libraries/LibPDF/Filter.h b/Userland/Libraries/LibPDF/Filter.h
new file mode 100644
index 00000000000..876656c38aa
--- /dev/null
+++ b/Userland/Libraries/LibPDF/Filter.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/ByteBuffer.h>
+#include <AK/FlyString.h>
+
+namespace PDF {
+
+class Filter {
+public:
+    static Optional<ByteBuffer> decode(const ReadonlyBytes& bytes, const FlyString& encoding_type);
+
+private:
+    static Optional<ByteBuffer> decode_ascii_hex(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_ascii85(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_lzw(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_flate(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_run_length(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_ccitt(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_jbig2(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_dct(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_jpx(const ReadonlyBytes& bytes);
+    static Optional<ByteBuffer> decode_crypt(const ReadonlyBytes& bytes);
+};
+
+}
diff --git a/Userland/Libraries/LibPDF/Forward.h b/Userland/Libraries/LibPDF/Forward.h
index f821af05cbc..05ef8249f21 100644
--- a/Userland/Libraries/LibPDF/Forward.h
+++ b/Userland/Libraries/LibPDF/Forward.h
@@ -11,6 +11,9 @@ namespace PDF {
 class Document;
 class Object;
 
+// Note: This macro doesn't care about PlainTextStreamObject and EncodedStreamObject because
+//       we never need to work directly with either of them.
+
 #define ENUMERATE_DIRECT_OBJECT_TYPES(V) \
     V(StringObject, string)              \
     V(NameObject, name)                  \
diff --git a/Userland/Libraries/LibPDF/Object.h b/Userland/Libraries/LibPDF/Object.h
index d185c6fdfd9..84a7bfb2a09 100644
--- a/Userland/Libraries/LibPDF/Object.h
+++ b/Userland/Libraries/LibPDF/Object.h
@@ -147,18 +147,17 @@ private:
     HashMap<FlyString, Value> m_map;
 };
 
-class StreamObject final : public Object {
+class StreamObject : public Object {
 public:
-    StreamObject(const NonnullRefPtr<DictObject>& dict, const ReadonlyBytes& bytes)
+    explicit StreamObject(const NonnullRefPtr<DictObject>& dict)
         : m_dict(dict)
-        , m_bytes(bytes)
     {
     }
 
-    ~StreamObject() override = default;
+    virtual ~StreamObject() override = default;
 
     [[nodiscard]] ALWAYS_INLINE NonnullRefPtr<DictObject> dict() const { return m_dict; }
-    [[nodiscard]] ALWAYS_INLINE const ReadonlyBytes& bytes() const { return m_bytes; }
+    [[nodiscard]] virtual ReadonlyBytes bytes() const = 0;
 
     ALWAYS_INLINE bool is_stream() const override { return true; }
     ALWAYS_INLINE const char* type_name() const override { return "stream"; }
@@ -166,9 +165,40 @@ public:
 
 private:
     NonnullRefPtr<DictObject> m_dict;
+};
+
+class PlainTextStreamObject final : public StreamObject {
+public:
+    PlainTextStreamObject(const NonnullRefPtr<DictObject>& dict, const ReadonlyBytes& bytes)
+        : StreamObject(dict)
+        , m_bytes(bytes)
+    {
+    }
+
+    virtual ~PlainTextStreamObject() override = default;
+
+    [[nodiscard]] ALWAYS_INLINE virtual ReadonlyBytes bytes() const override { return m_bytes; }
+
+private:
     ReadonlyBytes m_bytes;
 };
 
+class EncodedStreamObject final : public StreamObject {
+public:
+    EncodedStreamObject(const NonnullRefPtr<DictObject>& dict, ByteBuffer&& buffer)
+        : StreamObject(dict)
+        , m_buffer(buffer)
+    {
+    }
+
+    virtual ~EncodedStreamObject() override = default;
+
+    [[nodiscard]] ALWAYS_INLINE virtual ReadonlyBytes bytes() const override { return m_buffer.bytes(); }
+
+private:
+    ByteBuffer m_buffer;
+};
+
 class IndirectValue final : public Object {
 public:
     IndirectValue(u32 index, u32 generation_index, const Value& value)
diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp
index fc4ec2b6666..07e57046fe3 100644
--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@@ -7,6 +7,7 @@
 #include <AK/ScopeGuard.h>
 #include <AK/TypeCasts.h>
 #include <LibPDF/Document.h>
+#include <LibPDF/Filter.h>
 #include <LibPDF/Parser.h>
 #include <ctype.h>
 #include <math.h>
@@ -657,7 +658,15 @@ NonnullRefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
     m_reader.move_by(9);
     consume_whitespace();
 
-    return make_object<StreamObject>(dict, bytes);
+    if (dict->contains("Filter")) {
+        auto filter_type = dict->get_name(m_document, "Filter")->name();
+        auto maybe_bytes = Filter::decode(bytes, filter_type);
+        // FIXME: Handle error condition
+        VERIFY(maybe_bytes.has_value());
+        return make_object<EncodedStreamObject>(dict, move(maybe_bytes.value()));
+    }
+
+    return make_object<PlainTextStreamObject>(dict, bytes);
 }
 
 Vector<Command> Parser::parse_graphics_commands()