LibPDF: Initial work on parsing xref streams

Since PDF version 1.5, a document may omit the xref table in favor of a new kind of xref stream object. This is used to reference so-called "compressed" objects that are part of an object stream. With this patch we are able to parse this new kind of xref object, but we'll have to implement object streams to use them correctly.
Author: https://github.com/janso3 Commit: https://github.com/SerenityOS/serenity/commit/f9beff7b5e Pull-request: https://github.com/SerenityOS/serenity/pull/14873 Reviewed-by: https://github.com/mattco98 ✅
2024-09-20 09:49:15 +03:00 · 2022-08-15 12:04:59 +02:00 · 2022-08-15 12:04:59 +02:00 · f9beff7b5e · 2024-07-17 07:11:12 +09:00
commit f9beff7b5e
parent 4887aacec7
4 changed files with 108 additions and 4 deletions
--- a/Userland/Libraries/LibPDF/CommonNames.h
+++ b/Userland/Libraries/LibPDF/CommonNames.h
@ -70,6 +70,7 @@
    A(HTO)                        \
    A(ICCBased)                   \
    A(ID)                         \
+    A(Index)                      \
    A(JBIG2Decode)                \
    A(JPXDecode)                  \
    A(Kids)                       \
--- a/Userland/Libraries/LibPDF/DocumentParser.cpp
+++ b/Userland/Libraries/LibPDF/DocumentParser.cpp
@ -6,6 +6,7 @@

 #include <AK/BitStream.h>
 #include <AK/MemoryStream.h>
+#include <AK/Tuple.h>
 #include <LibPDF/CommonNames.h>
 #include <LibPDF/Document.h>
 #include <LibPDF/DocumentParser.h>
@ -178,7 +179,8 @@ PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
    // The linearization parameter dictionary has just been parsed, and the xref table
    // comes immediately after it. We are in the correct spot.
    m_xref_table = TRY(parse_xref_table());
-    m_trailer = TRY(parse_file_trailer());
+    if (!m_trailer)
+        m_trailer = TRY(parse_file_trailer());

    // Also parse the main xref table and merge into the first-page xref table. Note
    // that we don't use the main xref table offset from the linearization dict because
@ -188,6 +190,7 @@ PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
    m_reader.move_to(main_xref_table_offset);
    auto main_xref_table = TRY(parse_xref_table());
    TRY(m_xref_table->merge(move(*main_xref_table)));
+
    return {};
 }

@ -264,14 +267,96 @@ PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table()

    m_reader.move_to(xref_offset);
    m_xref_table = TRY(parse_xref_table());
-    m_trailer = TRY(parse_file_trailer());
+    if (!m_trailer)
+        m_trailer = TRY(parse_file_trailer());
    return {};
 }

+PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
+{
+    auto first_number = TRY(parse_number());
+    auto second_number = TRY(parse_number());
+
+    if (!m_reader.matches("obj"))
+        return error("Malformed xref object");
+    m_reader.move_by(3);
+    if (m_reader.matches_eol())
+        m_reader.consume_eol();
+
+    auto dict = TRY(parse_dict());
+    auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name();
+    if (type != "XRef")
+        return error("Malformed xref dictionary");
+
+    auto field_sizes = TRY(dict->get_array(m_document, "W"));
+    if (field_sizes->size() != 3)
+        return error("Malformed xref dictionary");
+
+    auto object_count = dict->get_value("Size").get<int>();
+
+    Vector<Tuple<int, int>> subsection_indices;
+    if (dict->contains(CommonNames::Index)) {
+        auto index_array = TRY(dict->get_array(m_document, CommonNames::Index));
+        if (index_array->size() % 2 != 0)
+            return error("Malformed xref dictionary");
+
+        for (size_t i = 0; i < index_array->size(); i += 2)
+            subsection_indices.append({ index_array->at(i).get<int>(), index_array->at(i + 1).get<int>() - 1 });
+    } else {
+        subsection_indices.append({ 0, object_count - 1 });
+    }
+    auto stream = TRY(parse_stream(dict));
+    auto table = adopt_ref(*new XRefTable());
+
+    auto field_to_long = [](Span<const u8> field) -> long {
+        long value = 0;
+        const u8 max = (field.size() - 1) * 8;
+        for (size_t i = 0; i < field.size(); ++i) {
+            value |= static_cast<long>(field[i]) << (max - (i * 8));
+        }
+        return value;
+    };
+
+    size_t byte_index = 0;
+    size_t subsection_index = 0;
+
+    Vector<XRefEntry> entries;
+
+    for (int entry_index = 0; entry_index < object_count; ++entry_index) {
+        Array<long, 3> fields;
+        for (size_t field_index = 0; field_index < 3; ++field_index) {
+            auto field_size = field_sizes->at(field_index).get_u32();
+            auto field = stream->bytes().slice(byte_index, field_size);
+            fields[field_index] = field_to_long(field);
+            byte_index += field_size;
+        }
+
+        u8 type = fields[0];
+        if (!field_sizes->at(0).get_u32())
+            type = 1;
+
+        entries.append({ fields[1], static_cast<u16>(fields[2]), type != 0, type == 2 });
+
+        auto indices = subsection_indices[subsection_index];
+        if (entry_index >= indices.get<1>()) {
+            table->add_section({ indices.get<0>(), indices.get<1>(), entries });
+            entries.clear();
+            subsection_index++;
+        }
+    }
+
+    m_trailer = dict;
+
+    return table;
+}
+
 PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()
 {
-    if (!m_reader.matches("xref"))
-        return error("Expected \"xref\"");
+    if (!m_reader.matches("xref")) {
+        // Since version 1.5, there may be a cross-reference stream instead
+        return parse_xref_stream();
+    }
+
    m_reader.move_by(4);
    if (!m_reader.consume_eol())
        return error("Expected newline after \"xref\"");
--- a/Userland/Libraries/LibPDF/DocumentParser.h
+++ b/Userland/Libraries/LibPDF/DocumentParser.h
@ -82,6 +82,7 @@ private:
    PDFErrorOr<void> initialize_hint_tables();
    PDFErrorOr<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes);
    Vector<PageOffsetHintTableEntry> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes);
+    PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_stream();
    PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_table();
    PDFErrorOr<NonnullRefPtr<DictObject>> parse_file_trailer();

--- a/Userland/Libraries/LibPDF/XRefTable.h
+++ b/Userland/Libraries/LibPDF/XRefTable.h
@ -19,6 +19,7 @@ struct XRefEntry {
    long byte_offset { invalid_byte_offset };
    u16 generation_number { 0 };
    bool in_use { false };
+    bool compressed { false };
 };

 struct XRefSection {
@ -77,18 +78,34 @@ public:
        return m_entries[index].byte_offset;
    }

+    [[nodiscard]] ALWAYS_INLINE long object_stream_for_object(size_t index) const
+    {
+        return byte_offset_for_object(index);
+    }
+
    [[nodiscard]] ALWAYS_INLINE u16 generation_number_for_object(size_t index) const
    {
        VERIFY(has_object(index));
        return m_entries[index].generation_number;
    }

+    [[nodiscard]] ALWAYS_INLINE u16 object_stream_index_for_object(size_t index) const
+    {
+        return generation_number_for_object(index);
+    }
+
    [[nodiscard]] ALWAYS_INLINE bool is_object_in_use(size_t index) const
    {
        VERIFY(has_object(index));
        return m_entries[index].in_use;
    }

+    [[nodiscard]] ALWAYS_INLINE bool is_object_compressed(size_t index) const
+    {
+        VERIFY(has_object(index));
+        return m_entries[index].compressed;
+    }
+
 private:
    friend struct AK::Formatter<PDF::XRefTable>;