diff --git a/Userland/Libraries/LibPDF/CommonNames.h b/Userland/Libraries/LibPDF/CommonNames.h index 790690d45de..a436d1b36ed 100644 --- a/Userland/Libraries/LibPDF/CommonNames.h +++ b/Userland/Libraries/LibPDF/CommonNames.h @@ -70,6 +70,7 @@ A(HTO) \ A(ICCBased) \ A(ID) \ + A(Index) \ A(JBIG2Decode) \ A(JPXDecode) \ A(Kids) \ diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp index e34302928a1..0dee9638390 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.cpp +++ b/Userland/Libraries/LibPDF/DocumentParser.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -178,7 +179,8 @@ PDFErrorOr DocumentParser::initialize_linearized_xref_table() // The linearization parameter dictionary has just been parsed, and the xref table // comes immediately after it. We are in the correct spot. m_xref_table = TRY(parse_xref_table()); - m_trailer = TRY(parse_file_trailer()); + if (!m_trailer) + m_trailer = TRY(parse_file_trailer()); // Also parse the main xref table and merge into the first-page xref table. Note // that we don't use the main xref table offset from the linearization dict because @@ -188,6 +190,7 @@ PDFErrorOr DocumentParser::initialize_linearized_xref_table() m_reader.move_to(main_xref_table_offset); auto main_xref_table = TRY(parse_xref_table()); TRY(m_xref_table->merge(move(*main_xref_table))); + return {}; } @@ -264,14 +267,96 @@ PDFErrorOr DocumentParser::initialize_non_linearized_xref_table() m_reader.move_to(xref_offset); m_xref_table = TRY(parse_xref_table()); - m_trailer = TRY(parse_file_trailer()); + if (!m_trailer) + m_trailer = TRY(parse_file_trailer()); return {}; } +PDFErrorOr> DocumentParser::parse_xref_stream() +{ + auto first_number = TRY(parse_number()); + auto second_number = TRY(parse_number()); + + if (!m_reader.matches("obj")) + return error("Malformed xref object"); + m_reader.move_by(3); + if (m_reader.matches_eol()) + m_reader.consume_eol(); + + auto dict = TRY(parse_dict()); + auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name(); + if (type != "XRef") + return error("Malformed xref dictionary"); + + auto field_sizes = TRY(dict->get_array(m_document, "W")); + if (field_sizes->size() != 3) + return error("Malformed xref dictionary"); + + auto object_count = dict->get_value("Size").get(); + + Vector> subsection_indices; + if (dict->contains(CommonNames::Index)) { + auto index_array = TRY(dict->get_array(m_document, CommonNames::Index)); + if (index_array->size() % 2 != 0) + return error("Malformed xref dictionary"); + + for (size_t i = 0; i < index_array->size(); i += 2) + subsection_indices.append({ index_array->at(i).get(), index_array->at(i + 1).get() - 1 }); + } else { + subsection_indices.append({ 0, object_count - 1 }); + } + auto stream = TRY(parse_stream(dict)); + auto table = adopt_ref(*new XRefTable()); + + auto field_to_long = [](Span field) -> long { + long value = 0; + const u8 max = (field.size() - 1) * 8; + for (size_t i = 0; i < field.size(); ++i) { + value |= static_cast(field[i]) << (max - (i * 8)); + } + return value; + }; + + size_t byte_index = 0; + size_t subsection_index = 0; + + Vector entries; + + for (int entry_index = 0; entry_index < object_count; ++entry_index) { + Array fields; + for (size_t field_index = 0; field_index < 3; ++field_index) { + auto field_size = field_sizes->at(field_index).get_u32(); + auto field = stream->bytes().slice(byte_index, field_size); + fields[field_index] = field_to_long(field); + byte_index += field_size; + } + + u8 type = fields[0]; + if (!field_sizes->at(0).get_u32()) + type = 1; + + entries.append({ fields[1], static_cast(fields[2]), type != 0, type == 2 }); + + auto indices = subsection_indices[subsection_index]; + if (entry_index >= indices.get<1>()) { + table->add_section({ indices.get<0>(), indices.get<1>(), entries }); + entries.clear(); + subsection_index++; + } + } + + m_trailer = dict; + + return table; +} + PDFErrorOr> DocumentParser::parse_xref_table() { - if (!m_reader.matches("xref")) - return error("Expected \"xref\""); + if (!m_reader.matches("xref")) { + // Since version 1.5, there may be a cross-reference stream instead + return parse_xref_stream(); + } + m_reader.move_by(4); if (!m_reader.consume_eol()) return error("Expected newline after \"xref\""); diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h index fef5f40ad42..6b6814bb755 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.h +++ b/Userland/Libraries/LibPDF/DocumentParser.h @@ -82,6 +82,7 @@ private: PDFErrorOr initialize_hint_tables(); PDFErrorOr parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes); Vector parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes); + PDFErrorOr> parse_xref_stream(); PDFErrorOr> parse_xref_table(); PDFErrorOr> parse_file_trailer(); diff --git a/Userland/Libraries/LibPDF/XRefTable.h b/Userland/Libraries/LibPDF/XRefTable.h index 01c77197dbd..8c5abe14c9c 100644 --- a/Userland/Libraries/LibPDF/XRefTable.h +++ b/Userland/Libraries/LibPDF/XRefTable.h @@ -19,6 +19,7 @@ struct XRefEntry { long byte_offset { invalid_byte_offset }; u16 generation_number { 0 }; bool in_use { false }; + bool compressed { false }; }; struct XRefSection { @@ -77,18 +78,34 @@ public: return m_entries[index].byte_offset; } + [[nodiscard]] ALWAYS_INLINE long object_stream_for_object(size_t index) const + { + return byte_offset_for_object(index); + } + [[nodiscard]] ALWAYS_INLINE u16 generation_number_for_object(size_t index) const { VERIFY(has_object(index)); return m_entries[index].generation_number; } + [[nodiscard]] ALWAYS_INLINE u16 object_stream_index_for_object(size_t index) const + { + return generation_number_for_object(index); + } + [[nodiscard]] ALWAYS_INLINE bool is_object_in_use(size_t index) const { VERIFY(has_object(index)); return m_entries[index].in_use; } + [[nodiscard]] ALWAYS_INLINE bool is_object_compressed(size_t index) const + { + VERIFY(has_object(index)); + return m_entries[index].compressed; + } + private: friend struct AK::Formatter;