LibPDF: Try to repair XRef tables with broken indices

An XRef table usually starts with an object number of zero. While it
could technically start at any other number, this is a tell-tale sign
of a broken table.

For the "broken" documents I encountered, this always meant that some
objects must have been removed from the start of the table, without
updating the following indices. When this is the case, the document is
not able to be read normally.

However, most other PDF parsers seem to know of this quirk and fix the
XRef table automatically.

Likewise, we now check for this exact case, and if it matches up with
what we expect, we update the XRef table such that all object numbers
match the actual objects found in the file again.
This commit is contained in:
Julian Offenhäuser 2022-11-23 11:33:14 +01:00 committed by Andreas Kling
parent e06a065594
commit d1bc89e30b
Notes: sideshowbarker 2024-07-17 04:08:19 +09:00
3 changed files with 54 additions and 1 deletions

View File

@ -197,7 +197,7 @@ PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
auto main_xref_table = TRY(parse_xref_table());
TRY(m_xref_table->merge(move(*main_xref_table)));
return {};
return validate_xref_table_and_fix_if_necessary();
}
PDFErrorOr<void> DocumentParser::initialize_hint_tables()
@ -275,6 +275,56 @@ PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table()
m_xref_table = TRY(parse_xref_table());
if (!m_trailer)
m_trailer = TRY(parse_file_trailer());
return validate_xref_table_and_fix_if_necessary();
}
PDFErrorOr<void> DocumentParser::validate_xref_table_and_fix_if_necessary()
{
/* While an xref table may start with an object number other than zero, this is
very uncommon and likely a sign of a document with broken indices.
Like most other PDF parsers seem to do, we still try to salvage the situation.
NOTE: This is probably not spec-compliant behavior.*/
size_t first_valid_index = 0;
while (m_xref_table->byte_offset_for_object(first_valid_index) == invalid_byte_offset)
first_valid_index++;
if (first_valid_index) {
auto& entries = m_xref_table->entries();
bool need_to_rebuild_table = true;
for (size_t i = first_valid_index; i < entries.size(); ++i) {
if (!entries[i].in_use)
continue;
size_t actual_object_number = 0;
if (entries[i].compressed) {
auto object_stream_index = m_xref_table->object_stream_for_object(i);
auto stream_offset = m_xref_table->byte_offset_for_object(object_stream_index);
m_reader.move_to(stream_offset);
auto first_number = TRY(parse_number());
actual_object_number = first_number.get_u32();
} else {
auto byte_offset = m_xref_table->byte_offset_for_object(i);
m_reader.move_to(byte_offset);
auto indirect_value = TRY(parse_indirect_value());
actual_object_number = indirect_value->index();
}
if (actual_object_number != i - first_valid_index) {
/* Our suspicion was wrong, not all object numbers are shifted equally.
This could mean that the document is hopelessly broken, or it just
starts at a non-zero object index for some reason. */
need_to_rebuild_table = false;
break;
}
}
if (need_to_rebuild_table) {
warnln("Broken xref table detected, trying to fix it.");
entries.remove(0, first_valid_index);
}
}
return {};
}

View File

@ -79,6 +79,7 @@ private:
PDFErrorOr<LinearizationResult> initialize_linearization_dict();
PDFErrorOr<void> initialize_linearized_xref_table();
PDFErrorOr<void> initialize_non_linearized_xref_table();
PDFErrorOr<void> validate_xref_table_and_fix_if_necessary();
PDFErrorOr<void> initialize_hint_tables();
PDFErrorOr<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes);
Vector<PageOffsetHintTableEntry> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes);

View File

@ -68,6 +68,8 @@ public:
m_entries.append(entry);
}
ALWAYS_INLINE Vector<XRefEntry>& entries() { return m_entries; }
[[nodiscard]] ALWAYS_INLINE bool has_object(size_t index) const
{
return index < m_entries.size() && m_entries[index].byte_offset != -1;