From e7f7c434f79adc199f08ca30f5f6b49d82e92a5f Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Sat, 21 Oct 2023 16:30:33 -0400 Subject: [PATCH] LibPDF: Don't check for `startxref` after trailer dict Several files have a comment after the trailer dict and the `startxref` after it. We really should add a consume_whitespace_and_comments() function and call that in most places we currently call consume_whitespace(). But in this case, for non-linearized files, we first jump to the end of the file, read `startxref`, then jump to `xref` from the offset there, and then read the trailer after the `xref`, only to read `startxref` again. So we can just not do that. (For linearized files, we now completely ignore `startxref`. But we don't use the data in `startxref` in linearized files anyways, so it's fine to not read it there too.) Reduces number of crashes on 300 random PDFs from the web (the first 300 from 0000.zip from https://pdfa.org/new-large-scale-pdf-corpus-now-publicly-available/) from 25 (8%) to 23 (7%). --- Userland/Libraries/LibPDF/DocumentParser.cpp | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp index 0731da6c6d1..a4f775fdf21 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.cpp +++ b/Userland/Libraries/LibPDF/DocumentParser.cpp @@ -525,21 +525,7 @@ PDFErrorOr> DocumentParser::parse_file_trailer() return error("Expected \"trailer\" keyword"); m_reader.move_by(7); m_reader.consume_whitespace(); - auto dict = TRY(parse_dict()); - - if (!m_reader.matches("startxref")) - return error("Expected \"startxref\""); - m_reader.move_by(9); - m_reader.consume_whitespace(); - - m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); - VERIFY(m_reader.consume_eol()); - if (!m_reader.matches("%%EOF")) - return error("Expected \"%%EOF\""); - - m_reader.move_by(5); - m_reader.consume_whitespace(); - return dict; + return parse_dict(); } PDFErrorOr DocumentParser::parse_compressed_object_with_index(u32 index)