diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp index 38a6aedcbc9..bdf0eea509f 100644 --- a/Userland/Libraries/LibPDF/Document.cpp +++ b/Userland/Libraries/LibPDF/Document.cpp @@ -97,6 +97,12 @@ ByteString Document::text_string_to_utf8(ByteString const& text_string) PDFErrorOr> Document::create(ReadonlyBytes bytes) { + size_t offset_to_start = TRY(DocumentParser::scan_for_header_start(bytes)); + if (offset_to_start != 0) { + dbgln("warning: PDF header not at start of file, skipping {} bytes", offset_to_start); + bytes = bytes.slice(offset_to_start); + } + auto parser = adopt_ref(*new DocumentParser({}, bytes)); auto document = adopt_ref(*new Document(parser)); diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp index d147d854d5c..d772f57f613 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.cpp +++ b/Userland/Libraries/LibPDF/DocumentParser.cpp @@ -72,6 +72,19 @@ PDFErrorOr DocumentParser::parse_object_with_index(u32 index) return indirect_value->value(); } +PDFErrorOr DocumentParser::scan_for_header_start(ReadonlyBytes bytes) +{ + // PDF 1.7 spec, APPENDIX H, 3.4.1 "File Header": + // "13. Acrobat viewers require only that the header appear somewhere within the first 1024 bytes of the file." + // ...which of course means files depend on it. + // All offsets in the file are relative to the header start, not to the start of the file. + StringView first_bytes { bytes.data(), min(bytes.size(), 1024 - "1.4"sv.length()) }; + Optional start_offset = first_bytes.find("%PDF-"sv); + if (!start_offset.has_value()) + return Error { Error::Type::Parse, "Failed to find PDF start" }; + return start_offset.value(); +} + PDFErrorOr DocumentParser::parse_header() { m_reader.move_to(0); diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h index cbb127417aa..16ecd5d1992 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.h +++ b/Userland/Libraries/LibPDF/DocumentParser.h @@ -18,6 +18,8 @@ struct Version { class DocumentParser final : public RefCounted , public Parser { public: + static PDFErrorOr scan_for_header_start(ReadonlyBytes); + DocumentParser(Document*, ReadonlyBytes); enum class LinearizationResult {