LibPDF: Add support for stream filters

This commit also splits up StreamObject into PlainTextStreamObject and
EncodedStreamObject, which is essentially just a stream object which
does not own its bytes vs one which does.
This commit is contained in:
Matthew Olsson 2021-05-22 20:44:18 -07:00 committed by Ali Mohammad Pur
parent 97cc482087
commit 477e3946e5
Notes: sideshowbarker 2024-07-18 17:26:46 +09:00
6 changed files with 253 additions and 7 deletions

View File

@ -1,6 +1,6 @@
set(SOURCES
Object.cpp
Document.cpp
Filter.cpp
Object.cpp
Parser.cpp
Renderer.cpp

View File

@ -0,0 +1,173 @@
/*
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Hex.h>
#include <LibCompress/Deflate.h>
#include <LibPDF/Filter.h>
namespace PDF {
Optional<ByteBuffer> Filter::decode(const ReadonlyBytes& bytes, const FlyString& encoding_type)
{
if (encoding_type == "ASCIIHexDecode")
return decode_ascii_hex(bytes);
if (encoding_type == "ASCII85Decode")
return decode_ascii85(bytes);
if (encoding_type == "LZWDecode")
return decode_lzw(bytes);
if (encoding_type == "FlateDecode")
return decode_flate(bytes);
if (encoding_type == "RunLengthDecode")
return decode_run_length(bytes);
if (encoding_type == "CCITTFaxDecode")
return decode_ccitt(bytes);
if (encoding_type == "JBIG2Decode")
return decode_jbig2(bytes);
if (encoding_type == "DCTDecode")
return decode_dct(bytes);
if (encoding_type == "JPXDecode")
return decode_jpx(bytes);
if (encoding_type == "Crypt")
return decode_crypt(bytes);
return {};
}
Optional<ByteBuffer> Filter::decode_ascii_hex(const ReadonlyBytes& bytes)
{
if (bytes.size() % 2 == 0)
return decode_hex(bytes);
// FIXME: Integrate this padding into AK/Hex?
auto output = ByteBuffer::create_zeroed(bytes.size() / 2 + 1);
for (size_t i = 0; i < bytes.size() / 2; ++i) {
const auto c1 = decode_hex_digit(static_cast<char>(bytes[i * 2]));
if (c1 >= 16)
return {};
const auto c2 = decode_hex_digit(static_cast<char>(bytes[i * 2 + 1]));
if (c2 >= 16)
return {};
output[i] = (c1 << 4) + c2;
}
// Process last byte with a padded zero
output[output.size() - 1] = decode_hex_digit(static_cast<char>(bytes[bytes.size() - 1])) * 16;
return output;
};
Optional<ByteBuffer> Filter::decode_ascii85(const ReadonlyBytes& bytes)
{
Vector<u8> buff;
buff.ensure_capacity(bytes.size());
size_t byte_index = 0;
while (byte_index < bytes.size()) {
if (bytes[byte_index] == ' ') {
byte_index++;
continue;
}
if (bytes[byte_index] == 'z') {
byte_index++;
for (int i = 0; i < 4; i++)
buff.append(0);
continue;
}
u32 number = 0;
if (byte_index + 5 >= bytes.size()) {
auto to_write = bytes.size() - byte_index;
for (int i = 0; i < 5; i++) {
auto byte = byte_index >= bytes.size() ? 'u' : bytes[byte_index++];
if (byte == ' ') {
i--;
continue;
}
number = number * 85 + byte - 33;
}
for (size_t i = 0; i < to_write - 1; i++)
buff.append(reinterpret_cast<u8*>(&number)[3 - i]);
break;
} else {
for (int i = 0; i < 5; i++) {
auto byte = bytes[byte_index++];
if (byte == ' ') {
i--;
continue;
}
number = number * 85 + byte - 33;
}
}
for (int i = 0; i < 4; i++)
buff.append(reinterpret_cast<u8*>(&number)[3 - i]);
}
return ByteBuffer::copy(buff.span());
};
Optional<ByteBuffer> Filter::decode_lzw(const ReadonlyBytes&)
{
dbgln("LZW decoding is not supported");
VERIFY_NOT_REACHED();
};
Optional<ByteBuffer> Filter::decode_flate(const ReadonlyBytes& bytes)
{
// FIXME: The spec says Flate decoding is "based on" zlib, does that mean they
// aren't exactly the same?
auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2));
VERIFY(buff.has_value());
return buff.value();
};
Optional<ByteBuffer> Filter::decode_run_length(const ReadonlyBytes&)
{
// FIXME: Support RunLength decoding
TODO();
};
Optional<ByteBuffer> Filter::decode_ccitt(const ReadonlyBytes&)
{
// FIXME: Support CCITT decoding
TODO();
};
Optional<ByteBuffer> Filter::decode_jbig2(const ReadonlyBytes&)
{
// FIXME: Support JBIG2 decoding
TODO();
};
Optional<ByteBuffer> Filter::decode_dct(const ReadonlyBytes&)
{
// FIXME: Support dct decoding
TODO();
};
Optional<ByteBuffer> Filter::decode_jpx(const ReadonlyBytes&)
{
// FIXME: Support JPX decoding
TODO();
};
Optional<ByteBuffer> Filter::decode_crypt(const ReadonlyBytes&)
{
// FIXME: Support Crypt decoding
TODO();
};
}

View File

@ -0,0 +1,31 @@
/*
* Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/ByteBuffer.h>
#include <AK/FlyString.h>
namespace PDF {
class Filter {
public:
static Optional<ByteBuffer> decode(const ReadonlyBytes& bytes, const FlyString& encoding_type);
private:
static Optional<ByteBuffer> decode_ascii_hex(const ReadonlyBytes& bytes);
static Optional<ByteBuffer> decode_ascii85(const ReadonlyBytes& bytes);
static Optional<ByteBuffer> decode_lzw(const ReadonlyBytes& bytes);
static Optional<ByteBuffer> decode_flate(const ReadonlyBytes& bytes);
static Optional<ByteBuffer> decode_run_length(const ReadonlyBytes& bytes);
static Optional<ByteBuffer> decode_ccitt(const ReadonlyBytes& bytes);
static Optional<ByteBuffer> decode_jbig2(const ReadonlyBytes& bytes);
static Optional<ByteBuffer> decode_dct(const ReadonlyBytes& bytes);
static Optional<ByteBuffer> decode_jpx(const ReadonlyBytes& bytes);
static Optional<ByteBuffer> decode_crypt(const ReadonlyBytes& bytes);
};
}

View File

@ -11,6 +11,9 @@ namespace PDF {
class Document;
class Object;
// Note: This macro doesn't care about PlainTextStreamObject and EncodedStreamObject because
// we never need to work directly with either of them.
#define ENUMERATE_DIRECT_OBJECT_TYPES(V) \
V(StringObject, string) \
V(NameObject, name) \

View File

@ -147,18 +147,17 @@ private:
HashMap<FlyString, Value> m_map;
};
class StreamObject final : public Object {
class StreamObject : public Object {
public:
StreamObject(const NonnullRefPtr<DictObject>& dict, const ReadonlyBytes& bytes)
explicit StreamObject(const NonnullRefPtr<DictObject>& dict)
: m_dict(dict)
, m_bytes(bytes)
{
}
~StreamObject() override = default;
virtual ~StreamObject() override = default;
[[nodiscard]] ALWAYS_INLINE NonnullRefPtr<DictObject> dict() const { return m_dict; }
[[nodiscard]] ALWAYS_INLINE const ReadonlyBytes& bytes() const { return m_bytes; }
[[nodiscard]] virtual ReadonlyBytes bytes() const = 0;
ALWAYS_INLINE bool is_stream() const override { return true; }
ALWAYS_INLINE const char* type_name() const override { return "stream"; }
@ -166,9 +165,40 @@ public:
private:
NonnullRefPtr<DictObject> m_dict;
};
class PlainTextStreamObject final : public StreamObject {
public:
PlainTextStreamObject(const NonnullRefPtr<DictObject>& dict, const ReadonlyBytes& bytes)
: StreamObject(dict)
, m_bytes(bytes)
{
}
virtual ~PlainTextStreamObject() override = default;
[[nodiscard]] ALWAYS_INLINE virtual ReadonlyBytes bytes() const override { return m_bytes; }
private:
ReadonlyBytes m_bytes;
};
class EncodedStreamObject final : public StreamObject {
public:
EncodedStreamObject(const NonnullRefPtr<DictObject>& dict, ByteBuffer&& buffer)
: StreamObject(dict)
, m_buffer(buffer)
{
}
virtual ~EncodedStreamObject() override = default;
[[nodiscard]] ALWAYS_INLINE virtual ReadonlyBytes bytes() const override { return m_buffer.bytes(); }
private:
ByteBuffer m_buffer;
};
class IndirectValue final : public Object {
public:
IndirectValue(u32 index, u32 generation_index, const Value& value)

View File

@ -7,6 +7,7 @@
#include <AK/ScopeGuard.h>
#include <AK/TypeCasts.h>
#include <LibPDF/Document.h>
#include <LibPDF/Filter.h>
#include <LibPDF/Parser.h>
#include <ctype.h>
#include <math.h>
@ -657,7 +658,15 @@ NonnullRefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
m_reader.move_by(9);
consume_whitespace();
return make_object<StreamObject>(dict, bytes);
if (dict->contains("Filter")) {
auto filter_type = dict->get_name(m_document, "Filter")->name();
auto maybe_bytes = Filter::decode(bytes, filter_type);
// FIXME: Handle error condition
VERIFY(maybe_bytes.has_value());
return make_object<EncodedStreamObject>(dict, move(maybe_bytes.value()));
}
return make_object<PlainTextStreamObject>(dict, bytes);
}
Vector<Command> Parser::parse_graphics_commands()