1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-09-11 20:47:56 +03:00

Add API to compress generated PDF files

This feature compresses PDF streams (as it was already the case) and ask pydyf
to use a compact PDF structure with compressed object stream and
cross-reference object (for PDF version >=1.5).
This commit is contained in:
Guillaume Ayoub 2023-03-26 13:08:19 +02:00
parent e49d955509
commit fdbdfc150c
14 changed files with 100 additions and 57 deletions

View File

@ -513,7 +513,8 @@ WeasyPrint provides two options to deal with images: ``optimize_size`` and
``optimize_size`` can enable size optimization for images, but also for fonts.
When enabled, the generated PDF will include smaller images and fonts, but the
rendering time may be slightly increased.
rendering time may be slightly increased. The whole structure of the PDF can be
compressed too.
.. code-block:: python
@ -523,7 +524,7 @@ rendering time may be slightly increased.
# Full size optimization, slower, but generated PDF is smaller
HTML('https://example.org/').write_pdf(
'example.pdf', optimize_size=('fonts', 'images'))
'example.pdf', optimize_size=('fonts', 'images', 'pdf'))
``image_cache`` gives the possibility to use a cache for images, avoiding to
download, parse and optimize them each time they are used.

View File

@ -74,7 +74,7 @@ def document_write_png(self, target=None, resolution=96, antialiasing=1,
def html_write_png(self, target=None, stylesheets=None, resolution=96,
presentational_hints=False, optimize_size=('fonts',),
presentational_hints=False, optimize_size=('fonts', 'pdf'),
font_config=None, counter_style=None, image_cache=None):
return self.render(
stylesheets, presentational_hints=presentational_hints,

View File

@ -303,11 +303,12 @@ def test_command_line_render(tmpdir):
tmpdir.join(name).write_binary(pattern_bytes)
# Reference
optimize_size = ('fonts', 'pdf')
html_obj = FakeHTML(string=combined, base_url='dummy.html')
pdf_bytes = html_obj.write_pdf()
pdf_bytes = html_obj.write_pdf(optimize_size=optimize_size)
rotated_pdf_bytes = FakeHTML(
string=combined, base_url='dummy.html',
media_type='screen').write_pdf()
media_type='screen').write_pdf(optimize_size=optimize_size)
tmpdir.join('no_css.html').write_binary(html)
tmpdir.join('combined.html').write_binary(combined)
@ -386,9 +387,9 @@ def test_command_line_render(tmpdir):
assert stdout.count(b'attachment') == 0
stdout = _run('combined.html -')
assert stdout.count(b'attachment') == 0
stdout = _run('-a pattern.png combined.html -')
stdout = _run('-O none -a pattern.png combined.html -')
assert stdout.count(b'attachment') == 1
stdout = _run('-a style.css -a pattern.png combined.html -')
stdout = _run('-O none -a style.css -a pattern.png combined.html -')
assert stdout.count(b'attachment') == 2
os.mkdir('subdirectory')
@ -423,42 +424,58 @@ def test_command_line_render(tmpdir):
(4, '2.0'),
))
def test_pdfa(version, pdf_version):
stdout = _run(f'--pdf-variant=pdf/a-{version}b - -', b'test')
stdout = _run(f'--pdf-variant=pdf/a-{version}b -O none - -', b'test')
assert f'PDF-{pdf_version}'.encode() in stdout
assert f'part="{version}"'.encode() in stdout
@pytest.mark.parametrize('version, pdf_version', (
(1, '1.4'),
(2, '1.7'),
(3, '1.7'),
(4, '2.0'),
))
def test_pdfa_compressed(version, pdf_version):
_run(f'--pdf-variant=pdf/a-{version}b - -', b'test')
def test_pdfua():
stdout = _run('--pdf-variant=pdf/ua-1 - -', b'test')
stdout = _run('--pdf-variant=pdf/ua-1 -O none - -', b'test')
assert b'part="1"' in stdout
def test_pdfua_compressed():
_run('--pdf-variant=pdf/ua-1 - -', b'test')
def test_pdf_identifier():
stdout = _run('--pdf-identifier=abc - -', b'test')
stdout = _run('--pdf-identifier=abc -O none - -', b'test')
assert b'abc' in stdout
def test_pdf_version():
stdout = _run('--pdf-version=1.4 - -', b'test')
stdout = _run('--pdf-version=1.4 -O none - -', b'test')
assert b'PDF-1.4' in stdout
def test_pdf_custom_metadata():
stdout = _run('--custom-metadata - -', b'<meta name=key content=value />')
stdout = _run(
'--custom-metadata -O none - -',
b'<meta name=key content=value />')
assert b'/key' in stdout
assert b'value' in stdout
def test_bad_pdf_custom_metadata():
stdout = _run(
'--custom-metadata - -',
'--custom-metadata -O none - -',
'<meta name=é content=value />'.encode('latin1'))
assert b'value' not in stdout
def test_partial_pdf_custom_metadata():
stdout = _run(
'--custom-metadata - -',
'--custom-metadata -O none - -',
'<meta name=a.b/céd0 content=value />'.encode('latin1'))
assert b'/abcd0' in stdout
assert b'value' in stdout
@ -470,7 +487,7 @@ def test_partial_pdf_custom_metadata():
(b'<textarea></textarea>', b'/Tx'),
))
def test_pdf_inputs(html, field):
stdout = _run('--pdf-forms - -', html)
stdout = _run('--pdf-forms -O none - -', html)
assert b'AcroForm' in stdout
assert field in stdout
stdout = _run('- -', html)
@ -484,8 +501,8 @@ def test_pdf_inputs(html, field):
))
def test_appearance(css, with_forms, without_forms):
html = f'<input style="{css}">'.encode()
assert (b'AcroForm' in _run('--pdf-forms - -', html)) is with_forms
assert (b'AcroForm' in _run('- -', html)) is without_forms
assert (b'AcroForm' in _run('--pdf-forms -O none - -', html)) is with_forms
assert (b'AcroForm' in _run(' -O none - -', html)) is without_forms
def test_reproducible():

View File

@ -53,6 +53,18 @@ class FakeHTML(HTML):
TEST_UA_STYLESHEET if stylesheet == HTML5_UA_STYLESHEET
else stylesheet for stylesheet in super()._ua_stylesheets(forms)]
def write_pdf(self, target=None, stylesheets=None, zoom=1,
attachments=None, finisher=None, presentational_hints=False,
optimize_size=('fonts',), font_config=None,
counter_style=None, image_cache=None, identifier=None,
variant=None, version=None, forms=False,
custom_metadata=False):
# Override function to set PDF size optimization to False by default
return super().write_pdf(
target, stylesheets, zoom, attachments, finisher,
presentational_hints, optimize_size, font_config, counter_style,
image_cache, identifier, variant, version, forms, custom_metadata)
def resource_filename(basename):
"""Return the absolute path of the resource called ``basename``."""

View File

@ -118,8 +118,8 @@ class HTML:
return [HTML5_PH_STYLESHEET]
def render(self, stylesheets=None, presentational_hints=False,
optimize_size=('fonts',), font_config=None, counter_style=None,
image_cache=None, forms=False):
optimize_size=('fonts', 'pdf'), font_config=None,
counter_style=None, image_cache=None, forms=False):
"""Lay out and paginate the document, but do not (yet) export it.
This returns a :class:`document.Document` object which provides
@ -133,7 +133,8 @@ class HTML:
:param bool presentational_hints:
Whether HTML presentational hints are followed.
:param tuple optimize_size:
Optimize size of generated PDF. Can contain "images" and "fonts".
Optimize size of generated PDF. Can contain "images", "fonts" and
"pdf".
:type font_config: :class:`text.fonts.FontConfiguration`
:param font_config: A font configuration handling ``@font-face`` rules.
:type counter_style: :class:`css.counters.CounterStyle`
@ -153,7 +154,7 @@ class HTML:
def write_pdf(self, target=None, stylesheets=None, zoom=1,
attachments=None, finisher=None, presentational_hints=False,
optimize_size=('fonts',), font_config=None,
optimize_size=('fonts', 'pdf'), font_config=None,
counter_style=None, image_cache=None, identifier=None,
variant=None, version=None, forms=False,
custom_metadata=False):
@ -185,7 +186,8 @@ class HTML:
:param bool presentational_hints: Whether HTML presentational hints are
followed.
:param tuple optimize_size:
Optimize size of generated PDF. Can contain "images" and "fonts".
Optimize size of generated PDF. Can contain "images", "fonts" and
"pdf".
:type font_config: :class:`text.fonts.FontConfiguration`
:param font_config: A font configuration handling ``@font-face`` rules.
:type counter_style: :class:`css.counters.CounterStyle`

View File

@ -90,9 +90,10 @@ def main(argv=None, stdout=None, stdin=None):
.. option:: -O <type>, --optimize-size <type>
Optimize the size of generated documents. Supported types are
``images``, ``fonts``, ``all`` and ``none``. This option can be used
multiple times, ``all`` adds all allowed values, ``none`` removes all
previously set values.
``images``, ``fonts``, ``pdf``, ``all`` and ``none``. This option can
be used multiple times, ``all`` adds all allowed values, ``none``
removes all previously set values (including the default ones,
``fonts`` and ``pdf``).
.. option:: -c <folder>, --cache-folder <folder>
@ -160,7 +161,8 @@ def main(argv=None, stdout=None, stdin=None):
parser.add_argument(
'-O', '--optimize-size', action='append',
help='optimize output size for specified features',
choices=('images', 'fonts', 'all', 'none'), default=['fonts'])
choices=('images', 'fonts', 'pdf', 'all', 'none'),
default=['fonts', 'pdf'])
parser.add_argument(
'-c', '--cache-folder',
help='Store cache on disk instead of memory. The ``folder`` is '
@ -198,7 +200,7 @@ def main(argv=None, stdout=None, stdin=None):
if arg == 'none':
optimize_size.clear()
elif arg == 'all':
optimize_size |= {'images', 'fonts'}
optimize_size |= {'images', 'fonts', 'pdf'}
else:
optimize_size.add(arg)

View File

@ -296,8 +296,8 @@ class Document:
# rendering is destroyed. This is needed as font_config.__del__ removes
# fonts that may be used when rendering
self.font_config = font_config
# Set of flags for PDF size optimization. Can contain "images" and
# "fonts".
# Set of flags for PDF size optimization. Can contain "images", "fonts"
# and "pdf".
self._optimize_size = optimize_size
def build_element_structure(self, structure, etree_element=None):
@ -414,13 +414,15 @@ class Document:
if finisher:
finisher(self, pdf)
compress = 'pdf' in self._optimize_size
if target is None:
output = io.BytesIO()
pdf.write(output, version=pdf.version, identifier=identifier)
pdf.write(output, pdf.version, identifier, compress)
return output.getvalue()
if hasattr(target, 'write'):
pdf.write(target, version=pdf.version, identifier=identifier)
pdf.write(target, pdf.version, identifier, compress)
else:
with open(target, 'wb') as fd:
pdf.write(fd, version=pdf.version, identifier=identifier)
pdf.write(fd, pdf.version, identifier, compress)

View File

@ -153,9 +153,10 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,
page_rectangle = (
left / scale, top / scale,
(right - left) / scale, (bottom - top) / scale)
compress = 'pdf' in optimize_size
stream = Stream(
document.fonts, page_rectangle, states, x_objects, patterns,
shadings, images, mark)
shadings, images, mark, compress=compress)
stream.transform(d=-1, f=(page.height * scale))
pdf.add_object(stream)
page_streams.append(stream)
@ -175,10 +176,11 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,
add_links(links_and_anchors, matrix, pdf, pdf_page, pdf_names, mark)
add_annotations(
links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files)
links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files,
compress)
add_inputs(
page.inputs, matrix, pdf, pdf_page, resources, stream,
document.font_config.font_map)
document.font_config.font_map, compress)
page.paint(stream, scale=scale)
# Bleed
@ -281,6 +283,7 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,
# Apply PDF variants functions
if variant:
variant_function(pdf, metadata, document, page_streams)
compress = 'pdf' in optimize_size
variant_function(pdf, metadata, document, page_streams, compress)
return pdf

View File

@ -91,7 +91,8 @@ def add_outlines(pdf, bookmarks, parent=None):
return outlines, count
def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map):
def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map,
compress):
"""Include form inputs in PDF."""
if not inputs:
return
@ -118,7 +119,7 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map):
input_name = pydyf.String(element.attrib.get('name', default_name))
# TODO: where does this 0.75 scale come from?
font_size = style['font_size'] * 0.75
field_stream = pydyf.Stream()
field_stream = pydyf.Stream(compress=compress)
field_stream.set_color_rgb(*style['color'][:3])
if input_type == 'checkbox':
# Checkboxes
@ -129,7 +130,7 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map):
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array((0, 0, width, height)),
})
}, compress=compress)
checked_stream.push_state()
checked_stream.begin_text()
checked_stream.set_color_rgb(*style['color'][:3])
@ -194,7 +195,7 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map):
pdf.catalog['AcroForm']['Fields'].append(field.reference)
def add_annotations(links, matrix, document, pdf, page, annot_files):
def add_annotations(links, matrix, document, pdf, page, annot_files, compress):
"""Include annotations in PDF."""
# TODO: splitting a link into multiple independent rectangular
# annotations works well for pure links, but rather mediocre for
@ -225,8 +226,7 @@ def add_annotations(links, matrix, document, pdf, page, annot_files):
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array(rectangle),
'Length': 0,
})
}, compress)
pdf.add_object(stream)
annot = pydyf.Dictionary({
'Type': '/Annot',
@ -278,7 +278,7 @@ def write_pdf_attachment(pdf, attachment, url_fetcher):
'Size': uncompressed_length,
})
})
file_stream = pydyf.Stream([stream], file_extra)
file_stream = pydyf.Stream([stream], file_extra, compress)
pdf.add_object(file_stream)
except URLFetchingError as exception:

View File

@ -8,6 +8,7 @@ from ..logger import LOGGER
def build_fonts_dictionary(pdf, fonts, optimize_size):
compress = 'pdf' in optimize_size
pdf_fonts = pydyf.Dictionary()
fonts_by_file_hash = {}
for font in fonts.values():
@ -32,7 +33,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size):
else:
font_extra = pydyf.Dictionary({'Length1': len(font.file_content)})
font_stream = pydyf.Stream(
[font.file_content], font_extra, compress=True)
[font.file_content], font_extra, compress=compress)
pdf.add_object(font_stream)
font_references_by_file_hash[file_hash] = font_stream.reference
@ -80,7 +81,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size):
b'1 begincodespacerange',
b'<0000> <ffff>',
b'endcodespacerange',
f'{len(cmap)} beginbfchar'.encode()])
f'{len(cmap)} beginbfchar'.encode()], compress=compress)
for glyph, text in cmap.items():
unicode_codepoints = ''.join(
f'{letter.encode("utf-16-be").hex()}' for letter in text)
@ -125,7 +126,8 @@ def build_fonts_dictionary(pdf, fonts, optimize_size):
for cid in cids:
bits[cid] = '1'
stream = pydyf.Stream(
(int(''.join(bits), 2).to_bytes(padded_width, 'big'),))
(int(''.join(bits), 2).to_bytes(padded_width, 'big'),),
compress=compress)
pdf.add_object(stream)
font_descriptor['CIDSet'] = stream.reference
if font.type == 'otf':
@ -156,6 +158,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size):
def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
optimize_size):
compress = 'pdf' in optimize_size
# https://docs.microsoft.com/typography/opentype/spec/ebdt
font_dictionary['FontBBox'] = pydyf.Array([0, 0, 1, 1])
font_dictionary['FontMatrix'] = pydyf.Array([1, 0, 0, 1, 0, 0])
@ -308,7 +311,7 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
b'/BPC 1',
b'/D [1 0]',
b'ID', bitmap, b'EI'
])
], compress=compress)
pdf.add_object(bitmap_stream)
char_procs[glyph_id] = bitmap_stream.reference

View File

@ -20,7 +20,7 @@ for key, value in NS.items():
register_namespace(key, value)
def add_metadata(pdf, metadata, variant, version, conformance):
def add_metadata(pdf, metadata, variant, version, conformance, compress):
"""Add PDF stream of metadata.
Described in ISO-32000-1:2008, 14.3.2.
@ -88,6 +88,6 @@ def add_metadata(pdf, metadata, variant, version, conformance):
footer = b'<?xpacket end="r"?>'
stream_content = b'\n'.join((header, xml, footer))
extra = {'Type': '/Metadata', 'Subtype': '/XML'}
metadata = pydyf.Stream([stream_content], extra=extra)
metadata = pydyf.Stream([stream_content], extra, compress)
pdf.add_object(metadata)
pdf.catalog['Metadata'] = metadata.reference

View File

@ -18,7 +18,7 @@ from ..logger import LOGGER
from .metadata import add_metadata
def pdfa(pdf, metadata, document, page_streams, version):
def pdfa(pdf, metadata, document, page_streams, compress, version):
"""Set metadata for PDF/A documents."""
LOGGER.warning(
'PDF/A support is experimental, '
@ -29,7 +29,7 @@ def pdfa(pdf, metadata, document, page_streams, version):
profile = pydyf.Stream(
[read_binary(__package__, 'sRGB2014.icc')],
pydyf.Dictionary({'N': 3, 'Alternate': '/DeviceRGB'}),
compress=True)
compress=compress)
pdf.add_object(profile)
pdf.catalog['OutputIntents'] = pydyf.Array([
pydyf.Dictionary({
@ -46,7 +46,7 @@ def pdfa(pdf, metadata, document, page_streams, version):
pdf_object['F'] = 2 ** (3 - 1)
# Common PDF metadata stream
add_metadata(pdf, metadata, 'a', version, 'B')
add_metadata(pdf, metadata, 'a', version, 'B', compress)
VARIANTS = {

View File

@ -6,7 +6,7 @@ from ..logger import LOGGER
from .metadata import add_metadata
def pdfua(pdf, metadata, document, page_streams):
def pdfua(pdf, metadata, document, page_streams, compress):
"""Set metadata for PDF/UA documents."""
LOGGER.warning(
'PDF/UA support is experimental, '
@ -117,7 +117,7 @@ def pdfua(pdf, metadata, document, page_streams):
annotation['F'] = 2 ** (2 - 1)
# Common PDF metadata stream
add_metadata(pdf, metadata, 'ua', version=1, conformance=None)
add_metadata(pdf, metadata, 'ua', 1, conformance=None, compress=compress)
# PDF document extra metadata
if 'Lang' not in pdf.catalog:

View File

@ -195,7 +195,6 @@ class Stream(pydyf.Stream):
def __init__(self, fonts, page_rectangle, states, x_objects, patterns,
shadings, images, mark, *args, **kwargs):
super().__init__(*args, **kwargs)
self.compress = True
self.page_rectangle = page_rectangle
self.marked = []
self._fonts = fonts
@ -356,7 +355,8 @@ class Stream(pydyf.Stream):
})
group = Stream(
self._fonts, self.page_rectangle, states, x_objects, patterns,
shadings, self._images, self._mark, extra=extra)
shadings, self._images, self._mark, extra=extra,
compress=self.compress)
group.id = f'x{len(self._x_objects)}'
self._x_objects[group.id] = group
return group
@ -407,7 +407,8 @@ class Stream(pydyf.Stream):
})
pattern = Stream(
self._fonts, self.page_rectangle, states, x_objects, patterns,
shadings, self._images, self._mark, extra=extra)
shadings, self._images, self._mark, extra=extra,
compress=self.compress)
pattern.id = f'p{len(self._patterns)}'
self._patterns[pattern.id] = pattern
return pattern