Add API to compress generated PDF files

This feature compresses PDF streams (as it was already the case) and ask pydyf to use a compact PDF structure with compressed object stream and cross-reference object (for PDF version >=1.5).
2024-09-11 20:47:56 +03:00 · 2023-03-26 13:08:19 +02:00 · 2023-03-26 13:08:19 +02:00 · fdbdfc150c
commit fdbdfc150c
parent e49d955509
14 changed files with 100 additions and 57 deletions
--- a/docs/first_steps.rst
+++ b/docs/first_steps.rst
@ -513,7 +513,8 @@ WeasyPrint provides two options to deal with images: ``optimize_size`` and

 ``optimize_size`` can enable size optimization for images, but also for fonts.
 When enabled, the generated PDF will include smaller images and fonts, but the
-rendering time may be slightly increased.
+rendering time may be slightly increased. The whole structure of the PDF can be
+compressed too.

 .. code-block:: python

@ -523,7 +524,7 @@ rendering time may be slightly increased.

    # Full size optimization, slower, but generated PDF is smaller
    HTML('https://example.org/').write_pdf(
-        'example.pdf', optimize_size=('fonts', 'images'))
+        'example.pdf', optimize_size=('fonts', 'images', 'pdf'))

 ``image_cache`` gives the possibility to use a cache for images, avoiding to
 download, parse and optimize them each time they are used.
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -74,7 +74,7 @@ def document_write_png(self, target=None, resolution=96, antialiasing=1,


 def html_write_png(self, target=None, stylesheets=None, resolution=96,
-                   presentational_hints=False, optimize_size=('fonts',),
+                   presentational_hints=False, optimize_size=('fonts', 'pdf'),
                   font_config=None, counter_style=None, image_cache=None):
    return self.render(
        stylesheets, presentational_hints=presentational_hints,
--- a/tests/test_api.py
+++ b/tests/test_api.py
@ -303,11 +303,12 @@ def test_command_line_render(tmpdir):
        tmpdir.join(name).write_binary(pattern_bytes)

    # Reference
+    optimize_size = ('fonts', 'pdf')
    html_obj = FakeHTML(string=combined, base_url='dummy.html')
-    pdf_bytes = html_obj.write_pdf()
+    pdf_bytes = html_obj.write_pdf(optimize_size=optimize_size)
    rotated_pdf_bytes = FakeHTML(
        string=combined, base_url='dummy.html',
-        media_type='screen').write_pdf()
+        media_type='screen').write_pdf(optimize_size=optimize_size)

    tmpdir.join('no_css.html').write_binary(html)
    tmpdir.join('combined.html').write_binary(combined)
@ -386,9 +387,9 @@ def test_command_line_render(tmpdir):
    assert stdout.count(b'attachment') == 0
    stdout = _run('combined.html -')
    assert stdout.count(b'attachment') == 0
-    stdout = _run('-a pattern.png combined.html -')
+    stdout = _run('-O none -a pattern.png combined.html -')
    assert stdout.count(b'attachment') == 1
-    stdout = _run('-a style.css -a pattern.png combined.html -')
+    stdout = _run('-O none -a style.css -a pattern.png combined.html -')
    assert stdout.count(b'attachment') == 2

    os.mkdir('subdirectory')
@ -423,42 +424,58 @@ def test_command_line_render(tmpdir):
    (4, '2.0'),
 ))
 def test_pdfa(version, pdf_version):
-    stdout = _run(f'--pdf-variant=pdf/a-{version}b - -', b'test')
+    stdout = _run(f'--pdf-variant=pdf/a-{version}b -O none - -', b'test')
    assert f'PDF-{pdf_version}'.encode() in stdout
    assert f'part="{version}"'.encode() in stdout


+@pytest.mark.parametrize('version, pdf_version', (
+    (1, '1.4'),
+    (2, '1.7'),
+    (3, '1.7'),
+    (4, '2.0'),
+))
+def test_pdfa_compressed(version, pdf_version):
+    _run(f'--pdf-variant=pdf/a-{version}b - -', b'test')
+
+
 def test_pdfua():
-    stdout = _run('--pdf-variant=pdf/ua-1 - -', b'test')
+    stdout = _run('--pdf-variant=pdf/ua-1 -O none - -', b'test')
    assert b'part="1"' in stdout


+def test_pdfua_compressed():
+    _run('--pdf-variant=pdf/ua-1 - -', b'test')
+
+
 def test_pdf_identifier():
-    stdout = _run('--pdf-identifier=abc - -', b'test')
+    stdout = _run('--pdf-identifier=abc -O none - -', b'test')
    assert b'abc' in stdout


 def test_pdf_version():
-    stdout = _run('--pdf-version=1.4 - -', b'test')
+    stdout = _run('--pdf-version=1.4 -O none - -', b'test')
    assert b'PDF-1.4' in stdout


 def test_pdf_custom_metadata():
-    stdout = _run('--custom-metadata - -', b'<meta name=key content=value />')
+    stdout = _run(
+        '--custom-metadata -O none - -',
+        b'<meta name=key content=value />')
    assert b'/key' in stdout
    assert b'value' in stdout


 def test_bad_pdf_custom_metadata():
    stdout = _run(
-        '--custom-metadata - -',
+        '--custom-metadata -O none - -',
        '<meta name=é content=value />'.encode('latin1'))
    assert b'value' not in stdout


 def test_partial_pdf_custom_metadata():
    stdout = _run(
-        '--custom-metadata - -',
+        '--custom-metadata -O none - -',
        '<meta name=a.b/céd0 content=value />'.encode('latin1'))
    assert b'/abcd0' in stdout
    assert b'value' in stdout
@ -470,7 +487,7 @@ def test_partial_pdf_custom_metadata():
    (b'<textarea></textarea>', b'/Tx'),
 ))
 def test_pdf_inputs(html, field):
-    stdout = _run('--pdf-forms - -', html)
+    stdout = _run('--pdf-forms -O none - -', html)
    assert b'AcroForm' in stdout
    assert field in stdout
    stdout = _run('- -', html)
@ -484,8 +501,8 @@ def test_pdf_inputs(html, field):
 ))
 def test_appearance(css, with_forms, without_forms):
    html = f'<input style="{css}">'.encode()
-    assert (b'AcroForm' in _run('--pdf-forms - -', html)) is with_forms
-    assert (b'AcroForm' in _run('- -', html)) is without_forms
+    assert (b'AcroForm' in _run('--pdf-forms -O none - -', html)) is with_forms
+    assert (b'AcroForm' in _run(' -O none - -', html)) is without_forms


 def test_reproducible():
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@ -53,6 +53,18 @@ class FakeHTML(HTML):
            TEST_UA_STYLESHEET if stylesheet == HTML5_UA_STYLESHEET
            else stylesheet for stylesheet in super()._ua_stylesheets(forms)]

+    def write_pdf(self, target=None, stylesheets=None, zoom=1,
+                  attachments=None, finisher=None, presentational_hints=False,
+                  optimize_size=('fonts',), font_config=None,
+                  counter_style=None, image_cache=None, identifier=None,
+                  variant=None, version=None, forms=False,
+                  custom_metadata=False):
+        # Override function to set PDF size optimization to False by default
+        return super().write_pdf(
+            target, stylesheets, zoom, attachments, finisher,
+            presentational_hints, optimize_size, font_config, counter_style,
+            image_cache, identifier, variant, version, forms, custom_metadata)
+

 def resource_filename(basename):
    """Return the absolute path of the resource called ``basename``."""
--- a/weasyprint/init.py
+++ b/weasyprint/init.py
@ -118,8 +118,8 @@ class HTML:
        return [HTML5_PH_STYLESHEET]

    def render(self, stylesheets=None, presentational_hints=False,
-               optimize_size=('fonts',), font_config=None, counter_style=None,
-               image_cache=None, forms=False):
+               optimize_size=('fonts', 'pdf'), font_config=None,
+               counter_style=None, image_cache=None, forms=False):
        """Lay out and paginate the document, but do not (yet) export it.

        This returns a :class:`document.Document` object which provides
@ -133,7 +133,8 @@ class HTML:
        :param bool presentational_hints:
            Whether HTML presentational hints are followed.
        :param tuple optimize_size:
-            Optimize size of generated PDF. Can contain "images" and "fonts".
+            Optimize size of generated PDF. Can contain "images", "fonts" and
+            "pdf".
        :type font_config: :class:`text.fonts.FontConfiguration`
        :param font_config: A font configuration handling ``@font-face`` rules.
        :type counter_style: :class:`css.counters.CounterStyle`
@ -153,7 +154,7 @@ class HTML:

    def write_pdf(self, target=None, stylesheets=None, zoom=1,
                  attachments=None, finisher=None, presentational_hints=False,
-                  optimize_size=('fonts',), font_config=None,
+                  optimize_size=('fonts', 'pdf'), font_config=None,
                  counter_style=None, image_cache=None, identifier=None,
                  variant=None, version=None, forms=False,
                  custom_metadata=False):
@ -185,7 +186,8 @@ class HTML:
        :param bool presentational_hints: Whether HTML presentational hints are
            followed.
        :param tuple optimize_size:
-            Optimize size of generated PDF. Can contain "images" and "fonts".
+            Optimize size of generated PDF. Can contain "images", "fonts" and
+            "pdf".
        :type font_config: :class:`text.fonts.FontConfiguration`
        :param font_config: A font configuration handling ``@font-face`` rules.
        :type counter_style: :class:`css.counters.CounterStyle`
--- a/weasyprint/main.py
+++ b/weasyprint/main.py
@ -90,9 +90,10 @@ def main(argv=None, stdout=None, stdin=None):
    .. option:: -O <type>, --optimize-size <type>

        Optimize the size of generated documents. Supported types are
-        ``images``, ``fonts``, ``all`` and ``none``. This option can be used
-        multiple times, ``all`` adds all allowed values, ``none`` removes all
-        previously set values.
+        ``images``, ``fonts``, ``pdf``, ``all`` and ``none``. This option can
+        be used multiple times, ``all`` adds all allowed values, ``none``
+        removes all previously set values (including the default ones,
+        ``fonts`` and ``pdf``).

    .. option:: -c <folder>, --cache-folder <folder>

@ -160,7 +161,8 @@ def main(argv=None, stdout=None, stdin=None):
    parser.add_argument(
        '-O', '--optimize-size', action='append',
        help='optimize output size for specified features',
-        choices=('images', 'fonts', 'all', 'none'), default=['fonts'])
+        choices=('images', 'fonts', 'pdf', 'all', 'none'),
+        default=['fonts', 'pdf'])
    parser.add_argument(
        '-c', '--cache-folder',
        help='Store cache on disk instead of memory. The ``folder`` is '
@ -198,7 +200,7 @@ def main(argv=None, stdout=None, stdin=None):
        if arg == 'none':
            optimize_size.clear()
        elif arg == 'all':
-            optimize_size |= {'images', 'fonts'}
+            optimize_size |= {'images', 'fonts', 'pdf'}
        else:
            optimize_size.add(arg)

--- a/weasyprint/document.py
+++ b/weasyprint/document.py
@ -296,8 +296,8 @@ class Document:
        # rendering is destroyed. This is needed as font_config.__del__ removes
        # fonts that may be used when rendering
        self.font_config = font_config
-        # Set of flags for PDF size optimization. Can contain "images" and
-        # "fonts".
+        # Set of flags for PDF size optimization. Can contain "images", "fonts"
+        # and "pdf".
        self._optimize_size = optimize_size

    def build_element_structure(self, structure, etree_element=None):
@ -414,13 +414,15 @@ class Document:
        if finisher:
            finisher(self, pdf)

+        compress = 'pdf' in self._optimize_size
+
        if target is None:
            output = io.BytesIO()
-            pdf.write(output, version=pdf.version, identifier=identifier)
+            pdf.write(output, pdf.version, identifier, compress)
            return output.getvalue()

        if hasattr(target, 'write'):
-            pdf.write(target, version=pdf.version, identifier=identifier)
+            pdf.write(target, pdf.version, identifier, compress)
        else:
            with open(target, 'wb') as fd:
-                pdf.write(fd, version=pdf.version, identifier=identifier)
+                pdf.write(fd, pdf.version, identifier, compress)
--- a/weasyprint/pdf/init.py
+++ b/weasyprint/pdf/init.py
@ -153,9 +153,10 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,
        page_rectangle = (
            left / scale, top / scale,
            (right - left) / scale, (bottom - top) / scale)
+        compress = 'pdf' in optimize_size
        stream = Stream(
            document.fonts, page_rectangle, states, x_objects, patterns,
-            shadings, images, mark)
+            shadings, images, mark, compress=compress)
        stream.transform(d=-1, f=(page.height * scale))
        pdf.add_object(stream)
        page_streams.append(stream)
@ -175,10 +176,11 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,

        add_links(links_and_anchors, matrix, pdf, pdf_page, pdf_names, mark)
        add_annotations(
-            links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files)
+            links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files,
+            compress)
        add_inputs(
            page.inputs, matrix, pdf, pdf_page, resources, stream,
-            document.font_config.font_map)
+            document.font_config.font_map, compress)
        page.paint(stream, scale=scale)

        # Bleed
@ -281,6 +283,7 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,

    # Apply PDF variants functions
    if variant:
-        variant_function(pdf, metadata, document, page_streams)
+        compress = 'pdf' in optimize_size
+        variant_function(pdf, metadata, document, page_streams, compress)

    return pdf
--- a/weasyprint/pdf/anchors.py
+++ b/weasyprint/pdf/anchors.py
@ -91,7 +91,8 @@ def add_outlines(pdf, bookmarks, parent=None):
    return outlines, count


-def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map):
+def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map,
+               compress):
    """Include form inputs in PDF."""
    if not inputs:
        return
@ -118,7 +119,7 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map):
        input_name = pydyf.String(element.attrib.get('name', default_name))
        # TODO: where does this 0.75 scale come from?
        font_size = style['font_size'] * 0.75
-        field_stream = pydyf.Stream()
+        field_stream = pydyf.Stream(compress=compress)
        field_stream.set_color_rgb(*style['color'][:3])
        if input_type == 'checkbox':
            # Checkboxes
@ -129,7 +130,7 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map):
                'Type': '/XObject',
                'Subtype': '/Form',
                'BBox': pydyf.Array((0, 0, width, height)),
-            })
+            }, compress=compress)
            checked_stream.push_state()
            checked_stream.begin_text()
            checked_stream.set_color_rgb(*style['color'][:3])
@ -194,7 +195,7 @@ def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map):
        pdf.catalog['AcroForm']['Fields'].append(field.reference)


-def add_annotations(links, matrix, document, pdf, page, annot_files):
+def add_annotations(links, matrix, document, pdf, page, annot_files, compress):
    """Include annotations in PDF."""
    # TODO: splitting a link into multiple independent rectangular
    # annotations works well for pure links, but rather mediocre for
@ -225,8 +226,7 @@ def add_annotations(links, matrix, document, pdf, page, annot_files):
            'Type': '/XObject',
            'Subtype': '/Form',
            'BBox': pydyf.Array(rectangle),
-            'Length': 0,
-        })
+        }, compress)
        pdf.add_object(stream)
        annot = pydyf.Dictionary({
            'Type': '/Annot',
@ -278,7 +278,7 @@ def write_pdf_attachment(pdf, attachment, url_fetcher):
                    'Size': uncompressed_length,
                })
            })
-            file_stream = pydyf.Stream([stream], file_extra)
+            file_stream = pydyf.Stream([stream], file_extra, compress)
            pdf.add_object(file_stream)

    except URLFetchingError as exception:
--- a/weasyprint/pdf/fonts.py
+++ b/weasyprint/pdf/fonts.py
@ -8,6 +8,7 @@ from ..logger import LOGGER


 def build_fonts_dictionary(pdf, fonts, optimize_size):
+    compress = 'pdf' in optimize_size
    pdf_fonts = pydyf.Dictionary()
    fonts_by_file_hash = {}
    for font in fonts.values():
@ -32,7 +33,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size):
        else:
            font_extra = pydyf.Dictionary({'Length1': len(font.file_content)})
        font_stream = pydyf.Stream(
-            [font.file_content], font_extra, compress=True)
+            [font.file_content], font_extra, compress=compress)
        pdf.add_object(font_stream)
        font_references_by_file_hash[file_hash] = font_stream.reference

@ -80,7 +81,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size):
            b'1 begincodespacerange',
            b'<0000> <ffff>',
            b'endcodespacerange',
-            f'{len(cmap)} beginbfchar'.encode()])
+            f'{len(cmap)} beginbfchar'.encode()], compress=compress)
        for glyph, text in cmap.items():
            unicode_codepoints = ''.join(
                f'{letter.encode("utf-16-be").hex()}' for letter in text)
@ -125,7 +126,8 @@ def build_fonts_dictionary(pdf, fonts, optimize_size):
                for cid in cids:
                    bits[cid] = '1'
                stream = pydyf.Stream(
-                    (int(''.join(bits), 2).to_bytes(padded_width, 'big'),))
+                    (int(''.join(bits), 2).to_bytes(padded_width, 'big'),),
+                    compress=compress)
                pdf.add_object(stream)
                font_descriptor['CIDSet'] = stream.reference
            if font.type == 'otf':
@ -156,6 +158,7 @@ def build_fonts_dictionary(pdf, fonts, optimize_size):

 def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
                                  optimize_size):
+    compress = 'pdf' in optimize_size
    # https://docs.microsoft.com/typography/opentype/spec/ebdt
    font_dictionary['FontBBox'] = pydyf.Array([0, 0, 1, 1])
    font_dictionary['FontMatrix'] = pydyf.Array([1, 0, 0, 1, 0, 0])
@ -308,7 +311,7 @@ def _build_bitmap_font_dictionary(font_dictionary, pdf, font, widths,
            b'/BPC 1',
            b'/D [1 0]',
            b'ID', bitmap, b'EI'
-        ])
+        ], compress=compress)
        pdf.add_object(bitmap_stream)
        char_procs[glyph_id] = bitmap_stream.reference

--- a/weasyprint/pdf/metadata.py
+++ b/weasyprint/pdf/metadata.py
@ -20,7 +20,7 @@ for key, value in NS.items():
    register_namespace(key, value)


-def add_metadata(pdf, metadata, variant, version, conformance):
+def add_metadata(pdf, metadata, variant, version, conformance, compress):
    """Add PDF stream of metadata.

    Described in ISO-32000-1:2008, 14.3.2.
@ -88,6 +88,6 @@ def add_metadata(pdf, metadata, variant, version, conformance):
    footer = b'<?xpacket end="r"?>'
    stream_content = b'\n'.join((header, xml, footer))
    extra = {'Type': '/Metadata', 'Subtype': '/XML'}
-    metadata = pydyf.Stream([stream_content], extra=extra)
+    metadata = pydyf.Stream([stream_content], extra, compress)
    pdf.add_object(metadata)
    pdf.catalog['Metadata'] = metadata.reference
--- a/weasyprint/pdf/pdfa.py
+++ b/weasyprint/pdf/pdfa.py
@ -18,7 +18,7 @@ from ..logger import LOGGER
 from .metadata import add_metadata


-def pdfa(pdf, metadata, document, page_streams, version):
+def pdfa(pdf, metadata, document, page_streams, compress, version):
    """Set metadata for PDF/A documents."""
    LOGGER.warning(
        'PDF/A support is experimental, '
@ -29,7 +29,7 @@ def pdfa(pdf, metadata, document, page_streams, version):
    profile = pydyf.Stream(
        [read_binary(__package__, 'sRGB2014.icc')],
        pydyf.Dictionary({'N': 3, 'Alternate': '/DeviceRGB'}),
-        compress=True)
+        compress=compress)
    pdf.add_object(profile)
    pdf.catalog['OutputIntents'] = pydyf.Array([
        pydyf.Dictionary({
@ -46,7 +46,7 @@ def pdfa(pdf, metadata, document, page_streams, version):
            pdf_object['F'] = 2 ** (3 - 1)

    # Common PDF metadata stream
-    add_metadata(pdf, metadata, 'a', version, 'B')
+    add_metadata(pdf, metadata, 'a', version, 'B', compress)


 VARIANTS = {
--- a/weasyprint/pdf/pdfua.py
+++ b/weasyprint/pdf/pdfua.py
@ -6,7 +6,7 @@ from ..logger import LOGGER
 from .metadata import add_metadata


-def pdfua(pdf, metadata, document, page_streams):
+def pdfua(pdf, metadata, document, page_streams, compress):
    """Set metadata for PDF/UA documents."""
    LOGGER.warning(
        'PDF/UA support is experimental, '
@ -117,7 +117,7 @@ def pdfua(pdf, metadata, document, page_streams):
        annotation['F'] = 2 ** (2 - 1)

    # Common PDF metadata stream
-    add_metadata(pdf, metadata, 'ua', version=1, conformance=None)
+    add_metadata(pdf, metadata, 'ua', 1, conformance=None, compress=compress)

    # PDF document extra metadata
    if 'Lang' not in pdf.catalog:
--- a/weasyprint/pdf/stream.py
+++ b/weasyprint/pdf/stream.py
@ -195,7 +195,6 @@ class Stream(pydyf.Stream):
    def __init__(self, fonts, page_rectangle, states, x_objects, patterns,
                 shadings, images, mark, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self.compress = True
        self.page_rectangle = page_rectangle
        self.marked = []
        self._fonts = fonts
@ -356,7 +355,8 @@ class Stream(pydyf.Stream):
        })
        group = Stream(
            self._fonts, self.page_rectangle, states, x_objects, patterns,
-            shadings, self._images, self._mark, extra=extra)
+            shadings, self._images, self._mark, extra=extra,
+            compress=self.compress)
        group.id = f'x{len(self._x_objects)}'
        self._x_objects[group.id] = group
        return group
@ -407,7 +407,8 @@ class Stream(pydyf.Stream):
        })
        pattern = Stream(
            self._fonts, self.page_rectangle, states, x_objects, patterns,
-            shadings, self._images, self._mark, extra=extra)
+            shadings, self._images, self._mark, extra=extra,
+            compress=self.compress)
        pattern.id = f'p{len(self._patterns)}'
        self._patterns[pattern.id] = pattern
        return pattern