From f243dbcca35812aa7e1627a2bf0b22a28eb94d0f Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 20 Dec 2012 20:10:48 +0100 Subject: [PATCH] Embed JPEG-encoded images in PDF. Fix #6 If an image is in JPEG format, embed it as-is in the PDF output. This often results in smaller PDF file size. (The image is still decoded however, so there is no rendering speed improvement.) --- weasyprint/document.py | 2 ++ weasyprint/images.py | 50 ++++++++++++++++++--------- weasyprint/tests/test_draw.py | 2 +- weasyprint/tests/test_pdf.py | 14 +++++++- weasyprint/tests/w3_test_suite/run.py | 2 +- 5 files changed, 50 insertions(+), 20 deletions(-) diff --git a/weasyprint/document.py b/weasyprint/document.py index b1502474..ca2aa02c 100644 --- a/weasyprint/document.py +++ b/weasyprint/document.py @@ -97,6 +97,7 @@ class _TaggedTuple(tuple): """ + def _get_metadata(box, bookmarks, links, anchors, matrix): bookmark_label = box.bookmark_label bookmark_level = box.bookmark_level @@ -128,6 +129,7 @@ def _get_metadata(box, bookmarks, links, anchors, matrix): if has_anchor: anchors[anchor_name] = pos_x, pos_y + def _prepare(box, bookmarks, links, anchors, matrix): transform = _get_matrix(box) if transform: diff --git a/weasyprint/images.py b/weasyprint/images.py index 4e4df2e2..1dea5273 100644 --- a/weasyprint/images.py +++ b/weasyprint/images.py @@ -27,7 +27,7 @@ from .text import USING_INTROSPECTION # Do not try to import PyGObject 3 if we already have PyGTK # that tends to segfault. if not USING_INTROSPECTION: - # Use PyGObject introspection + # Use PyGTK try: from gtk import gdk from gtk.gdk import PixbufLoader @@ -38,7 +38,7 @@ if not USING_INTROSPECTION: else: def gdkpixbuf_loader(file_obj, string): """Load raster images with gdk-pixbuf through PyGTK.""" - pixbuf = get_pixbuf(file_obj, string) + pixbuf, jpeg_data = get_pixbuf(file_obj, string) dummy_context = cairo.Context(cairo.ImageSurface( cairo.FORMAT_ARGB32, 1, 1)) gdk.CairoContext(dummy_context).set_source_pixbuf(pixbuf, 0, 0) @@ -48,7 +48,14 @@ if not USING_INTROSPECTION: # It is therefore not thread-safe and state must be reset # before any use. get_pattern = dummy_context.get_source + if cairo.version_info >= (1, 10, 0): + add_jpeg_data(get_pattern().get_surface(), jpeg_data) return get_pattern, pixbuf.get_width(), pixbuf.get_height() + + def pixbuf_format(loader): + format_ = loader.get_format() + if format_: + return format_['name'] else: # Use PyGObject introspection try: @@ -58,6 +65,11 @@ else: def gdkpixbuf_loader(file_obj, string, pixbuf_error=exception): raise pixbuf_error else: + def pixbuf_format(loader): + format_ = loader.get_format() + if format_: + return format_.get_name() + PIXBUF_VERSION = (GdkPixbuf.PIXBUF_MAJOR, GdkPixbuf.PIXBUF_MINOR, GdkPixbuf.PIXBUF_MICRO) @@ -75,7 +87,7 @@ else: and Gdk. """ - pixbuf = get_pixbuf(file_obj, string) + pixbuf, jpeg_data = get_pixbuf(file_obj, string) dummy_context = cairo.Context(cairo.ImageSurface( cairo.FORMAT_ARGB32, 1, 1)) Gdk.cairo_set_source_pixbuf(dummy_context, pixbuf, 0, 0) @@ -85,6 +97,8 @@ else: # It is therefore not thread-safe and state must be reset # before any use. get_pattern = dummy_context.get_source + if cairo.version_info >= (1, 10, 0): + add_jpeg_data(get_pattern().get_surface(), jpeg_data) return get_pattern, pixbuf.get_width(), pixbuf.get_height() except ImportError: @@ -94,38 +108,40 @@ else: without Gdk and going through PNG. """ - pixbuf = get_pixbuf(file_obj, string) + pixbuf, jpeg_data = get_pixbuf(file_obj, string) _, png = pixbuf.save_to_bufferv('png', ['compression'], ['0']) - return cairo_png_loader(None, png) + return cairo_png_loader(None, png, jpeg_data) def get_pixbuf(file_obj=None, string=None, chunck_size=16 * 1024): """Create a Pixbuf object.""" + if file_obj: + string = file_obj.read() + if not string: + raise ValueError('Could not load image: empty content') loader = PixbufLoader() try: - if file_obj: - while 1: - chunck = file_obj.read(chunck_size) - if not chunck: - break - loader.write(chunck) - elif string: - loader.write(string) - else: - raise ValueError('Could not load image: empty content') + loader.write(string) finally: # Pixbuf is really unhappy if we don’t do this: loader.close() - return loader.get_pixbuf() + jpeg_data = string if pixbuf_format(loader) == 'jpeg' else None + return loader.get_pixbuf(), jpeg_data -def cairo_png_loader(file_obj, string): +def cairo_png_loader(file_obj, string, jpeg_data=None): """Return a cairo Surface from a PNG byte stream.""" surface = cairo.ImageSurface.create_from_png(file_obj or BytesIO(string)) + add_jpeg_data(surface, jpeg_data) get_pattern = lambda: cairo.SurfacePattern(surface) return get_pattern, surface.get_width(), surface.get_height() +def add_jpeg_data(surface, jpeg_data): + if jpeg_data and hasattr(surface, 'set_mime_data'): + surface.set_mime_data('image/jpeg', jpeg_data) + + def cairosvg_loader(file_obj, string, uri): """Return a cairo Surface from a SVG byte stream. diff --git a/weasyprint/tests/test_draw.py b/weasyprint/tests/test_draw.py index a51f9e5e..61114f4a 100644 --- a/weasyprint/tests/test_draw.py +++ b/weasyprint/tests/test_draw.py @@ -151,7 +151,7 @@ def document_to_pixels(document, name, expected_width, expected_height): def png_to_pixels(png_bytes, width, height): - pixbuf = get_pixbuf(string=png_bytes) + pixbuf, _ = get_pixbuf(string=png_bytes) assert (pixbuf.get_width(), pixbuf.get_height()) == (width, height) if not pixbuf.get_has_alpha(): pixbuf = pixbuf.add_alpha(False, 0, 0, 0) # no substitute color diff --git a/weasyprint/tests/test_pdf.py b/weasyprint/tests/test_pdf.py index bd69488d..e8fa0b9c 100644 --- a/weasyprint/tests/test_pdf.py +++ b/weasyprint/tests/test_pdf.py @@ -15,6 +15,7 @@ from __future__ import division, unicode_literals import io import cairo +import pytest from .. import CSS from .. import pdf @@ -203,7 +204,6 @@ def test_bookmarks(): label='a', target=(0, 75, 1425))] - @assert_no_logs def test_links(): links = get_links('') @@ -294,3 +294,15 @@ def test_missing_links(): assert links == [[('internal', (0, 50, 935), (50, 950, 450, 935))]] assert len(logs) == 1 assert 'WARNING: No anchor #missing for internal URI reference' in logs[0] + + +@assert_no_logs +def test_jpeg(): + if not hasattr(cairo.ImageSurface, 'set_mime_data'): + pytest.xfail() + def render(html): + return TestHTML(base_url=resource_filename('dummy.html'), + string=html).write_pdf() + assert b'/Filter /DCTDecode' not in render('') + # JPEG-encoded image, embedded in PDF: + assert b'/Filter /DCTDecode' in render('') diff --git a/weasyprint/tests/w3_test_suite/run.py b/weasyprint/tests/w3_test_suite/run.py index af06c67d..43a58cdb 100644 --- a/weasyprint/tests/w3_test_suite/run.py +++ b/weasyprint/tests/w3_test_suite/run.py @@ -84,7 +84,7 @@ def make_test_suite(): HTML(BASE_URL + name).write_png( png_filename, stylesheets=[PAGE_SIZE_STYLESHEET]) with open(png_filename, 'rb') as image: - raw = get_pixbuf(file_obj=image).get_pixels() + raw, _ = get_pixbuf(file_obj=image).get_pixels() rendered[name] = raw return raw