Clean lazy image loading

2024-09-11 20:47:56 +03:00 · 2023-03-05 14:40:26 +01:00 · 2023-03-05 14:40:26 +01:00 · 07e43dc4c2
commit 07e43dc4c2
parent a2231ba147
5 changed files with 187 additions and 267 deletions
--- a/weasyprint/init.py
+++ b/weasyprint/init.py
@ -133,9 +133,7 @@ class HTML:
        :param bool presentational_hints:
            Whether HTML presentational hints are followed.
        :param tuple optimize_size:
-            Optimize size of generated PDF. Can contain: "fonts";
-            "images" applies `optimize=True` parameter to image compression;
-            "not_jpegs" tries to prevent keep jpeg data same as in file.
+            Optimize size of generated PDF. Can contain "images" and "fonts".
        :type font_config: :class:`text.fonts.FontConfiguration`
        :param font_config: A font configuration handling ``@font-face`` rules.
        :type counter_style: :class:`css.counters.CounterStyle`
@ -183,9 +181,7 @@ class HTML:
        :param bool presentational_hints: Whether HTML presentational hints are
            followed.
        :param tuple optimize_size:
-            Optimize size of generated PDF. Can contain: "fonts";
-            "images" applies `optimize=True` parameter to image compression;
-            "not_jpegs" tries to prevent keep jpeg data same as in file.
+            Optimize size of generated PDF. Can contain "images" and "fonts".
        :type font_config: :class:`text.fonts.FontConfiguration`
        :param font_config: A font configuration handling ``@font-face`` rules.
        :type counter_style: :class:`css.counters.CounterStyle`
--- a/weasyprint/document.py
+++ b/weasyprint/document.py
@ -2,10 +2,6 @@

 import functools
 import io
-import os
-import shutil
-from pathlib import Path
-from tempfile import NamedTemporaryFile

 from . import CSS
 from .anchors import gather_anchors, make_page_bookmark_tree
@ -367,26 +363,13 @@ class Document:
        if finisher:
            finisher(self, pdf)

+        if target is None:
+            output = io.BytesIO()
+            pdf.write(output, version=pdf.version, identifier=identifier)
+            return output.getvalue()
+
        if hasattr(target, 'write'):
            pdf.write(target, version=pdf.version, identifier=identifier)
-            return
-
-        if target is None:
-            # TODO: Should we make None value for target parameter deprecated in write_pdf()?
-            # Returning bytes.
-            # You should avoid target=None value if you may run out of RAM.
-            # Consumes a double amount of memory. It creates document in BinaryIO and returns bytes copy from it.
-            # Just for a moment two copies of PDF document will be in memory.
-            # Also pydyf.PDF object is in a memory.
-            bytes_io = io.BytesIO()
-            pdf.write(bytes_io, version=pdf.version, identifier=identifier)
-            return bytes_io.getvalue()
-
-        temp_file = NamedTemporaryFile(buffering=8388608, dir=Path(target).parent, delete=False, suffix=".pdf~")
-        try:
-            pdf.write(temp_file, version=pdf.version, identifier=identifier)
-            temp_file.close()
-            shutil.move(temp_file.name, target)
-        finally:
-            if os.path.exists(temp_file.name):
-                os.remove(temp_file.name)
+        else:
+            with open(target, 'wb') as fd:
+                pdf.write(fd, version=pdf.version, identifier=identifier)
--- a/weasyprint/images.py
+++ b/weasyprint/images.py
@ -2,21 +2,18 @@

 import io
 import math
+import struct
 from hashlib import md5
 from io import BytesIO
 from itertools import cycle
 from math import inf
-from pathlib import Path
-from typing import Optional, Union, Collection
-from urllib.parse import unquote
 from xml.etree import ElementTree

-from PIL import Image, ImageFile
-
 import pydyf
+from PIL import Image, ImageFile, ImageOps
+
 from .layout.percent import percentage
 from .logger import LOGGER
-from .rotate_fn import rotate_pillow_image
 from .svg import SVG
 from .urls import URLFetchingError, fetch

@ -38,54 +35,137 @@ class ImageLoadingError(ValueError):
        return cls(f'{name}: {value}' if value else name)


-
-RAM_SAVE = True
-"""
-This is temporary constant that defines RAM Saving when work with images.
-When it is True, local images will not be kept im memory.
-PROS: Lowers memory consumption.
-CONS: Had to read images twice from disk. Might affect speed a little.
-"""
-# TODO: Should RAM_SAVE be introduced as a parameter into render() method?
-#       It was my temp quick-fix ugly solution to put it as module constant.
-
-
 class RasterImage:
-    def __init__(self, pillow_image, image_id, optimize_size,
-                 url: Optional[str] = None, orientation: Optional[str] = None):
+    def __init__(self, pillow_image, image_id, optimize_size, cache_path=None):
+        self.id = image_id
+        self._cache_path = cache_path

-        pillow_image.id = image_id
+        if 'transparency' in pillow_image.info:
+            pillow_image = pillow_image.convert('RGBA')
+        elif pillow_image.mode in ('1', 'P', 'I'):
+            pillow_image = pillow_image.convert('RGB')

-        if RAM_SAVE and url and url.startswith("file://"):
-            quoted_path = url[7:]
-            path = unquote(quoted_path)
-            self._pillow_image = DelayedPillowImage(pillow_image, path, orientation, optimize_size)
+        self.width = pillow_image.width
+        self.height = pillow_image.height
+        self.ratio = (self.width / self.height) if self.height != 0 else inf
+
+        if pillow_image.mode in ('RGB', 'RGBA'):
+            color_space = '/DeviceRGB'
+        elif pillow_image.mode in ('L', 'LA'):
+            color_space = '/DeviceGray'
+        elif pillow_image.mode == 'CMYK':
+            color_space = '/DeviceCMYK'
        else:
-            self._pillow_image = pillow_image
+            LOGGER.warning('Unknown image mode: %s', pillow_image.mode)
+            color_space = '/DeviceRGB'

-        self._optimize_size = optimize_size
-        self._intrinsic_width = pillow_image.width
-        self._intrinsic_height = pillow_image.height
-        self._intrinsic_ratio = (
-            self._intrinsic_width / self._intrinsic_height
-            if self._intrinsic_height != 0 else inf)
+        self.extra = pydyf.Dictionary({
+            'Type': '/XObject',
+            'Subtype': '/Image',
+            'Width': self.width,
+            'Height': self.height,
+            'ColorSpace': color_space,
+            'BitsPerComponent': 8,
+        })
+        optimize = 'images' in optimize_size
+        if pillow_image.format in ('JPEG', 'MPO'):
+            self.extra['Filter'] = '/DCTDecode'
+            image_file = io.BytesIO()
+            pillow_image.save(image_file, format='JPEG', optimize=optimize)
+            self.stream = self.get_stream(image_file.getvalue())
+        else:
+            self.extra['Filter'] = '/FlateDecode'
+            self.extra['DecodeParms'] = pydyf.Dictionary({
+                # Predictor 15 specifies that we're providing PNG data,
+                # ostensibly using an "optimum predictor", but doesn't actually
+                # matter as long as the predictor value is 10+ according to the
+                # spec. (Other PNG predictor values assert that we're using
+                # specific predictors that we don't want to commit to, but
+                # "optimum" can vary.)
+                'Predictor': 15,
+                'Columns': self.width,
+            })
+            if pillow_image.mode in ('RGB', 'RGBA'):
+                # Defaults to 1.
+                self.extra['DecodeParms']['Colors'] = 3
+            if pillow_image.mode in ('RGBA', 'LA'):
+                alpha = pillow_image.getchannel('A')
+                pillow_image = pillow_image.convert(pillow_image.mode[:-1])
+                alpha_data = self._get_png_data(alpha, optimize)
+                stream = self.get_stream(alpha_data)
+                self.extra['SMask'] = pydyf.Stream(stream, extra={
+                    'Filter': '/FlateDecode',
+                    'Type': '/XObject',
+                    'Subtype': '/Image',
+                    'DecodeParms': pydyf.Dictionary({
+                        'Predictor': 15,
+                        'Columns': pillow_image.width,
+                    }),
+                    'Width': pillow_image.width,
+                    'Height': pillow_image.height,
+                    'ColorSpace': '/DeviceGray',
+                    'BitsPerComponent': 8,
+                })

-    def get_intrinsic_size(self, image_resolution, font_size):
-        return (
-            self._intrinsic_width / image_resolution,
-            self._intrinsic_height / image_resolution,
-            self._intrinsic_ratio)
+            png_data = self._get_png_data(pillow_image, optimize)
+            self.stream = self.get_stream(png_data)
+
+    def get_intrinsic_size(self, resolution, font_size):
+        return self.width / resolution, self.height / resolution, self.ratio

    def draw(self, stream, concrete_width, concrete_height, image_rendering):
-        if self._intrinsic_width <= 0 or self._intrinsic_height <= 0:
+        if self.width <= 0 or self.height <= 0:
            return

-        image_name = stream.add_image(
-            self._pillow_image, image_rendering, self._optimize_size)
+        image_name = stream.add_image(self, image_rendering)
        stream.transform(
            concrete_width, 0, 0, -concrete_height, 0, concrete_height)
        stream.draw_x_object(image_name)

+    @staticmethod
+    def _get_png_data(pillow_image, optimize):
+        image_file = io.BytesIO()
+        pillow_image.save(image_file, format='PNG', optimize=optimize)
+
+        # Read the PNG header, then discard it because we know it's a PNG. If
+        # this weren't just output from Pillow, we should actually check it.
+        image_file.seek(8)
+
+        png_data = []
+        raw_chunk_length = image_file.read(4)
+        # PNG files consist of a series of chunks.
+        while raw_chunk_length:
+            # Each chunk begins with its data length (four bytes, may be zero),
+            # then its type (four ASCII characters), then the data, then four
+            # bytes of a CRC.
+            chunk_len, = struct.unpack('!I', raw_chunk_length)
+            chunk_type = image_file.read(4)
+            if chunk_type == b'IDAT':
+                png_data.append(image_file.read(chunk_len))
+            else:
+                image_file.seek(chunk_len, io.SEEK_CUR)
+            # We aren't checking the CRC, we assume this is a valid PNG.
+            image_file.seek(4, io.SEEK_CUR)
+            raw_chunk_length = image_file.read(4)
+
+        return b''.join(png_data)
+
+    def get_stream(self, data, alpha=False):
+        if self._cache_path:
+            path = self._cache_path / f'{self.id}{int(alpha)}'
+            path.write_bytes(data)
+            return [LazyImage(path)]
+        else:
+            return [data]
+
+
+class LazyImage:
+    def __init__(self, path):
+        self._path = path
+
+    def __bytes__(self):
+        self._path.read_bytes()
+

 class SVGImage:
    def __init__(self, tree, base_url, url_fetcher, context):
@ -133,38 +213,36 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
                string = result['file_obj'].read()
            mime_type = forced_mime_type or result['mime_type']

-            image = None
-            svg_exceptions = []
-            # Try to rely on given mimetype for SVG
-            if mime_type == 'image/svg+xml':
+        image = None
+        svg_exceptions = []
+        # Try to rely on given mimetype for SVG
+        if mime_type == 'image/svg+xml':
+            try:
+                tree = ElementTree.fromstring(string)
+                image = SVGImage(tree, url, url_fetcher, context)
+            except Exception as svg_exception:
+                svg_exceptions.append(svg_exception)
+        # Try pillow for raster images, or for failing SVG
+        if image is None:
+            try:
+                pillow_image = Image.open(BytesIO(string))
+            except Exception as raster_exception:
+                if mime_type == 'image/svg+xml':
+                    # Tried SVGImage then Pillow for a SVG, abort
+                    raise ImageLoadingError.from_exception(svg_exceptions[0])
                try:
+                    # Last chance, try SVG
                    tree = ElementTree.fromstring(string)
                    image = SVGImage(tree, url, url_fetcher, context)
-                except Exception as svg_exception:
-                    svg_exceptions.append(svg_exception)
-            # Try pillow for raster images, or for failing SVG
-            if image is None:
-                try:
-                    pillow_image = Image.open(BytesIO(string))
-                except Exception as raster_exception:
-                    if mime_type == 'image/svg+xml':
-                        # Tried SVGImage then Pillow for a SVG, abort
-                        raise ImageLoadingError.from_exception(
-                            svg_exceptions[0])
-                    try:
-                        # Last chance, try SVG
-                        tree = ElementTree.fromstring(string)
-                        image = SVGImage(tree, url, url_fetcher, context)
-                    except Exception:
-                        # Tried Pillow then SVGImage for a raster, abort
-                        raise ImageLoadingError.from_exception(
-                            raster_exception)
-                else:
-                    # Store image id to enable cache in Stream.add_image
-                    image_id = md5(url.encode()).hexdigest()
-                    # Keep image format as it is discarded by transposition
-                    pillow_image = rotate_pillow_image(pillow_image, orientation)
-                    image = RasterImage(pillow_image, image_id, optimize_size, url=url, orientation=orientation)
+                except Exception:
+                    # Tried Pillow then SVGImage for a raster, abort
+                    raise ImageLoadingError.from_exception(raster_exception)
+            else:
+                # Store image id to enable cache in Stream.add_image
+                image_id = md5(url.encode()).hexdigest()
+                # Keep image format as it is discarded by transposition
+                pillow_image = rotate_pillow_image(pillow_image, orientation)
+                image = RasterImage(pillow_image, image_id, optimize_size)

    except (URLFetchingError, ImageLoadingError) as exception:
        LOGGER.error('Failed to load image at %r: %s', url, exception)
@ -173,6 +251,28 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
    return image


+def rotate_pillow_image(pillow_image, orientation):
+    """Return a copy of a Pillow image with modified orientation.
+
+    If orientation is not changed, return the same image.
+
+    """
+    image_format = pillow_image.format
+    if orientation == 'from-image':
+        if 'exif' in pillow_image.info:
+            pillow_image = ImageOps.exif_transpose(pillow_image)
+    elif orientation != 'none':
+        angle, flip = orientation
+        if angle > 0:
+            rotation = getattr(Image.Transpose, f'ROTATE_{angle}')
+            pillow_image = pillow_image.transpose(rotation)
+        if flip:
+            pillow_image = pillow_image.transpose(
+                Image.Transpose.FLIP_LEFT_RIGHT)
+    pillow_image.format = image_format
+    return pillow_image
+
+
 def process_color_stops(vector_length, positions):
    """Give color stops positions on the gradient vector.

@ -678,48 +778,3 @@ class RadialGradient(Gradient):
            size_x = 1e7
            size_y = 1e-7
        return size_x, size_y
-
-
-class DelayedPillowImage(pydyf.Object):
-    def __init__(self, pillow_image,
-                 path: Union[str, Path], orientation, optimize_size: Collection[str]):
-        """
-        Memory efficient replacer of PIL Image.
-        Does not keep image in memory. Retreives image bytes on demand.
-        """
-
-        # Those are paramerers to recreate image bytes from file
-        self.path = path
-        self.orientation = orientation
-        self.optimize_size = optimize_size
-
-        # These parameters of original Image object that used somewhere else.
-        self.id = pillow_image.id
-        self.info = pillow_image.info
-        self.mode = pillow_image.mode
-        self.width = pillow_image.width
-        self.height = pillow_image.height
-        self.format = pillow_image.format
-
-    def __repr__(self):
-        return f"<Picture {self.path}>"
-    
-    @property
-    def data(self) -> bytes:
-
-        original_pillow_image = Image.open(self.path)
-        rotated_pillow_image = rotate_pillow_image(original_pillow_image, self.orientation)
-
-
-        if rotated_pillow_image is original_pillow_image:
-            if original_pillow_image.format == 'JPEG' and ('not_jpegs' in self.optimize_size):
-                return Path(self.path).read_bytes()
-
-        optimize = 'images' in self.optimize_size
-        return get_jpeg_bytes(rotated_pillow_image, optimize)
-
-
-def get_jpeg_bytes(pillow_image, optimize: bool):
-    image_file = io.BytesIO()
-    pillow_image.save(image_file, format='JPEG', optimize=optimize)
-    return image_file.getvalue()
--- a/weasyprint/pdf/stream.py
+++ b/weasyprint/pdf/stream.py
@ -1,7 +1,7 @@
 """PDF stream."""

 import io
-import struct
+from copy import deepcopy
 from functools import lru_cache
 from hashlib import md5

@ -10,7 +10,6 @@ from fontTools import subset
 from fontTools.ttLib import TTFont, TTLibError, ttFont
 from fontTools.varLib.mutator import instantiateVariableFont

-from ..images import DelayedPillowImage, get_jpeg_bytes
 from ..logger import LOGGER
 from ..matrix import Matrix
 from ..text.ffi import ffi, harfbuzz, pango, units_to_double
@ -363,109 +362,20 @@ class Stream(pydyf.Stream):
        self._x_objects[group.id] = group
        return group

-    def _get_png_data(self, pillow_image, optimize):
-        image_file = io.BytesIO()
-        pillow_image.save(image_file, format='PNG', optimize=optimize)
-
-        # Read the PNG header, then discard it because we know it's a PNG. If
-        # this weren't just output from Pillow, we should actually check it.
-        image_file.seek(8)
-
-        png_data = b''
-        raw_chunk_length = image_file.read(4)
-        # PNG files consist of a series of chunks.
-        while len(raw_chunk_length) > 0:
-            # Each chunk begins with its data length (four bytes, may be zero),
-            # then its type (four ASCII characters), then the data, then four
-            # bytes of a CRC.
-            chunk_len, = struct.unpack('!I', raw_chunk_length)
-            chunk_type = image_file.read(4)
-            if chunk_type == b'IDAT':
-                png_data += image_file.read(chunk_len)
-            else:
-                image_file.seek(chunk_len, io.SEEK_CUR)
-            # We aren't checking the CRC, we assume this is a valid PNG.
-            image_file.seek(4, io.SEEK_CUR)
-            raw_chunk_length = image_file.read(4)
-
-        return png_data
-
-    def add_image(self, pillow_image, image_rendering, optimize_size):
-        image_name = f'i{pillow_image.id}'
+    def add_image(self, image, image_rendering):
+        image_name = f'i{image.id}{image_rendering}'
        self._x_objects[image_name] = None  # Set by write_pdf
        if image_name in self._images:
            # Reuse image already stored in document
            return image_name

-        if 'transparency' in pillow_image.info:
-            pillow_image = pillow_image.convert('RGBA')
-        elif pillow_image.mode in ('1', 'P', 'I'):
-            pillow_image = pillow_image.convert('RGB')
-
-        if pillow_image.mode in ('RGB', 'RGBA'):
-            color_space = '/DeviceRGB'
-        elif pillow_image.mode in ('L', 'LA'):
-            color_space = '/DeviceGray'
-        elif pillow_image.mode == 'CMYK':
-            color_space = '/DeviceCMYK'
-        else:
-            LOGGER.warning('Unknown image mode: %s', pillow_image.mode)
-            color_space = '/DeviceRGB'
-
        interpolate = 'true' if image_rendering == 'auto' else 'false'
-        extra = pydyf.Dictionary({
-            'Type': '/XObject',
-            'Subtype': '/Image',
-            'Width': pillow_image.width,
-            'Height': pillow_image.height,
-            'ColorSpace': color_space,
-            'BitsPerComponent': 8,
-            'Interpolate': interpolate,
-        })
+        extra = deepcopy(image.extra)
+        extra['Interpolate'] = interpolate
+        if 'SMask' in extra:
+            extra['SMask'].extra['Interpolate'] = interpolate

-        optimize = 'images' in optimize_size
-        if pillow_image.format in ('JPEG', 'MPO'):
-            extra['Filter'] = '/DCTDecode'
-            if isinstance(pillow_image, DelayedPillowImage):
-                stream = [pillow_image]
-            else:
-                stream = [get_jpeg_bytes(pillow_image, optimize)]
-        else:
-            extra['Filter'] = '/FlateDecode'
-            extra['DecodeParms'] = pydyf.Dictionary({
-                # Predictor 15 specifies that we're providing PNG data,
-                # ostensibly using an "optimum predictor", but doesn't actually
-                # matter as long as the predictor value is 10+ according to the
-                # spec. (Other PNG predictor values assert that we're using
-                # specific predictors that we don't want to commit to, but
-                # "optimum" can vary.)
-                'Predictor': 15,
-                'Columns': pillow_image.width,
-            })
-            if pillow_image.mode in ('RGB', 'RGBA'):
-                # Defaults to 1.
-                extra['DecodeParms']['Colors'] = 3
-            if pillow_image.mode in ('RGBA', 'LA'):
-                alpha = pillow_image.getchannel('A')
-                pillow_image = pillow_image.convert(pillow_image.mode[:-1])
-                alpha_data = self._get_png_data(alpha, optimize)
-                extra['SMask'] = pydyf.Stream([alpha_data], extra={
-                    'Filter': '/FlateDecode',
-                    'Type': '/XObject',
-                    'Subtype': '/Image',
-                    'DecodeParms': pydyf.Dictionary({
-                        'Predictor': 15,
-                        'Columns': pillow_image.width,
-                    }),
-                    'Width': pillow_image.width,
-                    'Height': pillow_image.height,
-                    'ColorSpace': '/DeviceGray',
-                    'BitsPerComponent': 8,
-                    'Interpolate': interpolate,
-                    })
-            stream = [self._get_png_data(pillow_image, optimize)]
-
-        xobject = pydyf.Stream(stream, extra=extra)
+        xobject = pydyf.Stream(image.stream, extra=extra)
        self._images[image_name] = xobject
        return image_name

--- a/weasyprint/rotate_fn.py
+++ b/weasyprint/rotate_fn.py
@ -1,24 +0,0 @@
-from PIL import ImageOps, Image
-
-
-def rotate_pillow_image(pillow_image: Image.Image, orientation) -> Image.Image:
-    """
-    Returns either absolute same image if orientation was not changed.
-    or its copy with modified orientation.
-    """
-    image_format = pillow_image.format
-    if orientation == 'from-image':
-        if 'exif' in pillow_image.info:
-            pillow_image = ImageOps.exif_transpose(
-                pillow_image)
-    elif orientation != 'none':
-        angle, flip = orientation
-        if angle > 0:
-            rotation = getattr(
-                Image.Transpose, f'ROTATE_{angle}')
-            pillow_image = pillow_image.transpose(rotation)
-        if flip:
-            pillow_image = pillow_image.transpose(
-                Image.Transpose.FLIP_LEFT_RIGHT)
-    pillow_image.format = image_format
-    return pillow_image