1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-09-11 20:47:56 +03:00

Clean lazy image loading

This commit is contained in:
Guillaume Ayoub 2023-03-05 14:40:26 +01:00
parent a2231ba147
commit 07e43dc4c2
5 changed files with 187 additions and 267 deletions

View File

@ -133,9 +133,7 @@ class HTML:
:param bool presentational_hints:
Whether HTML presentational hints are followed.
:param tuple optimize_size:
Optimize size of generated PDF. Can contain: "fonts";
"images" applies `optimize=True` parameter to image compression;
"not_jpegs" tries to prevent keep jpeg data same as in file.
Optimize size of generated PDF. Can contain "images" and "fonts".
:type font_config: :class:`text.fonts.FontConfiguration`
:param font_config: A font configuration handling ``@font-face`` rules.
:type counter_style: :class:`css.counters.CounterStyle`
@ -183,9 +181,7 @@ class HTML:
:param bool presentational_hints: Whether HTML presentational hints are
followed.
:param tuple optimize_size:
Optimize size of generated PDF. Can contain: "fonts";
"images" applies `optimize=True` parameter to image compression;
"not_jpegs" tries to prevent keep jpeg data same as in file.
Optimize size of generated PDF. Can contain "images" and "fonts".
:type font_config: :class:`text.fonts.FontConfiguration`
:param font_config: A font configuration handling ``@font-face`` rules.
:type counter_style: :class:`css.counters.CounterStyle`

View File

@ -2,10 +2,6 @@
import functools
import io
import os
import shutil
from pathlib import Path
from tempfile import NamedTemporaryFile
from . import CSS
from .anchors import gather_anchors, make_page_bookmark_tree
@ -367,26 +363,13 @@ class Document:
if finisher:
finisher(self, pdf)
if target is None:
output = io.BytesIO()
pdf.write(output, version=pdf.version, identifier=identifier)
return output.getvalue()
if hasattr(target, 'write'):
pdf.write(target, version=pdf.version, identifier=identifier)
return
if target is None:
# TODO: Should we make None value for target parameter deprecated in write_pdf()?
# Returning bytes.
# You should avoid target=None value if you may run out of RAM.
# Consumes a double amount of memory. It creates document in BinaryIO and returns bytes copy from it.
# Just for a moment two copies of PDF document will be in memory.
# Also pydyf.PDF object is in a memory.
bytes_io = io.BytesIO()
pdf.write(bytes_io, version=pdf.version, identifier=identifier)
return bytes_io.getvalue()
temp_file = NamedTemporaryFile(buffering=8388608, dir=Path(target).parent, delete=False, suffix=".pdf~")
try:
pdf.write(temp_file, version=pdf.version, identifier=identifier)
temp_file.close()
shutil.move(temp_file.name, target)
finally:
if os.path.exists(temp_file.name):
os.remove(temp_file.name)
else:
with open(target, 'wb') as fd:
pdf.write(fd, version=pdf.version, identifier=identifier)

View File

@ -2,21 +2,18 @@
import io
import math
import struct
from hashlib import md5
from io import BytesIO
from itertools import cycle
from math import inf
from pathlib import Path
from typing import Optional, Union, Collection
from urllib.parse import unquote
from xml.etree import ElementTree
from PIL import Image, ImageFile
import pydyf
from PIL import Image, ImageFile, ImageOps
from .layout.percent import percentage
from .logger import LOGGER
from .rotate_fn import rotate_pillow_image
from .svg import SVG
from .urls import URLFetchingError, fetch
@ -38,54 +35,137 @@ class ImageLoadingError(ValueError):
return cls(f'{name}: {value}' if value else name)
RAM_SAVE = True
"""
This is temporary constant that defines RAM Saving when work with images.
When it is True, local images will not be kept im memory.
PROS: Lowers memory consumption.
CONS: Had to read images twice from disk. Might affect speed a little.
"""
# TODO: Should RAM_SAVE be introduced as a parameter into render() method?
# It was my temp quick-fix ugly solution to put it as module constant.
class RasterImage:
def __init__(self, pillow_image, image_id, optimize_size,
url: Optional[str] = None, orientation: Optional[str] = None):
def __init__(self, pillow_image, image_id, optimize_size, cache_path=None):
self.id = image_id
self._cache_path = cache_path
pillow_image.id = image_id
if 'transparency' in pillow_image.info:
pillow_image = pillow_image.convert('RGBA')
elif pillow_image.mode in ('1', 'P', 'I'):
pillow_image = pillow_image.convert('RGB')
if RAM_SAVE and url and url.startswith("file://"):
quoted_path = url[7:]
path = unquote(quoted_path)
self._pillow_image = DelayedPillowImage(pillow_image, path, orientation, optimize_size)
self.width = pillow_image.width
self.height = pillow_image.height
self.ratio = (self.width / self.height) if self.height != 0 else inf
if pillow_image.mode in ('RGB', 'RGBA'):
color_space = '/DeviceRGB'
elif pillow_image.mode in ('L', 'LA'):
color_space = '/DeviceGray'
elif pillow_image.mode == 'CMYK':
color_space = '/DeviceCMYK'
else:
self._pillow_image = pillow_image
LOGGER.warning('Unknown image mode: %s', pillow_image.mode)
color_space = '/DeviceRGB'
self._optimize_size = optimize_size
self._intrinsic_width = pillow_image.width
self._intrinsic_height = pillow_image.height
self._intrinsic_ratio = (
self._intrinsic_width / self._intrinsic_height
if self._intrinsic_height != 0 else inf)
self.extra = pydyf.Dictionary({
'Type': '/XObject',
'Subtype': '/Image',
'Width': self.width,
'Height': self.height,
'ColorSpace': color_space,
'BitsPerComponent': 8,
})
optimize = 'images' in optimize_size
if pillow_image.format in ('JPEG', 'MPO'):
self.extra['Filter'] = '/DCTDecode'
image_file = io.BytesIO()
pillow_image.save(image_file, format='JPEG', optimize=optimize)
self.stream = self.get_stream(image_file.getvalue())
else:
self.extra['Filter'] = '/FlateDecode'
self.extra['DecodeParms'] = pydyf.Dictionary({
# Predictor 15 specifies that we're providing PNG data,
# ostensibly using an "optimum predictor", but doesn't actually
# matter as long as the predictor value is 10+ according to the
# spec. (Other PNG predictor values assert that we're using
# specific predictors that we don't want to commit to, but
# "optimum" can vary.)
'Predictor': 15,
'Columns': self.width,
})
if pillow_image.mode in ('RGB', 'RGBA'):
# Defaults to 1.
self.extra['DecodeParms']['Colors'] = 3
if pillow_image.mode in ('RGBA', 'LA'):
alpha = pillow_image.getchannel('A')
pillow_image = pillow_image.convert(pillow_image.mode[:-1])
alpha_data = self._get_png_data(alpha, optimize)
stream = self.get_stream(alpha_data)
self.extra['SMask'] = pydyf.Stream(stream, extra={
'Filter': '/FlateDecode',
'Type': '/XObject',
'Subtype': '/Image',
'DecodeParms': pydyf.Dictionary({
'Predictor': 15,
'Columns': pillow_image.width,
}),
'Width': pillow_image.width,
'Height': pillow_image.height,
'ColorSpace': '/DeviceGray',
'BitsPerComponent': 8,
})
def get_intrinsic_size(self, image_resolution, font_size):
return (
self._intrinsic_width / image_resolution,
self._intrinsic_height / image_resolution,
self._intrinsic_ratio)
png_data = self._get_png_data(pillow_image, optimize)
self.stream = self.get_stream(png_data)
def get_intrinsic_size(self, resolution, font_size):
return self.width / resolution, self.height / resolution, self.ratio
def draw(self, stream, concrete_width, concrete_height, image_rendering):
if self._intrinsic_width <= 0 or self._intrinsic_height <= 0:
if self.width <= 0 or self.height <= 0:
return
image_name = stream.add_image(
self._pillow_image, image_rendering, self._optimize_size)
image_name = stream.add_image(self, image_rendering)
stream.transform(
concrete_width, 0, 0, -concrete_height, 0, concrete_height)
stream.draw_x_object(image_name)
@staticmethod
def _get_png_data(pillow_image, optimize):
image_file = io.BytesIO()
pillow_image.save(image_file, format='PNG', optimize=optimize)
# Read the PNG header, then discard it because we know it's a PNG. If
# this weren't just output from Pillow, we should actually check it.
image_file.seek(8)
png_data = []
raw_chunk_length = image_file.read(4)
# PNG files consist of a series of chunks.
while raw_chunk_length:
# Each chunk begins with its data length (four bytes, may be zero),
# then its type (four ASCII characters), then the data, then four
# bytes of a CRC.
chunk_len, = struct.unpack('!I', raw_chunk_length)
chunk_type = image_file.read(4)
if chunk_type == b'IDAT':
png_data.append(image_file.read(chunk_len))
else:
image_file.seek(chunk_len, io.SEEK_CUR)
# We aren't checking the CRC, we assume this is a valid PNG.
image_file.seek(4, io.SEEK_CUR)
raw_chunk_length = image_file.read(4)
return b''.join(png_data)
def get_stream(self, data, alpha=False):
if self._cache_path:
path = self._cache_path / f'{self.id}{int(alpha)}'
path.write_bytes(data)
return [LazyImage(path)]
else:
return [data]
class LazyImage:
def __init__(self, path):
self._path = path
def __bytes__(self):
self._path.read_bytes()
class SVGImage:
def __init__(self, tree, base_url, url_fetcher, context):
@ -133,38 +213,36 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
string = result['file_obj'].read()
mime_type = forced_mime_type or result['mime_type']
image = None
svg_exceptions = []
# Try to rely on given mimetype for SVG
if mime_type == 'image/svg+xml':
image = None
svg_exceptions = []
# Try to rely on given mimetype for SVG
if mime_type == 'image/svg+xml':
try:
tree = ElementTree.fromstring(string)
image = SVGImage(tree, url, url_fetcher, context)
except Exception as svg_exception:
svg_exceptions.append(svg_exception)
# Try pillow for raster images, or for failing SVG
if image is None:
try:
pillow_image = Image.open(BytesIO(string))
except Exception as raster_exception:
if mime_type == 'image/svg+xml':
# Tried SVGImage then Pillow for a SVG, abort
raise ImageLoadingError.from_exception(svg_exceptions[0])
try:
# Last chance, try SVG
tree = ElementTree.fromstring(string)
image = SVGImage(tree, url, url_fetcher, context)
except Exception as svg_exception:
svg_exceptions.append(svg_exception)
# Try pillow for raster images, or for failing SVG
if image is None:
try:
pillow_image = Image.open(BytesIO(string))
except Exception as raster_exception:
if mime_type == 'image/svg+xml':
# Tried SVGImage then Pillow for a SVG, abort
raise ImageLoadingError.from_exception(
svg_exceptions[0])
try:
# Last chance, try SVG
tree = ElementTree.fromstring(string)
image = SVGImage(tree, url, url_fetcher, context)
except Exception:
# Tried Pillow then SVGImage for a raster, abort
raise ImageLoadingError.from_exception(
raster_exception)
else:
# Store image id to enable cache in Stream.add_image
image_id = md5(url.encode()).hexdigest()
# Keep image format as it is discarded by transposition
pillow_image = rotate_pillow_image(pillow_image, orientation)
image = RasterImage(pillow_image, image_id, optimize_size, url=url, orientation=orientation)
except Exception:
# Tried Pillow then SVGImage for a raster, abort
raise ImageLoadingError.from_exception(raster_exception)
else:
# Store image id to enable cache in Stream.add_image
image_id = md5(url.encode()).hexdigest()
# Keep image format as it is discarded by transposition
pillow_image = rotate_pillow_image(pillow_image, orientation)
image = RasterImage(pillow_image, image_id, optimize_size)
except (URLFetchingError, ImageLoadingError) as exception:
LOGGER.error('Failed to load image at %r: %s', url, exception)
@ -173,6 +251,28 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
return image
def rotate_pillow_image(pillow_image, orientation):
"""Return a copy of a Pillow image with modified orientation.
If orientation is not changed, return the same image.
"""
image_format = pillow_image.format
if orientation == 'from-image':
if 'exif' in pillow_image.info:
pillow_image = ImageOps.exif_transpose(pillow_image)
elif orientation != 'none':
angle, flip = orientation
if angle > 0:
rotation = getattr(Image.Transpose, f'ROTATE_{angle}')
pillow_image = pillow_image.transpose(rotation)
if flip:
pillow_image = pillow_image.transpose(
Image.Transpose.FLIP_LEFT_RIGHT)
pillow_image.format = image_format
return pillow_image
def process_color_stops(vector_length, positions):
"""Give color stops positions on the gradient vector.
@ -678,48 +778,3 @@ class RadialGradient(Gradient):
size_x = 1e7
size_y = 1e-7
return size_x, size_y
class DelayedPillowImage(pydyf.Object):
def __init__(self, pillow_image,
path: Union[str, Path], orientation, optimize_size: Collection[str]):
"""
Memory efficient replacer of PIL Image.
Does not keep image in memory. Retreives image bytes on demand.
"""
# Those are paramerers to recreate image bytes from file
self.path = path
self.orientation = orientation
self.optimize_size = optimize_size
# These parameters of original Image object that used somewhere else.
self.id = pillow_image.id
self.info = pillow_image.info
self.mode = pillow_image.mode
self.width = pillow_image.width
self.height = pillow_image.height
self.format = pillow_image.format
def __repr__(self):
return f"<Picture {self.path}>"
@property
def data(self) -> bytes:
original_pillow_image = Image.open(self.path)
rotated_pillow_image = rotate_pillow_image(original_pillow_image, self.orientation)
if rotated_pillow_image is original_pillow_image:
if original_pillow_image.format == 'JPEG' and ('not_jpegs' in self.optimize_size):
return Path(self.path).read_bytes()
optimize = 'images' in self.optimize_size
return get_jpeg_bytes(rotated_pillow_image, optimize)
def get_jpeg_bytes(pillow_image, optimize: bool):
image_file = io.BytesIO()
pillow_image.save(image_file, format='JPEG', optimize=optimize)
return image_file.getvalue()

View File

@ -1,7 +1,7 @@
"""PDF stream."""
import io
import struct
from copy import deepcopy
from functools import lru_cache
from hashlib import md5
@ -10,7 +10,6 @@ from fontTools import subset
from fontTools.ttLib import TTFont, TTLibError, ttFont
from fontTools.varLib.mutator import instantiateVariableFont
from ..images import DelayedPillowImage, get_jpeg_bytes
from ..logger import LOGGER
from ..matrix import Matrix
from ..text.ffi import ffi, harfbuzz, pango, units_to_double
@ -363,109 +362,20 @@ class Stream(pydyf.Stream):
self._x_objects[group.id] = group
return group
def _get_png_data(self, pillow_image, optimize):
image_file = io.BytesIO()
pillow_image.save(image_file, format='PNG', optimize=optimize)
# Read the PNG header, then discard it because we know it's a PNG. If
# this weren't just output from Pillow, we should actually check it.
image_file.seek(8)
png_data = b''
raw_chunk_length = image_file.read(4)
# PNG files consist of a series of chunks.
while len(raw_chunk_length) > 0:
# Each chunk begins with its data length (four bytes, may be zero),
# then its type (four ASCII characters), then the data, then four
# bytes of a CRC.
chunk_len, = struct.unpack('!I', raw_chunk_length)
chunk_type = image_file.read(4)
if chunk_type == b'IDAT':
png_data += image_file.read(chunk_len)
else:
image_file.seek(chunk_len, io.SEEK_CUR)
# We aren't checking the CRC, we assume this is a valid PNG.
image_file.seek(4, io.SEEK_CUR)
raw_chunk_length = image_file.read(4)
return png_data
def add_image(self, pillow_image, image_rendering, optimize_size):
image_name = f'i{pillow_image.id}'
def add_image(self, image, image_rendering):
image_name = f'i{image.id}{image_rendering}'
self._x_objects[image_name] = None # Set by write_pdf
if image_name in self._images:
# Reuse image already stored in document
return image_name
if 'transparency' in pillow_image.info:
pillow_image = pillow_image.convert('RGBA')
elif pillow_image.mode in ('1', 'P', 'I'):
pillow_image = pillow_image.convert('RGB')
if pillow_image.mode in ('RGB', 'RGBA'):
color_space = '/DeviceRGB'
elif pillow_image.mode in ('L', 'LA'):
color_space = '/DeviceGray'
elif pillow_image.mode == 'CMYK':
color_space = '/DeviceCMYK'
else:
LOGGER.warning('Unknown image mode: %s', pillow_image.mode)
color_space = '/DeviceRGB'
interpolate = 'true' if image_rendering == 'auto' else 'false'
extra = pydyf.Dictionary({
'Type': '/XObject',
'Subtype': '/Image',
'Width': pillow_image.width,
'Height': pillow_image.height,
'ColorSpace': color_space,
'BitsPerComponent': 8,
'Interpolate': interpolate,
})
extra = deepcopy(image.extra)
extra['Interpolate'] = interpolate
if 'SMask' in extra:
extra['SMask'].extra['Interpolate'] = interpolate
optimize = 'images' in optimize_size
if pillow_image.format in ('JPEG', 'MPO'):
extra['Filter'] = '/DCTDecode'
if isinstance(pillow_image, DelayedPillowImage):
stream = [pillow_image]
else:
stream = [get_jpeg_bytes(pillow_image, optimize)]
else:
extra['Filter'] = '/FlateDecode'
extra['DecodeParms'] = pydyf.Dictionary({
# Predictor 15 specifies that we're providing PNG data,
# ostensibly using an "optimum predictor", but doesn't actually
# matter as long as the predictor value is 10+ according to the
# spec. (Other PNG predictor values assert that we're using
# specific predictors that we don't want to commit to, but
# "optimum" can vary.)
'Predictor': 15,
'Columns': pillow_image.width,
})
if pillow_image.mode in ('RGB', 'RGBA'):
# Defaults to 1.
extra['DecodeParms']['Colors'] = 3
if pillow_image.mode in ('RGBA', 'LA'):
alpha = pillow_image.getchannel('A')
pillow_image = pillow_image.convert(pillow_image.mode[:-1])
alpha_data = self._get_png_data(alpha, optimize)
extra['SMask'] = pydyf.Stream([alpha_data], extra={
'Filter': '/FlateDecode',
'Type': '/XObject',
'Subtype': '/Image',
'DecodeParms': pydyf.Dictionary({
'Predictor': 15,
'Columns': pillow_image.width,
}),
'Width': pillow_image.width,
'Height': pillow_image.height,
'ColorSpace': '/DeviceGray',
'BitsPerComponent': 8,
'Interpolate': interpolate,
})
stream = [self._get_png_data(pillow_image, optimize)]
xobject = pydyf.Stream(stream, extra=extra)
xobject = pydyf.Stream(image.stream, extra=extra)
self._images[image_name] = xobject
return image_name

View File

@ -1,24 +0,0 @@
from PIL import ImageOps, Image
def rotate_pillow_image(pillow_image: Image.Image, orientation) -> Image.Image:
"""
Returns either absolute same image if orientation was not changed.
or its copy with modified orientation.
"""
image_format = pillow_image.format
if orientation == 'from-image':
if 'exif' in pillow_image.info:
pillow_image = ImageOps.exif_transpose(
pillow_image)
elif orientation != 'none':
angle, flip = orientation
if angle > 0:
rotation = getattr(
Image.Transpose, f'ROTATE_{angle}')
pillow_image = pillow_image.transpose(rotation)
if flip:
pillow_image = pillow_image.transpose(
Image.Transpose.FLIP_LEFT_RIGHT)
pillow_image.format = image_format
return pillow_image