1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-09-11 20:47:56 +03:00

Add --cache-folder option to temporarily store images on disk

This commit is contained in:
Guillaume Ayoub 2023-03-05 22:56:41 +01:00
parent 07e43dc4c2
commit 44001c5383
4 changed files with 91 additions and 19 deletions

View File

@ -138,7 +138,11 @@ class HTML:
:param font_config: A font configuration handling ``@font-face`` rules.
:type counter_style: :class:`css.counters.CounterStyle`
:param counter_style: A dictionary storing ``@counter-style`` rules.
:param dict image_cache: A dictionary used to cache images.
:param image_cache:
A dictionary used to cache images, or a folder path where images
are temporarily stored.
:type image_cache:
:obj:`dict`, :obj:`str` or :class:`document.DiskCache`
:param bool forms: Whether PDF forms have to be included.
:returns: A :class:`document.Document` object.
@ -186,7 +190,11 @@ class HTML:
:param font_config: A font configuration handling ``@font-face`` rules.
:type counter_style: :class:`css.counters.CounterStyle`
:param counter_style: A dictionary storing ``@counter-style`` rules.
:param dict image_cache: A dictionary used to cache images.
:param image_cache:
A dictionary used to cache images, or a folder path where images
are temporarily stored.
:type image_cache:
:obj:`dict`, :obj:`str` or :class:`document.DiskCache`
:param bytes identifier: A bytestring used as PDF file identifier.
:param str variant: A PDF variant name.
:param str version: A PDF version number.

View File

@ -94,6 +94,11 @@ def main(argv=None, stdout=None, stdin=None):
multiple times, ``all`` adds all allowed values, ``none`` removes all
previously set values.
.. option:: -c <folder>, --cache-folder <folder>
Store cache on disk instead of memory. The ``folder`` is created if
needed and cleaned after the PDF is generated.
.. option:: -v, --verbose
Show warnings and information messages.
@ -156,6 +161,10 @@ def main(argv=None, stdout=None, stdin=None):
'-O', '--optimize-size', action='append',
help='optimize output size for specified features',
choices=('images', 'fonts', 'all', 'none'), default=['fonts'])
parser.add_argument(
'-c', '--cache-folder',
help='Store cache on disk instead of memory. The ``folder`` is '
'created if needed and cleaned after the PDF is generated.')
parser.add_argument(
'-v', '--verbose', action='store_true',
help='show warnings and information messages')
@ -203,6 +212,7 @@ def main(argv=None, stdout=None, stdin=None):
'version': args.pdf_version,
'forms': args.pdf_forms,
'custom_metadata': args.custom_metadata,
'image_cache': args.cache_folder,
}
# Default to logging to stderr.

View File

@ -2,6 +2,8 @@
import functools
import io
from hashlib import md5
from pathlib import Path
from . import CSS
from .anchors import gather_anchors, make_page_bookmark_tree
@ -158,6 +160,52 @@ class DocumentMetadata:
self.custom = custom or {}
class DiskCache:
"""Dict-like storing images content on disk.
Bytestrings values are stored on disk. Other Python objects (i.e.
RasterImage instances) are still stored in memory, but are much more
lightweight.
"""
def __init__(self, folder):
self._path = Path(folder)
self._path.mkdir(parents=True, exist_ok=True)
self._memory_cache = {}
self._disk_paths = set()
def _path_from_key(self, key):
return self._path / md5(key.encode()).hexdigest()
def __getitem__(self, key):
if key in self._memory_cache:
return self._memory_cache[key]
else:
return self._path_from_key(key).read_bytes()
def __setitem__(self, key, value):
if isinstance(value, bytes):
path = self._path_from_key(key)
self._disk_paths.add(path)
path.write_bytes(value)
else:
self._memory_cache[key] = value
def __contains__(self, key):
return (
key in self._memory_cache or
self._path_from_key(key).exists())
def __del__(self):
try:
for path in self._disk_paths:
path.unlink(missing_ok=True)
self._path.rmdir()
except Exception:
# Silently ignore errors while clearing cache
pass
class Document:
"""A rendered document ready to be painted in a pydyf stream.
@ -180,7 +228,10 @@ class Document:
target_collector = TargetCollector()
page_rules = []
user_stylesheets = []
image_cache = {} if image_cache is None else image_cache
if image_cache is None:
image_cache = {}
elif not isinstance(image_cache, DiskCache):
image_cache = DiskCache(image_cache)
for css in stylesheets or []:
if not hasattr(css, 'matcher'):
css = CSS(

View File

@ -36,9 +36,9 @@ class ImageLoadingError(ValueError):
class RasterImage:
def __init__(self, pillow_image, image_id, optimize_size, cache_path=None):
def __init__(self, pillow_image, image_id, optimize_size, cache):
self.id = image_id
self._cache_path = cache_path
self._cache = cache
if 'transparency' in pillow_image.info:
pillow_image = pillow_image.convert('RGBA')
@ -92,7 +92,7 @@ class RasterImage:
alpha = pillow_image.getchannel('A')
pillow_image = pillow_image.convert(pillow_image.mode[:-1])
alpha_data = self._get_png_data(alpha, optimize)
stream = self.get_stream(alpha_data)
stream = self.get_stream(alpha_data, alpha=True)
self.extra['SMask'] = pydyf.Stream(stream, extra={
'Filter': '/FlateDecode',
'Type': '/XObject',
@ -151,20 +151,20 @@ class RasterImage:
return b''.join(png_data)
def get_stream(self, data, alpha=False):
if self._cache_path:
path = self._cache_path / f'{self.id}{int(alpha)}'
path.write_bytes(data)
return [LazyImage(path)]
else:
return [data]
key = f'{self.id}{int(alpha)}'
return [LazyImage(self._cache, key, data)]
class LazyImage:
def __init__(self, path):
self._path = path
class LazyImage(pydyf.Object):
def __init__(self, cache, key, data):
super().__init__()
self._key = key
self._cache = cache
cache[key] = data
def __bytes__(self):
self._path.read_bytes()
@property
def data(self):
return self._cache[self._key]
class SVGImage:
@ -240,13 +240,14 @@ def get_image_from_uri(cache, url_fetcher, optimize_size, url,
else:
# Store image id to enable cache in Stream.add_image
image_id = md5(url.encode()).hexdigest()
# Keep image format as it is discarded by transposition
pillow_image = rotate_pillow_image(pillow_image, orientation)
image = RasterImage(pillow_image, image_id, optimize_size)
image = RasterImage(
pillow_image, image_id, optimize_size, cache)
except (URLFetchingError, ImageLoadingError) as exception:
LOGGER.error('Failed to load image at %r: %s', url, exception)
image = None
cache[url] = image
return image
@ -269,6 +270,8 @@ def rotate_pillow_image(pillow_image, orientation):
if flip:
pillow_image = pillow_image.transpose(
Image.Transpose.FLIP_LEFT_RIGHT)
# Keep image format as it is discarded by transposition
pillow_image.format = image_format
return pillow_image