mirror of
https://github.com/Kozea/WeasyPrint.git
synced 2024-10-26 20:57:35 +03:00
424 lines
16 KiB
Python
424 lines
16 KiB
Python
"""The Awesome Document Factory.
|
|
|
|
The public API is what is accessible from this "root" packages without
|
|
importing sub-modules.
|
|
|
|
"""
|
|
|
|
import contextlib
|
|
from datetime import datetime
|
|
from os.path import getctime, getmtime
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
|
|
import cssselect2
|
|
import html5lib
|
|
import tinycss2
|
|
|
|
VERSION = __version__ = '61.1'
|
|
|
|
#: Default values for command-line and Python API options. See
|
|
#: :func:`__main__.main` to learn more about specific options for
|
|
#: command-line.
|
|
#:
|
|
#: :param list stylesheets:
|
|
#: An optional list of user stylesheets. The list can include
|
|
#: are :class:`CSS` objects, filenames, URLs, or file-like
|
|
#: objects. (See :ref:`Stylesheet Origins`.)
|
|
#: :param str media_type:
|
|
#: Media type to use for @media.
|
|
#: :param list attachments:
|
|
#: A list of additional file attachments for the generated PDF
|
|
#: document or :obj:`None`. The list's elements are
|
|
#: :class:`Attachment` objects, filenames, URLs or file-like objects.
|
|
#: :param bytes pdf_identifier:
|
|
#: A bytestring used as PDF file identifier.
|
|
#: :param str pdf_variant:
|
|
#: A PDF variant name.
|
|
#: :param str pdf_version:
|
|
#: A PDF version number.
|
|
#: :param bool pdf_forms:
|
|
#: Whether PDF forms have to be included.
|
|
#: :param bool uncompressed_pdf:
|
|
#: Whether PDF content should be compressed.
|
|
#: :param bool custom_metadata:
|
|
#: Whether custom HTML metadata should be stored in the generated PDF.
|
|
#: :param bool presentational_hints:
|
|
#: Whether HTML presentational hints are followed.
|
|
#: :param bool optimize_images:
|
|
#: Whether size of embedded images should be optimized, with no quality
|
|
#: loss.
|
|
#: :param int jpeg_quality:
|
|
#: JPEG quality between 0 (worst) to 95 (best).
|
|
#: :param int dpi:
|
|
#: Maximum resolution of images embedded in the PDF.
|
|
#: :param bool full_fonts:
|
|
#: Whether unmodified font files should be embedded when possible.
|
|
#: :param bool hinting:
|
|
#: Whether hinting information should be kept in embedded fonts.
|
|
#: :type cache: :obj:`dict`, :class:`pathlib.Path` or :obj:`str`
|
|
#: :param cache:
|
|
#: A dictionary used to cache images in memory, or a folder path where
|
|
#: images are temporarily stored.
|
|
DEFAULT_OPTIONS = {
|
|
'stylesheets': None,
|
|
'media_type': 'print',
|
|
'attachments': None,
|
|
'pdf_identifier': None,
|
|
'pdf_variant': None,
|
|
'pdf_version': None,
|
|
'pdf_forms': None,
|
|
'uncompressed_pdf': False,
|
|
'custom_metadata': False,
|
|
'presentational_hints': False,
|
|
'optimize_images': False,
|
|
'jpeg_quality': None,
|
|
'dpi': None,
|
|
'full_fonts': False,
|
|
'hinting': False,
|
|
'cache': None,
|
|
}
|
|
|
|
__all__ = [
|
|
'HTML', 'CSS', 'DEFAULT_OPTIONS', 'Attachment', 'Document', 'Page',
|
|
'default_url_fetcher', 'VERSION', '__version__']
|
|
|
|
|
|
# Import after setting the version, as the version is used in other modules
|
|
from .urls import ( # noqa isort:skip
|
|
fetch, default_url_fetcher, path2url, ensure_url, url_is_absolute)
|
|
from .logger import LOGGER, PROGRESS_LOGGER # noqa isort:skip
|
|
# Some imports are at the end of the file (after the CSS class)
|
|
# to work around circular imports.
|
|
|
|
|
|
def _find_base_url(html_document, fallback_base_url):
|
|
"""Return the base URL for the document.
|
|
|
|
See https://www.w3.org/TR/html5/urls.html#document-base-url
|
|
|
|
"""
|
|
first_base_element = next(iter(html_document.iter('base')), None)
|
|
if first_base_element is not None:
|
|
href = first_base_element.get('href', '').strip()
|
|
if href:
|
|
return urljoin(fallback_base_url, href)
|
|
return fallback_base_url
|
|
|
|
|
|
class HTML:
|
|
"""HTML document parsed by html5lib.
|
|
|
|
You can just create an instance with a positional argument:
|
|
``doc = HTML(something)``
|
|
The class will try to guess if the input is a filename, an absolute URL,
|
|
or a :term:`file object`.
|
|
|
|
Alternatively, use **one** named argument so that no guessing is involved:
|
|
|
|
:type filename: str or pathlib.Path
|
|
:param filename:
|
|
A filename, relative to the current directory, or absolute.
|
|
:param str url:
|
|
An absolute, fully qualified URL.
|
|
:type file_obj: :term:`file object`
|
|
:param file_obj:
|
|
Any object with a ``read`` method.
|
|
:param str string:
|
|
A string of HTML source.
|
|
|
|
Specifying multiple inputs is an error:
|
|
``HTML(filename="foo.html", url="localhost://bar.html")``
|
|
will raise a :obj:`TypeError`.
|
|
|
|
You can also pass optional named arguments:
|
|
|
|
:param str encoding:
|
|
Force the source character encoding.
|
|
:type base_url: str or pathlib.Path
|
|
:param base_url:
|
|
The base used to resolve relative URLs (e.g. in
|
|
``<img src="../foo.png">``). If not provided, try to use the input
|
|
filename, URL, or ``name`` attribute of
|
|
:term:`file objects <file object>`.
|
|
:type url_fetcher: :term:`callable`
|
|
:param url_fetcher:
|
|
A function or other callable with the same signature as
|
|
:func:`default_url_fetcher` called to fetch external resources such as
|
|
stylesheets and images. (See :ref:`URL Fetchers`.)
|
|
:param str media_type:
|
|
The media type to use for ``@media``. Defaults to ``'print'``.
|
|
**Note:** In some cases like ``HTML(string=foo)`` relative URLs will be
|
|
invalid if ``base_url`` is not provided.
|
|
|
|
"""
|
|
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
|
|
string=None, encoding=None, base_url=None,
|
|
url_fetcher=default_url_fetcher, media_type='print'):
|
|
PROGRESS_LOGGER.info(
|
|
'Step 1 - Fetching and parsing HTML - %s',
|
|
guess or filename or url or
|
|
getattr(file_obj, 'name', 'HTML string'))
|
|
if isinstance(base_url, Path):
|
|
base_url = str(base_url)
|
|
result = _select_source(
|
|
guess, filename, url, file_obj, string, base_url, url_fetcher)
|
|
with result as (source_type, source, base_url, protocol_encoding):
|
|
if isinstance(source, str):
|
|
result = html5lib.parse(source, namespaceHTMLElements=False)
|
|
else:
|
|
kwargs = {'namespaceHTMLElements': False}
|
|
if protocol_encoding is not None:
|
|
kwargs['transport_encoding'] = protocol_encoding
|
|
if encoding is not None:
|
|
kwargs['override_encoding'] = encoding
|
|
result = html5lib.parse(source, **kwargs)
|
|
self.base_url = _find_base_url(result, base_url)
|
|
self.url_fetcher = url_fetcher
|
|
self.media_type = media_type
|
|
self.wrapper_element = cssselect2.ElementWrapper.from_html_root(
|
|
result, content_language=None)
|
|
self.etree_element = self.wrapper_element.etree_element
|
|
|
|
def _ua_stylesheets(self, forms=False):
|
|
if forms:
|
|
return [HTML5_UA_STYLESHEET, HTML5_UA_FORM_STYLESHEET]
|
|
return [HTML5_UA_STYLESHEET]
|
|
|
|
def _ua_counter_style(self):
|
|
return [HTML5_UA_COUNTER_STYLE.copy()]
|
|
|
|
def _ph_stylesheets(self):
|
|
return [HTML5_PH_STYLESHEET]
|
|
|
|
def render(self, font_config=None, counter_style=None, **options):
|
|
"""Lay out and paginate the document, but do not (yet) export it.
|
|
|
|
This returns a :class:`document.Document` object which provides
|
|
access to individual pages and various meta-data.
|
|
See :meth:`write_pdf` to get a PDF directly.
|
|
|
|
:type font_config: :class:`text.fonts.FontConfiguration`
|
|
:param font_config:
|
|
A font configuration handling ``@font-face`` rules.
|
|
:type counter_style: :class:`css.counters.CounterStyle`
|
|
:param counter_style:
|
|
A dictionary storing ``@counter-style`` rules.
|
|
:param options:
|
|
The ``options`` parameter includes by default the
|
|
:data:`DEFAULT_OPTIONS` values.
|
|
:returns: A :class:`document.Document` object.
|
|
|
|
"""
|
|
new_options = DEFAULT_OPTIONS.copy()
|
|
new_options.update(options)
|
|
options = new_options
|
|
return Document._render(self, font_config, counter_style, options)
|
|
|
|
def write_pdf(self, target=None, zoom=1, finisher=None,
|
|
font_config=None, counter_style=None, **options):
|
|
"""Render the document to a PDF file.
|
|
|
|
This is a shortcut for calling :meth:`render`, then
|
|
:meth:`Document.write_pdf() <document.Document.write_pdf>`.
|
|
|
|
:type target:
|
|
:class:`str`, :class:`pathlib.Path` or :term:`file object`
|
|
:param target:
|
|
A filename where the PDF file is generated, a file object, or
|
|
:obj:`None`.
|
|
:param float zoom:
|
|
The zoom factor in PDF units per CSS units. **Warning**:
|
|
All CSS units are affected, including physical units like
|
|
``cm`` and named sizes like ``A4``. For values other than
|
|
1, the physical CSS units will thus be "wrong".
|
|
:type finisher: :term:`callable`
|
|
:param finisher:
|
|
A finisher function or callable that accepts the document and a
|
|
:class:`pydyf.PDF` object as parameters. Can be passed to perform
|
|
post-processing on the PDF right before the trailer is written.
|
|
:type font_config: :class:`text.fonts.FontConfiguration`
|
|
:param font_config:
|
|
A font configuration handling ``@font-face`` rules.
|
|
:type counter_style: :class:`css.counters.CounterStyle`
|
|
:param counter_style:
|
|
A dictionary storing ``@counter-style`` rules.
|
|
:param options:
|
|
The ``options`` parameter includes by default the
|
|
:data:`DEFAULT_OPTIONS` values.
|
|
:returns:
|
|
The PDF as :obj:`bytes` if ``target`` is not provided or
|
|
:obj:`None`, otherwise :obj:`None` (the PDF is written to
|
|
``target``).
|
|
|
|
"""
|
|
new_options = DEFAULT_OPTIONS.copy()
|
|
new_options.update(options)
|
|
options = new_options
|
|
return (
|
|
self.render(font_config, counter_style, **options)
|
|
.write_pdf(target, zoom, finisher, **options))
|
|
|
|
|
|
class CSS:
|
|
"""CSS stylesheet parsed by tinycss2.
|
|
|
|
An instance is created in the same way as :class:`HTML`, with the same
|
|
arguments.
|
|
|
|
An additional argument called ``font_config`` must be provided to handle
|
|
``@font-face`` rules. The same ``text.fonts.FontConfiguration`` object
|
|
must be used for different ``CSS`` objects applied to the same document.
|
|
|
|
``CSS`` objects have no public attributes or methods. They are only meant
|
|
to be used in the :meth:`HTML.write_pdf` and :meth:`HTML.render` methods
|
|
of :class:`HTML` objects.
|
|
|
|
"""
|
|
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
|
|
string=None, encoding=None, base_url=None,
|
|
url_fetcher=default_url_fetcher, _check_mime_type=False,
|
|
media_type='print', font_config=None, counter_style=None,
|
|
matcher=None, page_rules=None):
|
|
PROGRESS_LOGGER.info(
|
|
'Step 2 - Fetching and parsing CSS - %s',
|
|
filename or url or getattr(file_obj, 'name', 'CSS string'))
|
|
result = _select_source(
|
|
guess, filename, url, file_obj, string,
|
|
base_url=base_url, url_fetcher=url_fetcher,
|
|
check_css_mime_type=_check_mime_type)
|
|
with result as (source_type, source, base_url, protocol_encoding):
|
|
if source_type == 'file_obj':
|
|
source = source.read()
|
|
if isinstance(source, str):
|
|
# unicode, no encoding
|
|
stylesheet = tinycss2.parse_stylesheet(source)
|
|
else:
|
|
stylesheet, encoding = tinycss2.parse_stylesheet_bytes(
|
|
source, environment_encoding=encoding,
|
|
protocol_encoding=protocol_encoding)
|
|
self.base_url = base_url
|
|
self.matcher = matcher or cssselect2.Matcher()
|
|
self.page_rules = [] if page_rules is None else page_rules
|
|
preprocess_stylesheet(
|
|
media_type, base_url, stylesheet, url_fetcher, self.matcher,
|
|
self.page_rules, font_config, counter_style)
|
|
|
|
|
|
class Attachment:
|
|
"""File attachment for a PDF document.
|
|
|
|
An instance is created in the same way as :class:`HTML`, except that the
|
|
HTML specific arguments (``encoding`` and ``media_type``) are not
|
|
supported.
|
|
|
|
:param str description:
|
|
A description of the attachment to be included in the PDF document.
|
|
May be :obj:`None`.
|
|
:type created: :obj:`datetime.datetime`
|
|
:param created:
|
|
Creation date and time. Default is current date and time.
|
|
:type modified: :obj:`datetime.datetime`
|
|
:param modified:
|
|
Modification date and time. Default is current date and time.
|
|
:param str relationship:
|
|
A string that represents the relationship between the attachment and
|
|
the PDF it is embedded in. Default is 'Unspecified', other common
|
|
values are defined in ISO-32000-2:2020, 7.11.3.
|
|
|
|
"""
|
|
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
|
|
string=None, base_url=None, url_fetcher=default_url_fetcher,
|
|
description=None, created=None, modified=None,
|
|
relationship='Unspecified'):
|
|
self.source = _select_source(
|
|
guess, filename, url, file_obj, string, base_url=base_url,
|
|
url_fetcher=url_fetcher)
|
|
self.description = description
|
|
self.relationship = relationship
|
|
self.md5 = None
|
|
|
|
if created is None:
|
|
if filename:
|
|
created = datetime.fromtimestamp(getctime(filename))
|
|
else:
|
|
created = datetime.now()
|
|
if modified is None:
|
|
if filename:
|
|
modified = datetime.fromtimestamp(getmtime(filename))
|
|
else:
|
|
modified = datetime.now()
|
|
self.created = created
|
|
self.modified = modified
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def _select_source(guess=None, filename=None, url=None, file_obj=None,
|
|
string=None, base_url=None, url_fetcher=default_url_fetcher,
|
|
check_css_mime_type=False):
|
|
"""If only one input is given, return it with normalized ``base_url``."""
|
|
if base_url is not None:
|
|
base_url = ensure_url(base_url)
|
|
|
|
selected_params = [
|
|
param for param in (guess, filename, url, file_obj, string) if
|
|
param is not None]
|
|
if len(selected_params) != 1:
|
|
source = ', '.join(selected_params) or 'nothing'
|
|
raise TypeError(f'Expected exactly one source, got {source}')
|
|
elif guess is not None:
|
|
if hasattr(guess, 'read'):
|
|
type_ = 'file_obj'
|
|
elif isinstance(guess, Path):
|
|
type_ = 'filename'
|
|
elif url_is_absolute(guess):
|
|
type_ = 'url'
|
|
else:
|
|
type_ = 'filename'
|
|
result = _select_source(
|
|
base_url=base_url, url_fetcher=url_fetcher,
|
|
check_css_mime_type=check_css_mime_type,
|
|
**{type_: guess})
|
|
with result as result:
|
|
yield result
|
|
elif filename is not None:
|
|
if base_url is None:
|
|
base_url = path2url(filename)
|
|
with open(filename, 'rb') as file_obj:
|
|
yield 'file_obj', file_obj, base_url, None
|
|
elif url is not None:
|
|
with fetch(url_fetcher, url) as result:
|
|
if check_css_mime_type and result['mime_type'] != 'text/css':
|
|
LOGGER.error(
|
|
'Unsupported stylesheet type %s for %s',
|
|
result['mime_type'], result['redirected_url'])
|
|
yield 'string', '', base_url, None
|
|
else:
|
|
proto_encoding = result.get('encoding')
|
|
if base_url is None:
|
|
base_url = result.get('redirected_url', url)
|
|
if 'string' in result:
|
|
yield 'string', result['string'], base_url, proto_encoding
|
|
else:
|
|
yield (
|
|
'file_obj', result['file_obj'], base_url,
|
|
proto_encoding)
|
|
elif file_obj is not None:
|
|
if base_url is None:
|
|
# filesystem file-like objects have a 'name' attribute.
|
|
name = getattr(file_obj, 'name', None)
|
|
# Some streams have a .name like '<stdin>', not a filename.
|
|
if name and not name.startswith('<'):
|
|
base_url = ensure_url(name)
|
|
yield 'file_obj', file_obj, base_url, None
|
|
else:
|
|
assert string is not None
|
|
yield 'string', string, base_url, None
|
|
|
|
# Work around circular imports.
|
|
from .css import preprocess_stylesheet # noqa isort:skip
|
|
from .html import ( # noqa isort:skip
|
|
HTML5_UA_COUNTER_STYLE, HTML5_UA_STYLESHEET, HTML5_UA_FORM_STYLESHEET,
|
|
HTML5_PH_STYLESHEET)
|
|
from .document import Document, Page # noqa isort:skip
|