mirror of
https://github.com/Kozea/WeasyPrint.git
synced 2024-09-11 20:47:56 +03:00
6ff1b97f4c
Even if it can be an important feature for some users, the fact that nobody ever complained means that it’s not useful for the majority of users. The option is available but disabled by default.
359 lines
15 KiB
Python
359 lines
15 KiB
Python
"""The Awesome Document Factory.
|
|
|
|
The public API is what is accessible from this "root" packages without
|
|
importing sub-modules.
|
|
|
|
"""
|
|
|
|
import contextlib
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin
|
|
|
|
import cssselect2
|
|
import html5lib
|
|
import tinycss2
|
|
|
|
VERSION = __version__ = '58.1'
|
|
|
|
__all__ = [
|
|
'HTML', 'CSS', 'Attachment', 'Document', 'Page', 'default_url_fetcher',
|
|
'VERSION', '__version__']
|
|
|
|
|
|
# Import after setting the version, as the version is used in other modules
|
|
from .urls import ( # noqa isort:skip
|
|
fetch, default_url_fetcher, path2url, ensure_url, url_is_absolute)
|
|
from .logger import LOGGER, PROGRESS_LOGGER # noqa isort:skip
|
|
# Some imports are at the end of the file (after the CSS class)
|
|
# to work around circular imports.
|
|
|
|
|
|
def _find_base_url(html_document, fallback_base_url):
|
|
"""Return the base URL for the document.
|
|
|
|
See https://www.w3.org/TR/html5/urls.html#document-base-url
|
|
|
|
"""
|
|
first_base_element = next(iter(html_document.iter('base')), None)
|
|
if first_base_element is not None:
|
|
href = first_base_element.get('href', '').strip()
|
|
if href:
|
|
return urljoin(fallback_base_url, href)
|
|
return fallback_base_url
|
|
|
|
|
|
class HTML:
|
|
"""HTML document parsed by html5lib.
|
|
|
|
You can just create an instance with a positional argument:
|
|
``doc = HTML(something)``
|
|
The class will try to guess if the input is a filename, an absolute URL,
|
|
or a :term:`file object`.
|
|
|
|
Alternatively, use **one** named argument so that no guessing is involved:
|
|
|
|
:type filename: str or pathlib.Path
|
|
:param filename: A filename, relative to the current directory, or
|
|
absolute.
|
|
:param str url: An absolute, fully qualified URL.
|
|
:type file_obj: :term:`file object`
|
|
:param file_obj: Any object with a ``read`` method.
|
|
:param str string: A string of HTML source.
|
|
|
|
Specifying multiple inputs is an error:
|
|
``HTML(filename="foo.html", url="localhost://bar.html")``
|
|
will raise a :obj:`TypeError`.
|
|
|
|
You can also pass optional named arguments:
|
|
|
|
:param str encoding: Force the source character encoding.
|
|
:param str base_url: The base used to resolve relative URLs
|
|
(e.g. in ``<img src="../foo.png">``). If not provided, try to use
|
|
the input filename, URL, or ``name`` attribute of :term:`file objects
|
|
<file object>`.
|
|
:type url_fetcher: :term:`function`
|
|
:param url_fetcher: A function or other callable
|
|
with the same signature as :func:`default_url_fetcher` called to
|
|
fetch external resources such as stylesheets and images.
|
|
(See :ref:`URL Fetchers`.)
|
|
:param str media_type: The media type to use for ``@media``.
|
|
Defaults to ``'print'``. **Note:** In some cases like
|
|
``HTML(string=foo)`` relative URLs will be invalid if ``base_url``
|
|
is not provided.
|
|
|
|
"""
|
|
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
|
|
string=None, encoding=None, base_url=None,
|
|
url_fetcher=default_url_fetcher, media_type='print'):
|
|
PROGRESS_LOGGER.info(
|
|
'Step 1 - Fetching and parsing HTML - %s',
|
|
guess or filename or url or
|
|
getattr(file_obj, 'name', 'HTML string'))
|
|
result = _select_source(
|
|
guess, filename, url, file_obj, string, base_url, url_fetcher)
|
|
with result as (source_type, source, base_url, protocol_encoding):
|
|
if isinstance(source, str):
|
|
result = html5lib.parse(source, namespaceHTMLElements=False)
|
|
else:
|
|
result = html5lib.parse(
|
|
source, override_encoding=encoding,
|
|
transport_encoding=protocol_encoding,
|
|
namespaceHTMLElements=False)
|
|
self.base_url = _find_base_url(result, base_url)
|
|
self.url_fetcher = url_fetcher
|
|
self.media_type = media_type
|
|
self.wrapper_element = cssselect2.ElementWrapper.from_html_root(
|
|
result, content_language=None)
|
|
self.etree_element = self.wrapper_element.etree_element
|
|
|
|
def _ua_stylesheets(self, forms=False):
|
|
if forms:
|
|
return [HTML5_UA_STYLESHEET, HTML5_UA_FORM_STYLESHEET]
|
|
return [HTML5_UA_STYLESHEET]
|
|
|
|
def _ua_counter_style(self):
|
|
return [HTML5_UA_COUNTER_STYLE.copy()]
|
|
|
|
def _ph_stylesheets(self):
|
|
return [HTML5_PH_STYLESHEET]
|
|
|
|
def render(self, stylesheets=None, presentational_hints=False,
|
|
optimize_size=('fonts', 'hinting', 'pdf'), jpeg_quality=None,
|
|
dpi=None, font_config=None, counter_style=None,
|
|
image_cache=None, forms=False):
|
|
"""Lay out and paginate the document, but do not (yet) export it.
|
|
|
|
This returns a :class:`document.Document` object which provides
|
|
access to individual pages and various meta-data.
|
|
See :meth:`write_pdf` to get a PDF directly.
|
|
|
|
:param list stylesheets:
|
|
An optional list of user stylesheets. List elements are
|
|
:class:`CSS` objects, filenames, URLs, or file
|
|
objects. (See :ref:`Stylesheet Origins`.)
|
|
:param bool presentational_hints:
|
|
Whether HTML presentational hints are followed.
|
|
:param tuple optimize_size:
|
|
Optimize size of generated PDF. Can contain "images", "fonts",
|
|
"hinting" and "pdf".
|
|
:param int jpeg_quality: JPEG quality between 0 (worst) to 95 (best).
|
|
:param int dpi: Maximum resolution of images embedded in the PDF.
|
|
:type font_config: :class:`text.fonts.FontConfiguration`
|
|
:param font_config: A font configuration handling ``@font-face`` rules.
|
|
:type counter_style: :class:`css.counters.CounterStyle`
|
|
:param counter_style: A dictionary storing ``@counter-style`` rules.
|
|
:param image_cache:
|
|
A dictionary used to cache images, or a folder path where images
|
|
are temporarily stored.
|
|
:type image_cache: :obj:`dict` or :obj:`str`
|
|
:param bool forms: Whether PDF forms have to be included.
|
|
:returns: A :class:`document.Document` object.
|
|
|
|
"""
|
|
return Document._render(
|
|
self, stylesheets, presentational_hints, optimize_size,
|
|
jpeg_quality, dpi, font_config, counter_style, image_cache, forms)
|
|
|
|
def write_pdf(self, target=None, stylesheets=None, zoom=1,
|
|
attachments=None, finisher=None, presentational_hints=False,
|
|
optimize_size=('fonts', 'hinting', 'pdf'), jpeg_quality=None,
|
|
dpi=None, font_config=None, counter_style=None,
|
|
image_cache=None, identifier=None, variant=None,
|
|
version=None, forms=False, custom_metadata=False):
|
|
"""Render the document to a PDF file.
|
|
|
|
This is a shortcut for calling :meth:`render`, then
|
|
:meth:`Document.write_pdf() <document.Document.write_pdf>`.
|
|
|
|
:type target:
|
|
:class:`str`, :class:`pathlib.Path` or :term:`file object`
|
|
:param target:
|
|
A filename where the PDF file is generated, a file object, or
|
|
:obj:`None`.
|
|
:param list stylesheets:
|
|
An optional list of user stylesheets. The list's elements
|
|
are :class:`CSS` objects, filenames, URLs, or file-like
|
|
objects. (See :ref:`Stylesheet Origins`.)
|
|
:param float zoom:
|
|
The zoom factor in PDF units per CSS units. **Warning**:
|
|
All CSS units are affected, including physical units like
|
|
``cm`` and named sizes like ``A4``. For values other than
|
|
1, the physical CSS units will thus be "wrong".
|
|
:param list attachments: A list of additional file attachments for the
|
|
generated PDF document or :obj:`None`. The list's elements are
|
|
:class:`Attachment` objects, filenames, URLs or file-like objects.
|
|
:param finisher: A finisher function, that accepts the document and a
|
|
:class:`pydyf.PDF` object as parameters, can be passed to perform
|
|
post-processing on the PDF right before the trailer is written.
|
|
:param bool presentational_hints: Whether HTML presentational hints are
|
|
followed.
|
|
:param tuple optimize_size:
|
|
Optimize size of generated PDF. Can contain "images", "fonts",
|
|
"hinting" and "pdf".
|
|
:param int jpeg_quality: JPEG quality between 0 (worst) to 95 (best).
|
|
:param int dpi: Maximum resolution of images embedded in the PDF.
|
|
:type font_config: :class:`text.fonts.FontConfiguration`
|
|
:param font_config: A font configuration handling ``@font-face`` rules.
|
|
:type counter_style: :class:`css.counters.CounterStyle`
|
|
:param counter_style: A dictionary storing ``@counter-style`` rules.
|
|
:param image_cache:
|
|
A dictionary used to cache images, or a folder path where images
|
|
are temporarily stored.
|
|
:type image_cache: :obj:`dict` or :obj:`str`
|
|
:param bytes identifier: A bytestring used as PDF file identifier.
|
|
:param str variant: A PDF variant name.
|
|
:param str version: A PDF version number.
|
|
:param bool forms: Whether PDF forms have to be included.
|
|
:param bool custom_metadata: Whether custom HTML metadata should be
|
|
stored in the generated PDF.
|
|
:returns:
|
|
The PDF as :obj:`bytes` if ``target`` is not provided or
|
|
:obj:`None`, otherwise :obj:`None` (the PDF is written to
|
|
``target``).
|
|
|
|
"""
|
|
return (
|
|
self.render(
|
|
stylesheets, presentational_hints, optimize_size, jpeg_quality,
|
|
dpi, font_config, counter_style, image_cache, forms)
|
|
.write_pdf(
|
|
target, zoom, attachments, finisher, identifier, variant,
|
|
version, custom_metadata))
|
|
|
|
|
|
class CSS:
|
|
"""CSS stylesheet parsed by tinycss2.
|
|
|
|
An instance is created in the same way as :class:`HTML`, with the same
|
|
arguments.
|
|
|
|
An additional argument called ``font_config`` must be provided to handle
|
|
``@font-face`` rules. The same ``text.fonts.FontConfiguration`` object
|
|
must be used for different ``CSS`` objects applied to the same document.
|
|
|
|
``CSS`` objects have no public attributes or methods. They are only meant
|
|
to be used in the :meth:`HTML.write_pdf` and :meth:`HTML.render` methods
|
|
of :class:`HTML` objects.
|
|
|
|
"""
|
|
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
|
|
string=None, encoding=None, base_url=None,
|
|
url_fetcher=default_url_fetcher, _check_mime_type=False,
|
|
media_type='print', font_config=None, counter_style=None,
|
|
matcher=None, page_rules=None):
|
|
PROGRESS_LOGGER.info(
|
|
'Step 2 - Fetching and parsing CSS - %s',
|
|
filename or url or getattr(file_obj, 'name', 'CSS string'))
|
|
result = _select_source(
|
|
guess, filename, url, file_obj, string,
|
|
base_url=base_url, url_fetcher=url_fetcher,
|
|
check_css_mime_type=_check_mime_type)
|
|
with result as (source_type, source, base_url, protocol_encoding):
|
|
if source_type == 'string' and not isinstance(source, bytes):
|
|
# unicode, no encoding
|
|
stylesheet = tinycss2.parse_stylesheet(source)
|
|
else:
|
|
if source_type == 'file_obj':
|
|
source = source.read()
|
|
stylesheet, encoding = tinycss2.parse_stylesheet_bytes(
|
|
source, environment_encoding=encoding,
|
|
protocol_encoding=protocol_encoding)
|
|
self.base_url = base_url
|
|
self.matcher = matcher or cssselect2.Matcher()
|
|
self.page_rules = [] if page_rules is None else page_rules
|
|
preprocess_stylesheet(
|
|
media_type, base_url, stylesheet, url_fetcher, self.matcher,
|
|
self.page_rules, font_config, counter_style)
|
|
|
|
|
|
class Attachment:
|
|
"""File attachment for a PDF document.
|
|
|
|
An instance is created in the same way as :class:`HTML`, except that the
|
|
HTML specific arguments (``encoding`` and ``media_type``) are not
|
|
supported. An optional description can be provided with the ``description``
|
|
argument.
|
|
|
|
:param description: A description of the attachment to be included in the
|
|
PDF document. May be :obj:`None`.
|
|
|
|
"""
|
|
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
|
|
string=None, base_url=None, url_fetcher=default_url_fetcher,
|
|
description=None):
|
|
self.source = _select_source(
|
|
guess, filename, url, file_obj, string, base_url=base_url,
|
|
url_fetcher=url_fetcher)
|
|
self.description = description
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def _select_source(guess=None, filename=None, url=None, file_obj=None,
|
|
string=None, base_url=None, url_fetcher=default_url_fetcher,
|
|
check_css_mime_type=False):
|
|
"""If only one input is given, return it with normalized ``base_url``."""
|
|
if base_url is not None:
|
|
base_url = ensure_url(base_url)
|
|
|
|
selected_params = [
|
|
param for param in (guess, filename, url, file_obj, string) if
|
|
param is not None]
|
|
if len(selected_params) != 1:
|
|
source = ', '.join(selected_params) or 'nothing'
|
|
raise TypeError(f'Expected exactly one source, got {source}')
|
|
elif guess is not None:
|
|
if hasattr(guess, 'read'):
|
|
type_ = 'file_obj'
|
|
elif isinstance(guess, Path):
|
|
type_ = 'filename'
|
|
elif url_is_absolute(guess):
|
|
type_ = 'url'
|
|
else:
|
|
type_ = 'filename'
|
|
result = _select_source(
|
|
base_url=base_url, url_fetcher=url_fetcher,
|
|
check_css_mime_type=check_css_mime_type,
|
|
**{type_: guess})
|
|
with result as result:
|
|
yield result
|
|
elif filename is not None:
|
|
if base_url is None:
|
|
base_url = path2url(filename)
|
|
with open(filename, 'rb') as file_obj:
|
|
yield 'file_obj', file_obj, base_url, None
|
|
elif url is not None:
|
|
with fetch(url_fetcher, url) as result:
|
|
if check_css_mime_type and result['mime_type'] != 'text/css':
|
|
LOGGER.error(
|
|
'Unsupported stylesheet type %s for %s',
|
|
result['mime_type'], result['redirected_url'])
|
|
yield 'string', '', base_url, None
|
|
else:
|
|
proto_encoding = result.get('encoding')
|
|
if base_url is None:
|
|
base_url = result.get('redirected_url', url)
|
|
if 'string' in result:
|
|
yield 'string', result['string'], base_url, proto_encoding
|
|
else:
|
|
yield (
|
|
'file_obj', result['file_obj'], base_url,
|
|
proto_encoding)
|
|
elif file_obj is not None:
|
|
if base_url is None:
|
|
# filesystem file-like objects have a 'name' attribute.
|
|
name = getattr(file_obj, 'name', None)
|
|
# Some streams have a .name like '<stdin>', not a filename.
|
|
if name and not name.startswith('<'):
|
|
base_url = ensure_url(name)
|
|
yield 'file_obj', file_obj, base_url, None
|
|
else:
|
|
assert string is not None
|
|
yield 'string', string, base_url, None
|
|
|
|
# Work around circular imports.
|
|
from .css import preprocess_stylesheet # noqa isort:skip
|
|
from .html import ( # noqa isort:skip
|
|
HTML5_UA_COUNTER_STYLE, HTML5_UA_STYLESHEET, HTML5_UA_FORM_STYLESHEET,
|
|
HTML5_PH_STYLESHEET)
|
|
from .document import Document, Page # noqa isort:skip
|