2012-03-22 02:19:27 +04:00
|
|
|
"""
|
|
|
|
WeasyPrint
|
|
|
|
==========
|
2011-04-28 21:15:30 +04:00
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
WeasyPrint converts web documents to PDF.
|
2011-08-09 14:45:51 +04:00
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
The public API is what is accessible from this "root" packages
|
|
|
|
without importing sub-modules.
|
2011-08-19 13:22:31 +04:00
|
|
|
|
|
|
|
"""
|
|
|
|
|
2019-03-01 13:38:03 +03:00
|
|
|
import contextlib
|
2018-08-23 19:20:36 +03:00
|
|
|
import os
|
2018-01-28 02:02:13 +03:00
|
|
|
import sys
|
2018-08-21 17:17:00 +03:00
|
|
|
from pathlib import Path
|
2018-01-28 02:02:13 +03:00
|
|
|
|
2017-06-30 18:54:02 +03:00
|
|
|
import cssselect2
|
2019-03-01 13:38:03 +03:00
|
|
|
import html5lib
|
2017-03-26 12:42:50 +03:00
|
|
|
import tinycss2
|
2016-01-15 14:47:03 +03:00
|
|
|
|
2020-01-02 02:43:06 +03:00
|
|
|
if sys.version_info.major < 3: # pragma: no cover
|
2018-01-28 02:02:13 +03:00
|
|
|
raise RuntimeError(
|
|
|
|
'WeasyPrint does not support Python 2.x anymore. '
|
|
|
|
'Please use Python 3 or install an older version of WeasyPrint.')
|
|
|
|
|
2020-01-02 02:43:06 +03:00
|
|
|
if hasattr(sys, 'frozen'): # pragma: no cover
|
2018-08-23 19:20:36 +03:00
|
|
|
if hasattr(sys, '_MEIPASS'):
|
|
|
|
# Frozen with PyInstaller
|
|
|
|
# See https://github.com/Kozea/WeasyPrint/pull/540
|
2020-03-25 20:43:05 +03:00
|
|
|
ROOT = Path(sys._MEIPASS) / 'weasyprint'
|
2018-08-23 19:20:36 +03:00
|
|
|
else:
|
|
|
|
# Frozen with something else (py2exe, etc.)
|
|
|
|
# See https://github.com/Kozea/WeasyPrint/pull/269
|
2020-01-02 15:25:19 +03:00
|
|
|
ROOT = Path(os.path.dirname(sys.executable))
|
2018-08-23 19:20:36 +03:00
|
|
|
else:
|
2019-02-06 20:23:41 +03:00
|
|
|
ROOT = Path(os.path.dirname(__file__))
|
2018-08-23 19:20:36 +03:00
|
|
|
|
2019-02-06 20:23:41 +03:00
|
|
|
VERSION = __version__ = (ROOT / 'VERSION').read_text().strip()
|
2012-02-15 21:49:37 +04:00
|
|
|
|
2012-05-15 15:40:36 +04:00
|
|
|
# Used for 'User-Agent' in HTTP and 'Creator' in PDF
|
|
|
|
VERSION_STRING = 'WeasyPrint %s (http://weasyprint.org/)' % VERSION
|
|
|
|
|
2014-04-27 21:16:14 +04:00
|
|
|
__all__ = ['HTML', 'CSS', 'Attachment', 'Document', 'Page',
|
|
|
|
'default_url_fetcher', 'VERSION']
|
2012-05-15 15:40:36 +04:00
|
|
|
|
2012-07-13 14:24:55 +04:00
|
|
|
|
2016-01-15 14:47:03 +03:00
|
|
|
# Import after setting the version, as the version is used in other modules
|
2019-03-01 13:38:03 +03:00
|
|
|
from .urls import ( # noqa isort:skip
|
|
|
|
fetch, default_url_fetcher, path2url, ensure_url, url_is_absolute)
|
|
|
|
from .logger import LOGGER, PROGRESS_LOGGER # noqa isort:skip
|
2016-01-15 14:47:03 +03:00
|
|
|
# Some imports are at the end of the file (after the CSS class)
|
2013-02-25 18:21:25 +04:00
|
|
|
# to work around circular imports.
|
2012-07-13 14:24:55 +04:00
|
|
|
|
2012-02-15 21:49:37 +04:00
|
|
|
|
2020-01-02 14:06:58 +03:00
|
|
|
class HTML:
|
2017-06-30 23:48:47 +03:00
|
|
|
"""Represents an HTML document parsed by html5lib.
|
2012-02-27 18:07:41 +04:00
|
|
|
|
2012-09-19 19:37:52 +04:00
|
|
|
You can just create an instance with a positional argument:
|
|
|
|
``doc = HTML(something)``
|
|
|
|
The class will try to guess if the input is a filename, an absolute URL,
|
2019-02-22 13:34:46 +03:00
|
|
|
or a :term:`file object`.
|
2012-02-27 18:07:41 +04:00
|
|
|
|
2012-09-19 19:37:52 +04:00
|
|
|
Alternatively, use **one** named argument so that no guessing is involved:
|
2012-02-27 18:07:41 +04:00
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
:type filename: str or pathlib.Path
|
2017-04-29 11:41:53 +03:00
|
|
|
:param filename: A filename, relative to the current directory, or
|
|
|
|
absolute.
|
2019-02-22 13:34:46 +03:00
|
|
|
:type url: str
|
2012-09-19 19:37:52 +04:00
|
|
|
:param url: An absolute, fully qualified URL.
|
2019-02-22 13:34:46 +03:00
|
|
|
:type file_obj: :term:`file object`
|
|
|
|
:param file_obj: Any object with a ``read`` method.
|
|
|
|
:type string: str
|
2019-01-23 02:12:01 +03:00
|
|
|
:param string: A string of HTML source.
|
2012-02-27 18:07:41 +04:00
|
|
|
|
2017-04-28 21:36:14 +03:00
|
|
|
Specifying multiple inputs is an error:
|
|
|
|
``HTML(filename="foo.html", url="localhost://bar.html")``
|
2019-02-22 13:34:46 +03:00
|
|
|
will raise a :obj:`TypeError`.
|
2012-02-27 18:07:41 +04:00
|
|
|
|
2012-09-19 19:37:52 +04:00
|
|
|
You can also pass optional named arguments:
|
2012-02-27 18:07:41 +04:00
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
:type encoding: str
|
2012-09-19 19:37:52 +04:00
|
|
|
:param encoding: Force the source character encoding.
|
2019-02-22 13:34:46 +03:00
|
|
|
:type base_url: str
|
2012-09-19 19:37:52 +04:00
|
|
|
:param base_url: The base used to resolve relative URLs
|
2017-04-28 21:36:14 +03:00
|
|
|
(e.g. in ``<img src="../foo.png">``). If not provided, try to use
|
2019-02-22 13:34:46 +03:00
|
|
|
the input filename, URL, or ``name`` attribute of :term:`file objects
|
|
|
|
<file object>`.
|
|
|
|
:type url_fetcher: function
|
2017-04-28 21:36:14 +03:00
|
|
|
:param url_fetcher: A function or other callable
|
2012-10-08 21:51:18 +04:00
|
|
|
with the same signature as :func:`default_url_fetcher` called to
|
|
|
|
fetch external resources such as stylesheets and images.
|
|
|
|
(See :ref:`url-fetchers`.)
|
2019-02-22 13:34:46 +03:00
|
|
|
:type media_type: str
|
2012-09-19 19:37:52 +04:00
|
|
|
:param media_type: The media type to use for ``@media``.
|
|
|
|
Defaults to ``'print'``. **Note:** In some cases like
|
|
|
|
``HTML(string=foo)`` relative URLs will be invalid if ``base_url``
|
|
|
|
is not provided.
|
2012-02-27 18:07:41 +04:00
|
|
|
|
|
|
|
"""
|
|
|
|
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
|
2017-06-30 21:12:53 +03:00
|
|
|
string=None, encoding=None, base_url=None,
|
2014-04-22 22:40:46 +04:00
|
|
|
url_fetcher=default_url_fetcher, media_type='print'):
|
2019-01-04 01:02:44 +03:00
|
|
|
PROGRESS_LOGGER.info(
|
2017-07-25 14:58:18 +03:00
|
|
|
'Step 1 - Fetching and parsing HTML - %s',
|
|
|
|
guess or filename or url or
|
|
|
|
getattr(file_obj, 'name', 'HTML string'))
|
2013-06-20 15:58:24 +04:00
|
|
|
result = _select_source(
|
2017-06-30 21:12:53 +03:00
|
|
|
guess, filename, url, file_obj, string, base_url, url_fetcher)
|
2013-06-20 15:58:24 +04:00
|
|
|
with result as (source_type, source, base_url, protocol_encoding):
|
2018-01-14 03:48:17 +03:00
|
|
|
if isinstance(source, str):
|
2017-07-03 16:19:05 +03:00
|
|
|
result = html5lib.parse(source, namespaceHTMLElements=False)
|
2012-05-24 18:47:40 +04:00
|
|
|
else:
|
2017-06-30 23:58:33 +03:00
|
|
|
result = html5lib.parse(
|
|
|
|
source, override_encoding=encoding,
|
|
|
|
transport_encoding=protocol_encoding,
|
|
|
|
namespaceHTMLElements=False)
|
2017-07-03 16:19:05 +03:00
|
|
|
self.base_url = find_base_url(result, base_url)
|
2012-07-13 14:24:55 +04:00
|
|
|
self.url_fetcher = url_fetcher
|
2012-08-02 15:04:31 +04:00
|
|
|
self.media_type = media_type
|
2017-07-03 16:19:05 +03:00
|
|
|
self.wrapper_element = cssselect2.ElementWrapper.from_html_root(
|
2017-07-07 12:14:07 +03:00
|
|
|
result, content_language=None)
|
2017-07-03 16:19:05 +03:00
|
|
|
self.etree_element = self.wrapper_element.etree_element
|
2012-02-15 21:49:37 +04:00
|
|
|
|
2012-09-25 18:01:47 +04:00
|
|
|
def _ua_stylesheets(self):
|
2012-02-16 21:52:36 +04:00
|
|
|
return [HTML5_UA_STYLESHEET]
|
|
|
|
|
2019-12-24 16:39:40 +03:00
|
|
|
def _ua_counter_style(self):
|
|
|
|
return [HTML5_UA_COUNTER_STYLE.copy()]
|
|
|
|
|
2016-08-31 23:33:26 +03:00
|
|
|
def _ph_stylesheets(self):
|
|
|
|
return [HTML5_PH_STYLESHEET]
|
|
|
|
|
2013-07-14 15:08:02 +04:00
|
|
|
def _get_metadata(self):
|
2017-07-07 12:14:07 +03:00
|
|
|
return get_html_metadata(self.wrapper_element, self.base_url)
|
2013-07-14 15:08:02 +04:00
|
|
|
|
2020-04-18 01:19:35 +03:00
|
|
|
def render(self, stylesheets=None, presentational_hints=False,
|
|
|
|
font_config=None, counter_style=None):
|
2012-10-05 20:50:40 +04:00
|
|
|
"""Lay out and paginate the document, but do not (yet) export it
|
2019-02-22 13:34:46 +03:00
|
|
|
to PDF or PNG.
|
2012-09-12 21:33:16 +04:00
|
|
|
|
2012-10-08 21:51:18 +04:00
|
|
|
This returns a :class:`~document.Document` object which provides
|
|
|
|
access to individual pages and various meta-data.
|
|
|
|
See :meth:`write_pdf` to get a PDF directly.
|
|
|
|
|
|
|
|
.. versionadded:: 0.15
|
2012-09-12 21:33:16 +04:00
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
:type stylesheets: list
|
2012-09-25 18:01:47 +04:00
|
|
|
:param stylesheets:
|
2017-04-28 21:36:14 +03:00
|
|
|
An optional list of user stylesheets. List elements are
|
2019-02-22 13:34:46 +03:00
|
|
|
:class:`CSS` objects, filenames, URLs, or file
|
2017-04-28 21:36:14 +03:00
|
|
|
objects. (See :ref:`stylesheet-origins`.)
|
2016-08-30 19:15:30 +03:00
|
|
|
:type presentational_hints: bool
|
|
|
|
:param presentational_hints: Whether HTML presentational hints are
|
|
|
|
followed.
|
2017-10-01 16:17:32 +03:00
|
|
|
:type font_config: :class:`~fonts.FontConfiguration`
|
2019-02-22 13:34:46 +03:00
|
|
|
:param font_config: A font configuration handling ``@font-face`` rules.
|
2019-12-27 17:26:15 +03:00
|
|
|
:type counter_style: :class:`~css.counters.CounterStyle`
|
|
|
|
:param counter_style: A dictionary storing ``@counter-style`` rules.
|
2012-10-08 21:51:18 +04:00
|
|
|
:returns: A :class:`~document.Document` object.
|
2012-09-20 19:21:44 +04:00
|
|
|
|
2012-09-12 21:33:16 +04:00
|
|
|
"""
|
2016-08-30 19:15:30 +03:00
|
|
|
return Document._render(
|
2020-04-18 01:19:35 +03:00
|
|
|
self, stylesheets, presentational_hints, font_config,
|
|
|
|
counter_style)
|
2012-10-02 20:59:02 +04:00
|
|
|
|
2014-04-22 22:40:46 +04:00
|
|
|
def write_pdf(self, target=None, stylesheets=None, zoom=1,
|
2017-10-01 16:17:32 +03:00
|
|
|
attachments=None, presentational_hints=False,
|
2019-12-24 16:39:40 +03:00
|
|
|
font_config=None, counter_style=None):
|
2012-10-08 21:51:18 +04:00
|
|
|
"""Render the document to a PDF file.
|
|
|
|
|
|
|
|
This is a shortcut for calling :meth:`render`, then
|
|
|
|
:meth:`Document.write_pdf() <document.Document.write_pdf>`.
|
2012-02-15 21:49:37 +04:00
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
:type target: str, pathlib.Path or file object
|
2012-02-15 21:49:37 +04:00
|
|
|
:param target:
|
2019-02-22 13:34:46 +03:00
|
|
|
A filename where the PDF file is generated, a file object, or
|
|
|
|
:obj:`None`.
|
|
|
|
:type stylesheets: list
|
2012-02-15 21:49:37 +04:00
|
|
|
:param stylesheets:
|
2018-01-28 02:02:13 +03:00
|
|
|
An optional list of user stylesheets. The list's elements
|
2017-04-28 21:36:14 +03:00
|
|
|
are :class:`CSS` objects, filenames, URLs, or file-like
|
2019-02-22 13:34:46 +03:00
|
|
|
objects. (See :ref:`stylesheet-origins`.)
|
2012-11-23 01:27:34 +04:00
|
|
|
:type zoom: float
|
|
|
|
:param zoom:
|
2017-04-28 21:36:14 +03:00
|
|
|
The zoom factor in PDF units per CSS units. **Warning**:
|
|
|
|
All CSS units are affected, including physical units like
|
|
|
|
``cm`` and named sizes like ``A4``. For values other than
|
2018-01-28 02:02:13 +03:00
|
|
|
1, the physical CSS units will thus be "wrong".
|
2019-02-22 13:34:46 +03:00
|
|
|
:type attachments: list
|
2014-04-22 22:40:46 +04:00
|
|
|
:param attachments: A list of additional file attachments for the
|
2014-04-26 01:35:43 +04:00
|
|
|
generated PDF document or :obj:`None`. The list's elements are
|
|
|
|
:class:`Attachment` objects, filenames, URLs or file-like objects.
|
2016-08-30 20:10:53 +03:00
|
|
|
:type presentational_hints: bool
|
|
|
|
:param presentational_hints: Whether HTML presentational hints are
|
|
|
|
followed.
|
2017-10-01 16:17:32 +03:00
|
|
|
:type font_config: :class:`~fonts.FontConfiguration`
|
2019-02-22 13:34:46 +03:00
|
|
|
:param font_config: A font configuration handling ``@font-face`` rules.
|
2019-12-27 17:26:15 +03:00
|
|
|
:type counter_style: :class:`~css.counters.CounterStyle`
|
|
|
|
:param counter_style: A dictionary storing ``@counter-style`` rules.
|
2012-02-15 21:49:37 +04:00
|
|
|
:returns:
|
2019-02-22 13:34:46 +03:00
|
|
|
The PDF as :obj:`bytes` if ``target`` is not provided or
|
2012-10-08 21:51:18 +04:00
|
|
|
:obj:`None`, otherwise :obj:`None` (the PDF is written to
|
2019-02-22 13:34:46 +03:00
|
|
|
``target``).
|
2012-09-19 19:37:52 +04:00
|
|
|
|
2012-02-15 21:49:37 +04:00
|
|
|
"""
|
2016-12-06 10:07:58 +03:00
|
|
|
return self.render(
|
2020-04-18 01:19:35 +03:00
|
|
|
stylesheets, presentational_hints=presentational_hints,
|
2019-12-24 16:39:40 +03:00
|
|
|
font_config=font_config, counter_style=counter_style).write_pdf(
|
2016-12-06 10:07:58 +03:00
|
|
|
target, zoom, attachments)
|
2012-02-15 21:49:37 +04:00
|
|
|
|
2016-08-30 20:10:53 +03:00
|
|
|
def write_png(self, target=None, stylesheets=None, resolution=96,
|
2020-05-18 02:29:37 +03:00
|
|
|
antialiasing=1, presentational_hints=False, font_config=None,
|
2019-12-24 16:39:40 +03:00
|
|
|
counter_style=None):
|
2012-10-05 20:50:40 +04:00
|
|
|
"""Paint the pages vertically to a single PNG image.
|
2012-02-15 21:49:37 +04:00
|
|
|
|
2012-10-05 20:50:40 +04:00
|
|
|
There is no decoration around pages other than those specified in CSS
|
|
|
|
with ``@page`` rules. The final image is as wide as the widest page.
|
|
|
|
Each page is below the previous one, centered horizontally.
|
2012-09-19 19:37:52 +04:00
|
|
|
|
2012-10-08 21:51:18 +04:00
|
|
|
This is a shortcut for calling :meth:`render`, then
|
|
|
|
:meth:`Document.write_png() <document.Document.write_png>`.
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
:type target: str, pathlib.Path or file object
|
2012-02-15 21:49:37 +04:00
|
|
|
:param target:
|
2019-02-22 13:34:46 +03:00
|
|
|
A filename where the PNG file is generated, a file object, or
|
|
|
|
:obj:`None`.
|
|
|
|
:type stylesheets: list
|
2012-02-15 21:49:37 +04:00
|
|
|
:param stylesheets:
|
2019-02-22 13:34:46 +03:00
|
|
|
An optional list of user stylesheets. The list's elements
|
2017-04-28 21:36:14 +03:00
|
|
|
are :class:`CSS` objects, filenames, URLs, or file-like
|
|
|
|
objects. (See :ref:`stylesheet-origins`.)
|
2012-09-20 19:21:44 +04:00
|
|
|
:type resolution: float
|
2012-09-19 19:37:52 +04:00
|
|
|
:param resolution:
|
|
|
|
The output resolution in PNG pixels per CSS inch. At 96 dpi
|
|
|
|
(the default), PNG pixels match the CSS ``px`` unit.
|
2020-05-17 16:06:17 +03:00
|
|
|
:type antialiasing: int
|
|
|
|
:param antialiasing:
|
2020-05-18 02:29:37 +03:00
|
|
|
The antialiasing subsampling box size. Default is 1 (disabled), can
|
|
|
|
be set to 4 for optimal (but slow) antialiasing.
|
2016-08-30 20:10:53 +03:00
|
|
|
:type presentational_hints: bool
|
|
|
|
:param presentational_hints: Whether HTML presentational hints are
|
|
|
|
followed.
|
2017-10-01 16:17:32 +03:00
|
|
|
:type font_config: :class:`~fonts.FontConfiguration`
|
2019-02-22 13:34:46 +03:00
|
|
|
:param font_config: A font configuration handling ``@font-face`` rules.
|
2019-12-27 17:26:15 +03:00
|
|
|
:type counter_style: :class:`~css.counters.CounterStyle`
|
|
|
|
:param counter_style: A dictionary storing ``@counter-style`` rules.
|
2012-02-15 21:49:37 +04:00
|
|
|
:returns:
|
2019-02-22 13:34:46 +03:00
|
|
|
The image as :obj:`bytes` if ``target`` is not provided or
|
2012-10-08 21:51:18 +04:00
|
|
|
:obj:`None`, otherwise :obj:`None` (the image is written to
|
2019-02-22 13:34:46 +03:00
|
|
|
``target``.)
|
2012-09-19 19:37:52 +04:00
|
|
|
|
2012-02-15 21:49:37 +04:00
|
|
|
"""
|
2020-05-18 02:36:48 +03:00
|
|
|
return self.render(
|
|
|
|
stylesheets, presentational_hints=presentational_hints,
|
|
|
|
font_config=font_config, counter_style=counter_style).write_png(
|
|
|
|
target, resolution, antialiasing)
|
2012-02-15 21:49:37 +04:00
|
|
|
|
|
|
|
|
2020-01-02 14:06:58 +03:00
|
|
|
class CSS:
|
2017-03-26 12:42:50 +03:00
|
|
|
"""Represents a CSS stylesheet parsed by tinycss2.
|
2012-09-19 19:37:52 +04:00
|
|
|
|
2019-01-23 02:12:01 +03:00
|
|
|
An instance is created in the same way as :class:`HTML`, with the same
|
|
|
|
arguments.
|
2012-02-15 21:49:37 +04:00
|
|
|
|
2017-05-03 10:22:19 +03:00
|
|
|
An additional argument called ``font_config`` must be provided to handle
|
|
|
|
``@font-config`` rules. The same ``fonts.FontConfiguration`` object must be
|
|
|
|
used for different ``CSS`` objects applied to the same document.
|
|
|
|
|
2019-02-21 19:34:56 +03:00
|
|
|
``CSS`` objects have no public attributes or methods. They are only meant
|
|
|
|
to be used in the :meth:`~HTML.write_pdf`, :meth:`~HTML.write_png` and
|
2012-10-05 20:50:40 +04:00
|
|
|
:meth:`~HTML.render` methods of :class:`HTML` objects.
|
2012-02-15 21:49:37 +04:00
|
|
|
|
2012-02-27 18:07:41 +04:00
|
|
|
"""
|
|
|
|
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
|
2012-03-24 16:39:31 +04:00
|
|
|
string=None, encoding=None, base_url=None,
|
2012-08-02 15:04:31 +04:00
|
|
|
url_fetcher=default_url_fetcher, _check_mime_type=False,
|
2019-12-24 16:39:40 +03:00
|
|
|
media_type='print', font_config=None, counter_style=None,
|
|
|
|
matcher=None, page_rules=None):
|
2019-01-04 01:02:44 +03:00
|
|
|
PROGRESS_LOGGER.info(
|
2017-07-25 14:58:18 +03:00
|
|
|
'Step 2 - Fetching and parsing CSS - %s',
|
|
|
|
filename or url or getattr(file_obj, 'name', 'CSS string'))
|
2013-06-20 15:58:24 +04:00
|
|
|
result = _select_source(
|
2017-06-30 21:12:53 +03:00
|
|
|
guess, filename, url, file_obj, string,
|
2012-07-13 14:24:55 +04:00
|
|
|
base_url=base_url, url_fetcher=url_fetcher,
|
2016-10-28 18:44:09 +03:00
|
|
|
check_css_mime_type=_check_mime_type)
|
2013-06-20 15:58:24 +04:00
|
|
|
with result as (source_type, source, base_url, protocol_encoding):
|
|
|
|
if source_type == 'string' and not isinstance(source, bytes):
|
2012-03-24 16:39:31 +04:00
|
|
|
# unicode, no encoding
|
2017-03-26 12:42:50 +03:00
|
|
|
stylesheet = tinycss2.parse_stylesheet(source)
|
2013-06-20 15:58:24 +04:00
|
|
|
else:
|
|
|
|
if source_type == 'file_obj':
|
|
|
|
source = source.read()
|
2017-03-26 12:42:50 +03:00
|
|
|
stylesheet, encoding = tinycss2.parse_stylesheet_bytes(
|
|
|
|
source, environment_encoding=encoding,
|
2013-06-20 15:58:24 +04:00
|
|
|
protocol_encoding=protocol_encoding)
|
2012-03-24 16:39:31 +04:00
|
|
|
self.base_url = base_url
|
2017-06-30 18:54:02 +03:00
|
|
|
self.matcher = matcher or cssselect2.Matcher()
|
2017-10-06 17:20:23 +03:00
|
|
|
self.page_rules = [] if page_rules is None else page_rules
|
2016-09-26 13:15:12 +03:00
|
|
|
self.fonts = []
|
2016-09-24 16:36:26 +03:00
|
|
|
preprocess_stylesheet(
|
2017-06-30 18:54:02 +03:00
|
|
|
media_type, base_url, stylesheet, url_fetcher, self.matcher,
|
2019-12-24 16:39:40 +03:00
|
|
|
self.page_rules, self.fonts, font_config, counter_style)
|
2012-03-24 16:39:31 +04:00
|
|
|
|
2014-04-27 21:16:14 +04:00
|
|
|
|
2020-01-02 14:06:58 +03:00
|
|
|
class Attachment:
|
2014-04-26 01:35:43 +04:00
|
|
|
"""Represents a file attachment for a PDF document.
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
.. versionadded:: 0.22
|
|
|
|
|
|
|
|
An instance is created in the same way as :class:`HTML`, except that the
|
|
|
|
HTML specific arguments (``encoding`` and ``media_type``) are not
|
|
|
|
supported. An optional description can be provided with the ``description``
|
|
|
|
argument.
|
2014-04-26 01:35:43 +04:00
|
|
|
|
|
|
|
:param description: A description of the attachment to be included in the
|
2019-02-22 13:34:46 +03:00
|
|
|
PDF document. May be :obj:`None`.
|
2014-04-26 01:35:43 +04:00
|
|
|
|
|
|
|
"""
|
|
|
|
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
|
|
|
|
string=None, base_url=None, url_fetcher=default_url_fetcher,
|
|
|
|
description=None):
|
|
|
|
self.source = _select_source(
|
2017-06-30 21:12:53 +03:00
|
|
|
guess, filename, url, file_obj, string,
|
2014-04-26 01:35:43 +04:00
|
|
|
base_url=base_url, url_fetcher=url_fetcher)
|
|
|
|
self.description = description
|
|
|
|
|
2012-02-15 21:49:37 +04:00
|
|
|
|
2013-06-20 15:58:24 +04:00
|
|
|
@contextlib.contextmanager
|
2012-03-24 16:39:31 +04:00
|
|
|
def _select_source(guess=None, filename=None, url=None, file_obj=None,
|
2017-06-30 21:12:53 +03:00
|
|
|
string=None, base_url=None, url_fetcher=default_url_fetcher,
|
|
|
|
check_css_mime_type=False):
|
2012-02-27 18:07:41 +04:00
|
|
|
"""
|
|
|
|
Check that only one input is not None, and return it with the
|
|
|
|
normalized ``base_url``.
|
2012-05-31 16:38:34 +04:00
|
|
|
|
2012-02-27 18:07:41 +04:00
|
|
|
"""
|
2012-02-15 21:49:37 +04:00
|
|
|
if base_url is not None:
|
|
|
|
base_url = ensure_url(base_url)
|
|
|
|
|
2017-08-23 12:42:32 +03:00
|
|
|
selected_params = [
|
|
|
|
param for param in (guess, filename, url, file_obj, string) if
|
|
|
|
param is not None]
|
|
|
|
if len(selected_params) != 1:
|
|
|
|
raise TypeError('Expected exactly one source, got ' + (
|
|
|
|
', '.join(selected_params) or 'nothing'
|
|
|
|
))
|
2017-09-06 17:55:21 +03:00
|
|
|
elif guess is not None:
|
2012-02-27 18:07:41 +04:00
|
|
|
if hasattr(guess, 'read'):
|
|
|
|
type_ = 'file_obj'
|
2018-08-21 17:17:00 +03:00
|
|
|
elif isinstance(guess, Path):
|
|
|
|
type_ = 'filename'
|
2012-05-31 16:38:34 +04:00
|
|
|
elif url_is_absolute(guess):
|
2012-02-27 18:07:41 +04:00
|
|
|
type_ = 'url'
|
2012-02-15 21:49:37 +04:00
|
|
|
else:
|
2012-02-27 18:07:41 +04:00
|
|
|
type_ = 'filename'
|
2013-06-20 15:58:24 +04:00
|
|
|
result = _select_source(
|
2012-07-13 14:24:55 +04:00
|
|
|
base_url=base_url, url_fetcher=url_fetcher,
|
|
|
|
check_css_mime_type=check_css_mime_type,
|
2018-01-14 03:48:17 +03:00
|
|
|
**{type_: guess})
|
2013-06-20 15:58:24 +04:00
|
|
|
with result as result:
|
|
|
|
yield result
|
2017-09-06 17:55:21 +03:00
|
|
|
elif filename is not None:
|
2018-08-21 17:17:00 +03:00
|
|
|
if isinstance(filename, Path):
|
|
|
|
filename = str(filename)
|
2012-03-24 16:39:31 +04:00
|
|
|
if base_url is None:
|
|
|
|
base_url = path2url(filename)
|
2013-06-20 15:58:24 +04:00
|
|
|
with open(filename, 'rb') as file_obj:
|
|
|
|
yield 'file_obj', file_obj, base_url, None
|
2017-09-06 17:55:21 +03:00
|
|
|
elif url is not None:
|
2013-06-20 15:58:24 +04:00
|
|
|
with fetch(url_fetcher, url) as result:
|
|
|
|
if check_css_mime_type and result['mime_type'] != 'text/css':
|
2017-07-25 14:59:56 +03:00
|
|
|
LOGGER.error(
|
2013-06-20 15:58:24 +04:00
|
|
|
'Unsupported stylesheet type %s for %s',
|
|
|
|
result['mime_type'], result['redirected_url'])
|
|
|
|
yield 'string', '', base_url, None
|
|
|
|
else:
|
2013-12-13 17:12:31 +04:00
|
|
|
proto_encoding = result.get('encoding')
|
|
|
|
if base_url is None:
|
|
|
|
base_url = result.get('redirected_url', url)
|
|
|
|
if 'string' in result:
|
|
|
|
yield 'string', result['string'], base_url, proto_encoding
|
|
|
|
else:
|
|
|
|
yield (
|
|
|
|
'file_obj', result['file_obj'], base_url,
|
|
|
|
proto_encoding)
|
2017-09-06 17:55:21 +03:00
|
|
|
elif file_obj is not None:
|
2012-02-16 16:40:29 +04:00
|
|
|
if base_url is None:
|
2012-10-05 20:50:40 +04:00
|
|
|
# filesystem file-like objects have a 'name' attribute.
|
2012-02-16 16:40:29 +04:00
|
|
|
name = getattr(file_obj, 'name', None)
|
2012-07-18 16:31:55 +04:00
|
|
|
# Some streams have a .name like '<stdin>', not a filename.
|
|
|
|
if name and not name.startswith('<'):
|
2012-02-16 16:40:29 +04:00
|
|
|
base_url = ensure_url(name)
|
2013-06-20 15:58:24 +04:00
|
|
|
yield 'file_obj', file_obj, base_url, None
|
2017-09-06 17:55:21 +03:00
|
|
|
else:
|
2020-01-10 01:33:07 +03:00
|
|
|
assert string is not None
|
|
|
|
yield 'string', string, base_url, None
|
2013-02-25 18:21:25 +04:00
|
|
|
|
|
|
|
# Work around circular imports.
|
2019-03-01 13:38:03 +03:00
|
|
|
from .css import preprocess_stylesheet # noqa isort:skip
|
|
|
|
from .html import ( # noqa isort:skip
|
2019-12-24 16:39:40 +03:00
|
|
|
HTML5_UA_COUNTER_STYLE, HTML5_UA_STYLESHEET, HTML5_PH_STYLESHEET,
|
|
|
|
find_base_url, get_html_metadata)
|
2019-03-01 13:38:03 +03:00
|
|
|
from .document import Document, Page # noqa isort:skip
|