2012-03-22 02:19:27 +04:00
|
|
|
|
"""
|
|
|
|
|
weasyprint.utils
|
|
|
|
|
----------------
|
2011-06-30 00:34:01 +04:00
|
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
|
Various utility functions and classes.
|
2011-08-16 17:11:35 +04:00
|
|
|
|
|
2019-03-04 13:04:06 +03:00
|
|
|
|
:copyright: Copyright 2011-2019 Simon Sapin and contributors, see AUTHORS.
|
2012-03-22 02:19:27 +04:00
|
|
|
|
:license: BSD, see LICENSE for details.
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
|
|
|
|
"""
|
2011-06-30 00:34:01 +04:00
|
|
|
|
|
2018-01-14 03:48:17 +03:00
|
|
|
|
import codecs
|
2017-03-25 02:33:36 +03:00
|
|
|
|
import contextlib
|
|
|
|
|
import gzip
|
2014-04-22 02:52:58 +04:00
|
|
|
|
import io
|
2017-03-25 02:33:36 +03:00
|
|
|
|
import os.path
|
2012-05-23 16:43:02 +04:00
|
|
|
|
import re
|
2012-07-27 20:55:19 +04:00
|
|
|
|
import sys
|
2014-04-27 14:00:02 +04:00
|
|
|
|
import traceback
|
2017-03-25 02:33:36 +03:00
|
|
|
|
import zlib
|
2018-01-14 03:48:17 +03:00
|
|
|
|
from base64 import decodebytes
|
|
|
|
|
from gzip import GzipFile
|
2018-01-14 04:09:25 +03:00
|
|
|
|
from urllib.parse import quote, unquote, urljoin, urlsplit
|
|
|
|
|
from urllib.request import Request, pathname2url, urlopen
|
2011-06-30 00:34:01 +04:00
|
|
|
|
|
2012-05-15 15:40:36 +04:00
|
|
|
|
from . import VERSION_STRING
|
2017-03-25 02:33:36 +03:00
|
|
|
|
from .logger import LOGGER
|
2012-02-17 21:49:58 +04:00
|
|
|
|
|
2012-07-27 19:19:17 +04:00
|
|
|
|
# See http://stackoverflow.com/a/11687993/1162888
|
2012-06-02 10:04:24 +04:00
|
|
|
|
# Both are needed in Python 3 as the re module does not like to mix
|
2013-04-12 11:32:25 +04:00
|
|
|
|
# http://tools.ietf.org/html/rfc3986#section-3.1
|
|
|
|
|
UNICODE_SCHEME_RE = re.compile('^([a-zA-Z][a-zA-Z0-9.+-]+):')
|
|
|
|
|
BYTES_SCHEME_RE = re.compile(b'^([a-zA-Z][a-zA-Z0-9.+-]+):')
|
2012-06-02 10:04:24 +04:00
|
|
|
|
|
2018-01-14 03:48:17 +03:00
|
|
|
|
# getfilesystemencoding() on Linux is sometimes stupid...
|
|
|
|
|
FILESYSTEM_ENCODING = sys.getfilesystemencoding()
|
|
|
|
|
try:
|
|
|
|
|
if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
|
|
|
|
|
FILESYSTEM_ENCODING = 'utf-8'
|
|
|
|
|
except LookupError:
|
|
|
|
|
FILESYSTEM_ENCODING = 'utf-8'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class StreamingGzipFile(GzipFile):
|
|
|
|
|
def __init__(self, fileobj):
|
|
|
|
|
GzipFile.__init__(self, fileobj=fileobj)
|
|
|
|
|
self.fileobj_to_close = fileobj
|
|
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
|
GzipFile.close(self)
|
|
|
|
|
self.fileobj_to_close.close()
|
|
|
|
|
|
|
|
|
|
# Inform html5lib to not rely on these:
|
|
|
|
|
seek = tell = None
|
|
|
|
|
|
|
|
|
|
|
2012-05-18 11:12:50 +04:00
|
|
|
|
def iri_to_uri(url):
|
2015-12-16 15:50:04 +03:00
|
|
|
|
"""Turn an IRI that can contain any Unicode character into an ASCII-only
|
2012-05-18 11:12:50 +04:00
|
|
|
|
URI that conforms to RFC 3986.
|
2012-05-15 15:40:36 +04:00
|
|
|
|
"""
|
2012-10-04 21:12:34 +04:00
|
|
|
|
if url.startswith('data:'):
|
|
|
|
|
# Data URIs can be huge, but don’t need this anyway.
|
|
|
|
|
return url
|
2012-07-27 20:55:19 +04:00
|
|
|
|
# Use UTF-8 as per RFC 3987 (IRI), except for file://
|
2016-08-17 01:21:31 +03:00
|
|
|
|
url = url.encode(FILESYSTEM_ENCODING
|
|
|
|
|
if url.startswith('file:') else 'utf-8')
|
2012-05-18 11:12:50 +04:00
|
|
|
|
# This is a full URI, not just a component. Only %-encode characters
|
|
|
|
|
# that are not allowed at all in URIs. Everthing else is "safe":
|
|
|
|
|
# * Reserved characters: /:?#[]@!$&'()*+,;=
|
|
|
|
|
# * Unreserved characters: ASCII letters, digits and -._~
|
|
|
|
|
# Of these, only '~' is not in urllib’s "always safe" list.
|
|
|
|
|
# * '%' to avoid double-encoding
|
2012-05-18 19:54:10 +04:00
|
|
|
|
return quote(url, safe=b"/:?#[]@!$&'()*+,;=~%")
|
2011-12-16 15:19:10 +04:00
|
|
|
|
|
|
|
|
|
|
2012-03-24 16:39:31 +04:00
|
|
|
|
def path2url(path):
|
2018-03-20 12:04:55 +03:00
|
|
|
|
"""Return file URL of `path`.
|
|
|
|
|
Accepts 'str' or 'bytes', returns 'str'
|
|
|
|
|
"""
|
|
|
|
|
# Ensure 'str'
|
|
|
|
|
if isinstance(path, bytes):
|
|
|
|
|
path = path.decode(sys.getfilesystemencoding())
|
|
|
|
|
# if a trailing path.sep is given -- keep it
|
|
|
|
|
wants_trailing_slash = path.endswith(os.path.sep) or path.endswith('/')
|
2012-06-02 10:04:24 +04:00
|
|
|
|
path = os.path.abspath(path)
|
2018-03-20 12:04:55 +03:00
|
|
|
|
if wants_trailing_slash or os.path.isdir(path):
|
2012-08-02 19:19:34 +04:00
|
|
|
|
# Make sure directory names have a trailing slash.
|
|
|
|
|
# Otherwise relative URIs are resolved from the parent directory.
|
|
|
|
|
path += os.path.sep
|
2018-03-20 12:04:55 +03:00
|
|
|
|
wants_trailing_slash = True
|
2012-07-27 20:08:05 +04:00
|
|
|
|
path = pathname2url(path)
|
2018-03-20 12:04:55 +03:00
|
|
|
|
# on Windows pathname2url cuts off trailing slash
|
|
|
|
|
if wants_trailing_slash and not path.endswith('/'):
|
|
|
|
|
path += '/'
|
2012-07-27 20:08:05 +04:00
|
|
|
|
if path.startswith('///'):
|
|
|
|
|
# On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
|
|
|
|
|
# That enough slashes already.
|
|
|
|
|
return 'file:' + path
|
|
|
|
|
else:
|
|
|
|
|
return 'file://' + path
|
2012-03-24 16:39:31 +04:00
|
|
|
|
|
|
|
|
|
|
2012-05-21 20:43:08 +04:00
|
|
|
|
def url_is_absolute(url):
|
2012-06-02 10:04:24 +04:00
|
|
|
|
return bool(
|
2018-01-14 03:48:17 +03:00
|
|
|
|
(UNICODE_SCHEME_RE if isinstance(url, str) else BYTES_SCHEME_RE)
|
2012-06-02 10:04:24 +04:00
|
|
|
|
.match(url))
|
2012-05-21 20:43:08 +04:00
|
|
|
|
|
|
|
|
|
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def get_url_attribute(element, attr_name, base_url, allow_relative=False):
|
2012-05-24 18:06:58 +04:00
|
|
|
|
"""Get the URI corresponding to the ``attr_name`` attribute.
|
2012-05-21 20:43:08 +04:00
|
|
|
|
|
|
|
|
|
Return ``None`` if:
|
|
|
|
|
|
|
|
|
|
* the attribute is empty or missing or,
|
2017-03-25 20:28:41 +03:00
|
|
|
|
* the value is a relative URI but the document has no base URI and
|
|
|
|
|
``allow_relative`` is ``False``.
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
2017-03-25 20:28:41 +03:00
|
|
|
|
Otherwise return an URI, absolute if possible.
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
2011-08-09 14:45:51 +04:00
|
|
|
|
"""
|
2012-09-25 18:01:12 +04:00
|
|
|
|
value = element.get(attr_name, '').strip()
|
|
|
|
|
if value:
|
2017-03-25 20:28:41 +03:00
|
|
|
|
return url_join(
|
2017-07-03 16:19:05 +03:00
|
|
|
|
base_url or '', value, allow_relative, '<%s %s="%s">',
|
2017-07-01 01:28:14 +03:00
|
|
|
|
(element.tag, attr_name, value))
|
2012-09-25 18:01:12 +04:00
|
|
|
|
|
|
|
|
|
|
2017-03-25 20:28:41 +03:00
|
|
|
|
def url_join(base_url, url, allow_relative, context, context_args):
|
|
|
|
|
"""Like urllib.urljoin, but warn if base_url is required but missing."""
|
2012-09-25 18:01:12 +04:00
|
|
|
|
if url_is_absolute(url):
|
2012-10-04 21:12:34 +04:00
|
|
|
|
return iri_to_uri(url)
|
2012-09-25 18:01:12 +04:00
|
|
|
|
elif base_url:
|
2012-10-04 21:12:34 +04:00
|
|
|
|
return iri_to_uri(urljoin(base_url, url))
|
2017-03-25 20:28:41 +03:00
|
|
|
|
elif allow_relative:
|
|
|
|
|
return iri_to_uri(url)
|
2012-09-25 18:01:12 +04:00
|
|
|
|
else:
|
2017-07-25 14:59:56 +03:00
|
|
|
|
LOGGER.error('Relative URI reference without a base URI: ' + context,
|
|
|
|
|
*context_args)
|
2012-09-25 18:01:12 +04:00
|
|
|
|
return None
|
2011-08-09 14:45:51 +04:00
|
|
|
|
|
2011-08-05 13:16:44 +04:00
|
|
|
|
|
2017-07-03 16:19:05 +03:00
|
|
|
|
def get_link_attribute(element, attr_name, base_url):
|
2012-05-30 22:06:44 +04:00
|
|
|
|
"""Return ('external', absolute_uri) or
|
|
|
|
|
('internal', unquoted_fragment_id) or None.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
attr_value = element.get(attr_name, '').strip()
|
2013-03-13 15:12:12 +04:00
|
|
|
|
if attr_value.startswith('#') and len(attr_value) > 1:
|
2012-05-30 22:06:44 +04:00
|
|
|
|
# Do not require a base_url when the value is just a fragment.
|
2018-04-16 14:20:52 +03:00
|
|
|
|
return ('url', ('internal', unquote(attr_value[1:])))
|
2017-07-03 16:19:05 +03:00
|
|
|
|
uri = get_url_attribute(element, attr_name, base_url, allow_relative=True)
|
2013-03-13 15:12:12 +04:00
|
|
|
|
if uri:
|
2017-07-03 16:19:05 +03:00
|
|
|
|
if base_url:
|
2012-05-30 22:06:44 +04:00
|
|
|
|
parsed = urlsplit(uri)
|
|
|
|
|
# Compare with fragments removed
|
2017-07-03 16:19:05 +03:00
|
|
|
|
if parsed[:-1] == urlsplit(base_url)[:-1]:
|
2018-04-16 14:20:52 +03:00
|
|
|
|
return ('url', ('internal', unquote(parsed.fragment)))
|
|
|
|
|
return ('url', ('external', uri))
|
2012-05-30 22:06:44 +04:00
|
|
|
|
|
|
|
|
|
|
2011-08-19 18:53:05 +04:00
|
|
|
|
def ensure_url(string):
|
|
|
|
|
"""Get a ``scheme://path`` URL from ``string``.
|
|
|
|
|
|
|
|
|
|
If ``string`` looks like an URL, return it unchanged. Otherwise assume a
|
|
|
|
|
filename and convert it to a ``file://`` URL.
|
|
|
|
|
|
2011-08-09 14:45:51 +04:00
|
|
|
|
"""
|
2012-07-27 20:55:19 +04:00
|
|
|
|
return string if url_is_absolute(string) else path2url(string)
|
2011-08-16 17:11:35 +04:00
|
|
|
|
|
|
|
|
|
|
2018-01-14 03:48:17 +03:00
|
|
|
|
def safe_decodebytes(data):
|
2012-03-21 19:07:49 +04:00
|
|
|
|
"""Decode base64, padding being optional.
|
|
|
|
|
|
|
|
|
|
"From a theoretical point of view, the padding character is not needed,
|
|
|
|
|
since the number of missing bytes can be calculated from the number
|
|
|
|
|
of Base64 digits."
|
|
|
|
|
|
|
|
|
|
https://en.wikipedia.org/wiki/Base64#Padding
|
|
|
|
|
|
|
|
|
|
:param data: Base64 data as an ASCII byte string
|
|
|
|
|
:returns: The decoded byte string.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
missing_padding = 4 - len(data) % 4
|
|
|
|
|
if missing_padding:
|
2012-11-06 16:55:39 +04:00
|
|
|
|
data += b'=' * missing_padding
|
2018-01-14 03:48:17 +03:00
|
|
|
|
return decodebytes(data)
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
|
|
|
|
|
2014-04-22 02:52:58 +04:00
|
|
|
|
HTTP_HEADERS = {
|
|
|
|
|
'User-Agent': VERSION_STRING,
|
2018-08-10 18:50:57 +03:00
|
|
|
|
'Accept': '*/*',
|
2014-04-22 02:52:58 +04:00
|
|
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
|
|
|
}
|
|
|
|
|
|
2014-04-27 15:29:55 +04:00
|
|
|
|
|
2019-04-04 15:04:05 +03:00
|
|
|
|
def default_url_fetcher(url, timeout=10, ssl_context=None):
|
2012-09-19 19:37:52 +04:00
|
|
|
|
"""Fetch an external resource such as an image or stylesheet.
|
|
|
|
|
|
2012-10-08 21:51:18 +04:00
|
|
|
|
Another callable with the same signature can be given as the
|
|
|
|
|
:obj:`url_fetcher` argument to :class:`HTML` or :class:`CSS`.
|
|
|
|
|
(See :ref:`url-fetchers`.)
|
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
:type url: str
|
2017-04-28 21:36:14 +03:00
|
|
|
|
:param url: The URL of the resource to fetch.
|
2019-04-12 17:34:19 +03:00
|
|
|
|
:type timeout: int
|
|
|
|
|
:param timeout: The number of seconds before HTTP requests are dropped.
|
|
|
|
|
:type ssl_context: ssl.SSLContext
|
|
|
|
|
:param ssl_context: An SSL context used for HTTP requests.
|
2019-02-22 13:34:46 +03:00
|
|
|
|
:raises: An exception indicating failure, e.g. :obj:`ValueError` on
|
2017-04-29 11:41:53 +03:00
|
|
|
|
syntactically invalid URL.
|
2019-02-22 13:34:46 +03:00
|
|
|
|
:returns: A :obj:`dict` with the following keys:
|
2012-09-19 19:37:52 +04:00
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
* One of ``string`` (a :obj:`bytestring <bytes>`) or ``file_obj``
|
|
|
|
|
(a :term:`file object`).
|
2017-04-28 21:36:14 +03:00
|
|
|
|
* Optionally: ``mime_type``, a MIME type extracted e.g. from a
|
2012-09-19 19:37:52 +04:00
|
|
|
|
*Content-Type* header. If not provided, the type is guessed from the
|
|
|
|
|
file extension in the URL.
|
2017-04-28 21:36:14 +03:00
|
|
|
|
* Optionally: ``encoding``, a character encoding extracted e.g. from a
|
2012-09-19 19:37:52 +04:00
|
|
|
|
*charset* parameter in a *Content-Type* header
|
2015-07-01 02:01:43 +03:00
|
|
|
|
* Optionally: ``redirected_url``, the actual URL of the resource
|
2017-04-28 21:36:14 +03:00
|
|
|
|
if there were e.g. HTTP redirects.
|
2014-04-18 18:39:54 +04:00
|
|
|
|
* Optionally: ``filename``, the filename of the resource. Usually
|
|
|
|
|
derived from the *filename* parameter in a *Content-Disposition*
|
|
|
|
|
header
|
2012-09-19 19:37:52 +04:00
|
|
|
|
|
2017-04-28 21:36:14 +03:00
|
|
|
|
If a ``file_obj`` key is given, it is the caller’s responsibility
|
2019-02-27 20:38:14 +03:00
|
|
|
|
to call ``file_obj.close()``. The default function used internally to
|
|
|
|
|
fetch data in WeasyPrint tries to close the file object after
|
|
|
|
|
retreiving; but if this URL fetcher is used elsewhere, the file object
|
|
|
|
|
has to be closed manually.
|
2011-12-08 19:31:03 +04:00
|
|
|
|
|
2011-08-16 17:11:35 +04:00
|
|
|
|
"""
|
2018-01-14 03:48:17 +03:00
|
|
|
|
if UNICODE_SCHEME_RE.match(url):
|
2018-10-26 20:18:33 +03:00
|
|
|
|
# See https://bugs.python.org/issue34702
|
|
|
|
|
if url.startswith('file://'):
|
|
|
|
|
url = url.split('?')[0]
|
|
|
|
|
|
2012-05-18 11:12:50 +04:00
|
|
|
|
url = iri_to_uri(url)
|
2019-04-04 15:12:05 +03:00
|
|
|
|
response = urlopen(Request(url, headers=HTTP_HEADERS),
|
2019-04-04 15:04:05 +03:00
|
|
|
|
timeout=timeout, context=ssl_context)
|
2018-01-14 03:48:17 +03:00
|
|
|
|
response_info = response.info()
|
2014-04-22 02:52:58 +04:00
|
|
|
|
result = dict(redirected_url=response.geturl(),
|
2018-01-14 03:48:17 +03:00
|
|
|
|
mime_type=response_info.get_content_type(),
|
|
|
|
|
encoding=response_info.get_param('charset'),
|
|
|
|
|
filename=response_info.get_filename())
|
|
|
|
|
content_encoding = response_info.get('Content-Encoding')
|
2014-04-22 02:52:58 +04:00
|
|
|
|
if content_encoding == 'gzip':
|
|
|
|
|
if StreamingGzipFile is None:
|
|
|
|
|
result['string'] = gzip.GzipFile(
|
|
|
|
|
fileobj=io.BytesIO(response.read())).read()
|
|
|
|
|
response.close()
|
|
|
|
|
else:
|
|
|
|
|
result['file_obj'] = StreamingGzipFile(fileobj=response)
|
|
|
|
|
elif content_encoding == 'deflate':
|
|
|
|
|
data = response.read()
|
|
|
|
|
try:
|
|
|
|
|
result['string'] = zlib.decompress(data)
|
|
|
|
|
except zlib.error:
|
|
|
|
|
# Try without zlib header or checksum
|
|
|
|
|
result['string'] = zlib.decompress(data, -15)
|
|
|
|
|
else:
|
|
|
|
|
result['file_obj'] = response
|
|
|
|
|
return result
|
2012-07-13 14:24:55 +04:00
|
|
|
|
else:
|
|
|
|
|
raise ValueError('Not an absolute URI: %r' % url)
|
2012-07-18 16:31:55 +04:00
|
|
|
|
|
|
|
|
|
|
2013-06-21 00:32:28 +04:00
|
|
|
|
class URLFetchingError(IOError):
|
|
|
|
|
"""Some error happened when fetching an URL."""
|
|
|
|
|
|
|
|
|
|
|
2013-06-20 15:58:24 +04:00
|
|
|
|
@contextlib.contextmanager
|
2013-06-20 15:17:03 +04:00
|
|
|
|
def fetch(url_fetcher, url):
|
2013-06-20 15:58:24 +04:00
|
|
|
|
"""Call an url_fetcher, fill in optional data, and clean up."""
|
2013-06-21 00:32:28 +04:00
|
|
|
|
try:
|
|
|
|
|
result = url_fetcher(url)
|
|
|
|
|
except Exception as exc:
|
2018-10-26 16:00:25 +03:00
|
|
|
|
raise URLFetchingError('%s: %s' % (type(exc).__name__, str(exc)))
|
2013-06-20 15:17:03 +04:00
|
|
|
|
result.setdefault('redirected_url', url)
|
2013-06-20 15:26:12 +04:00
|
|
|
|
result.setdefault('mime_type', None)
|
2013-07-14 12:17:40 +04:00
|
|
|
|
if 'file_obj' in result:
|
2013-06-20 15:58:24 +04:00
|
|
|
|
try:
|
|
|
|
|
yield result
|
|
|
|
|
finally:
|
|
|
|
|
try:
|
|
|
|
|
result['file_obj'].close()
|
2014-04-27 14:00:02 +04:00
|
|
|
|
except Exception:
|
2013-06-20 15:58:24 +04:00
|
|
|
|
# May already be closed or something.
|
2014-04-27 14:00:02 +04:00
|
|
|
|
# This is just cleanup anyway: log but make it non-fatal.
|
|
|
|
|
LOGGER.warning('Error when closing stream for %s:\n%s',
|
|
|
|
|
url, traceback.format_exc())
|
2013-06-20 15:58:24 +04:00
|
|
|
|
else:
|
|
|
|
|
yield result
|