2011-06-30 00:34:01 +04:00
|
|
|
|
# coding: utf8
|
2012-03-22 02:19:27 +04:00
|
|
|
|
"""
|
|
|
|
|
weasyprint.utils
|
|
|
|
|
----------------
|
2011-06-30 00:34:01 +04:00
|
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
|
Various utility functions and classes.
|
2011-08-16 17:11:35 +04:00
|
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
|
:copyright: Copyright 2011-2012 Simon Sapin and contributors, see AUTHORS.
|
|
|
|
|
:license: BSD, see LICENSE for details.
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
|
|
|
|
"""
|
2011-06-30 00:34:01 +04:00
|
|
|
|
|
2012-02-17 21:49:58 +04:00
|
|
|
|
from __future__ import division, unicode_literals
|
|
|
|
|
|
2012-05-23 16:43:02 +04:00
|
|
|
|
import re
|
2012-07-27 20:55:19 +04:00
|
|
|
|
import sys
|
2012-07-30 14:01:20 +04:00
|
|
|
|
import codecs
|
2012-05-23 16:43:02 +04:00
|
|
|
|
import os.path
|
2012-07-18 16:31:55 +04:00
|
|
|
|
import mimetypes
|
2011-06-30 00:34:01 +04:00
|
|
|
|
|
2012-05-15 15:40:36 +04:00
|
|
|
|
from . import VERSION_STRING
|
2012-05-24 18:06:58 +04:00
|
|
|
|
from .logger import LOGGER
|
2012-02-17 21:49:58 +04:00
|
|
|
|
from .compat import (
|
2012-05-30 22:06:44 +04:00
|
|
|
|
urljoin, urlsplit, quote, unquote, unquote_to_bytes, urlopen_contenttype,
|
2012-07-29 20:38:59 +04:00
|
|
|
|
Request, parse_email, pathname2url, unicode, base64_decode)
|
2012-02-17 21:49:58 +04:00
|
|
|
|
|
|
|
|
|
|
2012-07-29 17:13:45 +04:00
|
|
|
|
# Unlinke HTML, CSS and PNG, the SVG MIME type is not always builtin
|
|
|
|
|
# in some Python version and therefore not reliable.
|
|
|
|
|
if sys.version_info[0] >= 3:
|
|
|
|
|
mimetypes.add_type('image/svg+xml', '.svg')
|
|
|
|
|
else:
|
|
|
|
|
# Native strings required.
|
|
|
|
|
mimetypes.add_type(b'image/svg+xml', b'.svg')
|
2012-07-29 00:38:44 +04:00
|
|
|
|
|
|
|
|
|
|
2012-07-30 14:01:20 +04:00
|
|
|
|
# getfilesystemencoding() on Linux is sometimes stupid...
|
|
|
|
|
FILESYSTEM_ENCODING = sys.getfilesystemencoding() or 'utf-8'
|
|
|
|
|
try:
|
|
|
|
|
if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
|
|
|
|
|
FILESYSTEM_ENCODING = 'utf-8'
|
|
|
|
|
except LookupError:
|
|
|
|
|
FILESYSTEM_ENCODING = 'utf-8'
|
|
|
|
|
|
|
|
|
|
|
2012-07-27 19:19:17 +04:00
|
|
|
|
# See http://stackoverflow.com/a/11687993/1162888
|
2012-06-02 10:04:24 +04:00
|
|
|
|
# Both are needed in Python 3 as the re module does not like to mix
|
2012-07-27 19:19:17 +04:00
|
|
|
|
UNICODE_SCHEME_RE = re.compile('^([a-z][a-z0-1.+-]+):', re.I)
|
|
|
|
|
BYTES_SCHEME_RE = re.compile(b'^([a-z][a-z0-1.+-]+):', re.I)
|
2012-06-02 10:04:24 +04:00
|
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
|
|
2012-05-18 11:12:50 +04:00
|
|
|
|
def iri_to_uri(url):
|
|
|
|
|
"""Turn an IRI that can contain any Unicode character into an ASII-only
|
|
|
|
|
URI that conforms to RFC 3986.
|
2012-05-15 15:40:36 +04:00
|
|
|
|
"""
|
2012-07-27 20:55:19 +04:00
|
|
|
|
# Use UTF-8 as per RFC 3987 (IRI), except for file://
|
2012-07-30 14:01:20 +04:00
|
|
|
|
url = url.encode(FILESYSTEM_ENCODING
|
|
|
|
|
if url.startswith('file:') else 'utf-8')
|
2012-05-18 11:12:50 +04:00
|
|
|
|
# This is a full URI, not just a component. Only %-encode characters
|
|
|
|
|
# that are not allowed at all in URIs. Everthing else is "safe":
|
|
|
|
|
# * Reserved characters: /:?#[]@!$&'()*+,;=
|
|
|
|
|
# * Unreserved characters: ASCII letters, digits and -._~
|
|
|
|
|
# Of these, only '~' is not in urllib’s "always safe" list.
|
|
|
|
|
# * '%' to avoid double-encoding
|
2012-05-18 19:54:10 +04:00
|
|
|
|
return quote(url, safe=b"/:?#[]@!$&'()*+,;=~%")
|
2011-12-16 15:19:10 +04:00
|
|
|
|
|
|
|
|
|
|
2012-03-24 16:39:31 +04:00
|
|
|
|
def path2url(path):
|
|
|
|
|
"""Return file URL of `path`"""
|
2012-06-02 10:04:24 +04:00
|
|
|
|
path = os.path.abspath(path)
|
2012-08-02 19:19:34 +04:00
|
|
|
|
if os.path.isdir(path):
|
|
|
|
|
# Make sure directory names have a trailing slash.
|
|
|
|
|
# Otherwise relative URIs are resolved from the parent directory.
|
|
|
|
|
path += os.path.sep
|
2012-06-02 10:04:24 +04:00
|
|
|
|
if isinstance(path, unicode):
|
2012-07-30 14:01:20 +04:00
|
|
|
|
path = path.encode(FILESYSTEM_ENCODING)
|
2012-07-27 20:08:05 +04:00
|
|
|
|
path = pathname2url(path)
|
|
|
|
|
if path.startswith('///'):
|
|
|
|
|
# On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
|
|
|
|
|
# That enough slashes already.
|
|
|
|
|
return 'file:' + path
|
|
|
|
|
else:
|
|
|
|
|
return 'file://' + path
|
2012-03-24 16:39:31 +04:00
|
|
|
|
|
|
|
|
|
|
2012-05-21 20:43:08 +04:00
|
|
|
|
def url_is_absolute(url):
|
2012-06-02 10:04:24 +04:00
|
|
|
|
return bool(
|
|
|
|
|
(UNICODE_SCHEME_RE if isinstance(url, unicode) else BYTES_SCHEME_RE)
|
|
|
|
|
.match(url))
|
2012-05-21 20:43:08 +04:00
|
|
|
|
|
|
|
|
|
|
2012-05-24 18:06:58 +04:00
|
|
|
|
def get_url_attribute(element, attr_name):
|
|
|
|
|
"""Get the URI corresponding to the ``attr_name`` attribute.
|
2012-05-21 20:43:08 +04:00
|
|
|
|
|
|
|
|
|
Return ``None`` if:
|
|
|
|
|
|
|
|
|
|
* the attribute is empty or missing or,
|
|
|
|
|
* the value is a relative URI but the document has no base URI.
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
2012-05-21 20:43:08 +04:00
|
|
|
|
Otherwise, return an absolute URI.
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
2011-08-09 14:45:51 +04:00
|
|
|
|
"""
|
2012-05-24 18:06:58 +04:00
|
|
|
|
attr_value = element.get(attr_name, '').strip()
|
2011-12-16 15:19:10 +04:00
|
|
|
|
if attr_value:
|
2012-05-21 20:43:08 +04:00
|
|
|
|
if url_is_absolute(attr_value):
|
|
|
|
|
return attr_value
|
|
|
|
|
elif element.base_url:
|
2012-02-27 19:48:27 +04:00
|
|
|
|
return urljoin(element.base_url, attr_value)
|
2012-05-24 18:06:58 +04:00
|
|
|
|
else:
|
|
|
|
|
LOGGER.warn(
|
|
|
|
|
'Relative URI reference without a base URI: '
|
|
|
|
|
'<%s %s="%s"> at line %d',
|
|
|
|
|
element.tag, attr_name, attr_value, element.sourceline)
|
2011-08-09 14:45:51 +04:00
|
|
|
|
|
2011-08-05 13:16:44 +04:00
|
|
|
|
|
2012-05-30 22:06:44 +04:00
|
|
|
|
def get_link_attribute(element, attr_name):
|
|
|
|
|
"""Return ('external', absolute_uri) or
|
|
|
|
|
('internal', unquoted_fragment_id) or None.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
attr_value = element.get(attr_name, '').strip()
|
|
|
|
|
if attr_value.startswith('#'):
|
|
|
|
|
# Do not require a base_url when the value is just a fragment.
|
|
|
|
|
return 'internal', unquote(attr_value[1:])
|
|
|
|
|
else:
|
|
|
|
|
uri = get_url_attribute(element, attr_name)
|
2012-08-03 17:20:22 +04:00
|
|
|
|
if uri and element.base_url:
|
2012-05-30 22:06:44 +04:00
|
|
|
|
parsed = urlsplit(uri)
|
|
|
|
|
# Compare with fragments removed
|
2012-08-03 17:20:22 +04:00
|
|
|
|
if parsed[:-1] == urlsplit(element.base_url)[:-1]:
|
2012-05-30 22:06:44 +04:00
|
|
|
|
return 'internal', unquote(parsed.fragment)
|
|
|
|
|
else:
|
|
|
|
|
return 'external', uri
|
|
|
|
|
|
|
|
|
|
|
2011-08-19 18:53:05 +04:00
|
|
|
|
def ensure_url(string):
|
|
|
|
|
"""Get a ``scheme://path`` URL from ``string``.
|
|
|
|
|
|
|
|
|
|
If ``string`` looks like an URL, return it unchanged. Otherwise assume a
|
|
|
|
|
filename and convert it to a ``file://`` URL.
|
|
|
|
|
|
2011-08-09 14:45:51 +04:00
|
|
|
|
"""
|
2012-07-27 20:55:19 +04:00
|
|
|
|
return string if url_is_absolute(string) else path2url(string)
|
2011-08-16 17:11:35 +04:00
|
|
|
|
|
|
|
|
|
|
2012-07-29 20:38:59 +04:00
|
|
|
|
def safe_base64_decode(data):
|
2012-03-21 19:07:49 +04:00
|
|
|
|
"""Decode base64, padding being optional.
|
|
|
|
|
|
|
|
|
|
"From a theoretical point of view, the padding character is not needed,
|
|
|
|
|
since the number of missing bytes can be calculated from the number
|
|
|
|
|
of Base64 digits."
|
|
|
|
|
|
|
|
|
|
https://en.wikipedia.org/wiki/Base64#Padding
|
|
|
|
|
|
|
|
|
|
:param data: Base64 data as an ASCII byte string
|
|
|
|
|
:returns: The decoded byte string.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
missing_padding = 4 - len(data) % 4
|
|
|
|
|
if missing_padding:
|
|
|
|
|
data += b'='* missing_padding
|
2012-07-29 20:38:59 +04:00
|
|
|
|
return base64_decode(data)
|
2012-03-21 19:07:49 +04:00
|
|
|
|
|
|
|
|
|
|
2012-05-23 16:43:02 +04:00
|
|
|
|
def open_data_url(url):
|
2012-05-30 22:06:44 +04:00
|
|
|
|
"""Decode URLs with the 'data' scheme. urllib can handle them
|
2012-02-17 21:49:58 +04:00
|
|
|
|
in Python 2, but that is broken in Python 3.
|
|
|
|
|
|
2012-05-30 22:06:44 +04:00
|
|
|
|
Inspired from Python 2.7.2’s urllib.py.
|
2012-02-17 21:49:58 +04:00
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# syntax of data URLs:
|
|
|
|
|
# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
|
|
|
|
|
# mediatype := [ type "/" subtype ] *( ";" parameter )
|
|
|
|
|
# data := *urlchar
|
|
|
|
|
# parameter := attribute "=" value
|
|
|
|
|
try:
|
|
|
|
|
header, data = url.split(',', 1)
|
|
|
|
|
except ValueError:
|
|
|
|
|
raise IOError('bad data URL')
|
|
|
|
|
header = header[5:] # len('data:') == 5
|
|
|
|
|
if header:
|
|
|
|
|
semi = header.rfind(';')
|
|
|
|
|
if semi >= 0 and '=' not in header[semi:]:
|
|
|
|
|
content_type = header[:semi]
|
|
|
|
|
encoding = header[semi+1:]
|
|
|
|
|
else:
|
|
|
|
|
content_type = header
|
|
|
|
|
encoding = ''
|
|
|
|
|
message = parse_email('Content-type: ' + content_type)
|
|
|
|
|
mime_type = message.get_content_type()
|
|
|
|
|
charset = message.get_content_charset()
|
|
|
|
|
else:
|
|
|
|
|
mime_type = 'text/plain'
|
|
|
|
|
charset = 'US-ASCII'
|
|
|
|
|
encoding = ''
|
|
|
|
|
|
2012-03-21 19:07:49 +04:00
|
|
|
|
data = unquote_to_bytes(data)
|
2012-02-17 21:49:58 +04:00
|
|
|
|
if encoding == 'base64':
|
2012-07-29 20:38:59 +04:00
|
|
|
|
data = safe_base64_decode(data)
|
2012-02-17 21:49:58 +04:00
|
|
|
|
|
2012-07-18 16:31:55 +04:00
|
|
|
|
return dict(string=data, mime_type=mime_type, encoding=charset,
|
|
|
|
|
redirected_url=url)
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
|
|
|
|
|
2012-07-13 14:24:55 +04:00
|
|
|
|
def default_url_fetcher(url):
|
2012-09-19 19:37:52 +04:00
|
|
|
|
"""Fetch an external resource such as an image or stylesheet.
|
|
|
|
|
|
|
|
|
|
:param url: The URL of the resource to fetch
|
|
|
|
|
:raises: any exception to indicate failure. Failures are logged
|
|
|
|
|
as warnings, with the string representation of the exception
|
|
|
|
|
in the message.
|
|
|
|
|
:returns: In case of success, a dict with the following keys:
|
|
|
|
|
|
|
|
|
|
* One of ``string`` (a byte string) or ``file_obj`` (a file-like object)
|
|
|
|
|
* Optionally: ``mime_type``, a MIME type extracted eg. from a
|
|
|
|
|
*Content-Type* header. If not provided, the type is guessed from the
|
|
|
|
|
file extension in the URL.
|
|
|
|
|
* Optionally: ``encoding``, a character encoding extracted eg. from a
|
|
|
|
|
*charset* parameter in a *Content-Type* header
|
|
|
|
|
* Optionally: ``redirected_url``, the actual URL of the ressource in case
|
|
|
|
|
there were eg. HTTP redirects.
|
|
|
|
|
|
|
|
|
|
If a ``file_obj`` key is given, it is the caller’s responsability to call
|
|
|
|
|
``file_obj.close()``.
|
2011-12-08 19:31:03 +04:00
|
|
|
|
|
2011-08-16 17:11:35 +04:00
|
|
|
|
"""
|
2012-07-13 14:24:55 +04:00
|
|
|
|
if url.startswith('data:'):
|
|
|
|
|
return open_data_url(url)
|
|
|
|
|
elif UNICODE_SCHEME_RE.match(url):
|
2012-05-18 11:12:50 +04:00
|
|
|
|
url = iri_to_uri(url)
|
2012-07-13 14:24:55 +04:00
|
|
|
|
result, mime_type, charset = urlopen_contenttype(Request(
|
|
|
|
|
url, headers={'User-Agent': VERSION_STRING}))
|
|
|
|
|
return dict(file_obj=result, redirected_url=result.geturl(),
|
|
|
|
|
mime_type=mime_type, encoding=charset)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError('Not an absolute URI: %r' % url)
|
2012-07-18 16:31:55 +04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def wrap_url_fetcher(url_fetcher):
|
|
|
|
|
"""Decorate an url_fetcher to fill in optional data.
|
|
|
|
|
|
|
|
|
|
url_fetcher itself can be None, in which case the default fetcher is used.
|
|
|
|
|
In a result dict, redirected_url defaults to the original URL. If not
|
|
|
|
|
provided, mime_type is guessed from the path extension in the URL.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
if url_fetcher is None:
|
|
|
|
|
return default_url_fetcher
|
|
|
|
|
|
|
|
|
|
def wrapped_fetcher(url):
|
|
|
|
|
result = url_fetcher(url)
|
|
|
|
|
result.setdefault('redirected_url', url)
|
|
|
|
|
if 'mime_type' not in result:
|
|
|
|
|
path = urlsplit(result['redirected_url']).path
|
|
|
|
|
mime_type, _ = mimetypes.guess_type(path)
|
|
|
|
|
result['mime_type'] = mime_type or 'application/octet-stream'
|
|
|
|
|
return result
|
|
|
|
|
return wrapped_fetcher
|