2011-06-30 00:34:01 +04:00
|
|
|
|
# coding: utf8
|
2012-03-22 02:19:27 +04:00
|
|
|
|
"""
|
|
|
|
|
weasyprint.utils
|
|
|
|
|
----------------
|
2011-06-30 00:34:01 +04:00
|
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
|
Various utility functions and classes.
|
2011-08-16 17:11:35 +04:00
|
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
|
:copyright: Copyright 2011-2012 Simon Sapin and contributors, see AUTHORS.
|
|
|
|
|
:license: BSD, see LICENSE for details.
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
|
|
|
|
"""
|
2011-06-30 00:34:01 +04:00
|
|
|
|
|
2012-02-17 21:49:58 +04:00
|
|
|
|
from __future__ import division, unicode_literals
|
|
|
|
|
|
|
|
|
|
import io
|
2012-05-23 16:43:02 +04:00
|
|
|
|
import re
|
2012-02-17 21:49:58 +04:00
|
|
|
|
import base64
|
2012-05-23 16:43:02 +04:00
|
|
|
|
import os.path
|
2011-06-30 00:34:01 +04:00
|
|
|
|
|
2012-05-15 15:40:36 +04:00
|
|
|
|
from . import VERSION_STRING
|
2012-05-24 18:06:58 +04:00
|
|
|
|
from .logger import LOGGER
|
2012-02-17 21:49:58 +04:00
|
|
|
|
from .compat import (
|
2012-05-30 22:06:44 +04:00
|
|
|
|
urljoin, urlsplit, quote, unquote, unquote_to_bytes, urlopen_contenttype,
|
2012-06-02 10:04:24 +04:00
|
|
|
|
Request, parse_email, pathname2url, unicode)
|
2012-02-17 21:49:58 +04:00
|
|
|
|
|
|
|
|
|
|
2012-06-02 10:04:24 +04:00
|
|
|
|
# Both are needed in Python 3 as the re module does not like to mix
|
|
|
|
|
UNICODE_SCHEME_RE = re.compile('^([a-z][a-z0-1.+-]*):', re.I)
|
|
|
|
|
BYTES_SCHEME_RE = re.compile(b'^([a-z][a-z0-1.+-]*):', re.I)
|
|
|
|
|
|
2012-05-23 16:43:02 +04:00
|
|
|
|
OPENER_BY_SCHEME = {}
|
|
|
|
|
|
|
|
|
|
def register_opener(scheme):
|
|
|
|
|
"""Register globally an opener function for a given URI scheme.
|
|
|
|
|
|
|
|
|
|
Expected usage::
|
|
|
|
|
|
|
|
|
|
from weasyprint.urls import register_opener
|
|
|
|
|
@register_opener('foo')
|
|
|
|
|
def git_urlopen(url):
|
|
|
|
|
url = urlparse.urlsplit(url)
|
|
|
|
|
assert url.scheme == 'foo'
|
|
|
|
|
# ...
|
|
|
|
|
return fileobj, mimetype, charset
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
def decorator(function):
|
|
|
|
|
OPENER_BY_SCHEME[scheme] = function
|
|
|
|
|
return function
|
|
|
|
|
return decorator
|
|
|
|
|
|
2012-03-22 02:19:27 +04:00
|
|
|
|
|
2012-05-18 11:12:50 +04:00
|
|
|
|
def iri_to_uri(url):
|
|
|
|
|
"""Turn an IRI that can contain any Unicode character into an ASII-only
|
|
|
|
|
URI that conforms to RFC 3986.
|
2012-05-15 15:40:36 +04:00
|
|
|
|
"""
|
2012-05-18 11:12:50 +04:00
|
|
|
|
# Use UTF-8 as per RFC 3987 (IRI)
|
|
|
|
|
url = url.encode('utf8')
|
|
|
|
|
# This is a full URI, not just a component. Only %-encode characters
|
|
|
|
|
# that are not allowed at all in URIs. Everthing else is "safe":
|
|
|
|
|
# * Reserved characters: /:?#[]@!$&'()*+,;=
|
|
|
|
|
# * Unreserved characters: ASCII letters, digits and -._~
|
|
|
|
|
# Of these, only '~' is not in urllib’s "always safe" list.
|
|
|
|
|
# * '%' to avoid double-encoding
|
2012-05-18 19:54:10 +04:00
|
|
|
|
return quote(url, safe=b"/:?#[]@!$&'()*+,;=~%")
|
2011-12-16 15:19:10 +04:00
|
|
|
|
|
|
|
|
|
|
2012-03-24 16:39:31 +04:00
|
|
|
|
def path2url(path):
|
|
|
|
|
"""Return file URL of `path`"""
|
2012-06-02 10:04:24 +04:00
|
|
|
|
path = os.path.abspath(path)
|
|
|
|
|
if isinstance(path, unicode):
|
|
|
|
|
path = path.encode('utf8')
|
2012-05-30 22:06:44 +04:00
|
|
|
|
# TODO: should this be 'file://' ? Maybe only on Unix?
|
2012-06-02 10:04:24 +04:00
|
|
|
|
return 'file:' + pathname2url(path)
|
2012-03-24 16:39:31 +04:00
|
|
|
|
|
|
|
|
|
|
2012-05-21 20:43:08 +04:00
|
|
|
|
def url_is_absolute(url):
|
2012-06-02 10:04:24 +04:00
|
|
|
|
return bool(
|
|
|
|
|
(UNICODE_SCHEME_RE if isinstance(url, unicode) else BYTES_SCHEME_RE)
|
|
|
|
|
.match(url))
|
2012-05-21 20:43:08 +04:00
|
|
|
|
|
|
|
|
|
|
2012-05-24 18:06:58 +04:00
|
|
|
|
def get_url_attribute(element, attr_name):
|
|
|
|
|
"""Get the URI corresponding to the ``attr_name`` attribute.
|
2012-05-21 20:43:08 +04:00
|
|
|
|
|
|
|
|
|
Return ``None`` if:
|
|
|
|
|
|
|
|
|
|
* the attribute is empty or missing or,
|
|
|
|
|
* the value is a relative URI but the document has no base URI.
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
2012-05-21 20:43:08 +04:00
|
|
|
|
Otherwise, return an absolute URI.
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
2011-08-09 14:45:51 +04:00
|
|
|
|
"""
|
2012-05-24 18:06:58 +04:00
|
|
|
|
attr_value = element.get(attr_name, '').strip()
|
2011-12-16 15:19:10 +04:00
|
|
|
|
if attr_value:
|
2012-05-21 20:43:08 +04:00
|
|
|
|
# TODO: support the <base> HTML element, but do not use
|
|
|
|
|
# lxml.html.HtmlElement.make_links_absolute() that changes
|
|
|
|
|
# the tree for content: attr(href)
|
|
|
|
|
if url_is_absolute(attr_value):
|
|
|
|
|
return attr_value
|
|
|
|
|
elif element.base_url:
|
2012-02-27 19:48:27 +04:00
|
|
|
|
return urljoin(element.base_url, attr_value)
|
2012-05-24 18:06:58 +04:00
|
|
|
|
else:
|
|
|
|
|
LOGGER.warn(
|
|
|
|
|
'Relative URI reference without a base URI: '
|
|
|
|
|
'<%s %s="%s"> at line %d',
|
|
|
|
|
element.tag, attr_name, attr_value, element.sourceline)
|
2011-08-09 14:45:51 +04:00
|
|
|
|
|
2011-08-05 13:16:44 +04:00
|
|
|
|
|
2012-05-30 22:06:44 +04:00
|
|
|
|
def get_link_attribute(element, attr_name):
|
|
|
|
|
"""Return ('external', absolute_uri) or
|
|
|
|
|
('internal', unquoted_fragment_id) or None.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
attr_value = element.get(attr_name, '').strip()
|
|
|
|
|
if attr_value.startswith('#'):
|
|
|
|
|
# Do not require a base_url when the value is just a fragment.
|
|
|
|
|
return 'internal', unquote(attr_value[1:])
|
|
|
|
|
else:
|
|
|
|
|
uri = get_url_attribute(element, attr_name)
|
|
|
|
|
if uri is not None:
|
|
|
|
|
document_uri = urlsplit(element.base_url or '')
|
|
|
|
|
parsed = urlsplit(uri)
|
|
|
|
|
# Compare with fragments removed
|
|
|
|
|
if parsed[:-1] == document_uri[:-1]:
|
|
|
|
|
return 'internal', unquote(parsed.fragment)
|
|
|
|
|
else:
|
|
|
|
|
return 'external', uri
|
|
|
|
|
|
|
|
|
|
|
2011-08-19 18:53:05 +04:00
|
|
|
|
def ensure_url(string):
|
|
|
|
|
"""Get a ``scheme://path`` URL from ``string``.
|
|
|
|
|
|
|
|
|
|
If ``string`` looks like an URL, return it unchanged. Otherwise assume a
|
|
|
|
|
filename and convert it to a ``file://`` URL.
|
|
|
|
|
|
2011-08-09 14:45:51 +04:00
|
|
|
|
"""
|
2012-05-21 20:43:08 +04:00
|
|
|
|
if url_is_absolute(string):
|
2012-01-16 16:12:27 +04:00
|
|
|
|
return string
|
|
|
|
|
else:
|
|
|
|
|
return path2url(string.encode('utf8'))
|
2011-08-16 17:11:35 +04:00
|
|
|
|
|
|
|
|
|
|
2012-03-21 19:07:49 +04:00
|
|
|
|
def decode_base64(data):
|
|
|
|
|
"""Decode base64, padding being optional.
|
|
|
|
|
|
|
|
|
|
"From a theoretical point of view, the padding character is not needed,
|
|
|
|
|
since the number of missing bytes can be calculated from the number
|
|
|
|
|
of Base64 digits."
|
|
|
|
|
|
|
|
|
|
https://en.wikipedia.org/wiki/Base64#Padding
|
|
|
|
|
|
|
|
|
|
:param data: Base64 data as an ASCII byte string
|
|
|
|
|
:returns: The decoded byte string.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
missing_padding = 4 - len(data) % 4
|
|
|
|
|
if missing_padding:
|
|
|
|
|
data += b'='* missing_padding
|
|
|
|
|
return base64.decodestring(data)
|
|
|
|
|
|
|
|
|
|
|
2012-05-23 16:43:02 +04:00
|
|
|
|
@register_opener('data')
|
|
|
|
|
def open_data_url(url):
|
2012-05-30 22:06:44 +04:00
|
|
|
|
"""Decode URLs with the 'data' scheme. urllib can handle them
|
2012-02-17 21:49:58 +04:00
|
|
|
|
in Python 2, but that is broken in Python 3.
|
|
|
|
|
|
2012-05-30 22:06:44 +04:00
|
|
|
|
Inspired from Python 2.7.2’s urllib.py.
|
2012-02-17 21:49:58 +04:00
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
# syntax of data URLs:
|
|
|
|
|
# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
|
|
|
|
|
# mediatype := [ type "/" subtype ] *( ";" parameter )
|
|
|
|
|
# data := *urlchar
|
|
|
|
|
# parameter := attribute "=" value
|
|
|
|
|
try:
|
|
|
|
|
header, data = url.split(',', 1)
|
|
|
|
|
except ValueError:
|
|
|
|
|
raise IOError('bad data URL')
|
|
|
|
|
header = header[5:] # len('data:') == 5
|
|
|
|
|
if header:
|
|
|
|
|
semi = header.rfind(';')
|
|
|
|
|
if semi >= 0 and '=' not in header[semi:]:
|
|
|
|
|
content_type = header[:semi]
|
|
|
|
|
encoding = header[semi+1:]
|
|
|
|
|
else:
|
|
|
|
|
content_type = header
|
|
|
|
|
encoding = ''
|
|
|
|
|
message = parse_email('Content-type: ' + content_type)
|
|
|
|
|
mime_type = message.get_content_type()
|
|
|
|
|
charset = message.get_content_charset()
|
|
|
|
|
else:
|
|
|
|
|
mime_type = 'text/plain'
|
|
|
|
|
charset = 'US-ASCII'
|
|
|
|
|
encoding = ''
|
|
|
|
|
|
2012-03-21 19:07:49 +04:00
|
|
|
|
data = unquote_to_bytes(data)
|
2012-02-17 21:49:58 +04:00
|
|
|
|
if encoding == 'base64':
|
2012-03-21 19:07:49 +04:00
|
|
|
|
data = decode_base64(data)
|
2012-02-17 21:49:58 +04:00
|
|
|
|
|
|
|
|
|
return io.BytesIO(data), mime_type, charset
|
2011-08-19 18:53:05 +04:00
|
|
|
|
|
|
|
|
|
|
2011-10-17 17:04:13 +04:00
|
|
|
|
def urlopen(url):
|
|
|
|
|
"""Fetch an URL and return ``(file_like, mime_type, charset)``.
|
2011-12-08 19:31:03 +04:00
|
|
|
|
|
|
|
|
|
It is the caller’s responsability to call ``file_like.close()``.
|
2011-08-16 17:11:35 +04:00
|
|
|
|
"""
|
2012-06-02 10:04:24 +04:00
|
|
|
|
match = UNICODE_SCHEME_RE.match(url)
|
2012-05-23 16:43:02 +04:00
|
|
|
|
if not match:
|
|
|
|
|
raise ValueError('Not an absolute URI: %r' % url)
|
|
|
|
|
opener = OPENER_BY_SCHEME.get(match.group(1))
|
|
|
|
|
if opener:
|
|
|
|
|
return opener(url)
|
2011-10-10 18:39:41 +04:00
|
|
|
|
else:
|
2012-05-18 11:12:50 +04:00
|
|
|
|
url = iri_to_uri(url)
|
2012-02-17 21:49:58 +04:00
|
|
|
|
return urlopen_contenttype(Request(url,
|
2012-05-15 15:40:36 +04:00
|
|
|
|
headers={'User-Agent': VERSION_STRING}))
|