WeasyPrint/weasyprint/urls.py

"""
    weasyprint.utils
    ----------------

    Various utility functions and classes.

    :copyright: Copyright 2011-2019 Simon Sapin and contributors, see AUTHORS.
    :license: BSD, see LICENSE for details.

"""

import codecs
import contextlib
import gzip
import io
import os.path
import re
import sys
import traceback
import zlib
from base64 import decodebytes
from gzip import GzipFile
from urllib.parse import quote, unquote, urljoin, urlsplit
from urllib.request import Request, pathname2url, urlopen

from . import VERSION_STRING
from .logger import LOGGER

# See http://stackoverflow.com/a/11687993/1162888
# Both are needed in Python 3 as the re module does not like to mix
# http://tools.ietf.org/html/rfc3986#section-3.1
UNICODE_SCHEME_RE = re.compile('^([a-zA-Z][a-zA-Z0-9.+-]+):')
BYTES_SCHEME_RE = re.compile(b'^([a-zA-Z][a-zA-Z0-9.+-]+):')

# getfilesystemencoding() on Linux is sometimes stupid...
FILESYSTEM_ENCODING = sys.getfilesystemencoding()
try:
    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
        FILESYSTEM_ENCODING = 'utf-8'
except LookupError:
    FILESYSTEM_ENCODING = 'utf-8'


class StreamingGzipFile(GzipFile):
    def __init__(self, fileobj):
        GzipFile.__init__(self, fileobj=fileobj)
        self.fileobj_to_close = fileobj

    def close(self):
        GzipFile.close(self)
        self.fileobj_to_close.close()

    # Inform html5lib to not rely on these:
    seek = tell = None


def iri_to_uri(url):
    """Turn an IRI that can contain any Unicode character into an ASCII-only
    URI that conforms to RFC 3986.
    """
    if url.startswith('data:'):
        # Data URIs can be huge, but don’t need this anyway.
        return url
    # Use UTF-8 as per RFC 3987 (IRI), except for file://
    url = url.encode(FILESYSTEM_ENCODING
                     if url.startswith('file:') else 'utf-8')
    # This is a full URI, not just a component. Only %-encode characters
    # that are not allowed at all in URIs. Everthing else is "safe":
    # * Reserved characters: /:?#[]@!$&'()*+,;=
    # * Unreserved characters: ASCII letters, digits and -._~
    #   Of these, only '~' is not in urllib’s "always safe" list.
    # * '%' to avoid double-encoding
    return quote(url, safe=b"/:?#[]@!$&'()*+,;=~%")


def path2url(path):
    """Return file URL of `path`.
    Accepts 'str' or 'bytes', returns 'str'
    """
    # Ensure 'str'
    if isinstance(path, bytes):
        path = path.decode(sys.getfilesystemencoding())
    # if a trailing path.sep is given -- keep it
    wants_trailing_slash = path.endswith(os.path.sep) or path.endswith('/')
    path = os.path.abspath(path)
    if wants_trailing_slash or os.path.isdir(path):
        # Make sure directory names have a trailing slash.
        # Otherwise relative URIs are resolved from the parent directory.
        path += os.path.sep
        wants_trailing_slash = True
    path = pathname2url(path)
    # on Windows pathname2url cuts off trailing slash
    if wants_trailing_slash and not path.endswith('/'):
        path += '/'
    if path.startswith('///'):
        # On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
        # That enough slashes already.
        return 'file:' + path
    else:
        return 'file://' + path


def url_is_absolute(url):
    return bool(
        (UNICODE_SCHEME_RE if isinstance(url, str) else BYTES_SCHEME_RE)
        .match(url))


def get_url_attribute(element, attr_name, base_url, allow_relative=False):
    """Get the URI corresponding to the ``attr_name`` attribute.

    Return ``None`` if:

    * the attribute is empty or missing or,
    * the value is a relative URI but the document has no base URI and
      ``allow_relative`` is ``False``.

    Otherwise return an URI, absolute if possible.

    """
    value = element.get(attr_name, '').strip()
    if value:
        return url_join(
            base_url or '', value, allow_relative, '<%s %s="%s">',
            (element.tag, attr_name, value))


def url_join(base_url, url, allow_relative, context, context_args):
    """Like urllib.urljoin, but warn if base_url is required but missing."""
    if url_is_absolute(url):
        return iri_to_uri(url)
    elif base_url:
        return iri_to_uri(urljoin(base_url, url))
    elif allow_relative:
        return iri_to_uri(url)
    else:
        LOGGER.error('Relative URI reference without a base URI: ' + context,
                     *context_args)
        return None


def get_link_attribute(element, attr_name, base_url):
    """Return ('external', absolute_uri) or
    ('internal', unquoted_fragment_id) or None.

    """
    attr_value = element.get(attr_name, '').strip()
    if attr_value.startswith('#') and len(attr_value) > 1:
        # Do not require a base_url when the value is just a fragment.
        return ('url', ('internal', unquote(attr_value[1:])))
    uri = get_url_attribute(element, attr_name, base_url, allow_relative=True)
    if uri:
        if base_url:
            parsed = urlsplit(uri)
            # Compare with fragments removed
            if parsed[:-1] == urlsplit(base_url)[:-1]:
                return ('url', ('internal', unquote(parsed.fragment)))
        return ('url', ('external', uri))


def ensure_url(string):
    """Get a ``scheme://path`` URL from ``string``.

    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
    filename and convert it to a ``file://`` URL.

    """
    return string if url_is_absolute(string) else path2url(string)


def safe_decodebytes(data):
    """Decode base64, padding being optional.

    "From a theoretical point of view, the padding character is not needed,
     since the number of missing bytes can be calculated from the number
     of Base64 digits."

    https://en.wikipedia.org/wiki/Base64#Padding

    :param data: Base64 data as an ASCII byte string
    :returns: The decoded byte string.

    """
    missing_padding = 4 - len(data) % 4
    if missing_padding:
        data += b'=' * missing_padding
    return decodebytes(data)


HTTP_HEADERS = {
    'User-Agent': VERSION_STRING,
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
}


def default_url_fetcher(url, timeout=10, ssl_context=None):
    """Fetch an external resource such as an image or stylesheet.

    Another callable with the same signature can be given as the
    :obj:`url_fetcher` argument to :class:`HTML` or :class:`CSS`.
    (See :ref:`url-fetchers`.)

    :type url: str
    :param url: The URL of the resource to fetch.
    :type timeout: int
    :param timeout: The number of seconds before HTTP requests are dropped.
    :type ssl_context: ssl.SSLContext
    :param ssl_context: An SSL context used for HTTP requests.
    :raises: An exception indicating failure, e.g. :obj:`ValueError` on
        syntactically invalid URL.
    :returns: A :obj:`dict` with the following keys:

        * One of ``string`` (a :obj:`bytestring <bytes>`) or ``file_obj``
          (a :term:`file object`).
        * Optionally: ``mime_type``, a MIME type extracted e.g. from a
          *Content-Type* header. If not provided, the type is guessed from the
          file extension in the URL.
        * Optionally: ``encoding``, a character encoding extracted e.g. from a
          *charset* parameter in a *Content-Type* header
        * Optionally: ``redirected_url``, the actual URL of the resource
          if there were e.g. HTTP redirects.
        * Optionally: ``filename``, the filename of the resource. Usually
          derived from the *filename* parameter in a *Content-Disposition*
          header

        If a ``file_obj`` key is given, it is the caller’s responsibility
        to call ``file_obj.close()``. The default function used internally to
        fetch data in WeasyPrint tries to close the file object after
        retreiving; but if this URL fetcher is used elsewhere, the file object
        has to be closed manually.

    """
    if UNICODE_SCHEME_RE.match(url):
        # See https://bugs.python.org/issue34702
        if url.startswith('file://'):
            url = url.split('?')[0]

        url = iri_to_uri(url)
        response = urlopen(Request(url, headers=HTTP_HEADERS),
                           timeout=timeout, context=ssl_context)
        response_info = response.info()
        result = dict(redirected_url=response.geturl(),
                      mime_type=response_info.get_content_type(),
                      encoding=response_info.get_param('charset'),
                      filename=response_info.get_filename())
        content_encoding = response_info.get('Content-Encoding')
        if content_encoding == 'gzip':
            if StreamingGzipFile is None:
                result['string'] = gzip.GzipFile(
                    fileobj=io.BytesIO(response.read())).read()
                response.close()
            else:
                result['file_obj'] = StreamingGzipFile(fileobj=response)
        elif content_encoding == 'deflate':
            data = response.read()
            try:
                result['string'] = zlib.decompress(data)
            except zlib.error:
                # Try without zlib header or checksum
                result['string'] = zlib.decompress(data, -15)
        else:
            result['file_obj'] = response
        return result
    else:
        raise ValueError('Not an absolute URI: %r' % url)


class URLFetchingError(IOError):
    """Some error happened when fetching an URL."""


@contextlib.contextmanager
def fetch(url_fetcher, url):
    """Call an url_fetcher, fill in optional data, and clean up."""
    try:
        result = url_fetcher(url)
    except Exception as exc:
        raise URLFetchingError('%s: %s' % (type(exc).__name__, str(exc)))
    result.setdefault('redirected_url', url)
    result.setdefault('mime_type', None)
    if 'file_obj' in result:
        try:
            yield result
        finally:
            try:
                result['file_obj'].close()
            except Exception:
                # May already be closed or something.
                # This is just cleanup anyway: log but make it non-fatal.
                LOGGER.warning('Error when closing stream for %s:\n%s',
                               url, traceback.format_exc())
    else:
        yield result
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								"""
 								    weasyprint.utils
 								    ----------------
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								    Various utility functions and classes.
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
-												Update copyright

											
										
										
											2019-03-04 13:04:06 +03:00
+								    :copyright: Copyright 2011-2019 Simon Sapin and contributors, see AUTHORS.
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								    :license: BSD, see LICENSE for details.
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
 								"""
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								import codecs
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								import contextlib
 								import gzip
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								import io
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								import os.path
-												Make urlopen() extensible

There is a global dict mapping URI schemes to opener functions,
and a decorator to add such a function.

Expected usage:

from weasyprint.urls import register_opener
@register_opener('foo')
def git_urlopen(url):
    url = urlparse.urlsplit(url)
    assert url.scheme == 'foo'
    # ...
    return fileobj, mimetype, charset

											
										
										
											2012-05-23 16:43:02 +04:00
+								import re
-												Use the filesystem encoding for file:// URLs. This might help with #891

											
										
										
											2012-07-27 20:55:19 +04:00
+								import sys
-												Correctly close StreamingGzipFile, and log this kind of error

… rather than silence them. Logs make tests fail.

											
										
										
											2014-04-27 14:00:02 +04:00
+								import traceback
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								import zlib
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								from base64 import decodebytes
 								from gzip import GzipFile
-												Lint

											
										
										
											2018-01-14 04:09:25 +03:00
+								from urllib.parse import quote, unquote, urljoin, urlsplit
 								from urllib.request import Request, pathname2url, urlopen
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Have the PDF post-process run on Python3 (links are broken)

											
										
										
											2012-05-15 15:40:36 +04:00
+								from . import VERSION_STRING
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								from .logger import LOGGER
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
-												Require at least 2 characters in URL schemes.

This makes sure that Windows filenames like C:\foo\bar.html
are not considered as URLs.

											
										
										
											2012-07-27 19:19:17 +04:00
+								# See http://stackoverflow.com/a/11687993/1162888
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								# Both are needed in Python 3 as the re module does not like to mix
-												Fix the URL scheme regexp.

Not sure why I had [0-1] there.
											
										
										
											2013-04-12 11:32:25 +04:00
+								# http://tools.ietf.org/html/rfc3986#section-3.1
 								UNICODE_SCHEME_RE = re.compile('^([a-zA-Z][a-zA-Z0-9.+-]+):')
 								BYTES_SCHEME_RE = re.compile(b'^([a-zA-Z][a-zA-Z0-9.+-]+):')
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								# getfilesystemencoding() on Linux is sometimes stupid...
 								FILESYSTEM_ENCODING = sys.getfilesystemencoding()
 								try:
 								    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
 								        FILESYSTEM_ENCODING = 'utf-8'
 								except LookupError:
 								    FILESYSTEM_ENCODING = 'utf-8'
 								class StreamingGzipFile(GzipFile):
 								    def __init__(self, fileobj):
 								        GzipFile.__init__(self, fileobj=fileobj)
 								        self.fileobj_to_close = fileobj
 								    def close(self):
 								        GzipFile.close(self)
 								        self.fileobj_to_close.close()
 								    # Inform html5lib to not rely on these:
 								    seek = tell = None
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								def iri_to_uri(url):
-												Fix typo

											
										
										
											2015-12-16 15:50:04 +03:00
+								    """Turn an IRI that can contain any Unicode character into an ASCII-only
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								    URI that conforms to RFC 3986.
-												Have the PDF post-process run on Python3 (links are broken)

											
										
										
											2012-05-15 15:40:36 +04:00
+								    """
-												Escape non-ASCII characters in hyperlinks.

											
										
										
											2012-10-04 21:12:34 +04:00
+								    if url.startswith('data:'):
 								        # Data URIs can be huge, but don’t need this anyway.
 								        return url
-												Use the filesystem encoding for file:// URLs. This might help with #891

											
										
										
											2012-07-27 20:55:19 +04:00
+								    # Use UTF-8 as per RFC 3987 (IRI), except for file://
-												Use FILESYSTEM_ENCODING when encoding file:// URLs

											
										
										
											2016-08-17 01:21:31 +03:00
+								    url = url.encode(FILESYSTEM_ENCODING
 								                     if url.startswith('file:') else 'utf-8')
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								    # This is a full URI, not just a component. Only %-encode characters
 								    # that are not allowed at all in URIs. Everthing else is "safe":
 								    # * Reserved characters: /:?#[]@!$&'()*+,;=
 								    # * Unreserved characters: ASCII letters, digits and -._~
 								    #   Of these, only '~' is not in urllib’s "always safe" list.
 								    # * '%' to avoid double-encoding
-												Fix URL quoting on Python 2.6

											
										
										
											2012-05-18 19:54:10 +04:00
+								    return quote(url, safe=b"/:?#[]@!$&'()*+,;=~%")
-												Add and test more warnings.

											
										
										
											2011-12-16 15:19:10 +04:00
-												Switch from cssutils to tinycss as the CSS parser

											
										
										
											2012-03-24 16:39:31 +04:00
+								def path2url(path):
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								    """Return file URL of `path`.
 								    Accepts 'str' or 'bytes', returns 'str'
 								    """
 								    # Ensure 'str'
 								    if isinstance(path, bytes):
 								        path = path.decode(sys.getfilesystemencoding())
 								    # if a trailing path.sep is given -- keep it
 								    wants_trailing_slash = path.endswith(os.path.sep) or path.endswith('/')
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								    path = os.path.abspath(path)
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								    if wants_trailing_slash or os.path.isdir(path):
-												Fix base_url set to a directory name.

With base_url='/a/b', a relative URL 'c' was resolved to '/a/c'.
Now it is resolved to '/a/b/c' if /a/b is a directory, '/a/c'
otherwise. This is most likely the expected behavior.

											
										
										
											2012-08-02 19:19:34 +04:00
+								        # Make sure directory names have a trailing slash.
 								        # Otherwise relative URIs are resolved from the parent directory.
 								        path += os.path.sep
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								        wants_trailing_slash = True
-												Gotta learn to run *all* tests before pushing.

											
										
										
											2012-07-27 20:08:05 +04:00
+								    path = pathname2url(path)
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								    # on Windows pathname2url cuts off trailing slash
 								    if wants_trailing_slash and not path.endswith('/'):
 								        path += '/'
-												Gotta learn to run *all* tests before pushing.

											
										
										
											2012-07-27 20:08:05 +04:00
+								    if path.startswith('///'):
 								        # On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
 								        # That enough slashes already.
 								        return 'file:' + path
 								    else:
 								        return 'file://' + path
-												Switch from cssutils to tinycss as the CSS parser

											
										
										
											2012-03-24 16:39:31 +04:00
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
+								def url_is_absolute(url):
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								    return bool(
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								        (UNICODE_SCHEME_RE if isinstance(url, str) else BYTES_SCHEME_RE)
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								        .match(url))
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								def get_url_attribute(element, attr_name, base_url, allow_relative=False):
-												Warn for relative URI references without a base URI.

											
										
										
											2012-05-24 18:06:58 +04:00
+								    """Get the URI corresponding to the ``attr_name`` attribute.
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
 								    Return ``None`` if:
 								    * the attribute is empty or missing or,
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								    * the value is a relative URI but the document has no base URI and
 								      ``allow_relative`` is ``False``.
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								    Otherwise return an URI, absolute if possible.
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    value = element.get(attr_name, '').strip()
 								    if value:
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								        return url_join(
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								            base_url or '', value, allow_relative, '<%s %s="%s">',
-												Remove unsupported source lines

											
										
										
											2017-07-01 01:28:14 +03:00
+								            (element.tag, attr_name, value))
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								def url_join(base_url, url, allow_relative, context, context_args):
 								    """Like urllib.urljoin, but warn if base_url is required but missing."""
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    if url_is_absolute(url):
-												Escape non-ASCII characters in hyperlinks.

											
										
										
											2012-10-04 21:12:34 +04:00
+								        return iri_to_uri(url)
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    elif base_url:
-												Escape non-ASCII characters in hyperlinks.

											
										
										
											2012-10-04 21:12:34 +04:00
+								        return iri_to_uri(urljoin(base_url, url))
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								    elif allow_relative:
 								        return iri_to_uri(url)
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    else:
-												Clean and document the logging levels

Related to #488.

											
										
										
											2017-07-25 14:59:56 +03:00
+								        LOGGER.error('Relative URI reference without a base URI: ' + context,
 								                     *context_args)
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								        return None
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
-												move urljoining to function

											
										
										
											2011-08-05 13:16:44 +04:00
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								def get_link_attribute(element, attr_name, base_url):
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								    """Return ('external', absolute_uri) or
 								    ('internal', unquoted_fragment_id) or None.
 								    """
 								    attr_value = element.get(attr_name, '').strip()
-												Allow absolute URLs without a base URL. Fix #42.

											
										
										
											2013-03-13 15:12:12 +04:00
+								    if attr_value.startswith('#') and len(attr_value) > 1:
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								        # Do not require a base_url when the value is just a fragment.
-												Always include token type when dealing with URLs

											
										
										
											2018-04-16 14:20:52 +03:00
+								        return ('url', ('internal', unquote(attr_value[1:])))
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								    uri = get_url_attribute(element, attr_name, base_url, allow_relative=True)
-												Allow absolute URLs without a base URL. Fix #42.

											
										
										
											2013-03-13 15:12:12 +04:00
+								    if uri:
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								        if base_url:
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								            parsed = urlsplit(uri)
 								            # Compare with fragments removed
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								            if parsed[:-1] == urlsplit(base_url)[:-1]:
-												Always include token type when dealing with URLs

											
										
										
											2018-04-16 14:20:52 +03:00
+								                return ('url', ('internal', unquote(parsed.fragment)))
 								        return ('url', ('external', uri))
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								def ensure_url(string):
 								    """Get a ``scheme://path`` URL from ``string``.
 								    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
 								    filename and convert it to a ``file://`` URL.
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Use the filesystem encoding for file:// URLs. This might help with #891

											
										
										
											2012-07-27 20:55:19 +04:00
+								    return string if url_is_absolute(string) else path2url(string)
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								def safe_decodebytes(data):
-												Fix base64 URLs without padding

											
										
										
											2012-03-21 19:07:49 +04:00
+								    """Decode base64, padding being optional.
 								    "From a theoretical point of view, the padding character is not needed,
 								     since the number of missing bytes can be calculated from the number
 								     of Base64 digits."
 								    https://en.wikipedia.org/wiki/Base64#Padding
 								    :param data: Base64 data as an ASCII byte string
 								    :returns: The decoded byte string.
 								    """
 								    missing_padding = 4 - len(data) % 4
 								    if missing_padding:
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								        data += b'=' * missing_padding
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								    return decodebytes(data)
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								HTTP_HEADERS = {
 								    'User-Agent': VERSION_STRING,
-												Add 'Accept' header

Some servers seem to love it, it's the only one included in curl with
'User-Agent' and it can't be harmful.

											
										
										
											2018-08-10 18:50:57 +03:00
+								    'Accept': '*/*',
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								    'Accept-Encoding': 'gzip, deflate',
 								}
-												Fix code formatting. (Clean flake8!)

											
										
										
											2014-04-27 15:29:55 +04:00
-												set parameter name to ssl_context and pass directly to urlopen
											
										
										
											2019-04-04 15:04:05 +03:00
+								def default_url_fetcher(url, timeout=10, ssl_context=None):
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
+								    """Fetch an external resource such as an image or stylesheet.
-												Docs docs docs.

But no ducks.

											
										
										
											2012-10-08 21:51:18 +04:00
+								    Another callable with the same signature can be given as the
 								    :obj:`url_fetcher` argument to :class:`HTML` or :class:`CSS`.
 								    (See :ref:`url-fetchers`.)
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    :type url: str
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								    :param url: The URL of the resource to fetch.
-												Update default_url_fetcher's documentation

											
										
										
											2019-04-12 17:34:19 +03:00
+								    :type timeout: int
 								    :param timeout: The number of seconds before HTTP requests are dropped.
 								    :type ssl_context: ssl.SSLContext
 								    :param ssl_context: An SSL context used for HTTP requests.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    :raises: An exception indicating failure, e.g. :obj:`ValueError` on
-												Cut long lines, remove an old note

											
										
										
											2017-04-29 11:41:53 +03:00
+								        syntactically invalid URL.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    :returns: A :obj:`dict` with the following keys:
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        * One of ``string`` (a :obj:`bytestring <bytes>`) or ``file_obj``
 								          (a :term:`file object`).
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								        * Optionally: ``mime_type``, a MIME type extracted e.g. from a
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
+								          *Content-Type* header. If not provided, the type is guessed from the
 								          file extension in the URL.
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								        * Optionally: ``encoding``, a character encoding extracted e.g. from a
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
+								          *charset* parameter in a *Content-Type* header
-												Fix ressource typo
											
										
										
											2015-07-01 02:01:43 +03:00
+								        * Optionally: ``redirected_url``, the actual URL of the resource
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								          if there were e.g. HTTP redirects.
-												Added optional filename key to the URL fetcher result

											
										
										
											2014-04-18 18:39:54 +04:00
+								        * Optionally: ``filename``, the filename of the resource. Usually
 								          derived from the *filename* parameter in a *Content-Disposition*
 								          header
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								        If a ``file_obj`` key is given, it is the caller’s responsibility
-												Update default_url_fetcher documentation

Related to #814.
											
										
										
											2019-02-27 20:38:14 +03:00
+								        to call ``file_obj.close()``. The default function used internally to
 								        fetch data in WeasyPrint tries to close the file object after
 								        retreiving; but if this URL fetcher is used elsewhere, the file object
 								        has to be closed manually.
-												Refactor image loading.

											
										
										
											2011-12-08 19:31:03 +04:00
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
+								    """
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								    if UNICODE_SCHEME_RE.match(url):
-												Remove query strings from file:// URIs

Fix #687, fix #688.

											
										
										
											2018-10-26 20:18:33 +03:00
+								        # See https://bugs.python.org/issue34702
 								        if url.startswith('file://'):
 								            url = url.split('?')[0]
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								        url = iri_to_uri(url)
-												remove ssl import
											
										
										
											2019-04-04 15:12:05 +03:00
+								        response = urlopen(Request(url, headers=HTTP_HEADERS),
-												set parameter name to ssl_context and pass directly to urlopen
											
										
										
											2019-04-04 15:04:05 +03:00
+								                           timeout=timeout, context=ssl_context)
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								        response_info = response.info()
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								        result = dict(redirected_url=response.geturl(),
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								                      mime_type=response_info.get_content_type(),
 								                      encoding=response_info.get_param('charset'),
 								                      filename=response_info.get_filename())
 								        content_encoding = response_info.get('Content-Encoding')
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								        if content_encoding == 'gzip':
 								            if StreamingGzipFile is None:
 								                result['string'] = gzip.GzipFile(
 								                    fileobj=io.BytesIO(response.read())).read()
 								                response.close()
 								            else:
 								                result['file_obj'] = StreamingGzipFile(fileobj=response)
 								        elif content_encoding == 'deflate':
 								            data = response.read()
 								            try:
 								                result['string'] = zlib.decompress(data)
 								            except zlib.error:
 								                # Try without zlib header or checksum
 								                result['string'] = zlib.decompress(data, -15)
 								        else:
 								            result['file_obj'] = response
 								        return result
-												Add ``url_fetcher`` to the public API

											
										
										
											2012-07-13 14:24:55 +04:00
+								    else:
 								        raise ValueError('Not an absolute URI: %r' % url)
-												Make mime_type optional in custom URL fetchers

											
										
										
											2012-07-18 16:31:55 +04:00
-												Image loading: only swallow exception related to fetching or decoding.

											
										
										
											2013-06-21 00:32:28 +04:00
+								class URLFetchingError(IOError):
 								    """Some error happened when fetching an URL."""
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								@contextlib.contextmanager
-												URL fetching: use an intermediate function rather than a wrapper

											
										
										
											2013-06-20 15:17:03 +04:00
+								def fetch(url_fetcher, url):
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								    """Call an url_fetcher, fill in optional data, and clean up."""
-												Image loading: only swallow exception related to fetching or decoding.

											
										
										
											2013-06-21 00:32:28 +04:00
+								    try:
 								        result = url_fetcher(url)
 								    except Exception as exc:
-												Fix exception in fetch

											
										
										
											2018-10-26 16:00:25 +03:00
+								        raise URLFetchingError('%s: %s' % (type(exc).__name__, str(exc)))
-												URL fetching: use an intermediate function rather than a wrapper

											
										
										
											2013-06-20 15:17:03 +04:00
+								    result.setdefault('redirected_url', url)
-												Do not get MIME types base on URLs.

											
										
										
											2013-06-20 15:26:12 +04:00
+								    result.setdefault('mime_type', None)
-												Docs: add an example combining documents into one PDF file.

											
										
										
											2013-07-14 12:17:40 +04:00
+								    if 'file_obj' in result:
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								        try:
 								            yield result
 								        finally:
 								            try:
 								                result['file_obj'].close()
-												Correctly close StreamingGzipFile, and log this kind of error

… rather than silence them. Logs make tests fail.

											
										
										
											2014-04-27 14:00:02 +04:00
+								            except Exception:
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								                # May already be closed or something.
-												Correctly close StreamingGzipFile, and log this kind of error

… rather than silence them. Logs make tests fail.

											
										
										
											2014-04-27 14:00:02 +04:00
+								                # This is just cleanup anyway: log but make it non-fatal.
 								                LOGGER.warning('Error when closing stream for %s:\n%s',
 								                               url, traceback.format_exc())
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								    else:
 								        yield result