WeasyPrint/weasyprint/urls.py

"""Various utility functions and classes for URL management."""

import codecs
import contextlib
import os.path
import re
import sys
import traceback
import zlib
from gzip import GzipFile
from pathlib import Path
from urllib.parse import quote, unquote, urljoin, urlsplit
from urllib.request import Request, pathname2url, urlopen

from . import __version__
from .logger import LOGGER

# See https://stackoverflow.com/a/11687993/1162888
# Both are needed in Python 3 as the re module does not like to mix
# https://datatracker.ietf.org/doc/html/rfc3986#section-3.1
UNICODE_SCHEME_RE = re.compile('^([a-zA-Z][a-zA-Z0-9.+-]+):')
BYTES_SCHEME_RE = re.compile(b'^([a-zA-Z][a-zA-Z0-9.+-]+):')

# getfilesystemencoding() on Linux is sometimes stupid…
FILESYSTEM_ENCODING = sys.getfilesystemencoding()
try:  # pragma: no cover
    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
        FILESYSTEM_ENCODING = 'utf-8'
except LookupError:  # pragma: no cover
    FILESYSTEM_ENCODING = 'utf-8'

HTTP_HEADERS = {
    'User-Agent': f'WeasyPrint {__version__}',
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
}


class StreamingGzipFile(GzipFile):
    def __init__(self, fileobj):
        GzipFile.__init__(self, fileobj=fileobj)
        self.fileobj_to_close = fileobj

    def close(self):
        GzipFile.close(self)
        self.fileobj_to_close.close()

    def seekable(self):
        return False


def iri_to_uri(url):
    """Turn a Unicode IRI into an ASCII-only URI that conforms to RFC 3986."""
    if url.startswith('data:'):
        # Data URIs can be huge, but don’t need this anyway.
        return url
    # Use UTF-8 as per RFC 3987 (IRI), except for file://
    url = url.encode(
        FILESYSTEM_ENCODING if url.startswith('file:') else 'utf-8')
    # This is a full URI, not just a component. Only %-encode characters
    # that are not allowed at all in URIs. Everthing else is "safe":
    # * Reserved characters: /:?#[]@!$&'()*+,;=
    # * Unreserved characters: ASCII letters, digits and -._~
    #   Of these, only '~' is not in urllib’s "always safe" list.
    # * '%' to avoid double-encoding
    return quote(url, safe=b"/:?#[]@!$&'()*+,;=~%")


def path2url(path):
    """Return file URL of `path`.

    Accepts 'str', 'bytes' or 'Path', returns 'str'.

    """
    # Ensure 'str'
    if isinstance(path, Path):
        path = str(path)
    elif isinstance(path, bytes):
        path = path.decode(FILESYSTEM_ENCODING)
    # If a trailing path.sep is given, keep it
    wants_trailing_slash = path.endswith(os.path.sep) or path.endswith('/')
    path = os.path.abspath(path)
    if wants_trailing_slash or os.path.isdir(path):
        # Make sure directory names have a trailing slash.
        # Otherwise relative URIs are resolved from the parent directory.
        path += os.path.sep
        wants_trailing_slash = True
    path = pathname2url(path)
    # On Windows pathname2url cuts off trailing slash
    if wants_trailing_slash and not path.endswith('/'):
        path += '/'  # pragma: no cover
    if path.startswith('///'):
        # On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
        # That enough slashes already.
        return f'file:{path}'  # pragma: no cover
    else:
        return f'file://{path}'


def url_is_absolute(url):
    """Return whether an URL (bytes or string) is absolute."""
    scheme = UNICODE_SCHEME_RE if isinstance(url, str) else BYTES_SCHEME_RE
    return bool(scheme.match(url))


def get_url_attribute(element, attr_name, base_url, allow_relative=False):
    """Get the URI corresponding to the ``attr_name`` attribute.

    Return ``None`` if:

    * the attribute is empty or missing or,
    * the value is a relative URI but the document has no base URI and
      ``allow_relative`` is ``False``.

    Otherwise return an URI, absolute if possible.

    """
    value = element.get(attr_name, '').strip()
    if value:
        return url_join(
            base_url or '', value, allow_relative, '<%s %s="%s">',
            (element.tag, attr_name, value))


def url_join(base_url, url, allow_relative, context, context_args):
    """Like urllib.urljoin, but warn if base_url is required but missing."""
    if url_is_absolute(url):
        return iri_to_uri(url)
    elif base_url:
        return iri_to_uri(urljoin(base_url, url))
    elif allow_relative:
        return iri_to_uri(url)
    else:
        LOGGER.error(
            f'Relative URI reference without a base URI: {context}',
            *context_args)
        return None


def get_link_attribute(element, attr_name, base_url):
    """Get the URL value of an element attribute.

    Return ``('external', absolute_uri)``, or ``('internal',
    unquoted_fragment_id)``, or ``None``.

    """
    attr_value = element.get(attr_name, '').strip()
    if attr_value.startswith('#') and len(attr_value) > 1:
        # Do not require a base_url when the value is just a fragment.
        return ('url', ('internal', unquote(attr_value[1:])))
    uri = get_url_attribute(element, attr_name, base_url, allow_relative=True)
    if uri:
        if base_url:
            try:
                parsed = urlsplit(uri)
            except ValueError:
                LOGGER.warning('Malformed URL: %s', uri)
            else:
                try:
                    parsed_base = urlsplit(base_url)
                except ValueError:
                    LOGGER.warning('Malformed base URL: %s', base_url)
                else:
                    # Compare with fragments removed
                    if parsed.fragment and parsed[:-1] == parsed_base[:-1]:
                        return ('url', ('internal', unquote(parsed.fragment)))
        return ('url', ('external', uri))


def ensure_url(string):
    """Get a ``scheme://path`` URL from ``string``.

    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
    filename and convert it to a ``file://`` URL.

    """
    return string if url_is_absolute(string) else path2url(string)


def default_url_fetcher(url, timeout=10, ssl_context=None):
    """Fetch an external resource such as an image or stylesheet.

    Another callable with the same signature can be given as the
    ``url_fetcher`` argument to :class:`HTML` or :class:`CSS`.
    (See :ref:`URL Fetchers`.)

    :param str url:
        The URL of the resource to fetch.
    :param int timeout:
        The number of seconds before HTTP requests are dropped.
    :param ssl.SSLContext ssl_context:
        An SSL context used for HTTP requests.
    :raises: An exception indicating failure, e.g. :obj:`ValueError` on
        syntactically invalid URL.
    :returns: A :obj:`dict` with the following keys:

        * One of ``string`` (a :obj:`bytestring <bytes>`) or ``file_obj``
          (a :term:`file object`).
        * Optionally: ``mime_type``, a MIME type extracted e.g. from a
          *Content-Type* header. If not provided, the type is guessed from the
          file extension in the URL.
        * Optionally: ``encoding``, a character encoding extracted e.g. from a
          *charset* parameter in a *Content-Type* header
        * Optionally: ``redirected_url``, the actual URL of the resource
          if there were e.g. HTTP redirects.
        * Optionally: ``filename``, the filename of the resource. Usually
          derived from the *filename* parameter in a *Content-Disposition*
          header

        If a ``file_obj`` key is given, it is the caller’s responsibility
        to call ``file_obj.close()``. The default function used internally to
        fetch data in WeasyPrint tries to close the file object after
        retreiving; but if this URL fetcher is used elsewhere, the file object
        has to be closed manually.

    """
    if UNICODE_SCHEME_RE.match(url):
        # See https://bugs.python.org/issue34702
        if url.startswith('file://'):
            url = url.split('?')[0]

        url = iri_to_uri(url)
        response = urlopen(
            Request(url, headers=HTTP_HEADERS), timeout=timeout,
            context=ssl_context)
        response_info = response.info()
        result = {
            'redirected_url': response.geturl(),
            'mime_type': response_info.get_content_type(),
            'encoding': response_info.get_param('charset'),
            'filename': response_info.get_filename(),
        }
        content_encoding = response_info.get('Content-Encoding')
        if content_encoding == 'gzip':
            result['file_obj'] = StreamingGzipFile(fileobj=response)
        elif content_encoding == 'deflate':
            data = response.read()
            try:
                result['string'] = zlib.decompress(data)
            except zlib.error:
                # Try without zlib header or checksum
                result['string'] = zlib.decompress(data, -15)
        else:
            result['file_obj'] = response
        return result
    else:  # pragma: no cover
        raise ValueError('Not an absolute URI: %r' % url)


class URLFetchingError(IOError):
    """Some error happened when fetching an URL."""


@contextlib.contextmanager
def fetch(url_fetcher, url):
    """Call an url_fetcher, fill in optional data, and clean up."""
    try:
        result = url_fetcher(url)
    except Exception as exception:
        raise URLFetchingError(f'{type(exception).__name__}: {exception}')
    result.setdefault('redirected_url', url)
    result.setdefault('mime_type', None)
    if 'file_obj' in result:
        try:
            yield result
        finally:
            try:
                result['file_obj'].close()
            except Exception:  # pragma: no cover
                # May already be closed or something.
                # This is just cleanup anyway: log but make it non-fatal.
                LOGGER.warning(
                    'Error when closing stream for %s:\n%s',
                    url, traceback.format_exc())
    else:
        yield result
-												Update and clean docstrings.

											
										
										
											2022-02-14 09:11:30 +03:00
+								"""Various utility functions and classes for URL management."""
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								import codecs
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								import contextlib
 								import os.path
-												Make urlopen() extensible

There is a global dict mapping URI schemes to opener functions,
and a decorator to add such a function.

Expected usage:

from weasyprint.urls import register_opener
@register_opener('foo')
def git_urlopen(url):
    url = urlparse.urlsplit(url)
    assert url.scheme == 'foo'
    # ...
    return fileobj, mimetype, charset

											
										
										
											2012-05-23 16:43:02 +04:00
+								import re
-												Use the filesystem encoding for file:// URLs. This might help with #891

											
										
										
											2012-07-27 20:55:19 +04:00
+								import sys
-												Correctly close StreamingGzipFile, and log this kind of error

… rather than silence them. Logs make tests fail.

											
										
										
											2014-04-27 14:00:02 +04:00
+								import traceback
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								import zlib
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								from gzip import GzipFile
-												Allow Path in path2url

											
										
										
											2022-02-14 07:59:23 +03:00
+								from pathlib import Path
-												Lint

											
										
										
											2018-01-14 04:09:25 +03:00
+								from urllib.parse import quote, unquote, urljoin, urlsplit
 								from urllib.request import Request, pathname2url, urlopen
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Clean __init__ and __main__

											
										
										
											2020-05-30 02:27:13 +03:00
+								from . import __version__
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								from .logger import LOGGER
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
-												Update and fix URLs in comments and documentation

											
										
										
											2022-06-28 16:57:49 +03:00
+								# See https://stackoverflow.com/a/11687993/1162888
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								# Both are needed in Python 3 as the re module does not like to mix
-												Update and fix URLs in comments and documentation

											
										
										
											2022-06-28 16:57:49 +03:00
+								# https://datatracker.ietf.org/doc/html/rfc3986#section-3.1
-												Fix the URL scheme regexp.

Not sure why I had [0-1] there.
											
										
										
											2013-04-12 11:32:25 +04:00
+								UNICODE_SCHEME_RE = re.compile('^([a-zA-Z][a-zA-Z0-9.+-]+):')
 								BYTES_SCHEME_RE = re.compile(b'^([a-zA-Z][a-zA-Z0-9.+-]+):')
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								# getfilesystemencoding() on Linux is sometimes stupid…
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								FILESYSTEM_ENCODING = sys.getfilesystemencoding()
-												Improve urls.py coverage

											
										
										
											2020-01-13 23:26:49 +03:00
+								try:  # pragma: no cover
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
 								        FILESYSTEM_ENCODING = 'utf-8'
-												Improve urls.py coverage

											
										
										
											2020-01-13 23:26:49 +03:00
+								except LookupError:  # pragma: no cover
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								    FILESYSTEM_ENCODING = 'utf-8'
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								HTTP_HEADERS = {
-												Clean __init__ and __main__

											
										
										
											2020-05-30 02:27:13 +03:00
+								    'User-Agent': f'WeasyPrint {__version__}',
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								    'Accept': '*/*',
 								    'Accept-Encoding': 'gzip, deflate',
 								}
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
 								class StreamingGzipFile(GzipFile):
 								    def __init__(self, fileobj):
 								        GzipFile.__init__(self, fileobj=fileobj)
 								        self.fileobj_to_close = fileobj
 								    def close(self):
 								        GzipFile.close(self)
 								        self.fileobj_to_close.close()
-												Use tinyhtml5 instead of html5lib

											
										
										
											2024-07-28 19:13:23 +03:00
+								    def seekable(self):
 								        return False
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								def iri_to_uri(url):
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								    """Turn a Unicode IRI into an ASCII-only URI that conforms to RFC 3986."""
-												Escape non-ASCII characters in hyperlinks.

											
										
										
											2012-10-04 21:12:34 +04:00
+								    if url.startswith('data:'):
 								        # Data URIs can be huge, but don’t need this anyway.
 								        return url
-												Use the filesystem encoding for file:// URLs. This might help with #891

											
										
										
											2012-07-27 20:55:19 +04:00
+								    # Use UTF-8 as per RFC 3987 (IRI), except for file://
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								    url = url.encode(
 								        FILESYSTEM_ENCODING if url.startswith('file:') else 'utf-8')
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								    # This is a full URI, not just a component. Only %-encode characters
 								    # that are not allowed at all in URIs. Everthing else is "safe":
 								    # * Reserved characters: /:?#[]@!$&'()*+,;=
 								    # * Unreserved characters: ASCII letters, digits and -._~
 								    #   Of these, only '~' is not in urllib’s "always safe" list.
 								    # * '%' to avoid double-encoding
-												Fix URL quoting on Python 2.6

											
										
										
											2012-05-18 19:54:10 +04:00
+								    return quote(url, safe=b"/:?#[]@!$&'()*+,;=~%")
-												Add and test more warnings.

											
										
										
											2011-12-16 15:19:10 +04:00
-												Switch from cssutils to tinycss as the CSS parser

											
										
										
											2012-03-24 16:39:31 +04:00
+								def path2url(path):
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								    """Return file URL of `path`.
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
-												Allow Path in path2url

											
										
										
											2022-02-14 07:59:23 +03:00
+								    Accepts 'str', 'bytes' or 'Path', returns 'str'.
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								    """
 								    # Ensure 'str'
-												Allow Path in path2url

											
										
										
											2022-02-14 07:59:23 +03:00
+								    if isinstance(path, Path):
 								        path = str(path)
 								    elif isinstance(path, bytes):
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								        path = path.decode(FILESYSTEM_ENCODING)
 								    # If a trailing path.sep is given, keep it
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								    wants_trailing_slash = path.endswith(os.path.sep) or path.endswith('/')
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								    path = os.path.abspath(path)
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								    if wants_trailing_slash or os.path.isdir(path):
-												Fix base_url set to a directory name.

With base_url='/a/b', a relative URL 'c' was resolved to '/a/c'.
Now it is resolved to '/a/b/c' if /a/b is a directory, '/a/c'
otherwise. This is most likely the expected behavior.

											
										
										
											2012-08-02 19:19:34 +04:00
+								        # Make sure directory names have a trailing slash.
 								        # Otherwise relative URIs are resolved from the parent directory.
 								        path += os.path.sep
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								        wants_trailing_slash = True
-												Gotta learn to run *all* tests before pushing.

											
										
										
											2012-07-27 20:08:05 +04:00
+								    path = pathname2url(path)
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								    # On Windows pathname2url cuts off trailing slash
-												Ensure 'str' and trailing slash in path2url

											
										
										
											2018-03-20 12:04:55 +03:00
+								    if wants_trailing_slash and not path.endswith('/'):
-												Test empty links and relative links with base

											
										
										
											2021-08-06 21:04:57 +03:00
+								        path += '/'  # pragma: no cover
-												Gotta learn to run *all* tests before pushing.

											
										
										
											2012-07-27 20:08:05 +04:00
+								    if path.startswith('///'):
 								        # On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
 								        # That enough slashes already.
-												Test empty links and relative links with base

											
										
										
											2021-08-06 21:04:57 +03:00
+								        return f'file:{path}'  # pragma: no cover
-												Gotta learn to run *all* tests before pushing.

											
										
										
											2012-07-27 20:08:05 +04:00
+								    else:
-												Use f-strings when possible

											
										
										
											2021-01-21 14:42:25 +03:00
+								        return f'file://{path}'
-												Switch from cssutils to tinycss as the CSS parser

											
										
										
											2012-03-24 16:39:31 +04:00
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
+								def url_is_absolute(url):
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								    """Return whether an URL (bytes or string) is absolute."""
 								    scheme = UNICODE_SCHEME_RE if isinstance(url, str) else BYTES_SCHEME_RE
 								    return bool(scheme.match(url))
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								def get_url_attribute(element, attr_name, base_url, allow_relative=False):
-												Warn for relative URI references without a base URI.

											
										
										
											2012-05-24 18:06:58 +04:00
+								    """Get the URI corresponding to the ``attr_name`` attribute.
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
 								    Return ``None`` if:
 								    * the attribute is empty or missing or,
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								    * the value is a relative URI but the document has no base URI and
 								      ``allow_relative`` is ``False``.
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								    Otherwise return an URI, absolute if possible.
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    value = element.get(attr_name, '').strip()
 								    if value:
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								        return url_join(
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								            base_url or '', value, allow_relative, '<%s %s="%s">',
-												Remove unsupported source lines

											
										
										
											2017-07-01 01:28:14 +03:00
+								            (element.tag, attr_name, value))
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								def url_join(base_url, url, allow_relative, context, context_args):
 								    """Like urllib.urljoin, but warn if base_url is required but missing."""
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    if url_is_absolute(url):
-												Escape non-ASCII characters in hyperlinks.

											
										
										
											2012-10-04 21:12:34 +04:00
+								        return iri_to_uri(url)
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    elif base_url:
-												Escape non-ASCII characters in hyperlinks.

											
										
										
											2012-10-04 21:12:34 +04:00
+								        return iri_to_uri(urljoin(base_url, url))
-												Allow relative URIs in anchors

Fix #437.

Backward-incompatible change: the signature of weasyprint.urls.url_join has
changed to allow relative links without a base URI.

This feature doesn't look really safe at first sight, but I can't find
anything bad coming from these changes. If there's a bug, I take the blame.

											
										
										
											2017-03-25 20:28:41 +03:00
+								    elif allow_relative:
 								        return iri_to_uri(url)
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    else:
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								        LOGGER.error(
-												Use f-strings when possible

											
										
										
											2021-01-21 14:42:25 +03:00
+								            f'Relative URI reference without a base URI: {context}',
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								            *context_args)
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								        return None
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
-												move urljoining to function

											
										
										
											2011-08-05 13:16:44 +04:00
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								def get_link_attribute(element, attr_name, base_url):
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								    """Get the URL value of an element attribute.
 								    Return ``('external', absolute_uri)``, or ``('internal',
 								    unquoted_fragment_id)``, or ``None``.
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
 								    """
 								    attr_value = element.get(attr_name, '').strip()
-												Allow absolute URLs without a base URL. Fix #42.

											
										
										
											2013-03-13 15:12:12 +04:00
+								    if attr_value.startswith('#') and len(attr_value) > 1:
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								        # Do not require a base_url when the value is just a fragment.
-												Always include token type when dealing with URLs

											
										
										
											2018-04-16 14:20:52 +03:00
+								        return ('url', ('internal', unquote(attr_value[1:])))
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								    uri = get_url_attribute(element, attr_name, base_url, allow_relative=True)
-												Allow absolute URLs without a base URL. Fix #42.

											
										
										
											2013-03-13 15:12:12 +04:00
+								    if uri:
-												Use ElementTree's elements as much as possible

											
										
										
											2017-07-03 16:19:05 +03:00
+								        if base_url:
-												Refs Kozea/WeasyPrint#2040 Wrapped urlsplit call in a try-except block
to fall back to an external URL when the url cannot be split

											
										
										
											2024-01-15 17:37:56 +03:00
+								            try:
 								                parsed = urlsplit(uri)
 								            except ValueError:
 								                LOGGER.warning('Malformed URL: %s', uri)
-												Display relevant error message when base URL is malformed

											
										
										
											2024-01-16 15:30:03 +03:00
+								            else:
 								                try:
 								                    parsed_base = urlsplit(base_url)
 								                except ValueError:
 								                    LOGGER.warning('Malformed base URL: %s', base_url)
 								                else:
 								                    # Compare with fragments removed
 								                    if parsed.fragment and parsed[:-1] == parsed_base[:-1]:
 								                        return ('url', ('internal', unquote(parsed.fragment)))
-												Always include token type when dealing with URLs

											
										
										
											2018-04-16 14:20:52 +03:00
+								        return ('url', ('external', uri))
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								def ensure_url(string):
 								    """Get a ``scheme://path`` URL from ``string``.
 								    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
 								    filename and convert it to a ``file://`` URL.
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Use the filesystem encoding for file:// URLs. This might help with #891

											
										
										
											2012-07-27 20:55:19 +04:00
+								    return string if url_is_absolute(string) else path2url(string)
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
-												set parameter name to ssl_context and pass directly to urlopen
											
										
										
											2019-04-04 15:04:05 +03:00
+								def default_url_fetcher(url, timeout=10, ssl_context=None):
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
+								    """Fetch an external resource such as an image or stylesheet.
-												Docs docs docs.

But no ducks.

											
										
										
											2012-10-08 21:51:18 +04:00
+								    Another callable with the same signature can be given as the
-												Split the "text" module into submodules

The module was too long, and the font management was split into two different
modules. We now have everything in the "text" module.

											
										
										
											2021-02-08 02:53:27 +03:00
+								    ``url_fetcher`` argument to :class:`HTML` or :class:`CSS`.
-												Add a lot of documentation

											
										
										
											2021-02-18 23:03:40 +03:00
+								    (See :ref:`URL Fetchers`.)
-												Use dictionary to store API options

This commit uses an "option" dictionary to store various API options that were
used as arguments in many public and private functions. This change allows to
easily document default values, to reduce the number of arguments and to avoid
many repetitions in documentation and signatures.

The changes to the public API are minimal, and should only have an impact for
users who passed unnamed arguments.

											
										
										
											2023-04-13 20:34:22 +03:00
+								    :param str url:
 								        The URL of the resource to fetch.
 								    :param int timeout:
 								        The number of seconds before HTTP requests are dropped.
 								    :param ssl.SSLContext ssl_context:
 								        An SSL context used for HTTP requests.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    :raises: An exception indicating failure, e.g. :obj:`ValueError` on
-												Cut long lines, remove an old note

											
										
										
											2017-04-29 11:41:53 +03:00
+								        syntactically invalid URL.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    :returns: A :obj:`dict` with the following keys:
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        * One of ``string`` (a :obj:`bytestring <bytes>`) or ``file_obj``
 								          (a :term:`file object`).
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								        * Optionally: ``mime_type``, a MIME type extracted e.g. from a
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
+								          *Content-Type* header. If not provided, the type is guessed from the
 								          file extension in the URL.
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								        * Optionally: ``encoding``, a character encoding extracted e.g. from a
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
+								          *charset* parameter in a *Content-Type* header
-												Fix ressource typo
											
										
										
											2015-07-01 02:01:43 +03:00
+								        * Optionally: ``redirected_url``, the actual URL of the resource
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								          if there were e.g. HTTP redirects.
-												Added optional filename key to the URL fetcher result

											
										
										
											2014-04-18 18:39:54 +04:00
+								        * Optionally: ``filename``, the filename of the resource. Usually
 								          derived from the *filename* parameter in a *Content-Disposition*
 								          header
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								        If a ``file_obj`` key is given, it is the caller’s responsibility
-												Update default_url_fetcher documentation

Related to #814.
											
										
										
											2019-02-27 20:38:14 +03:00
+								        to call ``file_obj.close()``. The default function used internally to
 								        fetch data in WeasyPrint tries to close the file object after
 								        retreiving; but if this URL fetcher is used elsewhere, the file object
 								        has to be closed manually.
-												Refactor image loading.

											
										
										
											2011-12-08 19:31:03 +04:00
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
+								    """
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								    if UNICODE_SCHEME_RE.match(url):
-												Remove query strings from file:// URIs

Fix #687, fix #688.

											
										
										
											2018-10-26 20:18:33 +03:00
+								        # See https://bugs.python.org/issue34702
 								        if url.startswith('file://'):
 								            url = url.split('?')[0]
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								        url = iri_to_uri(url)
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								        response = urlopen(
 								            Request(url, headers=HTTP_HEADERS), timeout=timeout,
 								            context=ssl_context)
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								        response_info = response.info()
-												Minor fixes

											
										
										
											2020-01-02 15:25:19 +03:00
+								        result = {
 								            'redirected_url': response.geturl(),
 								            'mime_type': response_info.get_content_type(),
 								            'encoding': response_info.get_param('charset'),
 								            'filename': response_info.get_filename(),
 								        }
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								        content_encoding = response_info.get('Content-Encoding')
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								        if content_encoding == 'gzip':
-												Improve urls.py coverage

											
										
										
											2020-01-13 23:26:49 +03:00
+								            result['file_obj'] = StreamingGzipFile(fileobj=response)
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								        elif content_encoding == 'deflate':
 								            data = response.read()
 								            try:
 								                result['string'] = zlib.decompress(data)
 								            except zlib.error:
 								                # Try without zlib header or checksum
 								                result['string'] = zlib.decompress(data, -15)
 								        else:
 								            result['file_obj'] = response
 								        return result
-												Improve urls.py coverage

											
										
										
											2020-01-13 23:26:49 +03:00
+								    else:  # pragma: no cover
-												Add ``url_fetcher`` to the public API

											
										
										
											2012-07-13 14:24:55 +04:00
+								        raise ValueError('Not an absolute URI: %r' % url)
-												Make mime_type optional in custom URL fetchers

											
										
										
											2012-07-18 16:31:55 +04:00
-												Image loading: only swallow exception related to fetching or decoding.

											
										
										
											2013-06-21 00:32:28 +04:00
+								class URLFetchingError(IOError):
 								    """Some error happened when fetching an URL."""
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								@contextlib.contextmanager
-												URL fetching: use an intermediate function rather than a wrapper

											
										
										
											2013-06-20 15:17:03 +04:00
+								def fetch(url_fetcher, url):
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								    """Call an url_fetcher, fill in optional data, and clean up."""
-												Image loading: only swallow exception related to fetching or decoding.

											
										
										
											2013-06-21 00:32:28 +04:00
+								    try:
 								        result = url_fetcher(url)
-												Clean formatted strings

											
										
										
											2020-05-30 16:48:24 +03:00
+								    except Exception as exception:
 								        raise URLFetchingError(f'{type(exception).__name__}: {exception}')
-												URL fetching: use an intermediate function rather than a wrapper

											
										
										
											2013-06-20 15:17:03 +04:00
+								    result.setdefault('redirected_url', url)
-												Do not get MIME types base on URLs.

											
										
										
											2013-06-20 15:26:12 +04:00
+								    result.setdefault('mime_type', None)
-												Docs: add an example combining documents into one PDF file.

											
										
										
											2013-07-14 12:17:40 +04:00
+								    if 'file_obj' in result:
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								        try:
 								            yield result
 								        finally:
 								            try:
 								                result['file_obj'].close()
-												Improve urls.py coverage

											
										
										
											2020-01-13 23:26:49 +03:00
+								            except Exception:  # pragma: no cover
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								                # May already be closed or something.
-												Correctly close StreamingGzipFile, and log this kind of error

… rather than silence them. Logs make tests fail.

											
										
										
											2014-04-27 14:00:02 +04:00
+								                # This is just cleanup anyway: log but make it non-fatal.
-												Improve urls.py coverage

											
										
										
											2020-01-13 23:26:49 +03:00
+								                LOGGER.warning(
 								                    'Error when closing stream for %s:\n%s',
 								                    url, traceback.format_exc())
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								    else:
 								        yield result