WeasyPrint/weasyprint/urls.py

# coding: utf-8
"""
    weasyprint.utils
    ----------------

    Various utility functions and classes.

    :copyright: Copyright 2011-2014 Simon Sapin and contributors, see AUTHORS.
    :license: BSD, see LICENSE for details.

"""

from __future__ import division, unicode_literals

import io
import re
import sys
import codecs
import os.path
import mimetypes
import contextlib
import gzip
import zlib
import traceback

from . import VERSION_STRING
from .logger import LOGGER
from .compat import (
    urljoin, urlsplit, quote, unquote, unquote_to_bytes, urlopen,
    urllib_get_content_type, urllib_get_charset, urllib_get_filename, Request,
    parse_email, pathname2url, unicode, base64_decode, StreamingGzipFile)


# Unlinke HTML, CSS and PNG, the SVG MIME type is not always builtin
# in some Python version and therefore not reliable.
if sys.version_info[0] >= 3:
    mimetypes.add_type('image/svg+xml', '.svg')
else:
    # Native strings required.
    mimetypes.add_type(b'image/svg+xml', b'.svg')


# getfilesystemencoding() on Linux is sometimes stupid...
FILESYSTEM_ENCODING = sys.getfilesystemencoding() or 'utf-8'
try:
    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
        FILESYSTEM_ENCODING = 'utf-8'
except LookupError:
    FILESYSTEM_ENCODING = 'utf-8'


# See http://stackoverflow.com/a/11687993/1162888
# Both are needed in Python 3 as the re module does not like to mix
# http://tools.ietf.org/html/rfc3986#section-3.1
UNICODE_SCHEME_RE = re.compile('^([a-zA-Z][a-zA-Z0-9.+-]+):')
BYTES_SCHEME_RE = re.compile(b'^([a-zA-Z][a-zA-Z0-9.+-]+):')


def iri_to_uri(url):
    """Turn an IRI that can contain any Unicode character into an ASCII-only
    URI that conforms to RFC 3986.
    """
    if url.startswith('data:'):
        # Data URIs can be huge, but don’t need this anyway.
        return url
    # Use UTF-8 as per RFC 3987 (IRI), except for file://
    url = url.encode(FILESYSTEM_ENCODING
                     if url.startswith('file:') else 'utf-8')
    # This is a full URI, not just a component. Only %-encode characters
    # that are not allowed at all in URIs. Everthing else is "safe":
    # * Reserved characters: /:?#[]@!$&'()*+,;=
    # * Unreserved characters: ASCII letters, digits and -._~
    #   Of these, only '~' is not in urllib’s "always safe" list.
    # * '%' to avoid double-encoding
    return quote(url, safe=b"/:?#[]@!$&'()*+,;=~%")


def path2url(path):
    """Return file URL of `path`"""
    path = os.path.abspath(path)
    if os.path.isdir(path):
        # Make sure directory names have a trailing slash.
        # Otherwise relative URIs are resolved from the parent directory.
        path += os.path.sep
    if isinstance(path, unicode):
        path = path.encode(FILESYSTEM_ENCODING)
    path = pathname2url(path)
    if path.startswith('///'):
        # On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
        # That enough slashes already.
        return 'file:' + path
    else:
        return 'file://' + path


def url_is_absolute(url):
    return bool(
        (UNICODE_SCHEME_RE if isinstance(url, unicode) else BYTES_SCHEME_RE)
        .match(url))


def element_base_url(element):
    """Return the URL associated with a lxml document.

    This is the same as the HtmlElement.base_url property, but dont’t want
    to require HtmlElement.

    """
    return element.getroottree().docinfo.URL


def get_url_attribute(element, attr_name):
    """Get the URI corresponding to the ``attr_name`` attribute.

    Return ``None`` if:

    * the attribute is empty or missing or,
    * the value is a relative URI but the document has no base URI.

    Otherwise, return an absolute URI.

    """
    value = element.get(attr_name, '').strip()
    if value:
        return url_join(element_base_url(element), value,
                        '<%s %s="%s"> at line %s', element.tag, attr_name,
                        value, element.sourceline)


def url_join(base_url, url, context, *args):
    """Like urllib.urljoin, but issue a warning and return None if base_url
    is required but missing.

    """
    if url_is_absolute(url):
        return iri_to_uri(url)
    elif base_url:
        return iri_to_uri(urljoin(base_url, url))
    else:
        LOGGER.warning('Relative URI reference without a base URI: ' + context,
                       *args)
        return None


def get_link_attribute(element, attr_name):
    """Return ('external', absolute_uri) or
    ('internal', unquoted_fragment_id) or None.

    """
    attr_value = element.get(attr_name, '').strip()
    if attr_value.startswith('#') and len(attr_value) > 1:
        # Do not require a base_url when the value is just a fragment.
        return 'internal', unquote(attr_value[1:])
    uri = get_url_attribute(element, attr_name)
    if uri:
        document_url = element_base_url(element)
        if document_url:
            parsed = urlsplit(uri)
            # Compare with fragments removed
            if parsed[:-1] == urlsplit(document_url)[:-1]:
                return 'internal', unquote(parsed.fragment)
        return 'external', uri


def ensure_url(string):
    """Get a ``scheme://path`` URL from ``string``.

    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
    filename and convert it to a ``file://`` URL.

    """
    return string if url_is_absolute(string) else path2url(string)


def safe_base64_decode(data):
    """Decode base64, padding being optional.

    "From a theoretical point of view, the padding character is not needed,
     since the number of missing bytes can be calculated from the number
     of Base64 digits."

    https://en.wikipedia.org/wiki/Base64#Padding

    :param data: Base64 data as an ASCII byte string
    :returns: The decoded byte string.

    """
    missing_padding = 4 - len(data) % 4
    if missing_padding:
        data += b'=' * missing_padding
    return base64_decode(data)


def open_data_url(url):
    """Decode URLs with the 'data' scheme. urllib can handle them
    in Python 2, but that is broken in Python 3.

    Inspired from Python 2.7.2’s urllib.py.

    """
    # syntax of data URLs:
    # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
    # mediatype := [ type "/" subtype ] *( ";" parameter )
    # data      := *urlchar
    # parameter := attribute "=" value
    try:
        header, data = url.split(',', 1)
    except ValueError:
        raise IOError('bad data URL')
    header = header[5:]  # len('data:') == 5
    if header:
        semi = header.rfind(';')
        if semi >= 0 and '=' not in header[semi:]:
            content_type = header[:semi]
            encoding = header[semi + 1:]
        else:
            content_type = header
            encoding = ''
        message = parse_email('Content-type: ' + content_type)
        mime_type = message.get_content_type()
        charset = message.get_content_charset()
    else:
        mime_type = 'text/plain'
        charset = 'US-ASCII'
        encoding = ''

    data = unquote_to_bytes(data)
    if encoding == 'base64':
        data = safe_base64_decode(data)

    return dict(string=data, mime_type=mime_type, encoding=charset,
                redirected_url=url)


HTTP_HEADERS = {
    'User-Agent': VERSION_STRING,
    'Accept-Encoding': 'gzip, deflate',
}


def default_url_fetcher(url):
    """Fetch an external resource such as an image or stylesheet.

    Another callable with the same signature can be given as the
    :obj:`url_fetcher` argument to :class:`HTML` or :class:`CSS`.
    (See :ref:`url-fetchers`.)

    :type url: Unicode string
    :param url: The URL of the resource to fetch
    :raises: any exception to indicate failure. Failures are logged
        as warnings, with the string representation of the exception
        in the message.
    :returns: In case of success, a dict with the following keys:

        * One of ``string`` (a byte string) or ``file_obj``
          (a file-like object)
        * Optionally: ``mime_type``, a MIME type extracted eg. from a
          *Content-Type* header. If not provided, the type is guessed from the
          file extension in the URL.
        * Optionally: ``encoding``, a character encoding extracted eg. from a
          *charset* parameter in a *Content-Type* header
        * Optionally: ``redirected_url``, the actual URL of the resource
          in case there were eg. HTTP redirects.
        * Optionally: ``filename``, the filename of the resource. Usually
          derived from the *filename* parameter in a *Content-Disposition*
          header

        If a ``file_obj`` key is given, it is the caller’s responsability
        to call ``file_obj.close()``.

    """
    if url.lower().startswith('data:'):
        return open_data_url(url)
    elif UNICODE_SCHEME_RE.match(url):
        url = iri_to_uri(url)
        response = urlopen(Request(url, headers=HTTP_HEADERS))
        result = dict(redirected_url=response.geturl(),
                      mime_type=urllib_get_content_type(response),
                      encoding=urllib_get_charset(response),
                      filename=urllib_get_filename(response))
        content_encoding = response.info().get('Content-Encoding')
        if content_encoding == 'gzip':
            if StreamingGzipFile is None:
                result['string'] = gzip.GzipFile(
                    fileobj=io.BytesIO(response.read())).read()
                response.close()
            else:
                result['file_obj'] = StreamingGzipFile(fileobj=response)
        elif content_encoding == 'deflate':
            data = response.read()
            try:
                result['string'] = zlib.decompress(data)
            except zlib.error:
                # Try without zlib header or checksum
                result['string'] = zlib.decompress(data, -15)
        else:
            result['file_obj'] = response
        return result
    else:
        raise ValueError('Not an absolute URI: %r' % url)


class URLFetchingError(IOError):
    """Some error happened when fetching an URL."""


@contextlib.contextmanager
def fetch(url_fetcher, url):
    """Call an url_fetcher, fill in optional data, and clean up."""
    try:
        result = url_fetcher(url)
    except Exception as exc:
        name = type(exc).__name__
        value = str(exc)
        raise URLFetchingError('%s: %s' % (name, value) if value else name)
    result.setdefault('redirected_url', url)
    result.setdefault('mime_type', None)
    if 'file_obj' in result:
        try:
            yield result
        finally:
            try:
                result['file_obj'].close()
            except Exception:
                # May already be closed or something.
                # This is just cleanup anyway: log but make it non-fatal.
                LOGGER.warning('Error when closing stream for %s:\n%s',
                               url, traceback.format_exc())
    else:
        yield result
-												Replace utf8 with utf-8 for gettext compatibility

											
										
										
											2015-11-25 10:38:01 +03:00
+								# coding: utf-8
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								"""
 								    weasyprint.utils
 								    ----------------
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								    Various utility functions and classes.
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
-.

											
										
										
											2014-01-10 18:27:02 +04:00
+								    :copyright: Copyright 2011-2014 Simon Sapin and contributors, see AUTHORS.
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								    :license: BSD, see LICENSE for details.
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
 								"""
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								from __future__ import division, unicode_literals
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								import io
-												Make urlopen() extensible

There is a global dict mapping URI schemes to opener functions,
and a decorator to add such a function.

Expected usage:

from weasyprint.urls import register_opener
@register_opener('foo')
def git_urlopen(url):
    url = urlparse.urlsplit(url)
    assert url.scheme == 'foo'
    # ...
    return fileobj, mimetype, charset

											
										
										
											2012-05-23 16:43:02 +04:00
+								import re
-												Use the filesystem encoding for file:// URLs. This might help with #891

											
										
										
											2012-07-27 20:55:19 +04:00
+								import sys
-												Work around getfilesystemencoding stupidity on Linux.

											
										
										
											2012-07-30 14:01:20 +04:00
+								import codecs
-												Make urlopen() extensible

There is a global dict mapping URI schemes to opener functions,
and a decorator to add such a function.

Expected usage:

from weasyprint.urls import register_opener
@register_opener('foo')
def git_urlopen(url):
    url = urlparse.urlsplit(url)
    assert url.scheme == 'foo'
    # ...
    return fileobj, mimetype, charset

											
										
										
											2012-05-23 16:43:02 +04:00
+								import os.path
-												Make mime_type optional in custom URL fetchers

											
										
										
											2012-07-18 16:31:55 +04:00
+								import mimetypes
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								import contextlib
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								import gzip
 								import zlib
-												Correctly close StreamingGzipFile, and log this kind of error

… rather than silence them. Logs make tests fail.

											
										
										
											2014-04-27 14:00:02 +04:00
+								import traceback
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Have the PDF post-process run on Python3 (links are broken)

											
										
										
											2012-05-15 15:40:36 +04:00
+								from . import VERSION_STRING
-												Warn for relative URI references without a base URI.

											
										
										
											2012-05-24 18:06:58 +04:00
+								from .logger import LOGGER
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								from .compat import (
-												Replaced `urlopen_contenttype` with `urllib_get_content_type`, `urllib_get_charset` and `urllib_get_filename`.

											
										
										
											2014-04-23 18:24:14 +04:00
+								    urljoin, urlsplit, quote, unquote, unquote_to_bytes, urlopen,
 								    urllib_get_content_type, urllib_get_charset, urllib_get_filename, Request,
-												Merge branch 'pdf-attachments' from PR #177

											
										
										
											2014-04-27 20:56:02 +04:00
+								    parse_email, pathname2url, unicode, base64_decode, StreamingGzipFile)
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
-												Fix MIME type registering: always use native strings.

											
										
										
											2012-07-29 17:13:45 +04:00
+								# Unlinke HTML, CSS and PNG, the SVG MIME type is not always builtin
 								# in some Python version and therefore not reliable.
 								if sys.version_info[0] >= 3:
 								    mimetypes.add_type('image/svg+xml', '.svg')
 								else:
 								    # Native strings required.
 								    mimetypes.add_type(b'image/svg+xml', b'.svg')
-												Force the MIME types for the file extensions we care about.

											
										
										
											2012-07-29 00:38:44 +04:00
-												Work around getfilesystemencoding stupidity on Linux.

											
										
										
											2012-07-30 14:01:20 +04:00
+								# getfilesystemencoding() on Linux is sometimes stupid...
 								FILESYSTEM_ENCODING = sys.getfilesystemencoding() or 'utf-8'
 								try:
 								    if codecs.lookup(FILESYSTEM_ENCODING).name == 'ascii':
 								        FILESYSTEM_ENCODING = 'utf-8'
 								except LookupError:
 								    FILESYSTEM_ENCODING = 'utf-8'
-												Require at least 2 characters in URL schemes.

This makes sure that Windows filenames like C:\foo\bar.html
are not considered as URLs.

											
										
										
											2012-07-27 19:19:17 +04:00
+								# See http://stackoverflow.com/a/11687993/1162888
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								# Both are needed in Python 3 as the re module does not like to mix
-												Fix the URL scheme regexp.

Not sure why I had [0-1] there.
											
										
										
											2013-04-12 11:32:25 +04:00
+								# http://tools.ietf.org/html/rfc3986#section-3.1
 								UNICODE_SCHEME_RE = re.compile('^([a-zA-Z][a-zA-Z0-9.+-]+):')
 								BYTES_SCHEME_RE = re.compile(b'^([a-zA-Z][a-zA-Z0-9.+-]+):')
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								def iri_to_uri(url):
-												Fix typo

											
										
										
											2015-12-16 15:50:04 +03:00
+								    """Turn an IRI that can contain any Unicode character into an ASCII-only
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								    URI that conforms to RFC 3986.
-												Have the PDF post-process run on Python3 (links are broken)

											
										
										
											2012-05-15 15:40:36 +04:00
+								    """
-												Escape non-ASCII characters in hyperlinks.

											
										
										
											2012-10-04 21:12:34 +04:00
+								    if url.startswith('data:'):
 								        # Data URIs can be huge, but don’t need this anyway.
 								        return url
-												Use the filesystem encoding for file:// URLs. This might help with #891

											
										
										
											2012-07-27 20:55:19 +04:00
+								    # Use UTF-8 as per RFC 3987 (IRI), except for file://
-												Work around getfilesystemencoding stupidity on Linux.

											
										
										
											2012-07-30 14:01:20 +04:00
+								    url = url.encode(FILESYSTEM_ENCODING
 								                     if url.startswith('file:') else 'utf-8')
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								    # This is a full URI, not just a component. Only %-encode characters
 								    # that are not allowed at all in URIs. Everthing else is "safe":
 								    # * Reserved characters: /:?#[]@!$&'()*+,;=
 								    # * Unreserved characters: ASCII letters, digits and -._~
 								    #   Of these, only '~' is not in urllib’s "always safe" list.
 								    # * '%' to avoid double-encoding
-												Fix URL quoting on Python 2.6

											
										
										
											2012-05-18 19:54:10 +04:00
+								    return quote(url, safe=b"/:?#[]@!$&'()*+,;=~%")
-												Add and test more warnings.

											
										
										
											2011-12-16 15:19:10 +04:00
-												Switch from cssutils to tinycss as the CSS parser

											
										
										
											2012-03-24 16:39:31 +04:00
+								def path2url(path):
 								    """Return file URL of `path`"""
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								    path = os.path.abspath(path)
-												Fix base_url set to a directory name.

With base_url='/a/b', a relative URL 'c' was resolved to '/a/c'.
Now it is resolved to '/a/b/c' if /a/b is a directory, '/a/c'
otherwise. This is most likely the expected behavior.

											
										
										
											2012-08-02 19:19:34 +04:00
+								    if os.path.isdir(path):
 								        # Make sure directory names have a trailing slash.
 								        # Otherwise relative URIs are resolved from the parent directory.
 								        path += os.path.sep
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								    if isinstance(path, unicode):
-												Work around getfilesystemencoding stupidity on Linux.

											
										
										
											2012-07-30 14:01:20 +04:00
+								        path = path.encode(FILESYSTEM_ENCODING)
-												Gotta learn to run *all* tests before pushing.

											
										
										
											2012-07-27 20:08:05 +04:00
+								    path = pathname2url(path)
 								    if path.startswith('///'):
 								        # On Windows pathname2url(r'C:\foo') is apparently '///C:/foo'
 								        # That enough slashes already.
 								        return 'file:' + path
 								    else:
 								        return 'file://' + path
-												Switch from cssutils to tinycss as the CSS parser

											
										
										
											2012-03-24 16:39:31 +04:00
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
+								def url_is_absolute(url):
-												Accept both unicode and bytes filenames.

											
										
										
											2012-06-02 10:04:24 +04:00
+								    return bool(
 								        (UNICODE_SCHEME_RE if isinstance(url, unicode) else BYTES_SCHEME_RE)
 								        .match(url))
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								def element_base_url(element):
 								    """Return the URL associated with a lxml document.
 								    This is the same as the HtmlElement.base_url property, but dont’t want
 								    to require HtmlElement.
 								    """
 								    return element.getroottree().docinfo.URL
-												Warn for relative URI references without a base URI.

											
										
										
											2012-05-24 18:06:58 +04:00
+								def get_url_attribute(element, attr_name):
 								    """Get the URI corresponding to the ``attr_name`` attribute.
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
 								    Return ``None`` if:
 								    * the attribute is empty or missing or,
 								    * the value is a relative URI but the document has no base URI.
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
+								    Otherwise, return an absolute URI.
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    value = element.get(attr_name, '').strip()
 								    if value:
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								        return url_join(element_base_url(element), value,
 								                        '<%s %s="%s"> at line %s', element.tag, attr_name,
 								                        value, element.sourceline)
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
 								def url_join(base_url, url, context, *args):
 								    """Like urllib.urljoin, but issue a warning and return None if base_url
 								    is required but missing.
 								    """
 								    if url_is_absolute(url):
-												Escape non-ASCII characters in hyperlinks.

											
										
										
											2012-10-04 21:12:34 +04:00
+								        return iri_to_uri(url)
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    elif base_url:
-												Escape non-ASCII characters in hyperlinks.

											
										
										
											2012-10-04 21:12:34 +04:00
+								        return iri_to_uri(urljoin(base_url, url))
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								    else:
-												Be careful logging.warn is deprecated

											
										
										
											2013-08-19 16:38:09 +04:00
+								        LOGGER.warning('Relative URI reference without a base URI: ' + context,
-												Fix code formatting. (Clean flake8!)

											
										
										
											2014-04-27 15:29:55 +04:00
+								                       *args)
-												Warn (not crash) on missing base_url for @import

											
										
										
											2012-09-25 18:01:12 +04:00
+								        return None
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
-												move urljoining to function

											
										
										
											2011-08-05 13:16:44 +04:00
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								def get_link_attribute(element, attr_name):
 								    """Return ('external', absolute_uri) or
 								    ('internal', unquoted_fragment_id) or None.
 								    """
 								    attr_value = element.get(attr_name, '').strip()
-												Allow absolute URLs without a base URL. Fix #42.

											
										
										
											2013-03-13 15:12:12 +04:00
+								    if attr_value.startswith('#') and len(attr_value) > 1:
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								        # Do not require a base_url when the value is just a fragment.
 								        return 'internal', unquote(attr_value[1:])
-												Allow absolute URLs without a base URL. Fix #42.

											
										
										
											2013-03-13 15:12:12 +04:00
+								    uri = get_url_attribute(element, attr_name)
 								    if uri:
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								        document_url = element_base_url(element)
-												Allow absolute URLs without a base URL. Fix #42.

											
										
										
											2013-03-13 15:12:12 +04:00
+								        if document_url:
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								            parsed = urlsplit(uri)
 								            # Compare with fragments removed
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								            if parsed[:-1] == urlsplit(document_url)[:-1]:
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								                return 'internal', unquote(parsed.fragment)
-												Allow absolute URLs without a base URL. Fix #42.

											
										
										
											2013-03-13 15:12:12 +04:00
+								        return 'external', uri
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								def ensure_url(string):
 								    """Get a ``scheme://path`` URL from ``string``.
 								    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
 								    filename and convert it to a ``file://`` URL.
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Use the filesystem encoding for file:// URLs. This might help with #891

											
										
										
											2012-07-27 20:55:19 +04:00
+								    return string if url_is_absolute(string) else path2url(string)
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
-												Remove a deprecation warning.

											
										
										
											2012-07-29 20:38:59 +04:00
+								def safe_base64_decode(data):
-												Fix base64 URLs without padding

											
										
										
											2012-03-21 19:07:49 +04:00
+								    """Decode base64, padding being optional.
 								    "From a theoretical point of view, the padding character is not needed,
 								     since the number of missing bytes can be calculated from the number
 								     of Base64 digits."
 								    https://en.wikipedia.org/wiki/Base64#Padding
 								    :param data: Base64 data as an ASCII byte string
 								    :returns: The decoded byte string.
 								    """
 								    missing_padding = 4 - len(data) % 4
 								    if missing_padding:
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								        data += b'=' * missing_padding
-												Remove a deprecation warning.

											
										
										
											2012-07-29 20:38:59 +04:00
+								    return base64_decode(data)
-												Fix base64 URLs without padding

											
										
										
											2012-03-21 19:07:49 +04:00
-												Make urlopen() extensible

There is a global dict mapping URI schemes to opener functions,
and a decorator to add such a function.

Expected usage:

from weasyprint.urls import register_opener
@register_opener('foo')
def git_urlopen(url):
    url = urlparse.urlsplit(url)
    assert url.scheme == 'foo'
    # ...
    return fileobj, mimetype, charset

											
										
										
											2012-05-23 16:43:02 +04:00
+								def open_data_url(url):
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								    """Decode URLs with the 'data' scheme. urllib can handle them
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								    in Python 2, but that is broken in Python 3.
-												Allow internal links without a base URI

... only for links in HTML attributes: <a href="#foo">

TODO: same for links in CSS: -weasy-link: url(#foo)

											
										
										
											2012-05-30 22:06:44 +04:00
+								    Inspired from Python 2.7.2’s urllib.py.
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
 								    """
 								    # syntax of data URLs:
 								    # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 								    # mediatype := [ type "/" subtype ] *( ";" parameter )
 								    # data      := *urlchar
 								    # parameter := attribute "=" value
 								    try:
 								        header, data = url.split(',', 1)
 								    except ValueError:
 								        raise IOError('bad data URL')
 								    header = header[5:]  # len('data:') == 5
 								    if header:
 								        semi = header.rfind(';')
 								        if semi >= 0 and '=' not in header[semi:]:
 								            content_type = header[:semi]
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								            encoding = header[semi + 1:]
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								        else:
 								            content_type = header
 								            encoding = ''
 								        message = parse_email('Content-type: ' + content_type)
 								        mime_type = message.get_content_type()
 								        charset = message.get_content_charset()
 								    else:
 								        mime_type = 'text/plain'
 								        charset = 'US-ASCII'
 								        encoding = ''
-												Fix base64 URLs without padding

											
										
										
											2012-03-21 19:07:49 +04:00
+								    data = unquote_to_bytes(data)
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								    if encoding == 'base64':
-												Remove a deprecation warning.

											
										
										
											2012-07-29 20:38:59 +04:00
+								        data = safe_base64_decode(data)
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
-												Make mime_type optional in custom URL fetchers

											
										
										
											2012-07-18 16:31:55 +04:00
+								    return dict(string=data, mime_type=mime_type, encoding=charset,
 								                redirected_url=url)
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								HTTP_HEADERS = {
 								    'User-Agent': VERSION_STRING,
 								    'Accept-Encoding': 'gzip, deflate',
 								}
-												Fix code formatting. (Clean flake8!)

											
										
										
											2014-04-27 15:29:55 +04:00
-												Add ``url_fetcher`` to the public API

											
										
										
											2012-07-13 14:24:55 +04:00
+								def default_url_fetcher(url):
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
+								    """Fetch an external resource such as an image or stylesheet.
-												Docs docs docs.

But no ducks.

											
										
										
											2012-10-08 21:51:18 +04:00
+								    Another callable with the same signature can be given as the
 								    :obj:`url_fetcher` argument to :class:`HTML` or :class:`CSS`.
 								    (See :ref:`url-fetchers`.)
 								    :type url: Unicode string
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
+								    :param url: The URL of the resource to fetch
 								    :raises: any exception to indicate failure. Failures are logged
 								        as warnings, with the string representation of the exception
 								        in the message.
 								    :returns: In case of success, a dict with the following keys:
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								        * One of ``string`` (a byte string) or ``file_obj``
 								          (a file-like object)
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
+								        * Optionally: ``mime_type``, a MIME type extracted eg. from a
 								          *Content-Type* header. If not provided, the type is guessed from the
 								          file extension in the URL.
 								        * Optionally: ``encoding``, a character encoding extracted eg. from a
 								          *charset* parameter in a *Content-Type* header
-												Fix ressource typo
											
										
										
											2015-07-01 02:01:43 +03:00
+								        * Optionally: ``redirected_url``, the actual URL of the resource
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								          in case there were eg. HTTP redirects.
-												Added optional filename key to the URL fetcher result

											
										
										
											2014-04-18 18:39:54 +04:00
+								        * Optionally: ``filename``, the filename of the resource. Usually
 								          derived from the *filename* parameter in a *Content-Disposition*
 								          header
-												Integrate the sphinx docs with docstrings.

											
										
										
											2012-09-19 19:37:52 +04:00
-												Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
https://github.com/Kozea/WeasyPrint/pull/12

											
										
										
											2012-11-06 16:55:39 +04:00
+								        If a ``file_obj`` key is given, it is the caller’s responsability
 								        to call ``file_obj.close()``.
-												Refactor image loading.

											
										
										
											2011-12-08 19:31:03 +04:00
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
+								    """
-												Fix #21: Check for 'data' URL scheme case-insensitively.

											
										
										
											2013-03-28 16:33:28 +04:00
+								    if url.lower().startswith('data:'):
-												Add ``url_fetcher`` to the public API

											
										
										
											2012-07-13 14:24:55 +04:00
+								        return open_data_url(url)
 								    elif UNICODE_SCHEME_RE.match(url):
-												Fix URI encoding per RFC 3986 and 3987.

											
										
										
											2012-05-18 11:12:50 +04:00
+								        url = iri_to_uri(url)
-												Merge branch 'pdf-attachments' from PR #177

											
										
										
											2014-04-27 20:56:02 +04:00
+								        response = urlopen(Request(url, headers=HTTP_HEADERS))
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								        result = dict(redirected_url=response.geturl(),
-												Merge branch 'pdf-attachments' from PR #177

											
										
										
											2014-04-27 20:56:02 +04:00
+								                      mime_type=urllib_get_content_type(response),
 								                      encoding=urllib_get_charset(response),
 								                      filename=urllib_get_filename(response))
-												Fix #86: Support gzip and deflate encoding in HTTP responses

											
										
										
											2014-04-22 02:52:58 +04:00
+								        content_encoding = response.info().get('Content-Encoding')
 								        if content_encoding == 'gzip':
 								            if StreamingGzipFile is None:
 								                result['string'] = gzip.GzipFile(
 								                    fileobj=io.BytesIO(response.read())).read()
 								                response.close()
 								            else:
 								                result['file_obj'] = StreamingGzipFile(fileobj=response)
 								        elif content_encoding == 'deflate':
 								            data = response.read()
 								            try:
 								                result['string'] = zlib.decompress(data)
 								            except zlib.error:
 								                # Try without zlib header or checksum
 								                result['string'] = zlib.decompress(data, -15)
 								        else:
 								            result['file_obj'] = response
 								        return result
-												Add ``url_fetcher`` to the public API

											
										
										
											2012-07-13 14:24:55 +04:00
+								    else:
 								        raise ValueError('Not an absolute URI: %r' % url)
-												Make mime_type optional in custom URL fetchers

											
										
										
											2012-07-18 16:31:55 +04:00
-												Image loading: only swallow exception related to fetching or decoding.

											
										
										
											2013-06-21 00:32:28 +04:00
+								class URLFetchingError(IOError):
 								    """Some error happened when fetching an URL."""
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								@contextlib.contextmanager
-												URL fetching: use an intermediate function rather than a wrapper

											
										
										
											2013-06-20 15:17:03 +04:00
+								def fetch(url_fetcher, url):
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								    """Call an url_fetcher, fill in optional data, and clean up."""
-												Image loading: only swallow exception related to fetching or decoding.

											
										
										
											2013-06-21 00:32:28 +04:00
+								    try:
 								        result = url_fetcher(url)
 								    except Exception as exc:
 								        name = type(exc).__name__
 								        value = str(exc)
 								        raise URLFetchingError('%s: %s' % (name, value) if value else name)
-												URL fetching: use an intermediate function rather than a wrapper

											
										
										
											2013-06-20 15:17:03 +04:00
+								    result.setdefault('redirected_url', url)
-												Do not get MIME types base on URLs.

											
										
										
											2013-06-20 15:26:12 +04:00
+								    result.setdefault('mime_type', None)
-												Docs: add an example combining documents into one PDF file.

											
										
										
											2013-07-14 12:17:40 +04:00
+								    if 'file_obj' in result:
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								        try:
 								            yield result
 								        finally:
 								            try:
 								                result['file_obj'].close()
-												Correctly close StreamingGzipFile, and log this kind of error

… rather than silence them. Logs make tests fail.

											
										
										
											2014-04-27 14:00:02 +04:00
+								            except Exception:
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								                # May already be closed or something.
-												Correctly close StreamingGzipFile, and log this kind of error

… rather than silence them. Logs make tests fail.

											
										
										
											2014-04-27 14:00:02 +04:00
+								                # This is just cleanup anyway: log but make it non-fatal.
 								                LOGGER.warning('Error when closing stream for %s:\n%s',
 								                               url, traceback.format_exc())
-												URL fetching: enforce closing sockets/files with a context manager.

											
										
										
											2013-06-20 15:58:24 +04:00
+								    else:
 								        yield result