WeasyPrint/weasyprint/utils.py

# coding: utf8
"""
    weasyprint.utils
    ----------------

    Various utility functions and classes.

    :copyright: Copyright 2011-2012 Simon Sapin and contributors, see AUTHORS.
    :license: BSD, see LICENSE for details.

"""

from __future__ import division, unicode_literals

import io
import base64

from cssutils.helper import path2url

from . import VERSION
from .logger import LOGGER
from .compat import (
    urljoin, urlparse, unquote_to_bytes, urlopen_contenttype, Request,
    parse_email)


# TODO: Most of this module is URL-related. Rename it to weasyprint.urls?


HTTP_USER_AGENT = 'WeasyPrint/%s http://weasyprint.org/' % VERSION


def get_url_attribute(element, key):
    """Get the URL corresponding to the ``key`` attribute of ``element``.

    The retrieved URL is absolute, even if the URL in the element is relative.

    """
    attr_value = element.get(key)
    if attr_value:
        attr_value = attr_value.strip()
        if attr_value:
            return urljoin(element.base_url, attr_value)


def ensure_url(string):
    """Get a ``scheme://path`` URL from ``string``.

    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
    filename and convert it to a ``file://`` URL.

    """
    if urlparse(string).scheme:
        return string
    else:
        return path2url(string.encode('utf8'))


def decode_base64(data):
    """Decode base64, padding being optional.

    "From a theoretical point of view, the padding character is not needed,
     since the number of missing bytes can be calculated from the number
     of Base64 digits."

    https://en.wikipedia.org/wiki/Base64#Padding

    :param data: Base64 data as an ASCII byte string
    :returns: The decoded byte string.

    """
    missing_padding = 4 - len(data) % 4
    if missing_padding:
        data += b'='* missing_padding
    return base64.decodestring(data)


def parse_data_url(url):
    """Decode URLs with the 'data' stream. urllib can handle them
    in Python 2, but that is broken in Python 3.

    Inspired from the Python 2.7.2’s urllib.py.

    """
    # syntax of data URLs:
    # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
    # mediatype := [ type "/" subtype ] *( ";" parameter )
    # data      := *urlchar
    # parameter := attribute "=" value
    try:
        header, data = url.split(',', 1)
    except ValueError:
        raise IOError('bad data URL')
    header = header[5:]  # len('data:') == 5
    if header:
        semi = header.rfind(';')
        if semi >= 0 and '=' not in header[semi:]:
            content_type = header[:semi]
            encoding = header[semi+1:]
        else:
            content_type = header
            encoding = ''
        message = parse_email('Content-type: ' + content_type)
        mime_type = message.get_content_type()
        charset = message.get_content_charset()
    else:
        mime_type = 'text/plain'
        charset = 'US-ASCII'
        encoding = ''

    data = unquote_to_bytes(data)
    if encoding == 'base64':
        data = decode_base64(data)

    return io.BytesIO(data), mime_type, charset


def urlopen(url):
    """Fetch an URL and return ``(file_like, mime_type, charset)``.

    It is the caller’s responsability to call ``file_like.close()``.
    """
    if url.startswith('data:'):
        return parse_data_url(url)
    else:
        return urlopen_contenttype(Request(url,
            headers={'User-Agent': HTTP_USER_AGENT}))


def urllib_fetcher(url):
    """URL fetcher for cssutils.

    This fetcher is based on urllib instead of urllib2, since urllib has
    support for the "data" URL scheme.

    """
    file_like, mime_type, charset = urlopen(url)
    if mime_type != 'text/css':
        LOGGER.warn('Expected `text/css` for stylsheet at %s, got `%s`',
                    url, mime_type)
        return None
    content = file_like.read()
    file_like.close()
    return charset, content


class cached_property(object):
    """A decorator that converts a function into a lazy property. The
    function wrapped is called the first time to retrieve the result
    and then that calculated result is used the next time you access
    the value.

    Stolen from Werkzeug:
    https://github.com/mitsuhiko/werkzeug/blob/7b8d887d33/werkzeug/utils.py#L28

    """

    def __init__(self, func):
        self.__name__ = func.__name__
        self.__module__ = func.__module__
        self.__doc__ = func.__doc__
        self.func = func

    def __get__(self, obj, type=None):
        if obj is None:
            return self
        missing = object()
        value = obj.__dict__.get(self.__name__, missing)
        if value is missing:
            value = self.func(obj)
            obj.__dict__[self.__name__] = value
        return value
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
+								# coding: utf8
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								"""
 								    weasyprint.utils
 								    ----------------
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								    Various utility functions and classes.
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								    :copyright: Copyright 2011-2012 Simon Sapin and contributors, see AUTHORS.
 								    :license: BSD, see LICENSE for details.
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
 								"""
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								from __future__ import division, unicode_literals
 								import io
 								import base64
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								from cssutils.helper import path2url
-												Re-factor urllib usage and add a custom User-Agent.

It seems that Wikimedia bans the urllib user agent.

											
										
										
											2011-10-17 17:04:13 +04:00
+								from . import VERSION
-												Rename weasyprint.logging to avoid conflicts the stdlib module.

											
										
										
											2012-02-22 20:12:40 +04:00
+								from .logger import LOGGER
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								from .compat import (
 								    urljoin, urlparse, unquote_to_bytes, urlopen_contenttype, Request,
 								    parse_email)
-												Switch the licence to BSD and rewrite module docstrings/headers

											
										
										
											2012-03-22 02:19:27 +04:00
+								# TODO: Most of this module is URL-related. Rename it to weasyprint.urls?
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								HTTP_USER_AGENT = 'WeasyPrint/%s http://weasyprint.org/' % VERSION
-												Add and test more warnings.

											
										
										
											2011-12-16 15:19:10 +04:00
-												move urljoining to function

											
										
										
											2011-08-05 13:16:44 +04:00
+								def get_url_attribute(element, key):
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								    """Get the URL corresponding to the ``key`` attribute of ``element``.
 								    The retrieved URL is absolute, even if the URL in the element is relative.
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Handle alt-text on images.

											
										
										
											2011-08-25 19:29:16 +04:00
+								    attr_value = element.get(key)
-												Add and test more warnings.

											
										
										
											2011-12-16 15:19:10 +04:00
+								    if attr_value:
-												Be more defensive.

											
										
										
											2012-02-27 19:48:27 +04:00
+								        attr_value = attr_value.strip()
 								        if attr_value:
 								            return urljoin(element.base_url, attr_value)
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
-												move urljoining to function

											
										
										
											2011-08-05 13:16:44 +04:00
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								def ensure_url(string):
 								    """Get a ``scheme://path`` URL from ``string``.
 								    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
 								    filename and convert it to a ``file://`` URL.
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Add tests and fixes for non-ASCII URLs

											
										
										
											2012-01-16 16:12:27 +04:00
+								    if urlparse(string).scheme:
 								        return string
 								    else:
 								        return path2url(string.encode('utf8'))
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
-												Fix base64 URLs without padding

											
										
										
											2012-03-21 19:07:49 +04:00
+								def decode_base64(data):
 								    """Decode base64, padding being optional.
 								    "From a theoretical point of view, the padding character is not needed,
 								     since the number of missing bytes can be calculated from the number
 								     of Base64 digits."
 								    https://en.wikipedia.org/wiki/Base64#Padding
 								    :param data: Base64 data as an ASCII byte string
 								    :returns: The decoded byte string.
 								    """
 								    missing_padding = 4 - len(data) % 4
 								    if missing_padding:
 								        data += b'='* missing_padding
 								    return base64.decodestring(data)
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								def parse_data_url(url):
 								    """Decode URLs with the 'data' stream. urllib can handle them
 								    in Python 2, but that is broken in Python 3.
 								    Inspired from the Python 2.7.2’s urllib.py.
 								    """
 								    # syntax of data URLs:
 								    # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
 								    # mediatype := [ type "/" subtype ] *( ";" parameter )
 								    # data      := *urlchar
 								    # parameter := attribute "=" value
 								    try:
 								        header, data = url.split(',', 1)
 								    except ValueError:
 								        raise IOError('bad data URL')
 								    header = header[5:]  # len('data:') == 5
 								    if header:
 								        semi = header.rfind(';')
 								        if semi >= 0 and '=' not in header[semi:]:
 								            content_type = header[:semi]
 								            encoding = header[semi+1:]
 								        else:
 								            content_type = header
 								            encoding = ''
 								        message = parse_email('Content-type: ' + content_type)
 								        mime_type = message.get_content_type()
 								        charset = message.get_content_charset()
 								    else:
 								        mime_type = 'text/plain'
 								        charset = 'US-ASCII'
 								        encoding = ''
-												Fix base64 URLs without padding

											
										
										
											2012-03-21 19:07:49 +04:00
+								    data = unquote_to_bytes(data)
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								    if encoding == 'base64':
-												Fix base64 URLs without padding

											
										
										
											2012-03-21 19:07:49 +04:00
+								        data = decode_base64(data)
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
 								    return io.BytesIO(data), mime_type, charset
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Re-factor urllib usage and add a custom User-Agent.

It seems that Wikimedia bans the urllib user agent.

											
										
										
											2011-10-17 17:04:13 +04:00
+								def urlopen(url):
 								    """Fetch an URL and return ``(file_like, mime_type, charset)``.
-												Refactor image loading.

											
										
										
											2011-12-08 19:31:03 +04:00
 								    It is the caller’s responsability to call ``file_like.close()``.
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
+								    """
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								    if url.startswith('data:'):
 								        return parse_data_url(url)
-												Partial Python 3 compatibility.

											
										
										
											2011-10-10 18:39:41 +04:00
+								    else:
-												Python 3 compat. All tests pass with the same code base!

											
										
										
											2012-02-17 21:49:58 +04:00
+								        return urlopen_contenttype(Request(url,
 								            headers={'User-Agent': HTTP_USER_AGENT}))
-												Re-factor urllib usage and add a custom User-Agent.

It seems that Wikimedia bans the urllib user agent.

											
										
										
											2011-10-17 17:04:13 +04:00
 								def urllib_fetcher(url):
 								    """URL fetcher for cssutils.
 								    This fetcher is based on urllib instead of urllib2, since urllib has
 								    support for the "data" URL scheme.
 								    """
 								    file_like, mime_type, charset = urlopen(url)
 								    if mime_type != 'text/css':
-												Add and test more warnings.

											
										
										
											2011-12-16 15:19:10 +04:00
+								        LOGGER.warn('Expected `text/css` for stylsheet at %s, got `%s`',
 								                    url, mime_type)
-												Re-factor urllib usage and add a custom User-Agent.

It seems that Wikimedia bans the urllib user agent.

											
										
										
											2011-10-17 17:04:13 +04:00
+								        return None
-												Refactor image loading.

											
										
										
											2011-12-08 19:31:03 +04:00
+								    content = file_like.read()
 								    file_like.close()
 								    return charset, content
-												Cleanups

											
										
										
											2012-02-22 18:52:49 +04:00
 								class cached_property(object):
 								    """A decorator that converts a function into a lazy property. The
 								    function wrapped is called the first time to retrieve the result
 								    and then that calculated result is used the next time you access
 								    the value.
 								    Stolen from Werkzeug:
 								    https://github.com/mitsuhiko/werkzeug/blob/7b8d887d33/werkzeug/utils.py#L28
 								    """
 								    def __init__(self, func):
 								        self.__name__ = func.__name__
 								        self.__module__ = func.__module__
 								        self.__doc__ = func.__doc__
 								        self.func = func
 								    def __get__(self, obj, type=None):
 								        if obj is None:
 								            return self
 								        missing = object()
 								        value = obj.__dict__.get(self.__name__, missing)
 								        if value is missing:
 								            value = self.func(obj)
 								            obj.__dict__[self.__name__] = value
 								        return value