WeasyPrint/weasy/utils.py

# coding: utf8

#  WeasyPrint converts web documents (HTML, CSS, ...) to PDF.
#  Copyright (C) 2011  Simon Sapin
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Affero General Public License as
#  published by the Free Software Foundation, either version 3 of the
#  License, or (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Affero General Public License for more details.
#
#  You should have received a copy of the GNU Affero General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.


"""
Various utils.

"""

import urllib
from urlparse import urljoin, urlparse

from cssutils.helper import path2url

from . import VERSION


def get_url_attribute(element, key):
    """Get the URL corresponding to the ``key`` attribute of ``element``.

    The retrieved URL is absolute, even if the URL in the element is relative.

    """
    attr_value = element.get(key)
    if attr_value is None:
        return None
    return urljoin(element.base_url, attr_value.strip())


def ensure_url(string):
    """Get a ``scheme://path`` URL from ``string``.

    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
    filename and convert it to a ``file://`` URL.

    """
    return string if urlparse(string).scheme else path2url(string)


class URLopener(urllib.FancyURLopener):
    # User-Agent
    version = 'WeasyPrint/%s http://weasyprint.org/' % VERSION


def urlopen(url):
    """Fetch an URL and return ``(file_like, mime_type, charset)``.

    It is the caller’s responsability to call ``file_like.close()``.
    """
    file_like = URLopener().open(url)
    info = file_like.info()
    if hasattr(info, 'get_content_type'):
        # Python 3
        mime_type = info.get_content_type()
    else:
        # Python 2
        mime_type = info.gettype()
    if hasattr(info, 'get_param'):
        # Python 3
        charset = info.get_param('charset')
    else:
        # Python 2
        charset = info.getparam('charset')
    return file_like.fp, mime_type, charset


def urllib_fetcher(url):
    """URL fetcher for cssutils.

    This fetcher is based on urllib instead of urllib2, since urllib has
    support for the "data" URL scheme.

    """
    file_like, mime_type, charset = urlopen(url)
    if mime_type != 'text/css':
        # TODO: add a warning
        return None
    content = file_like.read()
    file_like.close()
    return charset, content
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
+								# coding: utf8
 								#  WeasyPrint converts web documents (HTML, CSS, ...) to PDF.
 								#  Copyright (C) 2011  Simon Sapin
 								#
 								#  This program is free software: you can redistribute it and/or modify
 								#  it under the terms of the GNU Affero General Public License as
 								#  published by the Free Software Foundation, either version 3 of the
 								#  License, or (at your option) any later version.
 								#
 								#  This program is distributed in the hope that it will be useful,
 								#  but WITHOUT ANY WARRANTY; without even the implied warranty of
 								#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 								#  GNU Affero General Public License for more details.
 								#
 								#  You should have received a copy of the GNU Affero General Public License
 								#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								"""
 								Various utils.
 								"""
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								import urllib
 								from urlparse import urljoin, urlparse
-												Add the framework for layout computations.

											
										
										
											2011-06-30 00:34:01 +04:00
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								from cssutils.helper import path2url
-												Re-factor urllib usage and add a custom User-Agent.

It seems that Wikimedia bans the urllib user agent.

											
										
										
											2011-10-17 17:04:13 +04:00
+								from . import VERSION
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
-												move urljoining to function

											
										
										
											2011-08-05 13:16:44 +04:00
+								def get_url_attribute(element, key):
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								    """Get the URL corresponding to the ``key`` attribute of ``element``.
 								    The retrieved URL is absolute, even if the URL in the element is relative.
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Handle alt-text on images.

											
										
										
											2011-08-25 19:29:16 +04:00
+								    attr_value = element.get(key)
 								    if attr_value is None:
 								        return None
 								    return urljoin(element.base_url, attr_value.strip())
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
-												move urljoining to function

											
										
										
											2011-08-05 13:16:44 +04:00
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								def ensure_url(string):
 								    """Get a ``scheme://path`` URL from ``string``.
 								    If ``string`` looks like an URL, return it unchanged. Otherwise assume a
 								    filename and convert it to a ``file://`` URL.
-												Add a weasyprint.py script.

											
										
										
											2011-08-09 14:45:51 +04:00
+								    """
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
+								    return string if urlparse(string).scheme else path2url(string)
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
-												Re-factor urllib usage and add a custom User-Agent.

It seems that Wikimedia bans the urllib user agent.

											
										
										
											2011-10-17 17:04:13 +04:00
+								class URLopener(urllib.FancyURLopener):
 								    # User-Agent
 								    version = 'WeasyPrint/%s http://weasyprint.org/' % VERSION
-												Clean weasy/utils

											
										
										
											2011-08-19 18:53:05 +04:00
-												Re-factor urllib usage and add a custom User-Agent.

It seems that Wikimedia bans the urllib user agent.

											
										
										
											2011-10-17 17:04:13 +04:00
+								def urlopen(url):
 								    """Fetch an URL and return ``(file_like, mime_type, charset)``.
-												Refactor image loading.

											
										
										
											2011-12-08 19:31:03 +04:00
 								    It is the caller’s responsability to call ``file_like.close()``.
-												Add support for data: URL scheme stylesheets.

											
										
										
											2011-08-16 17:11:35 +04:00
+								    """
-												Re-factor urllib usage and add a custom User-Agent.

It seems that Wikimedia bans the urllib user agent.

											
										
										
											2011-10-17 17:04:13 +04:00
+								    file_like = URLopener().open(url)
 								    info = file_like.info()
-												Partial Python 3 compatibility.

											
										
										
											2011-10-10 18:39:41 +04:00
+								    if hasattr(info, 'get_content_type'):
 								        # Python 3
 								        mime_type = info.get_content_type()
 								    else:
 								        # Python 2
 								        mime_type = info.gettype()
 								    if hasattr(info, 'get_param'):
 								        # Python 3
 								        charset = info.get_param('charset')
 								    else:
 								        # Python 2
 								        charset = info.getparam('charset')
-												Refactor image loading.

											
										
										
											2011-12-08 19:31:03 +04:00
+								    return file_like.fp, mime_type, charset
-												Re-factor urllib usage and add a custom User-Agent.

It seems that Wikimedia bans the urllib user agent.

											
										
										
											2011-10-17 17:04:13 +04:00
 								def urllib_fetcher(url):
 								    """URL fetcher for cssutils.
 								    This fetcher is based on urllib instead of urllib2, since urllib has
 								    support for the "data" URL scheme.
 								    """
 								    file_like, mime_type, charset = urlopen(url)
 								    if mime_type != 'text/css':
 								        # TODO: add a warning
 								        return None
-												Refactor image loading.

											
										
										
											2011-12-08 19:31:03 +04:00
+								    content = file_like.read()
 								    file_like.close()
 								    return charset, content