WeasyPrint/weasyprint/css/__init__.py

# coding: utf8
"""
    weasyprint.css
    --------------

    This module takes care of steps 3 and 4 of “CSS 2.1 processing model”:
    Retrieve stylesheets associated with a document and annotate every element
    with a value for every CSS property.

    http://www.w3.org/TR/CSS21/intro.html#processing-model

    This module does this in more than two steps. The
    :func:`get_all_computed_styles` function does everything, but it itsef
    calls a function for each step:

    ``find_stylesheets``
        Find and parse all author stylesheets in a document.

    ``effective_rules``
        Resolve @media and @import rules.

    ``match_selectors``
        Find elements in a document that match a selector list.

    ``find_style_attributes``
        Find and parse all `style` HTML attributes.

    ``effective_declarations``
        Remove ignored properties and expand shorthands.

    ``add_property``
        Take applicable properties and only keep those with highest weight.

    ``set_computed_styles``
        Handle initial values, inheritance and computed values for one element.


    :copyright: Copyright 2011-2012 Simon Sapin and contributors, see AUTHORS.
    :license: BSD, see LICENSE for details.

"""

from __future__ import division, unicode_literals

import re

import tinycss
from tinycss.selectors3 import STYLE_ATTRIBUTE_SPECIFICITY
from lxml import cssselect

from . import properties
from . import validation
from . import computed_values
from ..utils import get_url_attribute
from ..logger import LOGGER
from ..compat import iteritems, urljoin
from .. import CSS


PARSER = tinycss.make_parser(with_selectors3=True, with_page3=True)


# Pseudo-classes and pseudo-elements are the same to lxml.cssselect.parse().
# List the identifiers for all CSS3 pseudo elements here to distinguish them.
PSEUDO_ELEMENTS = ('before', 'after', 'first-line', 'first-letter')

# Selectors for @page rules can have a pseudo-class, one of :first, :left
# or :right. This maps pseudo-classes to lists of "page types" selected.
PAGE_PSEUDOCLASS_TARGETS = {
    'first': ['first_left_page', 'first_right_page'],
    'left': ['left_page', 'first_left_page'],
    'right': ['right_page', 'first_right_page'],
    # no pseudo-class: all pages
    None: ['left_page', 'right_page', 'first_left_page', 'first_right_page'],
}

# A test function that returns True if the given property name has an
# initial value that is not always the same when computed.
RE_INITIAL_NOT_COMPUTED = re.compile(
    '^(display|border_[a-z]+_(width|color))$').match


class StyleDict(object):
    """A mapping (dict-like) that allows attribute access to values.

    Allow eg. ``style.font_size`` instead of ``style['font-size']``.

    :param parent: if given, should be a mapping. Values missing from this
                   dict will be looked up in the parent dict. Setting a value
                   in this dict masks any value in the parent.

    """
    def __init__(self, data=None, parent=None):
        if data is None:
            data = {}
        else:
            data = dict(data)
        if parent is None:
            parent = {}
        # work around our own __setattr__
        object.__setattr__(self, '_storage', data)
        object.__setattr__(self, '_parent', parent)

    def __getitem__(self, key):
        storage = self._storage
        if key in storage:
            return storage[key]
        else:
            return self._parent[key]

    def __setitem__(self, key, value):
        self._storage[key] = value

    def updated_copy(self, other):
        copy = self.copy()
        copy._storage.update(other)
        return copy

    def __contains__(self, key):
        return key in self._parent or key in self._storage

    __getattr__ = __getitem__  # May raise KeyError instead of AttributeError
    __setattr__ = __setitem__

    def copy(self):
        """Copy the ``StyleDict``.

        Create a new StyleDict with this one as the parent. This is a cheap
        "copy-on-write". Modifications in the copy will not affect
        the original, but modifications in the original *may* affect the
        copy.

        """
        if self._storage:
            parent = self
        else:
            parent = self._parent
        style = type(self)(parent=parent)
        if self.anonymous:
            object.__setattr__(style, 'anonymous', True)
        return style

    def inherit_from(self):
        """Return a new StyleDict with inherited properties from this one.

        Non-inherited properties get their initial values.
        This is the styles for an anonymous box.
        """
        style = computed_from_cascaded(cascaded={}, parent_style=self,
            # Only used by non-inherited properties. eg `content: attr(href)`
            element=None)
        object.__setattr__(style, 'anonymous', True)
        return style

    # Default values, may be overriden on instances
    anonymous = False


def find_stylesheets(document, medium):
    """Yield the stylesheets of ``document``.

    The output order is the same as the order of the dom.

    """
    for element in document.dom.iter():
        if element.tag not in ('style', 'link'):
            continue
        mime_type = element.get('type', 'text/css').split(';', 1)[0].strip()
        # Only keep 'type/subtype' from 'type/subtype ; param1; param2'.
        if mime_type != 'text/css':
            continue
        media_attr = element.get('media', '').strip() or 'all'
        media = [media_type.strip() for media_type in media_attr.split(',')]
        if not evaluate_media_query(media, medium):
            continue
        if element.tag == 'style':
            # Content is text that is directly in the <style> element, not its
            # descendants
            content = [element.text or '']
            for child in element:
                content.append(child.tail or '')
            content = ''.join(content)
            # lxml should give us either unicode or ASCII-only bytestrings, so
            # we don't need `encoding` here.
            css = CSS(string=content, base_url=element.base_url)
            yield css
        elif element.tag == 'link' and element.get('href'):
            rel = element.get('rel', '').split()
            if 'stylesheet' not in rel or 'alternate' in rel:
                continue
            href = get_url_attribute(element, 'href')
            css = CSS(url=href, _check_mime_type=True)
            if css.mime_type == 'text/css':
                yield css
            else:
                LOGGER.warn('Unsupported stylesheet type: %s', css.mime_type)


def find_style_attributes(document):
    """
    Yield ``element, declaration, base_url`` for elements with
    a "style" attribute.
    """
    parser = PARSER
    for element in document.dom.iter():
        style_attribute = element.get('style')
        if style_attribute:
            declarations, errors = parser.parse_style_attr(style_attribute)
            for error in errors:
                LOGGER.warn(error)
            yield element, declarations, element.base_url


def evaluate_media_query(query_list, medium):
    """Return the boolean evaluation of `query_list` for the given `medium`.

    :attr query_list: a cssutilts.stlysheets.MediaList
    :attr medium: a media type string (for now)

    """
    # TODO: actual support for media queries, not just media types
    return 'all' in query_list or medium in query_list


def declaration_precedence(origin, importance):
    """Return the precedence for a declaration.

    Precedence values have no meaning unless compared to each other.

    Acceptable values for ``origin`` are the strings ``'author'``, ``'user'``
    and ``'user agent'``.

    """
    # See http://www.w3.org/TR/CSS21/cascade.html#cascading-order
    if origin == 'user agent':
        return 1
    elif origin == 'user' and not importance:
        return 2
    elif origin == 'author' and not importance:
        return 3
    elif origin == 'author':  # and importance
        return 4
    else:
        assert origin == 'user'  # and importance
        return 5


def add_declaration(cascaded_styles, prop_name, prop_values, weight, element,
                    pseudo_type=None):
    """Set the value for a property on a given element.

    The value is only set if there is no value of greater weight defined yet.

    """
    style = cascaded_styles.setdefault((element, pseudo_type), {})
    _values, previous_weight = style.get(prop_name, (None, None))
    if previous_weight is None or previous_weight <= weight:
        style[prop_name] = prop_values, weight


def set_computed_styles(cascaded_styles, computed_styles,
                        element, parent, pseudo_type=None):
    """Set the computed values of styles to ``element``.

    Take the properties left by ``apply_style_rule`` on an element or
    pseudo-element and assign computed values with respect to the cascade,
    declaration priority (ie. ``!important``) and selector specificity.

    """
    if parent is None:
        parent_style = None
    else:
        parent_style = computed_styles[parent, None]

    cascaded = cascaded_styles.get((element, pseudo_type), {})
    style = computed_from_cascaded(
        element, cascaded, parent_style, pseudo_type)
    computed_styles[element, pseudo_type] = style


def computed_from_cascaded(element, cascaded, parent_style, pseudo_type=None):
    """Get a dict of computed style mixed from parent and cascaded styles."""
    if not cascaded and parent_style is not None:
        # Fast path for anonymous boxes:
        # no cascaded style, only implicitly initial or inherited values.
        computed = StyleDict(parent=properties.INITIAL_VALUES)
        for name in properties.INHERITED:
            computed[name] = parent_style[name]
        # border-*-style is none, so border-width computes to zero.
        # Other than that, properties that would need computing are
        # border-*-color, but they do not apply.
        for side in ('top', 'bottom', 'left', 'right'):
            computed['border_%s_width' % side] = 0
        return computed

    # Handle inheritance and initial values
    specified = StyleDict()
    computed = StyleDict()
    for name, initial in iteritems(properties.INITIAL_VALUES):
        if name in cascaded:
            value, _precedence = cascaded[name]
            keyword = value
        else:
            if name in properties.INHERITED:
                keyword = 'inherit'
            else:
                keyword = 'initial'

        if keyword == 'inherit' and parent_style is None:
            # On the root element, 'inherit' from initial values
            keyword = 'initial'

        if keyword == 'initial':
            value = initial
            if not RE_INITIAL_NOT_COMPUTED(name):
                # The value is the same as when computed
                computed[name] = value
        elif keyword == 'inherit':
            value = parent_style[name]
            # Values in parent_style are already computed.
            computed[name] = value

        assert value is not None
        specified[name] = value

    return computed_values.compute(
        element, pseudo_type, specified, computed, parent_style)


class PageSelector(object):
    """Mimic the API of :class:`tinycss.selectors3.Selector`"""

    def __init__(self, specificity, pseudo_element, matched):
        self.specificity = specificity
        self.pseudo_element = pseudo_element
        self.match = lambda _document: matched


def preprocess_stylesheet(medium, base_url, rules):
    """Do the work that can be done early on stylesheet, before they are
    in a document.

    """
    for rule in rules:
        if not rule.at_keyword:
            declarations = list(preprocess_declarations(
                base_url, rule.declarations))
            if declarations:
                yield rule, rule.selector_list, declarations

        elif rule.at_keyword == '@import':
            if not evaluate_media_query(rule.media, medium):
                continue
            for result in CSS(url=urljoin(base_url, rule.uri)).rules:
                yield result

        elif rule.at_keyword == '@media':
            if not evaluate_media_query(rule.media, medium):
                continue
            for result in preprocess_stylesheet(
                    medium, base_url, rule.rules):
                yield result

        elif rule.at_keyword == '@page':
            page_name, pseudo_class = rule.selector
            page_types = PAGE_PSEUDOCLASS_TARGETS[pseudo_class]
            # TODO: support named pages (see CSS3 Paged Media)
            if page_name is not None:
                LOGGER.warn('Named pages are not supported yet, the whole '
                            '@page %s rule was ignored.', page_name + (
                                ':' + pseudo_class if pseudo_class else ''))
                continue
            specificity = rule.specificity

            declarations = list(preprocess_declarations(
                base_url, rule.declarations))
            if declarations:
                selector_list = [PageSelector(specificity, None, page_types)]
                yield rule, selector_list, declarations

            for margin_rule in rule.at_rules:
                declarations = list(preprocess_declarations(
                    base_url, margin_rule.declarations))
                if declarations:
                    selector_list = [PageSelector(
                        specificity, margin_rule.at_keyword, page_types)]
                    yield margin_rule, selector_list, declarations


def preprocess_declarations(base_url, declarations):
    # set() + reversed(): only keep the last valid declaration,
    # don’t bother checking the previous ones for the same property
    seen = set()
    for declaration in reversed(declarations):
        name = declaration.name.replace('-', '_')
        if name in seen:
            # This only helps on non-shorthands, but still
            continue
        priority = declaration.priority
        for long_name, values in validation.validate_and_expand(
                base_url, name, declaration.value):
            if long_name not in seen:
                yield long_name, values, priority
                seen.add(long_name)


def get_all_computed_styles(document, medium,
                            user_stylesheets=None, ua_stylesheets=None):
    """Compute all the computed styles of ``document`` for ``medium``.

    Do everything from finding author stylesheets in the given HTML document
    to parsing and applying them.

    Return a dict of (DOM element, pseudo element type) -> StyleDict instance.

    """
    author_stylesheets = list(find_stylesheets(document, medium))

    # keys: (element, pseudo_element_type)
    #    element: a lxml element object or the '@page' string for @page styles
    #    pseudo_element_type: a string such as 'first' (for @page) or 'after',
    #        or None for normal elements
    # values: dicts of
    #     keys: property name as a string
    #     values: (values, weight)
    #         values: a PropertyValue-like object
    #         weight: values with a greater weight take precedence, see
    #             http://www.w3.org/TR/CSS21/cascade.html#cascading-order
    cascaded_styles = {}

    for sheets, origin in (
        # Order here is not important ('origin' is).
        # Use this order for a regression test
        (ua_stylesheets or [], 'user agent'),
        (author_stylesheets, 'author'),
        (user_stylesheets or [], 'user'),
    ):
        for sheet in sheets:
            for _rule, selector_list, declarations in sheet.rules:
                for selector in selector_list:
                    specificity = selector.specificity
                    pseudo_type = selector.pseudo_element
                    for element in selector.match(document.dom):
                        for name, values, importance in declarations:
                            precedence = declaration_precedence(
                                origin, importance)
                            weight = (precedence, specificity)
                            add_declaration(
                                cascaded_styles, name, values, weight,
                                element, pseudo_type)

    specificity = STYLE_ATTRIBUTE_SPECIFICITY
    for element, declarations, base_url in find_style_attributes(document):
        for name, values, importance in preprocess_declarations(
                base_url, declarations):
            precedence = declaration_precedence('author', importance)
            weight = (precedence, specificity)
            add_declaration(cascaded_styles, name, values, weight, element)

    # keys: (element, pseudo_element_type), like cascaded_styles
    # values: StyleDict objects:
    #     keys: property name as a string
    #     values: a PropertyValue-like object
    computed_styles = {}

    # First, computed styles for "real" elements *in tree order*
    # Tree order is important so that parents have computed styles before
    # their children, for inheritance.

    # Iterate on all elements, even if there is no cascaded style for them.
    for element in document.dom.iter():
        set_computed_styles(cascaded_styles, computed_styles, element,
                            parent=element.getparent())


    # Then computed styles for @page.

    # Iterate on all possible page types, even if there is no cascaded style
    # for them.
    for page_type in PAGE_PSEUDOCLASS_TARGETS[None]:
        set_computed_styles(cascaded_styles, computed_styles, page_type,
        # @page inherits from the root element:
        # http://lists.w3.org/Archives/Public/www-style/2012Jan/1164.html
                            parent=document.dom)

    # Then computed styles for pseudo elements, in any order.
    # Pseudo-elements inherit from their associated element so they come
    # last. Do them in a second pass as there is no easy way to iterate
    # on the pseudo-elements for a given element with the current structure
    # of cascaded_styles. (Keys are (element, pseudo_type) tuples.)

    # Only iterate on pseudo-elements that have cascaded styles. (Others
    # might as well not exist.)
    for element, pseudo_type in cascaded_styles:
        if pseudo_type:
            set_computed_styles(cascaded_styles, computed_styles,
                                element, pseudo_type=pseudo_type,
                                # The pseudo-element inherits from the element.
                                parent=element)

    return computed_styles