1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-10-05 00:21:15 +03:00
WeasyPrint/weasyprint/html.py

226 lines
6.9 KiB
Python
Raw Normal View History

2011-05-25 17:54:46 +04:00
# coding: utf8
# WeasyPrint converts web documents (HTML, CSS, ...) to PDF.
# Copyright (C) 2011 Simon Sapin
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Specific handling for some HTML elements, especially replaced elements.
2011-08-19 18:52:46 +04:00
2011-05-25 17:54:46 +04:00
Replaced elements (eg. <img> elements) are rendered externally and behave
as an atomic opaque box in CSS. They may or may not have intrinsic dimensions.
2011-08-19 18:52:46 +04:00
2011-05-25 17:54:46 +04:00
"""
from __future__ import division, unicode_literals
2011-08-10 16:51:18 +04:00
from .formatting_structure import boxes
from .utils import get_url_attribute
from .compat import xrange
2011-08-10 16:51:18 +04:00
2011-05-25 17:54:46 +04:00
# Maps HTML tag names to function taking an HTML element and returning a Box.
HTML_HANDLERS = {}
2011-08-20 20:02:04 +04:00
2011-08-25 19:29:16 +04:00
def handle_element(document, element, box):
2011-08-25 19:29:16 +04:00
"""Handle HTML elements that need special care.
:returns: a (possibly empty) list of boxes.
2011-08-25 19:29:16 +04:00
"""
if box.element_tag in HTML_HANDLERS:
return HTML_HANDLERS[element.tag](document, element, box)
2011-08-25 19:29:16 +04:00
else:
return [box]
2011-08-20 20:02:04 +04:00
def handler(tag):
2011-09-09 01:02:17 +04:00
"""Return a decorator registering a function handling ``tag`` elements."""
def decorator(function):
2011-09-09 01:02:17 +04:00
"""Decorator registering a function handling ``tag`` elements."""
HTML_HANDLERS[tag] = function
return function
return decorator
2011-08-20 20:02:04 +04:00
def is_block_level(box):
"""Tell wether ``box`` is supposed to be block level.
2011-09-09 01:02:17 +04:00
Return ``True`` if the element is block-level, ``False`` if it is
inline-level, and raise ValueError if it is neither.
2011-08-25 19:29:16 +04:00
"""
display = box.style.display
if display in ('block', 'list-item', 'table'):
2011-08-25 19:29:16 +04:00
return True
elif display in ('inline', 'inline-table', 'inline-block'):
2011-08-25 19:29:16 +04:00
return False
else:
raise ValueError('Unsupported display: ' + display)
2011-12-08 21:11:32 +04:00
def make_replaced_box(element, box, image):
"""Wrap an image in a replaced box.
2011-09-09 01:02:17 +04:00
That box is either block-level or inline-level, depending on what the
element should be.
2011-08-25 19:29:16 +04:00
"""
if is_block_level(box):
2011-12-05 17:24:43 +04:00
type_ = boxes.BlockReplacedBox
else:
2011-12-05 17:24:43 +04:00
type_ = boxes.InlineReplacedBox
2011-12-08 21:11:32 +04:00
return type_(element.tag, element.sourceline, box.style, image)
def make_text_box(element, box, text):
2011-09-27 13:50:47 +04:00
"""Make a text box.
2011-09-09 01:02:17 +04:00
2011-09-27 13:50:47 +04:00
If the element should be block-level, wrap it in a block box.
2011-09-09 01:02:17 +04:00
2011-08-25 19:29:16 +04:00
"""
text_box = boxes.TextBox(element.tag, element.sourceline,
box.style.inherit_from(), text)
if is_block_level(box):
type_ = boxes.BlockBox
2011-08-25 19:29:16 +04:00
else:
type_ = boxes.InlineBox
return type_(element.tag, element.sourceline,
box.style, [text_box])
2011-08-25 19:29:16 +04:00
@handler('img')
def handle_img(document, element, box):
2012-02-29 20:38:30 +04:00
"""Handle ``<img>`` elements, return either an image or the alt-text.
2011-09-09 01:02:17 +04:00
See: http://www.w3.org/TR/html5/embedded-content-1.html#the-img-element
2011-08-25 19:29:16 +04:00
2011-08-22 19:55:30 +04:00
"""
src = get_url_attribute(element, 'src')
alt = element.get('alt')
2011-08-25 19:29:16 +04:00
if src:
2012-01-13 21:16:27 +04:00
image = document.get_image_from_uri(src)
2011-12-08 21:11:32 +04:00
if image is not None:
return [make_replaced_box(element, box, image)]
else:
2011-08-25 19:29:16 +04:00
# Invalid image, use the alt-text.
if alt:
return [make_text_box(element, box, alt)]
2011-08-25 19:29:16 +04:00
elif alt == '':
# The element represents nothing
return []
2011-08-25 19:29:16 +04:00
else:
assert alt is None
# TODO: find some indicator that an image is missing.
# For now, just remove the image.
return []
2011-08-25 19:29:16 +04:00
else:
if alt:
return [make_text_box(element, box, alt)]
2011-08-25 19:29:16 +04:00
else:
return []
2011-05-25 17:54:46 +04:00
2012-02-29 20:38:30 +04:00
@handler('embed')
def handle_embed(document, element, box):
"""Handle ``<embed>`` elements, return either an image or nothing.
See: http://www.w3.org/TR/html5/the-iframe-element.html#the-embed-element
"""
src = get_url_attribute(element, 'src')
type_ = element.get('type', '').strip()
if src:
image = document.get_image_from_uri(src, type_)
if image is not None:
return [make_replaced_box(element, box, image)]
# No fallback.
return []
@handler('object')
def handle_object(document, element, box):
"""Handle ``<object>`` elements, return either an image or the fallback
content.
See: http://www.w3.org/TR/html5/the-iframe-element.html#the-object-element
"""
data = get_url_attribute(element, 'data')
type_ = element.get('type', '').strip()
if data:
image = document.get_image_from_uri(data, type_)
if image is not None:
return [make_replaced_box(element, box, image)]
# The elements children are the fallback.
return [box]
def integer_attribute(element, box, name, minimum=1):
"""Read an integer attribute from the HTML element and set it on the box.
"""
value = element.get(name, '').strip()
2012-02-29 20:38:30 +04:00
if value:
try:
value = int(value)
except ValueError:
pass
else:
if value >= minimum:
setattr(box, name, value)
@handler('colgroup')
2011-12-26 15:47:26 +04:00
def handle_colgroup(_document, element, box):
"""Handle the ``span`` attribute."""
if isinstance(box, boxes.TableColumnGroupBox):
if any(child.tag == 'col' for child in element):
box.span = None # sum of the childrens spans
else:
integer_attribute(element, box, 'span')
2012-02-29 20:38:30 +04:00
box.children = (
boxes.TableColumnBox.anonymous_from(box, [])
for _i in xrange(box.span))
return [box]
@handler('col')
2011-12-26 15:47:26 +04:00
def handle_col(_document, element, box):
"""Handle the ``span`` attribute."""
if isinstance(box, boxes.TableColumnBox):
integer_attribute(element, box, 'span')
if box.span > 1:
# Generate multiple boxes
# http://lists.w3.org/Archives/Public/www-style/2011Nov/0293.html
2011-12-26 15:47:26 +04:00
return [box.copy() for _i in xrange(box.span)]
return [box]
@handler('th')
@handler('td')
2011-12-26 15:47:26 +04:00
def handle_td(_document, element, box):
"""Handle the ``colspan``, ``rowspan`` attributes."""
if isinstance(box, boxes.TableCellBox):
# HTML 4.01 gives special meaning to colspan=0
# http://www.w3.org/TR/html401/struct/tables.html#adef-rowspan
# but HTML 5 removed it
# http://www.w3.org/TR/html5/tabular-data.html#attr-tdth-colspan
# rowspan=0 is still there though.
integer_attribute(element, box, 'colspan')
integer_attribute(element, box, 'rowspan', minimum=0)
return [box]