1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-10-05 00:21:15 +03:00
WeasyPrint/weasyprint/document.py

1030 lines
41 KiB
Python
Raw Normal View History

2012-10-02 20:59:02 +04:00
"""
weasyprint.document
-------------------
"""
import collections
2017-03-25 02:33:36 +03:00
import functools
2020-04-19 17:49:37 +03:00
import hashlib
2012-10-02 20:59:02 +04:00
import io
import math
import shutil
2020-04-19 17:49:37 +03:00
import zlib
from os.path import basename
from urllib.parse import unquote, urlsplit
2012-10-02 20:59:02 +04:00
import pydyf
from weasyprint.layout import LayoutContext
2012-10-02 20:59:02 +04:00
2020-04-19 17:49:37 +03:00
from . import Attachment, CSS, __version__
2012-10-02 20:59:02 +04:00
from .css import get_all_computed_styles
2019-12-24 17:56:24 +03:00
from .css.counters import CounterStyle
2018-03-28 01:34:34 +03:00
from .css.targets import TargetCollector
2017-03-25 02:33:36 +03:00
from .draw import draw_page, stacked
from .fonts import FontConfiguration
2012-10-04 13:35:25 +04:00
from .formatting_structure import boxes
2012-10-02 20:59:02 +04:00
from .formatting_structure.build import build_formatting_structure
from .html import W3C_DATE_RE
2018-01-07 03:46:39 +03:00
from .images import get_image_from_uri as original_get_image_from_uri
2012-10-02 20:59:02 +04:00
from .layout import layout_document
from .layout.percentages import percentage
from .logger import LOGGER, PROGRESS_LOGGER
2020-05-08 01:11:19 +03:00
from .text import ffi, pango
2020-04-19 17:49:37 +03:00
from .urls import URLFetchingError
def _w3c_date_to_pdf(string, attr_name):
"""Tranform W3C date to PDF format."""
if string is None:
return None
match = W3C_DATE_RE.match(string)
if match is None:
LOGGER.warning('Invalid %s date: %r', attr_name, string)
return None
groups = match.groupdict()
pdf_date = ''
found = False
for key in ('second', 'minute', 'hour', 'day', 'month', 'year'):
if groups[key]:
found = True
pdf_date = groups[key] + pdf_date
elif found:
pdf_date = '%02i' % (key in ('day', 'month')) + pdf_date
if groups['hour']:
assert groups['minute']
if groups['tz_hour']:
assert groups['tz_hour'].startswith(('+', '-'))
assert groups['tz_minute']
pdf_date += "%+03i'%02i" % (
int(groups['tz_hour']), int(groups['tz_minute']))
else:
pdf_date += 'Z'
return pdf_date
2012-10-02 20:59:02 +04:00
2020-05-08 01:31:50 +03:00
class Font:
2020-05-08 02:50:41 +03:00
def __init__(self, file_content, pango_font, glyph_item):
pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
2020-05-09 19:06:20 +03:00
font_description = ffi.gc(
pango.pango_font_description_copy(
pango.pango_font_describe(pango_font)),
pango.pango_font_description_free)
pango.pango_font_description_set_absolute_size(
font_description, pango.pango_units_from_double(1))
2020-05-08 00:27:43 +03:00
font_family = ffi.string(pango.pango_font_description_get_family(
2020-05-08 02:30:07 +03:00
font_description))
2020-05-08 00:27:43 +03:00
glyph_string = glyph_item.glyphs
num_glyphs = glyph_string.num_glyphs
2020-05-09 01:00:07 +03:00
self.hash = hash(file_content)
2020-05-08 02:50:41 +03:00
self.file_content = file_content
self.pango_font = pango_font
self.glyph_item = glyph_item
2020-05-08 00:27:43 +03:00
# When the font will be a font subset, the font name will have to be
# like '/XXXXXX+font_family'
2020-05-08 02:58:43 +03:00
self.name = b'/' + font_family.replace(b' ', b'')
2020-05-08 02:50:41 +03:00
self.family = font_family
2020-05-08 01:19:07 +03:00
self.flags = 4
self.font_bbox = None
self.italic_angle = 0
self.ascent = pango.pango_font_metrics_get_ascent(pango_metrics)
self.descent = pango.pango_font_metrics_get_descent(pango_metrics)
self.cap_height = None
self.stemv = 80
self.stemh = 80
2020-05-08 01:40:01 +03:00
self.glyphs = {glyph_string.glyphs[x].glyph for x in range(num_glyphs)}
2020-05-08 16:07:51 +03:00
self.first_char = None
self.last_char = None
self.widths = None
2020-05-08 01:11:19 +03:00
def add_glyphs(self, glyph_item):
glyph_string = glyph_item.glyphs
num_glyphs = glyph_string.num_glyphs
self.glyphs |= {
2020-05-08 01:40:01 +03:00
glyph_string.glyphs[x].glyph for x in range(num_glyphs)}
2020-05-08 01:11:19 +03:00
2020-05-08 02:50:41 +03:00
def compute_glyphs_values(self):
2020-05-08 16:07:51 +03:00
first_char = min(self.glyphs)
last_char = max(self.glyphs)
2020-05-08 02:30:07 +03:00
font_bbox = [0, 0, 0, 0]
2020-05-08 16:07:51 +03:00
widths = [0] * (last_char - first_char + 1)
2020-05-08 01:11:19 +03:00
ink_rect = ffi.new('PangoRectangle *')
2020-05-08 16:07:51 +03:00
logical_rect = ffi.new('PangoRectangle *')
2020-05-08 01:11:19 +03:00
for glyph in self.glyphs:
pango.pango_font_get_glyph_extents(
2020-05-08 16:07:51 +03:00
self.pango_font, glyph, ink_rect, logical_rect)
2020-05-08 02:30:07 +03:00
x1, y1, x2, y2 = (
ink_rect.x, -ink_rect.y - ink_rect.height,
ink_rect.x + ink_rect.width, -ink_rect.y)
if x1 < font_bbox[0]:
font_bbox[0] = x1
if y1 < font_bbox[1]:
font_bbox[1] = y1
if x2 > font_bbox[2]:
font_bbox[2] = x2
if y2 > font_bbox[3]:
font_bbox[3] = y2
2020-05-08 01:11:19 +03:00
2020-05-09 19:06:20 +03:00
widths[glyph - first_char] = (
pango.pango_units_to_double(logical_rect.width) * 1000)
2020-05-08 16:07:51 +03:00
2020-05-08 01:28:30 +03:00
ffi.release(ink_rect)
2020-05-08 16:07:51 +03:00
ffi.release(logical_rect)
2020-05-09 19:06:20 +03:00
self.bbox = font_bbox
2020-05-08 02:30:07 +03:00
self.cap_height = font_bbox[1]
2020-05-08 16:07:51 +03:00
self.first_char = first_char
self.last_char = last_char
self.widths = widths
class Context(pydyf.Stream):
2020-04-19 17:49:37 +03:00
"""PDF stream object with context storing alpha states."""
def __init__(self, alpha_states, *args, **kwargs):
super().__init__(*args, **kwargs)
self._alpha_states = alpha_states
2020-05-06 08:42:45 +03:00
self._fonts = {}
def set_alpha(self, alpha, stroke=False):
if alpha not in self._alpha_states:
self._alpha_states[alpha] = pydyf.Dictionary(
{'CA' if stroke else 'ca': alpha})
self.set_state(alpha)
2020-05-08 01:11:19 +03:00
def add_font(self, font, pango_font, glyph_item):
2020-05-06 08:42:45 +03:00
font_hash = hash(font)
if font_hash not in self._fonts:
2020-05-08 01:11:19 +03:00
self._fonts[font_hash] = Font(font, pango_font, glyph_item)
else:
self._fonts[font_hash].add_glyphs(glyph_item)
2020-05-09 01:00:07 +03:00
return self._fonts[font_hash]
2020-05-06 08:42:45 +03:00
2012-10-02 20:59:02 +04:00
2020-04-19 17:49:37 +03:00
BookmarkSubtree = collections.namedtuple(
'BookmarkSubtree', ('label', 'destination', 'children', 'state'))
2020-04-18 23:12:25 +03:00
2020-04-19 17:49:37 +03:00
def _write_pdf_attachment(pdf, attachment, url_fetcher):
"""Write an attachment to the PDF stream.
:return:
2020-04-21 23:30:38 +03:00
the attachment PDF dictionary.
2020-04-19 17:49:37 +03:00
"""
2020-04-21 23:30:38 +03:00
# Attachments from document links like <link> or <a> can only be URLs.
# They're passed in as tuples
2020-04-19 17:49:37 +03:00
url = ''
2020-04-21 23:30:38 +03:00
if isinstance(attachment, tuple):
url, description = attachment
attachment = Attachment(
url=url, url_fetcher=url_fetcher, description=description)
elif not isinstance(attachment, Attachment):
attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
2020-04-19 17:49:37 +03:00
2020-04-21 23:30:38 +03:00
try:
2020-04-19 17:49:37 +03:00
with attachment.source as (source_type, source, url, _):
if isinstance(source, bytes):
source = io.BytesIO(source)
2020-04-21 23:30:38 +03:00
uncompressed_length = 0
stream = b''
md5 = hashlib.md5()
compress = zlib.compressobj()
for data in iter(lambda: source.read(4096), b''):
uncompressed_length += len(data)
md5.update(data)
compressed = compress.compress(data)
stream += compressed
compressed = compress.flush(zlib.Z_FINISH)
stream += compressed
file_extra = pydyf.Dictionary({
'Type': '/EmbeddedFile',
'Filter': '/FlateDecode',
'Params': pydyf.Dictionary({
'CheckSum': f'<{md5.hexdigest()}>',
'Size': uncompressed_length,
})
})
file_stream = pydyf.Stream([stream], file_extra)
pdf.add_object(file_stream)
except URLFetchingError as exception:
LOGGER.error('Failed to load attachment: %s', exception)
return
2020-04-19 17:49:37 +03:00
# TODO: Use the result object from a URL fetch operation to provide more
# details on the possible filename.
filename = basename(unquote(urlsplit(url).path)) or 'attachment.bin'
2020-04-21 23:30:38 +03:00
attachment = pydyf.Dictionary({
'Type': '/Filespec',
'F': pydyf.String(),
'UF': pydyf.String(filename),
'EF': pydyf.Dictionary({'F': file_stream.reference}),
'Desc': pydyf.String(attachment.description or ''),
})
pdf.add_object(attachment)
return attachment
2020-04-19 17:49:37 +03:00
def create_bookmarks(bookmarks, pdf, parent=None):
2020-04-18 23:12:25 +03:00
count = len(bookmarks)
outlines = []
2020-04-19 17:49:37 +03:00
for title, (page, x, y), children, state in bookmarks:
2020-04-18 23:12:25 +03:00
destination = pydyf.Array((
2020-04-19 17:49:37 +03:00
pdf.objects[pdf.pages['Kids'][page * 3]].reference,
2020-04-18 23:12:25 +03:00
'/XYZ', x, y, 0))
outline = pydyf.Dictionary({
'Title': pydyf.String(title), 'Dest': destination})
2020-04-19 17:49:37 +03:00
pdf.add_object(outline)
2020-04-18 23:12:25 +03:00
children_outlines, children_count = create_bookmarks(
2020-04-19 17:49:37 +03:00
children, pdf, parent=outline)
2020-04-18 23:41:23 +03:00
outline['Count'] = children_count
if state == 'closed':
outline['Count'] *= -1
else:
count += children_count
2020-04-18 23:12:25 +03:00
if outlines:
outline['Prev'] = outlines[-1].reference
outlines[-1]['Next'] = outline.reference
if children_outlines:
outline['First'] = children_outlines[0].reference
outline['Last'] = children_outlines[-1].reference
if parent is not None:
outline['Parent'] = parent.reference
outlines.append(outline)
return outlines, count
2020-04-19 17:49:37 +03:00
def add_hyperlinks(links, anchors, matrix, pdf, page, names):
"""Include hyperlinks in current PDF page."""
page['Annots'] = pydyf.Array()
for link in links:
link_type, link_target, rectangle = link
x1, y1 = matrix.transform_point(*rectangle[:2])
x2, y2 = matrix.transform_point(*rectangle[2:])
if link_type in ('internal', 'external'):
annot = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Link',
'Rect': pydyf.Array([x1, y1, x2, y2]),
'BS': pydyf.Dictionary({'W': 0}),
})
if link_type == 'internal':
annot['Dest'] = pydyf.String(link_target)
else:
2020-04-19 17:49:37 +03:00
annot['A'] = pydyf.Dictionary({
'Type': '/Action',
'S': '/URI',
'URI': pydyf.String(link_target),
})
pdf.add_object(annot)
page['Annots'].append(annot.reference)
for anchor in anchors:
anchor_name, x, y = anchor
x, y = matrix.transform_point(x, y)
names.append(pydyf.String(anchor_name))
names.append(pydyf.Array([page.reference, '/XYZ', x, y, 0]))
def rectangle_aabb(matrix, pos_x, pos_y, width, height):
2020-04-19 17:49:37 +03:00
"""Apply a transformation matrix to an axis-aligned rectangle.
Return its axis-aligned bounding box as ``(x, y, width, height)``.
"""
transform_point = matrix.transform_point
x1, y1 = transform_point(pos_x, pos_y)
x2, y2 = transform_point(pos_x + width, pos_y)
x3, y3 = transform_point(pos_x, pos_y + height)
x4, y4 = transform_point(pos_x + width, pos_y + height)
box_x1 = min(x1, x2, x3, x4)
box_y1 = min(y1, y2, y3, y4)
box_x2 = max(x1, x2, x3, x4)
box_y2 = max(y1, y2, y3, y4)
return box_x1, box_y1, box_x2 - box_x1, box_y2 - box_y1
def resolve_links(pages):
"""Resolve internal hyperlinks.
Links to a missing anchor are removed with a warning.
If multiple anchors have the same name, the first one is used.
:returns:
A generator yielding lists (one per page) like :attr:`Page.links`,
except that ``target`` for internal hyperlinks is
``(page_number, x, y)`` instead of an anchor name.
The page number is a 0-based index into the :attr:`pages` list,
and ``x, y`` are in CSS pixels from the top-left of the page.
"""
anchors = set()
paged_anchors = []
for i, page in enumerate(pages):
paged_anchors.append([])
for anchor_name, (point_x, point_y) in page.anchors.items():
if anchor_name not in anchors:
paged_anchors[-1].append((anchor_name, point_x, point_y))
anchors.add(anchor_name)
for page in pages:
page_links = []
for link in page.links:
link_type, anchor_name, rectangle = link
if link_type == 'internal':
if anchor_name not in anchors:
LOGGER.error(
'No anchor #%s for internal URI reference',
anchor_name)
else:
page_links.append((link_type, anchor_name, rectangle))
else:
# External link
page_links.append(link)
yield page_links, paged_anchors.pop(0)
2020-04-19 17:49:37 +03:00
class Matrix(list):
def __init__(self, a=1, b=0, c=0, d=1, e=0, f=0, matrix=None):
if matrix is None:
matrix = [[a, b, 0], [c, d, 0], [e, f, 1]]
super().__init__(matrix)
2012-10-04 13:35:25 +04:00
2020-04-19 17:49:37 +03:00
def __matmul__(self, other):
assert len(self[0]) == len(other) == len(other[0]) == 3
return Matrix(matrix=[
[sum(self[i][k] * other[k][j] for k in range(3)) for j in range(3)]
2020-05-08 03:13:44 +03:00
for i in range(len(self))])
2012-10-04 13:35:25 +04:00
2020-04-19 17:49:37 +03:00
@property
def determinant(self):
assert len(self) == len(self[0]) == 3
return (
self[0][0] * (self[1][1] * self[2][2] - self[1][2] * self[2][1]) -
self[1][0] * (self[0][1] * self[2][2] - self[0][2] * self[2][1]) +
self[2][0] * (self[0][1] * self[1][2] - self[0][2] * self[1][1]))
2012-10-04 13:35:25 +04:00
2020-04-19 17:49:37 +03:00
def transform_point(self, x, y):
return (Matrix(matrix=[[x, y, 1]]) @ self)[0][:2]
class Page:
2012-10-04 13:35:25 +04:00
"""Represents a single rendered page.
2012-10-08 21:51:18 +04:00
.. versionadded:: 0.15
2012-10-04 13:35:25 +04:00
Should be obtained from :attr:`Document.pages` but not
instantiated directly.
"""
def __init__(self, page_box):
2012-10-08 21:51:18 +04:00
#: The page width, including margins, in CSS pixels.
2012-10-05 22:12:05 +04:00
self.width = page_box.margin_width()
2012-10-04 13:35:25 +04:00
2012-10-08 21:51:18 +04:00
#: The page height, including margins, in CSS pixels.
2012-10-05 22:12:05 +04:00
self.height = page_box.margin_height()
2012-10-04 13:35:25 +04:00
#: The page bleed widths as a :obj:`dict` with ``'top'``, ``'right'``,
#: ``'bottom'`` and ``'left'`` as keys, and values in CSS pixels.
2017-10-05 09:45:50 +03:00
self.bleed = {
side: page_box.style['bleed_%s' % side].value
for side in ('top', 'right', 'bottom', 'left')}
#: The :obj:`list` of ``(bookmark_level, bookmark_label, target)``
#: :obj:`tuples <tuple>`. ``bookmark_level`` and ``bookmark_label``
#: are respectively an :obj:`int` and a :obj:`string <str>`, based on
#: the CSS properties of the same names. ``target`` is an ``(x, y)``
#: point in CSS pixels from the top-left of the page.
self.bookmarks = []
#: The :obj:`list` of ``(link_type, target, rectangle)`` :obj:`tuples
#: <tuple>`. A ``rectangle`` is ``(x, y, width, height)``, in CSS
#: pixels from the top-left of the page. ``link_type`` is one of three
#: strings:
2012-10-04 13:35:25 +04:00
#:
#: * ``'external'``: ``target`` is an absolute URL
#: * ``'internal'``: ``target`` is an anchor name (see
#: :attr:`Page.anchors`).
#: The anchor might be defined in another page,
#: in multiple pages (in which case the first occurence is used),
#: or not at all.
#: * ``'attachment'``: ``target`` is an absolute URL and points
#: to a resource to attach to the document.
self.links = []
2012-10-04 13:35:25 +04:00
#: The :obj:`dict` mapping each anchor name to its target, an
#: ``(x, y)`` point in CSS pixels from the top-left of the page.
self.anchors = {}
2012-10-04 13:35:25 +04:00
2020-04-19 17:49:37 +03:00
self._gather_links_and_bookmarks(page_box)
2012-10-05 22:12:05 +04:00
self._page_box = page_box
2012-10-02 20:59:02 +04:00
2020-04-19 17:49:37 +03:00
def _gather_links_and_bookmarks(self, box, matrix=None):
# Get box transformation matrix.
# "Transforms apply to block-level and atomic inline-level elements,
# but do not apply to elements which may be split into
# multiple inline-level boxes."
# http://www.w3.org/TR/css3-2d-transforms/#introduction
if box.style['transform'] and not isinstance(box, boxes.InlineBox):
border_width = box.border_width()
border_height = box.border_height()
origin_x, origin_y = box.style['transform_origin']
offset_x = percentage(origin_x, border_width)
offset_y = percentage(origin_y, border_height)
origin_x = box.border_box_x() + offset_x
origin_y = box.border_box_y() + offset_y
matrix = Matrix(e=origin_x, f=origin_y)
for name, args in box.style['transform']:
a, b, c, d, e, f = 1, 0, 0, 1, 0, 0
if name == 'scale':
a, d = args
elif name == 'rotate':
a = d = math.cos(args)
b = math.sin(args)
c = -b
elif name == 'translate':
e = percentage(args[0], border_width)
f = percentage(args[1], border_height)
elif name == 'skew':
b, c = math.tan(args[1]), math.tan(args[0])
else:
assert name == 'matrix'
a, b, c, d, e, f = args
matrix = Matrix(a, b, c, d, e, f) @ matrix
box.transformation_matrix = (
Matrix(e=-origin_x, f=-origin_y) @ matrix)
if matrix:
matrix = box.transformation_matrix @ matrix
else:
matrix = box.transformation_matrix
bookmark_label = box.bookmark_label
if box.style['bookmark_level'] == 'none':
bookmark_level = None
else:
bookmark_level = box.style['bookmark_level']
state = box.style['bookmark_state']
link = box.style['link']
anchor_name = box.style['anchor']
has_bookmark = bookmark_label and bookmark_level
# 'link' is inherited but redundant on text boxes
has_link = link and not isinstance(box, boxes.TextBox)
# In case of duplicate IDs, only the first is an anchor.
has_anchor = anchor_name and anchor_name not in self.anchors
is_attachment = hasattr(box, 'is_attachment') and box.is_attachment
if has_bookmark or has_link or has_anchor:
pos_x, pos_y, width, height = box.hit_area()
if has_link:
token_type, link = link
assert token_type == 'url'
link_type, target = link
assert isinstance(target, str)
if link_type == 'external' and is_attachment:
link_type = 'attachment'
if matrix:
link = (link_type, target, rectangle_aabb(
matrix, pos_x, pos_y, pos_x + width, pos_y + height))
else:
link = (link_type, target, (
pos_x, pos_y, pos_x + width, pos_y + height))
self.links.append(link)
if matrix and (has_bookmark or has_anchor):
pos_x, pos_y = matrix.transform_point(pos_x, pos_y)
if has_bookmark:
self.bookmarks.append(
(bookmark_level, bookmark_label, (pos_x, pos_y), state))
if has_anchor:
self.anchors[anchor_name] = pos_x, pos_y
for child in box.all_children():
self._gather_links_and_bookmarks(child)
def paint(self, context, left_x=0, top_y=0, scale=1, clip=False):
2020-04-18 23:12:25 +03:00
"""Paint the page into the PDF file.
2012-10-02 20:59:02 +04:00
2020-04-19 17:49:37 +03:00
:type context: :class:`pdf.Context`
:param context:
A context object.
:type left_x: float
2012-10-02 20:59:02 +04:00
:param left_x:
2020-04-18 23:12:25 +03:00
X coordinate of the left of the page, in PDF points.
:type top_y: float
2012-10-02 20:59:02 +04:00
:param top_y:
2020-04-18 23:12:25 +03:00
Y coordinate of the top of the page, in PDF points.
:type scale: float
2012-10-05 22:12:05 +04:00
:param scale:
2020-04-18 23:12:25 +03:00
Zoom scale.
:type clip: bool
2012-10-02 20:59:02 +04:00
:param clip:
Whether to clip/cut content outside the page. If false or
not provided, content can overflow.
"""
with stacked(context):
# Make (0, 0) the top-left corner, and make user units CSS pixels:
context.transform(scale, 0, 0, scale, left_x, top_y)
2012-10-02 20:59:02 +04:00
if clip:
2012-10-05 22:12:05 +04:00
width = self.width
height = self.height
context.rectangle(0, 0, width, height)
context.clip()
draw_page(self._page_box, context)
2012-10-02 20:59:02 +04:00
class DocumentMetadata:
"""Meta-information belonging to a whole :class:`Document`.
.. versionadded:: 0.20
New attributes may be added in future versions of WeasyPrint.
"""
def __init__(self, title=None, authors=None, description=None,
2014-04-04 14:32:21 +04:00
keywords=None, generator=None, created=None, modified=None,
attachments=None):
#: The title of the document, as a string or :obj:`None`.
#: Extracted from the ``<title>`` element in HTML
#: and written to the ``/Title`` info field in PDF.
self.title = title
#: The authors of the document, as a list of strings.
#: (Defaults to the empty list.)
#: Extracted from the ``<meta name=author>`` elements in HTML
#: and written to the ``/Author`` info field in PDF.
self.authors = authors or []
#: The description of the document, as a string or :obj:`None`.
#: Extracted from the ``<meta name=description>`` element in HTML
#: and written to the ``/Subject`` info field in PDF.
self.description = description
#: Keywords associated with the document, as a list of strings.
#: (Defaults to the empty list.)
#: Extracted from ``<meta name=keywords>`` elements in HTML
#: and written to the ``/Keywords`` info field in PDF.
self.keywords = keywords or []
#: The name of one of the software packages
#: used to generate the document, as a string or :obj:`None`.
#: Extracted from the ``<meta name=generator>`` element in HTML
#: and written to the ``/Creator`` info field in PDF.
self.generator = generator
#: The creation date of the document, as a string or :obj:`None`.
#: Dates are in one of the six formats specified in
#: `W3Cs profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
#: Extracted from the ``<meta name=dcterms.created>`` element in HTML
#: and written to the ``/CreationDate`` info field in PDF.
self.created = created
#: The modification date of the document, as a string or :obj:`None`.
#: Dates are in one of the six formats specified in
#: `W3Cs profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
#: Extracted from the ``<meta name=dcterms.modified>`` element in HTML
#: and written to the ``/ModDate`` info field in PDF.
self.modified = modified
#: File attachments, as a list of tuples of URL and a description or
#: :obj:`None`. (Defaults to the empty list.)
2014-04-04 14:32:21 +04:00
#: Extracted from the ``<link rel=attachment>`` elements in HTML
#: and written to the ``/EmbeddedFiles`` dictionary in PDF.
#:
#: .. versionadded:: 0.22
2014-04-04 14:32:21 +04:00
self.attachments = attachments or []
class Document:
"""A rendered document ready to be painted on a cairo surface.
2012-10-04 13:35:25 +04:00
Typically obtained from :meth:`HTML.render() <weasyprint.HTML.render>`, but
can also be instantiated directly with a list of :class:`pages <Page>`, a
set of :class:`metadata <DocumentMetadata>`, a :func:`url_fetcher
<weasyprint.default_url_fetcher>` function, and a :class:`font_config
<weasyprint.fonts.FontConfiguration>`.
2012-10-04 13:35:25 +04:00
"""
2012-10-02 20:59:02 +04:00
@classmethod
def _build_layout_context(cls, html, stylesheets,
2019-12-24 16:39:40 +03:00
presentational_hints=False, font_config=None,
counter_style=None):
if font_config is None:
font_config = FontConfiguration()
2019-12-24 16:39:40 +03:00
if counter_style is None:
2019-12-24 17:56:24 +03:00
counter_style = CounterStyle()
2018-03-28 01:34:34 +03:00
target_collector = TargetCollector()
2017-06-30 18:54:02 +03:00
page_rules = []
user_stylesheets = []
for css in stylesheets or []:
if not hasattr(css, 'matcher'):
css = CSS(
guess=css, media_type=html.media_type,
2019-12-24 16:39:40 +03:00
font_config=font_config, counter_style=counter_style)
user_stylesheets.append(css)
style_for = get_all_computed_styles(
html, user_stylesheets, presentational_hints, font_config,
2019-12-24 16:39:40 +03:00
counter_style, page_rules, target_collector)
2012-10-05 20:50:40 +04:00
get_image_from_uri = functools.partial(
2018-01-07 03:46:39 +03:00
original_get_image_from_uri, {}, html.url_fetcher)
PROGRESS_LOGGER.info('Step 4 - Creating formatting structure')
context = LayoutContext(
style_for, get_image_from_uri, font_config, counter_style,
target_collector)
return context
@classmethod
2020-04-19 17:49:37 +03:00
def _render(cls, html, stylesheets, presentational_hints=False,
font_config=None, counter_style=None):
if font_config is None:
font_config = FontConfiguration()
2019-12-24 16:39:40 +03:00
if counter_style is None:
2019-12-24 17:56:24 +03:00
counter_style = CounterStyle()
2019-12-24 16:39:40 +03:00
context = cls._build_layout_context(
html, stylesheets, presentational_hints, font_config,
counter_style)
2018-08-08 18:47:47 +03:00
root_box = build_formatting_structure(
2019-07-23 19:07:14 +03:00
html.etree_element, context.style_for, context.get_image_from_uri,
2019-12-24 16:39:40 +03:00
html.base_url, context.target_collector, counter_style)
page_boxes = layout_document(html, root_box, context)
rendering = cls(
[Page(page_box) for page_box in page_boxes],
DocumentMetadata(**html._get_metadata()),
html.url_fetcher, font_config)
return rendering
2012-10-02 20:59:02 +04:00
def __init__(self, pages, metadata, url_fetcher, font_config):
2012-10-02 20:59:02 +04:00
#: A list of :class:`Page` objects.
self.pages = pages
#: A :class:`DocumentMetadata` object.
#: Contains information that does not belong to a specific page
#: but to the whole document.
self.metadata = metadata
#: A function or other callable with the same signature as
#: :func:`default_url_fetcher` called to fetch external resources such
#: as stylesheets and images. (See :ref:`url-fetchers`.)
self.url_fetcher = url_fetcher
# Keep a reference to font_config to avoid its garbage collection until
# rendering is destroyed. This is needed as font_config.__del__ removes
# fonts that may be used when rendering
self._font_config = font_config
2012-10-02 20:59:02 +04:00
def copy(self, pages='all'):
2012-10-05 20:50:40 +04:00
"""Take a subset of the pages.
.. versionadded:: 0.15
:type pages: :term:`iterable`
2012-10-05 20:50:40 +04:00
:param pages:
An iterable of :class:`Page` objects from :attr:`pages`.
:return:
A new :class:`Document` object.
Examples:
2012-10-05 20:50:40 +04:00
Write two PDF files for odd-numbered and even-numbered pages::
# Python lists count from 0 but pages are numbered from 1.
2012-10-05 20:50:40 +04:00
# [::2] is a slice of even list indexes but odd-numbered pages.
document.copy(document.pages[::2]).write_pdf('odd_pages.pdf')
document.copy(document.pages[1::2]).write_pdf('even_pages.pdf')
Write each page to a numbred PNG file::
2012-10-05 20:50:40 +04:00
for i, page in enumerate(document.pages):
document.copy(page).write_png('page_%s.png' % i)
Combine multiple documents into one PDF file,
using metadata from the first::
all_pages = [p for doc in documents for p in doc.pages]
documents[0].copy(all_pages).write_pdf('combined.pdf')
2012-10-05 20:50:40 +04:00
"""
2012-10-02 20:59:02 +04:00
if pages == 'all':
pages = self.pages
2012-10-05 20:50:40 +04:00
elif not isinstance(pages, list):
pages = list(pages)
return type(self)(
pages, self.metadata, self.url_fetcher, self._font_config)
2012-10-02 20:59:02 +04:00
def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None):
2012-10-05 20:50:40 +04:00
"""Paint the pages in a PDF file, with meta-data.
2012-10-02 20:59:02 +04:00
2012-10-05 20:50:40 +04:00
PDF files written directly by cairo do not have meta-data such as
bookmarks/outlines and hyperlinks.
2012-10-02 20:59:02 +04:00
:type target: str, pathlib.Path or file object
2012-10-05 20:50:40 +04:00
:param target:
A filename where the PDF file is generated, a file object, or
:obj:`None`.
:type zoom: float
:param zoom:
The zoom factor in PDF units per CSS units. **Warning**:
All CSS units are affected, including physical units like
``cm`` and named sizes like ``A4``. For values other than
1, the physical CSS units will thus be "wrong".
:type attachments: list
:param attachments: A list of additional file attachments for the
generated PDF document or :obj:`None`. The list's elements are
:class:`Attachment` objects, filenames, URLs or file-like objects.
2020-04-19 11:01:27 +03:00
:param finisher: A finisher function, that accepts the document and a
``pydyf.PDF`` object as parameters, can be passed to perform
post-processing on the PDF right before the trailer is written.
2012-10-05 20:50:40 +04:00
:returns:
The PDF as :obj:`bytes` if ``target`` is not provided or
:obj:`None`, otherwise :obj:`None` (the PDF is written to
``target``).
2012-10-02 20:59:02 +04:00
"""
2020-04-18 23:12:25 +03:00
# 0.75 = 72 PDF point per inch / 96 CSS pixel per inch
scale = zoom * 0.75
2020-04-19 17:49:37 +03:00
PROGRESS_LOGGER.info('Step 6 - Creating PDF')
pdf = pydyf.PDF()
alpha_states = pydyf.Dictionary()
pdf.add_object(alpha_states)
resources = pydyf.Dictionary({'ExtGState': alpha_states.reference})
pdf.add_object(resources)
pdf_names = pydyf.Array()
pdf.catalog['Names'] = pydyf.Dictionary(
{'Dests': pydyf.Dictionary({'Names': pdf_names})})
# Links and anchors
paged_links_and_anchors = list(resolve_links(self.pages))
2020-04-21 23:30:38 +03:00
attachment_links = [
[link for link in page_links if link[0] == 'attachment']
for page_links, page_anchors in paged_links_and_anchors]
# Annotations
annot_files = {}
2020-04-21 23:30:38 +03:00
# A single link can be split in multiple regions. We don't want to
# embed a file multiple times of course, so keep a reference to every
# embedded URL and reuse the object number.
for page_links in attachment_links:
for link_type, annot_target, rectangle in page_links:
if link_type == 'attachment' and target not in annot_files:
# TODO: Use the title attribute as description. The comment
# above about multiple regions won't always be correct,
# because two links might have the same href, but different
# titles.
2020-04-21 23:30:38 +03:00
annot_files[annot_target] = _write_pdf_attachment(
pdf, (annot_target, None), self.url_fetcher)
# Bookmarks
root = []
# At one point in the document, for each "output" depth, how much
# to add to get the source level (CSS values of bookmark-level).
# E.g. with <h1> then <h3>, level_shifts == [0, 1]
# 1 means that <h3> has depth 3 - 1 = 2 in the output.
skipped_levels = []
last_by_depth = [root]
previous_level = 0
for page_number, (page, links_and_anchors, page_links) in enumerate(
zip(self.pages, paged_links_and_anchors, attachment_links)):
# Draw from the top-left corner
matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale)
# Links and anchors
links, anchors = links_and_anchors
2020-04-19 01:47:19 +03:00
2020-04-19 10:55:39 +03:00
page_width = scale * (
page.width + page.bleed['left'] + page.bleed['right'])
page_height = scale * (
page.height + page.bleed['top'] + page.bleed['bottom'])
left = -scale * page.bleed['left']
top = -scale * page.bleed['top']
2020-04-19 17:49:37 +03:00
right = left + page_width
bottom = top + page_height
2020-04-19 17:49:37 +03:00
stream = Context(alpha_states)
stream.transform(1, 0, 0, -1, 0, page.height * scale)
page.paint(stream, scale=scale)
2020-04-19 17:49:37 +03:00
pdf.add_object(stream)
2020-04-19 01:47:19 +03:00
pdf_page = pydyf.Dictionary({
'Type': '/Page',
2020-04-19 17:49:37 +03:00
'Parent': pdf.pages.reference,
'MediaBox': pydyf.Array([left, top, right, bottom]),
'Contents': stream.reference,
'Resources': resources.reference,
2020-04-21 23:30:38 +03:00
'Annots': pydyf.Array(),
2020-04-19 01:47:19 +03:00
})
2020-04-19 17:49:37 +03:00
pdf.add_page(pdf_page)
add_hyperlinks(links, anchors, matrix, pdf, pdf_page, pdf_names)
# Bleed
2020-04-19 17:49:37 +03:00
bleed = {key: value * 0.75 for key, value in page.bleed.items()}
trim_left = left + bleed['left']
trim_top = top + bleed['top']
trim_right = right - bleed['right']
trim_bottom = bottom - bleed['bottom']
2020-04-19 01:47:19 +03:00
2020-04-19 17:49:37 +03:00
# Arbitrarly set PDF BleedBox between CSS bleed box (MediaBox) and
# CSS page box (TrimBox) at most 10 points from the TrimBox.
bleed_left = trim_left - min(10, bleed['left'])
bleed_top = trim_top - min(10, bleed['top'])
bleed_right = trim_right + min(10, bleed['right'])
bleed_bottom = trim_bottom + min(10, bleed['bottom'])
2020-04-19 01:47:19 +03:00
2020-04-19 17:49:37 +03:00
pdf_page['TrimBox'] = pydyf.Array([
trim_left, trim_top, trim_right, trim_bottom])
pdf_page['BleedBox'] = pydyf.Array([
bleed_left, bleed_top, bleed_right, bleed_bottom])
2012-10-02 20:59:02 +04:00
# Annotations
2020-04-21 23:30:38 +03:00
# TODO: splitting a link into multiple independent rectangular
# annotations works well for pure links, but rather mediocre for
# other annotations and fails completely for transformed (CSS) or
# complex link shapes (area). It would be better to use /AP for all
# links and coalesce link shapes that originate from the same HTML
# link. This would give a feeling similiar to what browsers do with
# links that span multiple lines.
for link_type, annot_target, rectangle in page_links:
annot_file = annot_files[annot_target]
if link_type == 'attachment' and annot_file is not None:
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
annot = pydyf.Dictionary({
'Type': '/Annot',
'Rect': pydyf.Array(rectangle),
'Subtype': '/FileAttachment',
'T': pydyf.String(),
'FS': annot_file.reference,
'AP': pydyf.Dictionary({'N': pydyf.Stream([], {
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array(rectangle),
'Length': 0,
})})
})
pdf.add_object(annot)
pdf_page['Annots'].append(annot.reference)
# Bookmarks
2020-04-19 17:49:37 +03:00
for level, label, (point_x, point_y), state in page.bookmarks:
if level > previous_level:
# Example: if the previous bookmark is a <h2>, the next
# depth "should" be for <h3>. If now we get a <h6> were
# skipping two levels: append 6 - 3 - 1 = 2
skipped_levels.append(level - previous_level - 1)
else:
temp = level
while temp < previous_level:
temp += 1 + skipped_levels.pop()
if temp > previous_level:
# We remove too many "skips", add some back:
skipped_levels.append(temp - previous_level - 1)
previous_level = level
depth = level - sum(skipped_levels)
assert depth == len(skipped_levels)
assert depth >= 1
children = []
point_x, point_y = matrix.transform_point(point_x, point_y)
subtree = BookmarkSubtree(
label, (page_number, point_x, point_y), children, state)
last_by_depth[depth - 1].append(subtree)
del last_by_depth[depth:]
last_by_depth.append(children)
outlines, count = create_bookmarks(root, pdf)
2020-04-22 00:07:35 +03:00
if outlines:
pdf.catalog['Outlines'] = pydyf.Dictionary({
'Count': count,
'First': outlines[0].reference,
'Last': outlines[-1].reference,
})
2020-04-19 17:49:37 +03:00
PROGRESS_LOGGER.info('Step 7 - Adding PDF metadata')
2020-05-06 08:42:45 +03:00
# PDF information
if self.metadata.title:
pdf.info['Title'] = pydyf.String(self.metadata.title)
if self.metadata.authors:
pdf.info['Author'] = pydyf.String(
', '.join(self.metadata.authors))
if self.metadata.description:
pdf.info['Subject'] = pydyf.String(self.metadata.description)
if self.metadata.keywords:
pdf.info['Keywords'] = pydyf.String(
', '.join(self.metadata.keywords))
if self.metadata.generator:
pdf.info['Creator'] = pydyf.String(self.metadata.generator)
pdf.info['Producer'] = pydyf.String(f'WeasyPrint {__version__}')
if self.metadata.created:
pdf.info['CreationDate'] = pydyf.String(
_w3c_date_to_pdf(self.metadata.created, 'created'))
if self.metadata.modified:
pdf.info['ModDate'] = pydyf.String(
_w3c_date_to_pdf(self.metadata.modified, 'modified'))
2020-05-06 08:42:45 +03:00
# Embedded files
2020-04-21 23:30:38 +03:00
attachments = self.metadata.attachments + (attachments or [])
2020-05-06 08:42:45 +03:00
pdf_attachments = []
for attachment in attachments:
pdf_attachment = _write_pdf_attachment(
pdf, attachment, self.url_fetcher)
if pdf_attachment is not None:
pdf_attachments.append(pdf_attachment)
if pdf_attachments:
content = pydyf.Dictionary({'Names': pydyf.Array()})
for i, pdf_attachment in enumerate(pdf_attachments):
content['Names'].append(pydyf.String(f'attachment{i}'))
content['Names'].append(pdf_attachment.reference)
pdf.add_object(content)
pdf.catalog['Names']['EmbeddedFiles'] = content.reference
# Embeded fonts
resources['Font'] = pydyf.Dictionary()
for font_hash, font in stream._fonts.items():
2020-05-08 02:50:41 +03:00
compressed = zlib.compressobj().compress(font.file_content)
2020-05-06 08:42:45 +03:00
font_extra = pydyf.Dictionary({
'Filter': '/FlateDecode',
2020-05-08 02:50:41 +03:00
'Length1': len(font.file_content),
2020-05-06 08:42:45 +03:00
})
font_stream = pydyf.Stream([compressed], font_extra)
pdf.add_object(font_stream)
2020-05-08 02:50:41 +03:00
font.compute_glyphs_values()
subfont_dictionary = pydyf.Dictionary({
2020-05-06 08:42:45 +03:00
'Type': '/Font',
'Subtype': '/CIDFontType2',
2020-05-08 02:55:50 +03:00
'BaseFont': font.name,
'CIDSystemInfo': pydyf.Dictionary({
'Registry': pydyf.String('Adobe'),
'Ordering': pydyf.String('Identity'),
'Supplement': 0,
}),
'W': pydyf.Array([font.first_char, pydyf.Array(font.widths)]),
2020-05-06 08:42:45 +03:00
'FontDescriptor': pydyf.Dictionary({
2020-05-08 02:55:50 +03:00
'FontName': font.name,
'FontFamily': pydyf.String(font.family),
2020-05-06 08:42:45 +03:00
'Flags': 32,
2020-05-08 02:50:41 +03:00
'FontBBox': pydyf.Array(font.bbox),
2020-05-08 01:40:01 +03:00
'ItalicAngle': font.italic_angle,
'Ascent': font.ascent,
'Descent': font.descent,
'CapHeight': font.cap_height,
'StemV': font.stemv,
'StemH': font.stemh,
2020-05-06 08:42:45 +03:00
'FontFile': font_stream.reference,
}),
})
pdf.add_object(subfont_dictionary)
font_dictionary = pydyf.Dictionary({
'Type': '/Font',
'Subtype': '/Type0',
'BaseFont': font.name,
'Encoding': '/Identity-H',
'DescendantFonts': pydyf.Array([subfont_dictionary.reference]),
2020-05-06 08:42:45 +03:00
})
pdf.add_object(font_dictionary)
resources['Font'][str(font_hash)] = font_dictionary.reference
2020-04-21 23:30:38 +03:00
2020-04-19 11:01:27 +03:00
if finisher:
2020-04-19 17:49:37 +03:00
finisher(self, pdf)
2020-04-19 11:01:27 +03:00
file_obj = io.BytesIO()
2020-04-19 17:49:37 +03:00
pdf.write(file_obj)
2012-10-02 20:59:02 +04:00
if target is None:
return file_obj.getvalue()
else:
file_obj.seek(0)
if hasattr(target, 'write'):
shutil.copyfileobj(file_obj, target)
else:
with open(target, 'wb') as fd:
shutil.copyfileobj(file_obj, fd)
2012-12-29 04:00:30 +04:00
def write_png(self, target=None, resolution=96):
"""Paint the pages vertically to a single PNG image.
There is no decoration around pages other than those specified in CSS
with ``@page`` rules. The final image is as wide as the widest page.
Each page is below the previous one, centered horizontally.
:param target:
A filename, file-like object, or :obj:`None`.
:type resolution: float
:param resolution:
The output resolution in PNG pixels per CSS inch. At 96 dpi
(the default), PNG pixels match the CSS ``px`` unit.
:returns:
A ``(png_bytes, png_width, png_height)`` tuple. ``png_bytes`` is a
byte string if ``target`` is :obj:`None`, otherwise :obj:`None`
(the image is written to ``target``). ``png_width`` and
``png_height`` are the size of the final image, in PNG pixels.
2012-12-29 04:00:30 +04:00
"""
2020-04-19 09:09:22 +03:00
# TODO: write this
raise NotImplementedError