2012-10-02 20:59:02 +04:00
|
|
|
|
"""
|
|
|
|
|
weasyprint.document
|
|
|
|
|
-------------------
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
2019-05-24 00:55:56 +03:00
|
|
|
|
import collections
|
2017-03-25 02:33:36 +03:00
|
|
|
|
import functools
|
2020-04-19 17:49:37 +03:00
|
|
|
|
import hashlib
|
2012-10-02 20:59:02 +04:00
|
|
|
|
import io
|
|
|
|
|
import math
|
|
|
|
|
import shutil
|
2020-04-19 17:49:37 +03:00
|
|
|
|
import zlib
|
|
|
|
|
from os.path import basename
|
|
|
|
|
from urllib.parse import unquote, urlsplit
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2020-04-18 01:19:35 +03:00
|
|
|
|
import pydyf
|
2019-07-23 08:12:08 +03:00
|
|
|
|
from weasyprint.layout import LayoutContext
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
from . import Attachment, CSS, __version__
|
2012-10-02 20:59:02 +04:00
|
|
|
|
from .css import get_all_computed_styles
|
2019-12-24 17:56:24 +03:00
|
|
|
|
from .css.counters import CounterStyle
|
2018-03-28 01:34:34 +03:00
|
|
|
|
from .css.targets import TargetCollector
|
2017-03-25 02:33:36 +03:00
|
|
|
|
from .draw import draw_page, stacked
|
2016-10-27 18:36:24 +03:00
|
|
|
|
from .fonts import FontConfiguration
|
2012-10-04 13:35:25 +04:00
|
|
|
|
from .formatting_structure import boxes
|
2012-10-02 20:59:02 +04:00
|
|
|
|
from .formatting_structure.build import build_formatting_structure
|
2018-08-06 18:38:02 +03:00
|
|
|
|
from .html import W3C_DATE_RE
|
2018-01-07 03:46:39 +03:00
|
|
|
|
from .images import get_image_from_uri as original_get_image_from_uri
|
2012-10-02 20:59:02 +04:00
|
|
|
|
from .layout import layout_document
|
2019-06-02 19:06:25 +03:00
|
|
|
|
from .layout.percentages import percentage
|
2019-01-04 01:02:44 +03:00
|
|
|
|
from .logger import LOGGER, PROGRESS_LOGGER
|
2020-05-08 01:11:19 +03:00
|
|
|
|
from .text import ffi, pango
|
2020-04-19 17:49:37 +03:00
|
|
|
|
from .urls import URLFetchingError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _w3c_date_to_pdf(string, attr_name):
|
|
|
|
|
"""Tranform W3C date to PDF format."""
|
|
|
|
|
if string is None:
|
|
|
|
|
return None
|
|
|
|
|
match = W3C_DATE_RE.match(string)
|
|
|
|
|
if match is None:
|
|
|
|
|
LOGGER.warning('Invalid %s date: %r', attr_name, string)
|
|
|
|
|
return None
|
|
|
|
|
groups = match.groupdict()
|
|
|
|
|
pdf_date = ''
|
|
|
|
|
found = False
|
|
|
|
|
for key in ('second', 'minute', 'hour', 'day', 'month', 'year'):
|
|
|
|
|
if groups[key]:
|
|
|
|
|
found = True
|
|
|
|
|
pdf_date = groups[key] + pdf_date
|
|
|
|
|
elif found:
|
|
|
|
|
pdf_date = '%02i' % (key in ('day', 'month')) + pdf_date
|
|
|
|
|
if groups['hour']:
|
|
|
|
|
assert groups['minute']
|
|
|
|
|
if groups['tz_hour']:
|
|
|
|
|
assert groups['tz_hour'].startswith(('+', '-'))
|
|
|
|
|
assert groups['tz_minute']
|
|
|
|
|
pdf_date += "%+03i'%02i" % (
|
|
|
|
|
int(groups['tz_hour']), int(groups['tz_minute']))
|
|
|
|
|
else:
|
|
|
|
|
pdf_date += 'Z'
|
|
|
|
|
return pdf_date
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2020-04-18 01:19:35 +03:00
|
|
|
|
|
2020-05-08 01:31:50 +03:00
|
|
|
|
class Font:
|
2020-05-08 02:50:41 +03:00
|
|
|
|
def __init__(self, file_content, pango_font, glyph_item):
|
2020-05-07 20:33:54 +03:00
|
|
|
|
pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
|
2020-05-08 02:30:07 +03:00
|
|
|
|
font_description = pango.pango_font_describe(pango_font)
|
2020-05-08 00:27:43 +03:00
|
|
|
|
font_family = ffi.string(pango.pango_font_description_get_family(
|
2020-05-08 02:30:07 +03:00
|
|
|
|
font_description))
|
2020-05-08 00:27:43 +03:00
|
|
|
|
glyph_string = glyph_item.glyphs
|
|
|
|
|
num_glyphs = glyph_string.num_glyphs
|
2020-05-07 20:33:54 +03:00
|
|
|
|
|
2020-05-08 02:50:41 +03:00
|
|
|
|
self.file_content = file_content
|
2020-05-07 20:33:54 +03:00
|
|
|
|
self.pango_font = pango_font
|
|
|
|
|
self.glyph_item = glyph_item
|
2020-05-08 00:27:43 +03:00
|
|
|
|
# When the font will be a font subset, the font name will have to be
|
|
|
|
|
# like '/XXXXXX+font_family'
|
2020-05-08 02:50:41 +03:00
|
|
|
|
self.name = b'/' + font_family.replace(b' ', b'')
|
|
|
|
|
self.family = font_family
|
2020-05-08 01:19:07 +03:00
|
|
|
|
self.flags = 4
|
2020-05-07 20:33:54 +03:00
|
|
|
|
self.font_bbox = None
|
|
|
|
|
self.italic_angle = 0
|
|
|
|
|
self.ascent = pango.pango_font_metrics_get_ascent(pango_metrics)
|
|
|
|
|
self.descent = pango.pango_font_metrics_get_descent(pango_metrics)
|
|
|
|
|
self.cap_height = None
|
|
|
|
|
self.stemv = 80
|
|
|
|
|
self.stemh = 80
|
2020-05-08 01:40:01 +03:00
|
|
|
|
self.glyphs = {glyph_string.glyphs[x].glyph for x in range(num_glyphs)}
|
2020-05-08 02:50:41 +03:00
|
|
|
|
self.size = pango.pango_units_to_double(
|
2020-05-08 02:30:07 +03:00
|
|
|
|
pango.pango_font_description_get_size(font_description))
|
2020-05-08 01:11:19 +03:00
|
|
|
|
|
|
|
|
|
def add_glyphs(self, glyph_item):
|
|
|
|
|
glyph_string = glyph_item.glyphs
|
|
|
|
|
num_glyphs = glyph_string.num_glyphs
|
2020-05-08 01:40:01 +03:00
|
|
|
|
self.glyphs += {
|
|
|
|
|
glyph_string.glyphs[x].glyph for x in range(num_glyphs)}
|
2020-05-08 01:11:19 +03:00
|
|
|
|
|
2020-05-08 02:50:41 +03:00
|
|
|
|
def compute_glyphs_values(self):
|
2020-05-08 02:30:07 +03:00
|
|
|
|
font_bbox = [0, 0, 0, 0]
|
2020-05-08 01:11:19 +03:00
|
|
|
|
ink_rect = ffi.new('PangoRectangle *')
|
|
|
|
|
|
|
|
|
|
for glyph in self.glyphs:
|
|
|
|
|
pango.pango_font_get_glyph_extents(
|
2020-05-08 01:24:30 +03:00
|
|
|
|
self.pango_font, glyph, ink_rect, ffi.NULL)
|
2020-05-08 02:30:07 +03:00
|
|
|
|
x1, y1, x2, y2 = (
|
|
|
|
|
ink_rect.x, -ink_rect.y - ink_rect.height,
|
|
|
|
|
ink_rect.x + ink_rect.width, -ink_rect.y)
|
|
|
|
|
if x1 < font_bbox[0]:
|
|
|
|
|
font_bbox[0] = x1
|
|
|
|
|
if y1 < font_bbox[1]:
|
|
|
|
|
font_bbox[1] = y1
|
|
|
|
|
if x2 > font_bbox[2]:
|
|
|
|
|
font_bbox[2] = x2
|
|
|
|
|
if y2 > font_bbox[3]:
|
|
|
|
|
font_bbox[3] = y2
|
2020-05-08 01:11:19 +03:00
|
|
|
|
|
2020-05-08 01:28:30 +03:00
|
|
|
|
ffi.release(ink_rect)
|
2020-05-08 02:50:41 +03:00
|
|
|
|
self.bbox = [value / self.size for value in font_bbox]
|
2020-05-08 02:30:07 +03:00
|
|
|
|
self.cap_height = font_bbox[1]
|
2020-05-07 20:33:54 +03:00
|
|
|
|
|
|
|
|
|
|
2020-04-18 01:19:35 +03:00
|
|
|
|
class Context(pydyf.Stream):
|
2020-04-19 17:49:37 +03:00
|
|
|
|
"""PDF stream object with context storing alpha states."""
|
2020-04-18 01:19:35 +03:00
|
|
|
|
def __init__(self, alpha_states, *args, **kwargs):
|
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
|
self._alpha_states = alpha_states
|
2020-05-06 08:42:45 +03:00
|
|
|
|
self._fonts = {}
|
2020-04-18 01:19:35 +03:00
|
|
|
|
|
|
|
|
|
def set_alpha(self, alpha, stroke=False):
|
|
|
|
|
if alpha not in self._alpha_states:
|
|
|
|
|
self._alpha_states[alpha] = pydyf.Dictionary(
|
|
|
|
|
{'CA' if stroke else 'ca': alpha})
|
|
|
|
|
self.set_state(alpha)
|
2018-04-13 11:44:19 +03:00
|
|
|
|
|
2020-05-08 01:11:19 +03:00
|
|
|
|
def add_font(self, font, pango_font, glyph_item):
|
2020-05-06 08:42:45 +03:00
|
|
|
|
font_hash = hash(font)
|
|
|
|
|
if font_hash not in self._fonts:
|
2020-05-08 01:11:19 +03:00
|
|
|
|
self._fonts[font_hash] = Font(font, pango_font, glyph_item)
|
|
|
|
|
else:
|
|
|
|
|
self._fonts[font_hash].add_glyphs(glyph_item)
|
2020-05-08 01:29:46 +03:00
|
|
|
|
return font_hash
|
2020-05-06 08:42:45 +03:00
|
|
|
|
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
BookmarkSubtree = collections.namedtuple(
|
|
|
|
|
'BookmarkSubtree', ('label', 'destination', 'children', 'state'))
|
2020-04-18 23:12:25 +03:00
|
|
|
|
|
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
def _write_pdf_attachment(pdf, attachment, url_fetcher):
|
|
|
|
|
"""Write an attachment to the PDF stream.
|
|
|
|
|
|
|
|
|
|
:return:
|
2020-04-21 23:30:38 +03:00
|
|
|
|
the attachment PDF dictionary.
|
2020-04-19 17:49:37 +03:00
|
|
|
|
|
|
|
|
|
"""
|
2020-04-21 23:30:38 +03:00
|
|
|
|
# Attachments from document links like <link> or <a> can only be URLs.
|
|
|
|
|
# They're passed in as tuples
|
2020-04-19 17:49:37 +03:00
|
|
|
|
url = ''
|
2020-04-21 23:30:38 +03:00
|
|
|
|
if isinstance(attachment, tuple):
|
|
|
|
|
url, description = attachment
|
|
|
|
|
attachment = Attachment(
|
|
|
|
|
url=url, url_fetcher=url_fetcher, description=description)
|
|
|
|
|
elif not isinstance(attachment, Attachment):
|
|
|
|
|
attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
|
2020-04-19 17:49:37 +03:00
|
|
|
|
|
2020-04-21 23:30:38 +03:00
|
|
|
|
try:
|
2020-04-19 17:49:37 +03:00
|
|
|
|
with attachment.source as (source_type, source, url, _):
|
|
|
|
|
if isinstance(source, bytes):
|
|
|
|
|
source = io.BytesIO(source)
|
2020-04-21 23:30:38 +03:00
|
|
|
|
uncompressed_length = 0
|
|
|
|
|
stream = b''
|
|
|
|
|
md5 = hashlib.md5()
|
|
|
|
|
compress = zlib.compressobj()
|
|
|
|
|
for data in iter(lambda: source.read(4096), b''):
|
|
|
|
|
uncompressed_length += len(data)
|
|
|
|
|
md5.update(data)
|
|
|
|
|
compressed = compress.compress(data)
|
|
|
|
|
stream += compressed
|
|
|
|
|
compressed = compress.flush(zlib.Z_FINISH)
|
|
|
|
|
stream += compressed
|
|
|
|
|
file_extra = pydyf.Dictionary({
|
|
|
|
|
'Type': '/EmbeddedFile',
|
|
|
|
|
'Filter': '/FlateDecode',
|
|
|
|
|
'Params': pydyf.Dictionary({
|
|
|
|
|
'CheckSum': f'<{md5.hexdigest()}>',
|
|
|
|
|
'Size': uncompressed_length,
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
file_stream = pydyf.Stream([stream], file_extra)
|
|
|
|
|
pdf.add_object(file_stream)
|
|
|
|
|
|
|
|
|
|
except URLFetchingError as exception:
|
|
|
|
|
LOGGER.error('Failed to load attachment: %s', exception)
|
|
|
|
|
return
|
2020-04-19 17:49:37 +03:00
|
|
|
|
|
|
|
|
|
# TODO: Use the result object from a URL fetch operation to provide more
|
|
|
|
|
# details on the possible filename.
|
|
|
|
|
filename = basename(unquote(urlsplit(url).path)) or 'attachment.bin'
|
|
|
|
|
|
2020-04-21 23:30:38 +03:00
|
|
|
|
attachment = pydyf.Dictionary({
|
|
|
|
|
'Type': '/Filespec',
|
|
|
|
|
'F': pydyf.String(),
|
|
|
|
|
'UF': pydyf.String(filename),
|
|
|
|
|
'EF': pydyf.Dictionary({'F': file_stream.reference}),
|
|
|
|
|
'Desc': pydyf.String(attachment.description or ''),
|
|
|
|
|
})
|
|
|
|
|
pdf.add_object(attachment)
|
|
|
|
|
return attachment
|
2020-04-19 17:49:37 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_bookmarks(bookmarks, pdf, parent=None):
|
2020-04-18 23:12:25 +03:00
|
|
|
|
count = len(bookmarks)
|
|
|
|
|
outlines = []
|
2020-04-19 17:49:37 +03:00
|
|
|
|
for title, (page, x, y), children, state in bookmarks:
|
2020-04-18 23:12:25 +03:00
|
|
|
|
destination = pydyf.Array((
|
2020-04-19 17:49:37 +03:00
|
|
|
|
pdf.objects[pdf.pages['Kids'][page * 3]].reference,
|
2020-04-18 23:12:25 +03:00
|
|
|
|
'/XYZ', x, y, 0))
|
|
|
|
|
outline = pydyf.Dictionary({
|
|
|
|
|
'Title': pydyf.String(title), 'Dest': destination})
|
2020-04-19 17:49:37 +03:00
|
|
|
|
pdf.add_object(outline)
|
2020-04-18 23:12:25 +03:00
|
|
|
|
children_outlines, children_count = create_bookmarks(
|
2020-04-19 17:49:37 +03:00
|
|
|
|
children, pdf, parent=outline)
|
2020-04-18 23:41:23 +03:00
|
|
|
|
outline['Count'] = children_count
|
|
|
|
|
if state == 'closed':
|
|
|
|
|
outline['Count'] *= -1
|
|
|
|
|
else:
|
|
|
|
|
count += children_count
|
2020-04-18 23:12:25 +03:00
|
|
|
|
if outlines:
|
|
|
|
|
outline['Prev'] = outlines[-1].reference
|
|
|
|
|
outlines[-1]['Next'] = outline.reference
|
|
|
|
|
if children_outlines:
|
|
|
|
|
outline['First'] = children_outlines[0].reference
|
|
|
|
|
outline['Last'] = children_outlines[-1].reference
|
|
|
|
|
if parent is not None:
|
|
|
|
|
outline['Parent'] = parent.reference
|
|
|
|
|
outlines.append(outline)
|
|
|
|
|
return outlines, count
|
|
|
|
|
|
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
def add_hyperlinks(links, anchors, matrix, pdf, page, names):
|
|
|
|
|
"""Include hyperlinks in current PDF page."""
|
|
|
|
|
page['Annots'] = pydyf.Array()
|
|
|
|
|
for link in links:
|
|
|
|
|
link_type, link_target, rectangle = link
|
|
|
|
|
x1, y1 = matrix.transform_point(*rectangle[:2])
|
|
|
|
|
x2, y2 = matrix.transform_point(*rectangle[2:])
|
|
|
|
|
if link_type in ('internal', 'external'):
|
|
|
|
|
annot = pydyf.Dictionary({
|
|
|
|
|
'Type': '/Annot',
|
|
|
|
|
'Subtype': '/Link',
|
|
|
|
|
'Rect': pydyf.Array([x1, y1, x2, y2]),
|
|
|
|
|
'BS': pydyf.Dictionary({'W': 0}),
|
|
|
|
|
})
|
|
|
|
|
if link_type == 'internal':
|
|
|
|
|
annot['Dest'] = pydyf.String(link_target)
|
2012-10-06 13:26:55 +04:00
|
|
|
|
else:
|
2020-04-19 17:49:37 +03:00
|
|
|
|
annot['A'] = pydyf.Dictionary({
|
|
|
|
|
'Type': '/Action',
|
|
|
|
|
'S': '/URI',
|
|
|
|
|
'URI': pydyf.String(link_target),
|
|
|
|
|
})
|
|
|
|
|
pdf.add_object(annot)
|
|
|
|
|
page['Annots'].append(annot.reference)
|
|
|
|
|
|
|
|
|
|
for anchor in anchors:
|
|
|
|
|
anchor_name, x, y = anchor
|
|
|
|
|
x, y = matrix.transform_point(x, y)
|
|
|
|
|
names.append(pydyf.String(anchor_name))
|
|
|
|
|
names.append(pydyf.Array([page.reference, '/XYZ', x, y, 0]))
|
2012-10-06 13:26:55 +04:00
|
|
|
|
|
|
|
|
|
|
2012-10-07 00:09:17 +04:00
|
|
|
|
def rectangle_aabb(matrix, pos_x, pos_y, width, height):
|
2020-04-19 17:49:37 +03:00
|
|
|
|
"""Apply a transformation matrix to an axis-aligned rectangle.
|
|
|
|
|
|
|
|
|
|
Return its axis-aligned bounding box as ``(x, y, width, height)``.
|
2012-10-07 00:09:17 +04:00
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
transform_point = matrix.transform_point
|
|
|
|
|
x1, y1 = transform_point(pos_x, pos_y)
|
|
|
|
|
x2, y2 = transform_point(pos_x + width, pos_y)
|
|
|
|
|
x3, y3 = transform_point(pos_x, pos_y + height)
|
|
|
|
|
x4, y4 = transform_point(pos_x + width, pos_y + height)
|
|
|
|
|
box_x1 = min(x1, x2, x3, x4)
|
|
|
|
|
box_y1 = min(y1, y2, y3, y4)
|
|
|
|
|
box_x2 = max(x1, x2, x3, x4)
|
|
|
|
|
box_y2 = max(y1, y2, y3, y4)
|
|
|
|
|
return box_x1, box_y1, box_x2 - box_x1, box_y2 - box_y1
|
|
|
|
|
|
|
|
|
|
|
2020-04-19 19:26:49 +03:00
|
|
|
|
def resolve_links(pages):
|
|
|
|
|
"""Resolve internal hyperlinks.
|
|
|
|
|
|
|
|
|
|
Links to a missing anchor are removed with a warning.
|
|
|
|
|
|
|
|
|
|
If multiple anchors have the same name, the first one is used.
|
|
|
|
|
|
|
|
|
|
:returns:
|
|
|
|
|
A generator yielding lists (one per page) like :attr:`Page.links`,
|
|
|
|
|
except that ``target`` for internal hyperlinks is
|
|
|
|
|
``(page_number, x, y)`` instead of an anchor name.
|
|
|
|
|
The page number is a 0-based index into the :attr:`pages` list,
|
|
|
|
|
and ``x, y`` are in CSS pixels from the top-left of the page.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
anchors = set()
|
|
|
|
|
paged_anchors = []
|
|
|
|
|
for i, page in enumerate(pages):
|
|
|
|
|
paged_anchors.append([])
|
|
|
|
|
for anchor_name, (point_x, point_y) in page.anchors.items():
|
|
|
|
|
if anchor_name not in anchors:
|
|
|
|
|
paged_anchors[-1].append((anchor_name, point_x, point_y))
|
|
|
|
|
anchors.add(anchor_name)
|
|
|
|
|
for page in pages:
|
|
|
|
|
page_links = []
|
|
|
|
|
for link in page.links:
|
|
|
|
|
link_type, anchor_name, rectangle = link
|
|
|
|
|
if link_type == 'internal':
|
|
|
|
|
if anchor_name not in anchors:
|
|
|
|
|
LOGGER.error(
|
|
|
|
|
'No anchor #%s for internal URI reference',
|
|
|
|
|
anchor_name)
|
|
|
|
|
else:
|
|
|
|
|
page_links.append((link_type, anchor_name, rectangle))
|
|
|
|
|
else:
|
|
|
|
|
# External link
|
|
|
|
|
page_links.append(link)
|
|
|
|
|
yield page_links, paged_anchors.pop(0)
|
|
|
|
|
|
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
class Matrix(list):
|
|
|
|
|
def __init__(self, a=1, b=0, c=0, d=1, e=0, f=0, matrix=None):
|
|
|
|
|
if matrix is None:
|
|
|
|
|
matrix = [[a, b, 0], [c, d, 0], [e, f, 1]]
|
|
|
|
|
super().__init__(matrix)
|
2012-10-04 13:35:25 +04:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
def __matmul__(self, other):
|
|
|
|
|
assert len(self[0]) == len(other) == len(other[0]) == 3
|
|
|
|
|
m = len(self)
|
|
|
|
|
return Matrix(matrix=[
|
|
|
|
|
[sum(self[i][k] * other[k][j] for k in range(3)) for j in range(3)]
|
|
|
|
|
for i in range(m)])
|
2012-10-04 13:35:25 +04:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
@property
|
|
|
|
|
def determinant(self):
|
|
|
|
|
assert len(self) == len(self[0]) == 3
|
|
|
|
|
return (
|
|
|
|
|
self[0][0] * (self[1][1] * self[2][2] - self[1][2] * self[2][1]) -
|
|
|
|
|
self[1][0] * (self[0][1] * self[2][2] - self[0][2] * self[2][1]) +
|
|
|
|
|
self[2][0] * (self[0][1] * self[1][2] - self[0][2] * self[1][1]))
|
2012-10-04 13:35:25 +04:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
def transform_point(self, x, y):
|
|
|
|
|
return (Matrix(matrix=[[x, y, 1]]) @ self)[0][:2]
|
2018-08-06 18:38:02 +03:00
|
|
|
|
|
|
|
|
|
|
2020-01-02 14:06:58 +03:00
|
|
|
|
class Page:
|
2012-10-04 13:35:25 +04:00
|
|
|
|
"""Represents a single rendered page.
|
|
|
|
|
|
2012-10-08 21:51:18 +04:00
|
|
|
|
.. versionadded:: 0.15
|
|
|
|
|
|
2012-10-04 13:35:25 +04:00
|
|
|
|
Should be obtained from :attr:`Document.pages` but not
|
|
|
|
|
instantiated directly.
|
|
|
|
|
|
|
|
|
|
"""
|
2020-04-18 01:19:35 +03:00
|
|
|
|
def __init__(self, page_box):
|
2012-10-08 21:51:18 +04:00
|
|
|
|
#: The page width, including margins, in CSS pixels.
|
2012-10-05 22:12:05 +04:00
|
|
|
|
self.width = page_box.margin_width()
|
2012-10-04 13:35:25 +04:00
|
|
|
|
|
2012-10-08 21:51:18 +04:00
|
|
|
|
#: The page height, including margins, in CSS pixels.
|
2012-10-05 22:12:05 +04:00
|
|
|
|
self.height = page_box.margin_height()
|
2012-10-04 13:35:25 +04:00
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: The page bleed widths as a :obj:`dict` with ``'top'``, ``'right'``,
|
|
|
|
|
#: ``'bottom'`` and ``'left'`` as keys, and values in CSS pixels.
|
2017-10-05 09:45:50 +03:00
|
|
|
|
self.bleed = {
|
|
|
|
|
side: page_box.style['bleed_%s' % side].value
|
|
|
|
|
for side in ('top', 'right', 'bottom', 'left')}
|
2017-09-05 16:44:50 +03:00
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: The :obj:`list` of ``(bookmark_level, bookmark_label, target)``
|
|
|
|
|
#: :obj:`tuples <tuple>`. ``bookmark_level`` and ``bookmark_label``
|
|
|
|
|
#: are respectively an :obj:`int` and a :obj:`string <str>`, based on
|
|
|
|
|
#: the CSS properties of the same names. ``target`` is an ``(x, y)``
|
|
|
|
|
#: point in CSS pixels from the top-left of the page.
|
|
|
|
|
self.bookmarks = []
|
|
|
|
|
|
|
|
|
|
#: The :obj:`list` of ``(link_type, target, rectangle)`` :obj:`tuples
|
|
|
|
|
#: <tuple>`. A ``rectangle`` is ``(x, y, width, height)``, in CSS
|
|
|
|
|
#: pixels from the top-left of the page. ``link_type`` is one of three
|
|
|
|
|
#: strings:
|
2012-10-04 13:35:25 +04:00
|
|
|
|
#:
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: * ``'external'``: ``target`` is an absolute URL
|
|
|
|
|
#: * ``'internal'``: ``target`` is an anchor name (see
|
2013-07-14 15:08:02 +04:00
|
|
|
|
#: :attr:`Page.anchors`).
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: The anchor might be defined in another page,
|
|
|
|
|
#: in multiple pages (in which case the first occurence is used),
|
|
|
|
|
#: or not at all.
|
|
|
|
|
#: * ``'attachment'``: ``target`` is an absolute URL and points
|
2014-04-04 20:46:00 +04:00
|
|
|
|
#: to a resource to attach to the document.
|
2019-02-22 13:34:46 +03:00
|
|
|
|
self.links = []
|
2012-10-04 13:35:25 +04:00
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: The :obj:`dict` mapping each anchor name to its target, an
|
|
|
|
|
#: ``(x, y)`` point in CSS pixels from the top-left of the page.
|
|
|
|
|
self.anchors = {}
|
2012-10-04 13:35:25 +04:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
self._gather_links_and_bookmarks(page_box)
|
2012-10-05 22:12:05 +04:00
|
|
|
|
self._page_box = page_box
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
def _gather_links_and_bookmarks(self, box, matrix=None):
|
|
|
|
|
# Get box transformation matrix.
|
|
|
|
|
# "Transforms apply to block-level and atomic inline-level elements,
|
|
|
|
|
# but do not apply to elements which may be split into
|
|
|
|
|
# multiple inline-level boxes."
|
|
|
|
|
# http://www.w3.org/TR/css3-2d-transforms/#introduction
|
|
|
|
|
if box.style['transform'] and not isinstance(box, boxes.InlineBox):
|
|
|
|
|
border_width = box.border_width()
|
|
|
|
|
border_height = box.border_height()
|
|
|
|
|
origin_x, origin_y = box.style['transform_origin']
|
|
|
|
|
offset_x = percentage(origin_x, border_width)
|
|
|
|
|
offset_y = percentage(origin_y, border_height)
|
|
|
|
|
origin_x = box.border_box_x() + offset_x
|
|
|
|
|
origin_y = box.border_box_y() + offset_y
|
|
|
|
|
|
|
|
|
|
matrix = Matrix(e=origin_x, f=origin_y)
|
|
|
|
|
for name, args in box.style['transform']:
|
|
|
|
|
a, b, c, d, e, f = 1, 0, 0, 1, 0, 0
|
|
|
|
|
if name == 'scale':
|
|
|
|
|
a, d = args
|
|
|
|
|
elif name == 'rotate':
|
|
|
|
|
a = d = math.cos(args)
|
|
|
|
|
b = math.sin(args)
|
|
|
|
|
c = -b
|
|
|
|
|
elif name == 'translate':
|
|
|
|
|
e = percentage(args[0], border_width)
|
|
|
|
|
f = percentage(args[1], border_height)
|
|
|
|
|
elif name == 'skew':
|
|
|
|
|
b, c = math.tan(args[1]), math.tan(args[0])
|
|
|
|
|
else:
|
|
|
|
|
assert name == 'matrix'
|
|
|
|
|
a, b, c, d, e, f = args
|
|
|
|
|
matrix = Matrix(a, b, c, d, e, f) @ matrix
|
|
|
|
|
box.transformation_matrix = (
|
|
|
|
|
Matrix(e=-origin_x, f=-origin_y) @ matrix)
|
|
|
|
|
if matrix:
|
|
|
|
|
matrix = box.transformation_matrix @ matrix
|
|
|
|
|
else:
|
|
|
|
|
matrix = box.transformation_matrix
|
|
|
|
|
|
|
|
|
|
bookmark_label = box.bookmark_label
|
|
|
|
|
if box.style['bookmark_level'] == 'none':
|
|
|
|
|
bookmark_level = None
|
|
|
|
|
else:
|
|
|
|
|
bookmark_level = box.style['bookmark_level']
|
|
|
|
|
state = box.style['bookmark_state']
|
|
|
|
|
link = box.style['link']
|
|
|
|
|
anchor_name = box.style['anchor']
|
|
|
|
|
has_bookmark = bookmark_label and bookmark_level
|
|
|
|
|
# 'link' is inherited but redundant on text boxes
|
|
|
|
|
has_link = link and not isinstance(box, boxes.TextBox)
|
|
|
|
|
# In case of duplicate IDs, only the first is an anchor.
|
|
|
|
|
has_anchor = anchor_name and anchor_name not in self.anchors
|
|
|
|
|
is_attachment = hasattr(box, 'is_attachment') and box.is_attachment
|
|
|
|
|
|
|
|
|
|
if has_bookmark or has_link or has_anchor:
|
|
|
|
|
pos_x, pos_y, width, height = box.hit_area()
|
|
|
|
|
if has_link:
|
|
|
|
|
token_type, link = link
|
|
|
|
|
assert token_type == 'url'
|
|
|
|
|
link_type, target = link
|
|
|
|
|
assert isinstance(target, str)
|
|
|
|
|
if link_type == 'external' and is_attachment:
|
|
|
|
|
link_type = 'attachment'
|
|
|
|
|
if matrix:
|
|
|
|
|
link = (link_type, target, rectangle_aabb(
|
|
|
|
|
matrix, pos_x, pos_y, pos_x + width, pos_y + height))
|
|
|
|
|
else:
|
|
|
|
|
link = (link_type, target, (
|
|
|
|
|
pos_x, pos_y, pos_x + width, pos_y + height))
|
|
|
|
|
self.links.append(link)
|
|
|
|
|
if matrix and (has_bookmark or has_anchor):
|
|
|
|
|
pos_x, pos_y = matrix.transform_point(pos_x, pos_y)
|
|
|
|
|
if has_bookmark:
|
|
|
|
|
self.bookmarks.append(
|
|
|
|
|
(bookmark_level, bookmark_label, (pos_x, pos_y), state))
|
|
|
|
|
if has_anchor:
|
|
|
|
|
self.anchors[anchor_name] = pos_x, pos_y
|
|
|
|
|
|
|
|
|
|
for child in box.all_children():
|
|
|
|
|
self._gather_links_and_bookmarks(child)
|
|
|
|
|
|
2020-04-18 01:19:35 +03:00
|
|
|
|
def paint(self, context, left_x=0, top_y=0, scale=1, clip=False):
|
2020-04-18 23:12:25 +03:00
|
|
|
|
"""Paint the page into the PDF file.
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
:type context: :class:`pdf.Context`
|
2020-04-18 01:19:35 +03:00
|
|
|
|
:param context:
|
|
|
|
|
A context object.
|
2019-02-22 13:34:46 +03:00
|
|
|
|
:type left_x: float
|
2012-10-02 20:59:02 +04:00
|
|
|
|
:param left_x:
|
2020-04-18 23:12:25 +03:00
|
|
|
|
X coordinate of the left of the page, in PDF points.
|
2019-02-22 13:34:46 +03:00
|
|
|
|
:type top_y: float
|
2012-10-02 20:59:02 +04:00
|
|
|
|
:param top_y:
|
2020-04-18 23:12:25 +03:00
|
|
|
|
Y coordinate of the top of the page, in PDF points.
|
2019-02-22 13:34:46 +03:00
|
|
|
|
:type scale: float
|
2012-10-05 22:12:05 +04:00
|
|
|
|
:param scale:
|
2020-04-18 23:12:25 +03:00
|
|
|
|
Zoom scale.
|
2019-02-22 13:34:46 +03:00
|
|
|
|
:type clip: bool
|
2012-10-02 20:59:02 +04:00
|
|
|
|
:param clip:
|
|
|
|
|
Whether to clip/cut content outside the page. If false or
|
|
|
|
|
not provided, content can overflow.
|
|
|
|
|
|
|
|
|
|
"""
|
2020-04-18 01:19:35 +03:00
|
|
|
|
with stacked(context):
|
|
|
|
|
# Make (0, 0) the top-left corner, and make user units CSS pixels:
|
|
|
|
|
context.transform(scale, 0, 0, scale, left_x, top_y)
|
2012-10-02 20:59:02 +04:00
|
|
|
|
if clip:
|
2012-10-05 22:12:05 +04:00
|
|
|
|
width = self.width
|
|
|
|
|
height = self.height
|
2020-04-18 01:19:35 +03:00
|
|
|
|
context.rectangle(0, 0, width, height)
|
|
|
|
|
context.clip()
|
|
|
|
|
draw_page(self._page_box, context)
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
|
|
|
|
|
2020-01-02 14:06:58 +03:00
|
|
|
|
class DocumentMetadata:
|
2019-02-22 13:34:46 +03:00
|
|
|
|
"""Meta-information belonging to a whole :class:`Document`.
|
2013-07-14 15:08:02 +04:00
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
.. versionadded:: 0.20
|
2013-07-14 15:08:02 +04:00
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
New attributes may be added in future versions of WeasyPrint.
|
2013-07-14 15:08:02 +04:00
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, title=None, authors=None, description=None,
|
2014-04-04 14:32:21 +04:00
|
|
|
|
keywords=None, generator=None, created=None, modified=None,
|
|
|
|
|
attachments=None):
|
2013-07-14 15:08:02 +04:00
|
|
|
|
#: The title of the document, as a string or :obj:`None`.
|
|
|
|
|
#: Extracted from the ``<title>`` element in HTML
|
|
|
|
|
#: and written to the ``/Title`` info field in PDF.
|
|
|
|
|
self.title = title
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: The authors of the document, as a list of strings.
|
|
|
|
|
#: (Defaults to the empty list.)
|
2013-07-14 15:08:02 +04:00
|
|
|
|
#: Extracted from the ``<meta name=author>`` elements in HTML
|
|
|
|
|
#: and written to the ``/Author`` info field in PDF.
|
|
|
|
|
self.authors = authors or []
|
|
|
|
|
#: The description of the document, as a string or :obj:`None`.
|
|
|
|
|
#: Extracted from the ``<meta name=description>`` element in HTML
|
|
|
|
|
#: and written to the ``/Subject`` info field in PDF.
|
|
|
|
|
self.description = description
|
|
|
|
|
#: Keywords associated with the document, as a list of strings.
|
|
|
|
|
#: (Defaults to the empty list.)
|
|
|
|
|
#: Extracted from ``<meta name=keywords>`` elements in HTML
|
|
|
|
|
#: and written to the ``/Keywords`` info field in PDF.
|
|
|
|
|
self.keywords = keywords or []
|
|
|
|
|
#: The name of one of the software packages
|
|
|
|
|
#: used to generate the document, as a string or :obj:`None`.
|
|
|
|
|
#: Extracted from the ``<meta name=generator>`` element in HTML
|
|
|
|
|
#: and written to the ``/Creator`` info field in PDF.
|
|
|
|
|
self.generator = generator
|
|
|
|
|
#: The creation date of the document, as a string or :obj:`None`.
|
|
|
|
|
#: Dates are in one of the six formats specified in
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: `W3C’s profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
|
2013-07-14 15:08:02 +04:00
|
|
|
|
#: Extracted from the ``<meta name=dcterms.created>`` element in HTML
|
|
|
|
|
#: and written to the ``/CreationDate`` info field in PDF.
|
|
|
|
|
self.created = created
|
|
|
|
|
#: The modification date of the document, as a string or :obj:`None`.
|
|
|
|
|
#: Dates are in one of the six formats specified in
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: `W3C’s profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
|
2013-07-14 15:08:02 +04:00
|
|
|
|
#: Extracted from the ``<meta name=dcterms.modified>`` element in HTML
|
|
|
|
|
#: and written to the ``/ModDate`` info field in PDF.
|
|
|
|
|
self.modified = modified
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: File attachments, as a list of tuples of URL and a description or
|
|
|
|
|
#: :obj:`None`. (Defaults to the empty list.)
|
2014-04-04 14:32:21 +04:00
|
|
|
|
#: Extracted from the ``<link rel=attachment>`` elements in HTML
|
|
|
|
|
#: and written to the ``/EmbeddedFiles`` dictionary in PDF.
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#:
|
|
|
|
|
#: .. versionadded:: 0.22
|
2014-04-04 14:32:21 +04:00
|
|
|
|
self.attachments = attachments or []
|
2013-07-14 15:08:02 +04:00
|
|
|
|
|
|
|
|
|
|
2020-01-02 14:06:58 +03:00
|
|
|
|
class Document:
|
2019-02-22 13:34:46 +03:00
|
|
|
|
"""A rendered document ready to be painted on a cairo surface.
|
2012-10-04 13:35:25 +04:00
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
Typically obtained from :meth:`HTML.render() <weasyprint.HTML.render>`, but
|
|
|
|
|
can also be instantiated directly with a list of :class:`pages <Page>`, a
|
|
|
|
|
set of :class:`metadata <DocumentMetadata>`, a :func:`url_fetcher
|
|
|
|
|
<weasyprint.default_url_fetcher>` function, and a :class:`font_config
|
|
|
|
|
<weasyprint.fonts.FontConfiguration>`.
|
2012-10-04 13:35:25 +04:00
|
|
|
|
|
|
|
|
|
"""
|
2019-07-23 08:12:08 +03:00
|
|
|
|
|
2012-10-02 20:59:02 +04:00
|
|
|
|
@classmethod
|
2020-04-18 01:19:35 +03:00
|
|
|
|
def _build_layout_context(cls, html, stylesheets,
|
2019-12-24 16:39:40 +03:00
|
|
|
|
presentational_hints=False, font_config=None,
|
|
|
|
|
counter_style=None):
|
2017-10-01 16:17:32 +03:00
|
|
|
|
if font_config is None:
|
|
|
|
|
font_config = FontConfiguration()
|
2019-12-24 16:39:40 +03:00
|
|
|
|
if counter_style is None:
|
2019-12-24 17:56:24 +03:00
|
|
|
|
counter_style = CounterStyle()
|
2018-03-28 01:34:34 +03:00
|
|
|
|
target_collector = TargetCollector()
|
2017-06-30 18:54:02 +03:00
|
|
|
|
page_rules = []
|
2018-03-24 01:57:33 +03:00
|
|
|
|
user_stylesheets = []
|
|
|
|
|
for css in stylesheets or []:
|
|
|
|
|
if not hasattr(css, 'matcher'):
|
|
|
|
|
css = CSS(
|
|
|
|
|
guess=css, media_type=html.media_type,
|
2019-12-24 16:39:40 +03:00
|
|
|
|
font_config=font_config, counter_style=counter_style)
|
2018-03-24 01:57:33 +03:00
|
|
|
|
user_stylesheets.append(css)
|
2018-08-17 11:30:51 +03:00
|
|
|
|
style_for = get_all_computed_styles(
|
2018-03-24 01:57:33 +03:00
|
|
|
|
html, user_stylesheets, presentational_hints, font_config,
|
2019-12-24 16:39:40 +03:00
|
|
|
|
counter_style, page_rules, target_collector)
|
2012-10-05 20:50:40 +04:00
|
|
|
|
get_image_from_uri = functools.partial(
|
2018-01-07 03:46:39 +03:00
|
|
|
|
original_get_image_from_uri, {}, html.url_fetcher)
|
2019-01-04 01:02:44 +03:00
|
|
|
|
PROGRESS_LOGGER.info('Step 4 - Creating formatting structure')
|
2019-07-23 08:12:08 +03:00
|
|
|
|
context = LayoutContext(
|
2020-04-18 01:19:35 +03:00
|
|
|
|
style_for, get_image_from_uri, font_config, counter_style,
|
|
|
|
|
target_collector)
|
2019-07-23 08:12:08 +03:00
|
|
|
|
return context
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2020-04-19 17:49:37 +03:00
|
|
|
|
def _render(cls, html, stylesheets, presentational_hints=False,
|
|
|
|
|
font_config=None, counter_style=None):
|
2019-07-23 08:12:08 +03:00
|
|
|
|
if font_config is None:
|
|
|
|
|
font_config = FontConfiguration()
|
|
|
|
|
|
2019-12-24 16:39:40 +03:00
|
|
|
|
if counter_style is None:
|
2019-12-24 17:56:24 +03:00
|
|
|
|
counter_style = CounterStyle()
|
2019-12-24 16:39:40 +03:00
|
|
|
|
|
2019-07-23 08:12:08 +03:00
|
|
|
|
context = cls._build_layout_context(
|
2020-04-18 01:19:35 +03:00
|
|
|
|
html, stylesheets, presentational_hints, font_config,
|
|
|
|
|
counter_style)
|
2019-07-23 08:12:08 +03:00
|
|
|
|
|
2018-08-08 18:47:47 +03:00
|
|
|
|
root_box = build_formatting_structure(
|
2019-07-23 19:07:14 +03:00
|
|
|
|
html.etree_element, context.style_for, context.get_image_from_uri,
|
2019-12-24 16:39:40 +03:00
|
|
|
|
html.base_url, context.target_collector, counter_style)
|
2019-07-23 08:12:08 +03:00
|
|
|
|
|
|
|
|
|
page_boxes = layout_document(html, root_box, context)
|
2016-10-27 12:41:34 +03:00
|
|
|
|
rendering = cls(
|
2020-04-18 01:19:35 +03:00
|
|
|
|
[Page(page_box) for page_box in page_boxes],
|
2018-01-28 17:45:39 +03:00
|
|
|
|
DocumentMetadata(**html._get_metadata()),
|
|
|
|
|
html.url_fetcher, font_config)
|
2016-10-27 12:41:34 +03:00
|
|
|
|
return rendering
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2018-01-28 17:45:39 +03:00
|
|
|
|
def __init__(self, pages, metadata, url_fetcher, font_config):
|
2012-10-02 20:59:02 +04:00
|
|
|
|
#: A list of :class:`Page` objects.
|
|
|
|
|
self.pages = pages
|
2013-07-14 15:08:02 +04:00
|
|
|
|
#: A :class:`DocumentMetadata` object.
|
|
|
|
|
#: Contains information that does not belong to a specific page
|
|
|
|
|
#: but to the whole document.
|
|
|
|
|
self.metadata = metadata
|
2019-02-22 13:34:46 +03:00
|
|
|
|
#: A function or other callable with the same signature as
|
|
|
|
|
#: :func:`default_url_fetcher` called to fetch external resources such
|
|
|
|
|
#: as stylesheets and images. (See :ref:`url-fetchers`.)
|
2014-04-18 17:11:45 +04:00
|
|
|
|
self.url_fetcher = url_fetcher
|
2018-01-28 17:45:39 +03:00
|
|
|
|
# Keep a reference to font_config to avoid its garbage collection until
|
|
|
|
|
# rendering is destroyed. This is needed as font_config.__del__ removes
|
|
|
|
|
# fonts that may be used when rendering
|
|
|
|
|
self._font_config = font_config
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
|
|
|
|
def copy(self, pages='all'):
|
2012-10-05 20:50:40 +04:00
|
|
|
|
"""Take a subset of the pages.
|
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
.. versionadded:: 0.15
|
|
|
|
|
|
|
|
|
|
:type pages: :term:`iterable`
|
2012-10-05 20:50:40 +04:00
|
|
|
|
:param pages:
|
|
|
|
|
An iterable of :class:`Page` objects from :attr:`pages`.
|
|
|
|
|
:return:
|
|
|
|
|
A new :class:`Document` object.
|
|
|
|
|
|
2013-07-14 12:17:40 +04:00
|
|
|
|
Examples:
|
2012-10-05 20:50:40 +04:00
|
|
|
|
|
2013-07-14 12:17:40 +04:00
|
|
|
|
Write two PDF files for odd-numbered and even-numbered pages::
|
|
|
|
|
|
|
|
|
|
# Python lists count from 0 but pages are numbered from 1.
|
2012-10-05 20:50:40 +04:00
|
|
|
|
# [::2] is a slice of even list indexes but odd-numbered pages.
|
|
|
|
|
document.copy(document.pages[::2]).write_pdf('odd_pages.pdf')
|
|
|
|
|
document.copy(document.pages[1::2]).write_pdf('even_pages.pdf')
|
|
|
|
|
|
2013-07-14 12:17:40 +04:00
|
|
|
|
Write each page to a numbred PNG file::
|
|
|
|
|
|
2012-10-05 20:50:40 +04:00
|
|
|
|
for i, page in enumerate(document.pages):
|
|
|
|
|
document.copy(page).write_png('page_%s.png' % i)
|
|
|
|
|
|
2013-07-14 12:17:40 +04:00
|
|
|
|
Combine multiple documents into one PDF file,
|
|
|
|
|
using metadata from the first::
|
|
|
|
|
|
2019-07-09 01:06:19 +03:00
|
|
|
|
all_pages = [p for doc in documents for p in doc.pages]
|
2013-07-14 12:17:40 +04:00
|
|
|
|
documents[0].copy(all_pages).write_pdf('combined.pdf')
|
|
|
|
|
|
2012-10-05 20:50:40 +04:00
|
|
|
|
"""
|
2012-10-02 20:59:02 +04:00
|
|
|
|
if pages == 'all':
|
|
|
|
|
pages = self.pages
|
2012-10-05 20:50:40 +04:00
|
|
|
|
elif not isinstance(pages, list):
|
|
|
|
|
pages = list(pages)
|
2018-01-28 17:45:39 +03:00
|
|
|
|
return type(self)(
|
|
|
|
|
pages, self.metadata, self.url_fetcher, self._font_config)
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2020-04-09 02:46:11 +03:00
|
|
|
|
def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None):
|
2012-10-05 20:50:40 +04:00
|
|
|
|
"""Paint the pages in a PDF file, with meta-data.
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2012-10-05 20:50:40 +04:00
|
|
|
|
PDF files written directly by cairo do not have meta-data such as
|
|
|
|
|
bookmarks/outlines and hyperlinks.
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2019-02-22 13:34:46 +03:00
|
|
|
|
:type target: str, pathlib.Path or file object
|
2012-10-05 20:50:40 +04:00
|
|
|
|
:param target:
|
2019-02-22 13:34:46 +03:00
|
|
|
|
A filename where the PDF file is generated, a file object, or
|
|
|
|
|
:obj:`None`.
|
2012-11-23 01:27:34 +04:00
|
|
|
|
:type zoom: float
|
|
|
|
|
:param zoom:
|
2017-04-28 21:36:14 +03:00
|
|
|
|
The zoom factor in PDF units per CSS units. **Warning**:
|
|
|
|
|
All CSS units are affected, including physical units like
|
|
|
|
|
``cm`` and named sizes like ``A4``. For values other than
|
2019-02-22 13:34:46 +03:00
|
|
|
|
1, the physical CSS units will thus be "wrong".
|
|
|
|
|
:type attachments: list
|
2014-04-22 22:40:46 +04:00
|
|
|
|
:param attachments: A list of additional file attachments for the
|
2014-04-26 01:35:43 +04:00
|
|
|
|
generated PDF document or :obj:`None`. The list's elements are
|
2019-02-22 13:34:46 +03:00
|
|
|
|
:class:`Attachment` objects, filenames, URLs or file-like objects.
|
2020-04-19 11:01:27 +03:00
|
|
|
|
:param finisher: A finisher function, that accepts the document and a
|
|
|
|
|
``pydyf.PDF`` object as parameters, can be passed to perform
|
|
|
|
|
post-processing on the PDF right before the trailer is written.
|
2012-10-05 20:50:40 +04:00
|
|
|
|
:returns:
|
2019-02-22 13:34:46 +03:00
|
|
|
|
The PDF as :obj:`bytes` if ``target`` is not provided or
|
|
|
|
|
:obj:`None`, otherwise :obj:`None` (the PDF is written to
|
|
|
|
|
``target``).
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
|
|
|
|
"""
|
2020-04-18 23:12:25 +03:00
|
|
|
|
# 0.75 = 72 PDF point per inch / 96 CSS pixel per inch
|
2012-11-23 01:27:34 +04:00
|
|
|
|
scale = zoom * 0.75
|
2018-08-06 18:38:02 +03:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
PROGRESS_LOGGER.info('Step 6 - Creating PDF')
|
|
|
|
|
|
|
|
|
|
pdf = pydyf.PDF()
|
|
|
|
|
alpha_states = pydyf.Dictionary()
|
|
|
|
|
pdf.add_object(alpha_states)
|
|
|
|
|
resources = pydyf.Dictionary({'ExtGState': alpha_states.reference})
|
|
|
|
|
pdf.add_object(resources)
|
|
|
|
|
pdf_names = pydyf.Array()
|
|
|
|
|
pdf.catalog['Names'] = pydyf.Dictionary(
|
|
|
|
|
{'Dests': pydyf.Dictionary({'Names': pdf_names})})
|
2018-08-06 18:38:02 +03:00
|
|
|
|
|
2020-04-21 23:47:55 +03:00
|
|
|
|
# Links and anchors
|
2020-04-19 19:26:49 +03:00
|
|
|
|
paged_links_and_anchors = list(resolve_links(self.pages))
|
2020-04-21 23:30:38 +03:00
|
|
|
|
attachment_links = [
|
|
|
|
|
[link for link in page_links if link[0] == 'attachment']
|
|
|
|
|
for page_links, page_anchors in paged_links_and_anchors]
|
|
|
|
|
|
2020-04-21 23:47:55 +03:00
|
|
|
|
# Annotations
|
|
|
|
|
annot_files = {}
|
2020-04-21 23:30:38 +03:00
|
|
|
|
# A single link can be split in multiple regions. We don't want to
|
|
|
|
|
# embed a file multiple times of course, so keep a reference to every
|
|
|
|
|
# embedded URL and reuse the object number.
|
|
|
|
|
for page_links in attachment_links:
|
|
|
|
|
for link_type, annot_target, rectangle in page_links:
|
|
|
|
|
if link_type == 'attachment' and target not in annot_files:
|
2020-04-21 23:47:55 +03:00
|
|
|
|
# TODO: Use the title attribute as description. The comment
|
|
|
|
|
# above about multiple regions won't always be correct,
|
|
|
|
|
# because two links might have the same href, but different
|
|
|
|
|
# titles.
|
2020-04-21 23:30:38 +03:00
|
|
|
|
annot_files[annot_target] = _write_pdf_attachment(
|
|
|
|
|
pdf, (annot_target, None), self.url_fetcher)
|
|
|
|
|
|
2020-04-21 23:47:55 +03:00
|
|
|
|
# Bookmarks
|
|
|
|
|
root = []
|
|
|
|
|
# At one point in the document, for each "output" depth, how much
|
|
|
|
|
# to add to get the source level (CSS values of bookmark-level).
|
|
|
|
|
# E.g. with <h1> then <h3>, level_shifts == [0, 1]
|
|
|
|
|
# 1 means that <h3> has depth 3 - 1 = 2 in the output.
|
|
|
|
|
skipped_levels = []
|
|
|
|
|
last_by_depth = [root]
|
|
|
|
|
previous_level = 0
|
|
|
|
|
|
|
|
|
|
for page_number, (page, links_and_anchors, page_links) in enumerate(
|
|
|
|
|
zip(self.pages, paged_links_and_anchors, attachment_links)):
|
|
|
|
|
# Draw from the top-left corner
|
|
|
|
|
matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale)
|
|
|
|
|
|
|
|
|
|
# Links and anchors
|
2018-09-24 16:27:24 +03:00
|
|
|
|
links, anchors = links_and_anchors
|
2020-04-19 01:47:19 +03:00
|
|
|
|
|
2020-04-19 10:55:39 +03:00
|
|
|
|
page_width = scale * (
|
|
|
|
|
page.width + page.bleed['left'] + page.bleed['right'])
|
|
|
|
|
page_height = scale * (
|
|
|
|
|
page.height + page.bleed['top'] + page.bleed['bottom'])
|
|
|
|
|
left = -scale * page.bleed['left']
|
|
|
|
|
top = -scale * page.bleed['top']
|
2020-04-19 17:49:37 +03:00
|
|
|
|
right = left + page_width
|
|
|
|
|
bottom = top + page_height
|
2020-04-18 01:19:35 +03:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
stream = Context(alpha_states)
|
2020-04-19 15:40:30 +03:00
|
|
|
|
stream.transform(1, 0, 0, -1, 0, page.height * scale)
|
2020-04-18 01:19:35 +03:00
|
|
|
|
page.paint(stream, scale=scale)
|
2020-04-19 17:49:37 +03:00
|
|
|
|
pdf.add_object(stream)
|
2020-04-18 01:19:35 +03:00
|
|
|
|
|
2020-04-19 01:47:19 +03:00
|
|
|
|
pdf_page = pydyf.Dictionary({
|
2020-04-18 01:19:35 +03:00
|
|
|
|
'Type': '/Page',
|
2020-04-19 17:49:37 +03:00
|
|
|
|
'Parent': pdf.pages.reference,
|
|
|
|
|
'MediaBox': pydyf.Array([left, top, right, bottom]),
|
2020-04-18 01:19:35 +03:00
|
|
|
|
'Contents': stream.reference,
|
|
|
|
|
'Resources': resources.reference,
|
2020-04-21 23:30:38 +03:00
|
|
|
|
'Annots': pydyf.Array(),
|
2020-04-19 01:47:19 +03:00
|
|
|
|
})
|
2020-04-19 17:49:37 +03:00
|
|
|
|
pdf.add_page(pdf_page)
|
|
|
|
|
|
|
|
|
|
add_hyperlinks(links, anchors, matrix, pdf, pdf_page, pdf_names)
|
|
|
|
|
|
2020-04-21 23:47:55 +03:00
|
|
|
|
# Bleed
|
2020-04-19 17:49:37 +03:00
|
|
|
|
bleed = {key: value * 0.75 for key, value in page.bleed.items()}
|
|
|
|
|
|
|
|
|
|
trim_left = left + bleed['left']
|
|
|
|
|
trim_top = top + bleed['top']
|
|
|
|
|
trim_right = right - bleed['right']
|
|
|
|
|
trim_bottom = bottom - bleed['bottom']
|
2020-04-19 01:47:19 +03:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
# Arbitrarly set PDF BleedBox between CSS bleed box (MediaBox) and
|
|
|
|
|
# CSS page box (TrimBox) at most 10 points from the TrimBox.
|
|
|
|
|
bleed_left = trim_left - min(10, bleed['left'])
|
|
|
|
|
bleed_top = trim_top - min(10, bleed['top'])
|
|
|
|
|
bleed_right = trim_right + min(10, bleed['right'])
|
|
|
|
|
bleed_bottom = trim_bottom + min(10, bleed['bottom'])
|
2020-04-19 01:47:19 +03:00
|
|
|
|
|
2020-04-19 17:49:37 +03:00
|
|
|
|
pdf_page['TrimBox'] = pydyf.Array([
|
|
|
|
|
trim_left, trim_top, trim_right, trim_bottom])
|
|
|
|
|
pdf_page['BleedBox'] = pydyf.Array([
|
|
|
|
|
bleed_left, bleed_top, bleed_right, bleed_bottom])
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
2020-04-21 23:47:55 +03:00
|
|
|
|
# Annotations
|
2020-04-21 23:30:38 +03:00
|
|
|
|
# TODO: splitting a link into multiple independent rectangular
|
|
|
|
|
# annotations works well for pure links, but rather mediocre for
|
|
|
|
|
# other annotations and fails completely for transformed (CSS) or
|
|
|
|
|
# complex link shapes (area). It would be better to use /AP for all
|
|
|
|
|
# links and coalesce link shapes that originate from the same HTML
|
|
|
|
|
# link. This would give a feeling similiar to what browsers do with
|
|
|
|
|
# links that span multiple lines.
|
|
|
|
|
for link_type, annot_target, rectangle in page_links:
|
|
|
|
|
annot_file = annot_files[annot_target]
|
|
|
|
|
if link_type == 'attachment' and annot_file is not None:
|
|
|
|
|
rectangle = (
|
|
|
|
|
*matrix.transform_point(*rectangle[:2]),
|
|
|
|
|
*matrix.transform_point(*rectangle[2:]))
|
|
|
|
|
annot = pydyf.Dictionary({
|
|
|
|
|
'Type': '/Annot',
|
|
|
|
|
'Rect': pydyf.Array(rectangle),
|
|
|
|
|
'Subtype': '/FileAttachment',
|
|
|
|
|
'T': pydyf.String(),
|
|
|
|
|
'FS': annot_file.reference,
|
|
|
|
|
'AP': pydyf.Dictionary({'N': pydyf.Stream([], {
|
|
|
|
|
'Type': '/XObject',
|
|
|
|
|
'Subtype': '/Form',
|
|
|
|
|
'BBox': pydyf.Array(rectangle),
|
|
|
|
|
'Length': 0,
|
|
|
|
|
})})
|
|
|
|
|
})
|
|
|
|
|
pdf.add_object(annot)
|
|
|
|
|
pdf_page['Annots'].append(annot.reference)
|
|
|
|
|
|
2020-04-21 23:47:55 +03:00
|
|
|
|
# Bookmarks
|
2020-04-19 17:49:37 +03:00
|
|
|
|
for level, label, (point_x, point_y), state in page.bookmarks:
|
|
|
|
|
if level > previous_level:
|
|
|
|
|
# Example: if the previous bookmark is a <h2>, the next
|
|
|
|
|
# depth "should" be for <h3>. If now we get a <h6> we’re
|
|
|
|
|
# skipping two levels: append 6 - 3 - 1 = 2
|
|
|
|
|
skipped_levels.append(level - previous_level - 1)
|
|
|
|
|
else:
|
|
|
|
|
temp = level
|
|
|
|
|
while temp < previous_level:
|
|
|
|
|
temp += 1 + skipped_levels.pop()
|
|
|
|
|
if temp > previous_level:
|
|
|
|
|
# We remove too many "skips", add some back:
|
|
|
|
|
skipped_levels.append(temp - previous_level - 1)
|
|
|
|
|
|
|
|
|
|
previous_level = level
|
|
|
|
|
depth = level - sum(skipped_levels)
|
|
|
|
|
assert depth == len(skipped_levels)
|
|
|
|
|
assert depth >= 1
|
|
|
|
|
|
|
|
|
|
children = []
|
|
|
|
|
point_x, point_y = matrix.transform_point(point_x, point_y)
|
|
|
|
|
subtree = BookmarkSubtree(
|
|
|
|
|
label, (page_number, point_x, point_y), children, state)
|
|
|
|
|
last_by_depth[depth - 1].append(subtree)
|
|
|
|
|
del last_by_depth[depth:]
|
|
|
|
|
last_by_depth.append(children)
|
|
|
|
|
|
|
|
|
|
outlines, count = create_bookmarks(root, pdf)
|
2020-04-22 00:07:35 +03:00
|
|
|
|
if outlines:
|
|
|
|
|
pdf.catalog['Outlines'] = pydyf.Dictionary({
|
|
|
|
|
'Count': count,
|
|
|
|
|
'First': outlines[0].reference,
|
|
|
|
|
'Last': outlines[-1].reference,
|
|
|
|
|
})
|
2020-04-19 17:49:37 +03:00
|
|
|
|
|
2020-04-21 23:47:55 +03:00
|
|
|
|
PROGRESS_LOGGER.info('Step 7 - Adding PDF metadata')
|
|
|
|
|
|
2020-05-06 08:42:45 +03:00
|
|
|
|
# PDF information
|
2020-04-21 23:47:55 +03:00
|
|
|
|
if self.metadata.title:
|
|
|
|
|
pdf.info['Title'] = pydyf.String(self.metadata.title)
|
|
|
|
|
if self.metadata.authors:
|
|
|
|
|
pdf.info['Author'] = pydyf.String(
|
|
|
|
|
', '.join(self.metadata.authors))
|
|
|
|
|
if self.metadata.description:
|
|
|
|
|
pdf.info['Subject'] = pydyf.String(self.metadata.description)
|
|
|
|
|
if self.metadata.keywords:
|
|
|
|
|
pdf.info['Keywords'] = pydyf.String(
|
|
|
|
|
', '.join(self.metadata.keywords))
|
|
|
|
|
if self.metadata.generator:
|
|
|
|
|
pdf.info['Creator'] = pydyf.String(self.metadata.generator)
|
|
|
|
|
pdf.info['Producer'] = pydyf.String(f'WeasyPrint {__version__}')
|
|
|
|
|
if self.metadata.created:
|
|
|
|
|
pdf.info['CreationDate'] = pydyf.String(
|
|
|
|
|
_w3c_date_to_pdf(self.metadata.created, 'created'))
|
|
|
|
|
if self.metadata.modified:
|
|
|
|
|
pdf.info['ModDate'] = pydyf.String(
|
|
|
|
|
_w3c_date_to_pdf(self.metadata.modified, 'modified'))
|
|
|
|
|
|
2020-05-06 08:42:45 +03:00
|
|
|
|
# Embedded files
|
2020-04-21 23:30:38 +03:00
|
|
|
|
attachments = self.metadata.attachments + (attachments or [])
|
2020-05-06 08:42:45 +03:00
|
|
|
|
pdf_attachments = []
|
|
|
|
|
for attachment in attachments:
|
|
|
|
|
pdf_attachment = _write_pdf_attachment(
|
|
|
|
|
pdf, attachment, self.url_fetcher)
|
|
|
|
|
if pdf_attachment is not None:
|
|
|
|
|
pdf_attachments.append(pdf_attachment)
|
|
|
|
|
if pdf_attachments:
|
|
|
|
|
content = pydyf.Dictionary({'Names': pydyf.Array()})
|
|
|
|
|
for i, pdf_attachment in enumerate(pdf_attachments):
|
|
|
|
|
content['Names'].append(pydyf.String(f'attachment{i}'))
|
|
|
|
|
content['Names'].append(pdf_attachment.reference)
|
|
|
|
|
pdf.add_object(content)
|
|
|
|
|
pdf.catalog['Names']['EmbeddedFiles'] = content.reference
|
|
|
|
|
|
|
|
|
|
# Embeded fonts
|
|
|
|
|
resources['Font'] = pydyf.Dictionary()
|
|
|
|
|
for font_hash, font in stream._fonts.items():
|
2020-05-08 02:50:41 +03:00
|
|
|
|
compressed = zlib.compressobj().compress(font.file_content)
|
2020-05-06 08:42:45 +03:00
|
|
|
|
font_extra = pydyf.Dictionary({
|
|
|
|
|
'Filter': '/FlateDecode',
|
2020-05-08 02:50:41 +03:00
|
|
|
|
'Length1': len(font.file_content),
|
2020-05-06 08:42:45 +03:00
|
|
|
|
})
|
|
|
|
|
font_stream = pydyf.Stream([compressed], font_extra)
|
|
|
|
|
pdf.add_object(font_stream)
|
|
|
|
|
|
2020-05-08 02:50:41 +03:00
|
|
|
|
font.compute_glyphs_values()
|
2020-05-06 08:42:45 +03:00
|
|
|
|
font_dictionary = pydyf.Dictionary({
|
|
|
|
|
'Type': '/Font',
|
|
|
|
|
'Subtype': '/TrueType',
|
2020-05-08 02:50:41 +03:00
|
|
|
|
'BaseFont': font.family,
|
2020-05-06 08:42:45 +03:00
|
|
|
|
'FirstChar': 32,
|
|
|
|
|
'LastChar': 99,
|
|
|
|
|
'Encoding': '/WinAnsiEncoding',
|
2020-05-08 01:29:46 +03:00
|
|
|
|
'Widths': pydyf.Array((99 - 32 + 1) * [1000]),
|
2020-05-06 08:42:45 +03:00
|
|
|
|
'FontDescriptor': pydyf.Dictionary({
|
2020-05-08 02:50:41 +03:00
|
|
|
|
'FontName': pydyf.String(font.name),
|
|
|
|
|
'FontFamily': font.family,
|
2020-05-06 08:42:45 +03:00
|
|
|
|
'Flags': 32,
|
2020-05-08 02:50:41 +03:00
|
|
|
|
'FontBBox': pydyf.Array(font.bbox),
|
2020-05-08 01:40:01 +03:00
|
|
|
|
'ItalicAngle': font.italic_angle,
|
|
|
|
|
'Ascent': font.ascent,
|
|
|
|
|
'Descent': font.descent,
|
|
|
|
|
'CapHeight': font.cap_height,
|
|
|
|
|
'StemV': font.stemv,
|
|
|
|
|
'StemH': font.stemh,
|
2020-05-06 08:42:45 +03:00
|
|
|
|
'FontFile': font_stream.reference,
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
pdf.add_object(font_dictionary)
|
|
|
|
|
resources['Font'][str(font_hash)] = font_dictionary.reference
|
2020-04-21 23:30:38 +03:00
|
|
|
|
|
2020-04-19 11:01:27 +03:00
|
|
|
|
if finisher:
|
2020-04-19 17:49:37 +03:00
|
|
|
|
finisher(self, pdf)
|
2020-04-19 11:01:27 +03:00
|
|
|
|
|
2020-04-18 01:19:35 +03:00
|
|
|
|
file_obj = io.BytesIO()
|
2020-04-19 17:49:37 +03:00
|
|
|
|
pdf.write(file_obj)
|
2012-10-02 20:59:02 +04:00
|
|
|
|
|
|
|
|
|
if target is None:
|
|
|
|
|
return file_obj.getvalue()
|
|
|
|
|
else:
|
|
|
|
|
file_obj.seek(0)
|
|
|
|
|
if hasattr(target, 'write'):
|
|
|
|
|
shutil.copyfileobj(file_obj, target)
|
|
|
|
|
else:
|
|
|
|
|
with open(target, 'wb') as fd:
|
|
|
|
|
shutil.copyfileobj(file_obj, fd)
|
|
|
|
|
|
2012-12-29 04:00:30 +04:00
|
|
|
|
def write_png(self, target=None, resolution=96):
|
|
|
|
|
"""Paint the pages vertically to a single PNG image.
|
|
|
|
|
|
|
|
|
|
There is no decoration around pages other than those specified in CSS
|
|
|
|
|
with ``@page`` rules. The final image is as wide as the widest page.
|
|
|
|
|
Each page is below the previous one, centered horizontally.
|
|
|
|
|
|
|
|
|
|
:param target:
|
|
|
|
|
A filename, file-like object, or :obj:`None`.
|
|
|
|
|
:type resolution: float
|
|
|
|
|
:param resolution:
|
|
|
|
|
The output resolution in PNG pixels per CSS inch. At 96 dpi
|
|
|
|
|
(the default), PNG pixels match the CSS ``px`` unit.
|
|
|
|
|
:returns:
|
2019-02-22 13:34:46 +03:00
|
|
|
|
A ``(png_bytes, png_width, png_height)`` tuple. ``png_bytes`` is a
|
|
|
|
|
byte string if ``target`` is :obj:`None`, otherwise :obj:`None`
|
|
|
|
|
(the image is written to ``target``). ``png_width`` and
|
|
|
|
|
``png_height`` are the size of the final image, in PNG pixels.
|
2012-12-29 04:00:30 +04:00
|
|
|
|
|
|
|
|
|
"""
|
2020-04-19 09:09:22 +03:00
|
|
|
|
# TODO: write this
|
|
|
|
|
raise NotImplementedError
|