1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-10-05 00:21:15 +03:00
WeasyPrint/weasyprint/document.py

1348 lines
53 KiB
Python
Raw Normal View History

2012-10-02 20:59:02 +04:00
"""
weasyprint.document
-------------------
"""
import collections
2017-03-25 02:33:36 +03:00
import functools
2020-04-19 17:49:37 +03:00
import hashlib
2012-10-02 20:59:02 +04:00
import io
import math
import shutil
2020-04-19 17:49:37 +03:00
import zlib
from os.path import basename
from subprocess import run
2020-04-19 17:49:37 +03:00
from urllib.parse import unquote, urlsplit
2012-10-02 20:59:02 +04:00
import pydyf
2020-05-29 20:43:56 +03:00
from fontTools import subset
from fontTools.ttLib import TTFont, TTLibError
2020-05-13 02:02:43 +03:00
from PIL import Image
from weasyprint.layout import LayoutContext
2012-10-02 20:59:02 +04:00
2020-05-13 00:54:42 +03:00
from . import CSS, Attachment, __version__
2012-10-02 20:59:02 +04:00
from .css import get_all_computed_styles
2019-12-24 17:56:24 +03:00
from .css.counters import CounterStyle
2018-03-28 01:34:34 +03:00
from .css.targets import TargetCollector
2017-03-25 02:33:36 +03:00
from .draw import draw_page, stacked
from .fonts import FontConfiguration
2012-10-04 13:35:25 +04:00
from .formatting_structure import boxes
2012-10-02 20:59:02 +04:00
from .formatting_structure.build import build_formatting_structure
2020-05-30 02:11:30 +03:00
from .html import W3C_DATE_RE, get_html_metadata
2018-01-07 03:46:39 +03:00
from .images import get_image_from_uri as original_get_image_from_uri
2012-10-02 20:59:02 +04:00
from .layout import layout_document
from .layout.percentages import percentage
from .logger import LOGGER, PROGRESS_LOGGER
2020-05-08 01:11:19 +03:00
from .text import ffi, pango
2020-04-19 17:49:37 +03:00
from .urls import URLFetchingError
def _w3c_date_to_pdf(string, attr_name):
"""Tranform W3C date to PDF format."""
if string is None:
return None
match = W3C_DATE_RE.match(string)
if match is None:
2020-05-30 16:48:24 +03:00
LOGGER.warning(f'Invalid {attr_name} date: {string!r}')
2020-04-19 17:49:37 +03:00
return None
groups = match.groupdict()
pdf_date = ''
found = groups['hour']
2020-04-19 17:49:37 +03:00
for key in ('second', 'minute', 'hour', 'day', 'month', 'year'):
if groups[key]:
found = True
pdf_date = groups[key] + pdf_date
elif found:
2020-05-30 02:11:30 +03:00
pdf_date = f'{(key in ("day", "month")):02d}{pdf_date}'
2020-04-19 17:49:37 +03:00
if groups['hour']:
assert groups['minute']
if groups['tz_hour']:
assert groups['tz_hour'].startswith(('+', '-'))
assert groups['tz_minute']
2020-05-30 02:11:30 +03:00
tz_hour = int(groups['tz_hour'])
tz_minute = int(groups['tz_minute'])
pdf_date += f"{tz_hour:+03d}'{tz_minute:02d}"
2020-04-19 17:49:37 +03:00
else:
pdf_date += 'Z'
return pdf_date
2012-10-02 20:59:02 +04:00
2020-05-08 01:31:50 +03:00
class Font:
2020-06-01 02:12:32 +03:00
def __init__(self, file_content, pango_font):
pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
font_description = pango.pango_font_describe(pango_font)
2020-05-08 00:27:43 +03:00
font_family = ffi.string(pango.pango_font_description_get_family(
2020-05-08 02:30:07 +03:00
font_description))
font_size = pango.pango_font_description_get_size(font_description)
2020-06-01 02:12:32 +03:00
sha = hashlib.sha256()
sha.update(file_content)
2020-05-08 02:50:41 +03:00
self.file_content = file_content
2020-06-01 02:12:32 +03:00
self.hash = ''.join(
chr(65 + letter % 26) for letter in sha.digest()[:6])
2020-05-31 02:20:38 +03:00
self.name = (
b'/' + self.hash.encode('ascii') + b'+' +
font_family.replace(b' ', b''))
2020-05-08 02:50:41 +03:00
self.family = font_family
2020-05-08 01:19:07 +03:00
self.flags = 4
self.italic_angle = 0
self.ascent = int(
pango.pango_font_metrics_get_ascent(pango_metrics) /
font_size * 1000)
self.descent = -int(
pango.pango_font_metrics_get_descent(pango_metrics) /
font_size * 1000)
self.stemv = 80
self.stemh = 80
self.bbox = [0, 0, 0, 0]
self.widths = {}
2020-05-12 19:38:12 +03:00
self.cmap = {}
class Context(pydyf.Stream):
2020-04-19 17:49:37 +03:00
"""PDF stream object with context storing alpha states."""
2020-06-07 01:32:47 +03:00
def __init__(self, document, page_rectangle, alpha_states, x_objects,
patterns, shadings, *args, **kwargs):
super().__init__(*args, **kwargs)
2020-06-01 12:48:17 +03:00
self.compress = True
2020-06-07 01:32:47 +03:00
self.page_rectangle = page_rectangle
2020-05-31 02:20:38 +03:00
self._document = document
self._alpha_states = alpha_states
2020-05-17 15:46:41 +03:00
self._x_objects = x_objects
2020-06-07 01:32:47 +03:00
self._patterns = patterns
self._shadings = shadings
2020-06-01 02:12:32 +03:00
self._current_color = self._current_color_stroke = None
self._current_alpha = self._current_alpha_stroke = None
self._current_font = self._current_font_size = None
self._old_font = self._old_font_size = None
# These objects are used in text.show_first_line
self.length = ffi.new('unsigned int *')
self.ink_rect = ffi.new('PangoRectangle *')
self.logical_rect = ffi.new('PangoRectangle *')
2020-06-01 02:12:32 +03:00
def pop_state(self):
super().pop_state()
self._current_color = self._current_color_stroke = None
self._current_alpha = self._current_alpha_stroke = None
self._current_font = None
def begin_text(self):
if self.stream[-1] == b'ET':
self._current_font = self._old_font
self.stream.pop()
else:
super().begin_text()
2020-05-31 02:20:38 +03:00
2020-06-01 02:12:32 +03:00
def end_text(self):
self._old_font, self._current_font = self._current_font, None
super().end_text()
def set_color_rgb(self, r, g, b, stroke=False):
if stroke:
if (r, g, b) == self._current_color_stroke:
return
else:
self._current_color_stroke = (r, g, b)
else:
if (r, g, b) == self._current_color:
return
else:
self._current_color = (r, g, b)
super().set_color_rgb(r, g, b, stroke)
def set_font_size(self, font, size):
if (font, size) == self._current_font:
return
self._current_font = (font, size)
super().set_font_size(font, size)
def set_alpha(self, alpha, stroke=False):
2020-06-01 02:12:32 +03:00
if stroke:
if alpha == self._current_alpha_stroke:
return
else:
self._current_alpha_stroke = alpha
else:
if alpha == self._current_alpha:
return
else:
self._current_alpha = alpha
if alpha not in self._alpha_states:
2020-05-17 15:46:41 +03:00
self._alpha_states[alpha] = pydyf.Dictionary()
if stroke in (None, False):
self._alpha_states[alpha]['ca'] = alpha
if stroke in (None, True):
self._alpha_states[alpha]['CA'] = alpha
self.set_state(alpha)
2020-06-01 02:12:32 +03:00
def add_font(self, font_hash, font_content, pango_font):
self._document.fonts[font_hash] = Font(font_content, pango_font)
2020-05-31 02:20:38 +03:00
return self._document.fonts[font_hash]
2020-05-06 08:42:45 +03:00
2020-05-30 02:11:30 +03:00
def get_fonts(self):
2020-05-31 02:20:38 +03:00
return self._document.fonts
2020-05-30 02:11:30 +03:00
2020-06-07 01:32:47 +03:00
def sub_context(self, *args, **kwargs):
return Context(
2020-06-07 16:38:16 +03:00
self._document, self.page_rectangle, self._alpha_states,
self._x_objects, self._patterns, self._shadings, *args, **kwargs)
2020-06-07 01:32:47 +03:00
2020-05-17 15:46:41 +03:00
def push_group(self, bounding_box):
2020-06-07 21:54:40 +03:00
alpha_states = pydyf.Dictionary()
x_objects = pydyf.Dictionary()
2020-06-07 21:54:40 +03:00
patterns = pydyf.Dictionary()
shadings = pydyf.Dictionary()
resources = pydyf.Dictionary({
2020-06-07 21:54:40 +03:00
'ExtGState': alpha_states,
'XObject': x_objects,
2020-06-07 21:54:40 +03:00
'Pattern': patterns,
'Shading': shadings,
2020-05-17 15:46:41 +03:00
})
extra = pydyf.Dictionary({
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array(bounding_box),
'Resources': resources,
'Group': pydyf.Dictionary({
'Type': '/Group',
'S': '/Transparency',
'I': 'true',
'CS': '/DeviceRGB',
}),
})
group = Context(
2020-06-07 21:54:40 +03:00
self._document, self.page_rectangle, alpha_states, x_objects,
patterns, shadings, extra=extra)
group.id = f'x{len(self._x_objects)}'
2020-06-07 16:38:16 +03:00
group._parent = self
self._x_objects[group.id] = group
2020-05-17 15:46:41 +03:00
return group
def pop_group(self):
2020-06-07 16:38:16 +03:00
return self._parent
2020-05-17 15:46:41 +03:00
def add_image(self, pillow_image, image_rendering, optimize_image):
2020-06-07 17:12:47 +03:00
image_format = pillow_image.format
image_mode = pillow_image.mode
if image_mode in ('RGB', 'RGBA', 'P'):
color_space = '/DeviceRGB'
elif image_mode in ('1', 'L'):
color_space = '/DeviceGray'
elif image_mode == 'CMYK':
color_space = '/DeviceCMYK'
2020-08-02 16:07:32 +03:00
if image_mode == ('1', 'P'):
pillow_image = pillow_image.convert('RGB')
2020-08-02 16:07:32 +03:00
interpolate = 'true' if image_rendering == 'auto' else 'false'
2020-06-03 18:58:53 +03:00
extra = pydyf.Dictionary({
'Type': '/XObject',
'Subtype': '/Image',
'Width': pillow_image.width,
'Height': pillow_image.height,
'ColorSpace': color_space,
2020-06-07 22:17:29 +03:00
'BitsPerComponent': 8,
2020-08-02 16:07:32 +03:00
'Interpolate': interpolate,
2020-06-03 18:58:53 +03:00
})
image_file = io.BytesIO()
2020-06-07 18:05:12 +03:00
if image_format == 'JPEG':
extra['Filter'] = '/DCTDecode'
2020-08-02 16:07:32 +03:00
pillow_image.save(
image_file, format='JPEG', optimize=optimize_image)
2020-06-07 18:05:12 +03:00
else:
2020-08-02 16:07:32 +03:00
extra['Filter'] = '/JPXDecode'
if image_mode == 'RGBA':
alpha = pillow_image.getchannel('A')
pillow_image = pillow_image.convert('RGB')
alpha_file = io.BytesIO()
alpha.save(
alpha_file, format='JPEG2000', optimize=optimize_image,
num_resolutions=1)
extra['SMask'] = pydyf.Stream([alpha_file.getvalue()], extra={
'Filter': '/JPXDecode',
'Type': '/XObject',
'Subtype': '/Image',
'Width': pillow_image.width,
'Height': pillow_image.height,
'ColorSpace': '/DeviceGray',
'BitsPerComponent': 8,
'Interpolate': interpolate,
})
# Set number of resolutions to 1 because of
# https://github.com/uclouvain/openjpeg/issues/215
pillow_image.save(
image_file, format='JPEG2000', optimize=optimize_image,
num_resolutions=1)
stream = [image_file.getvalue()]
2020-06-07 18:05:12 +03:00
xobject = pydyf.Stream(stream, extra=extra)
2020-06-03 19:01:25 +03:00
image_name = f'Im{len(self._x_objects)}'
self._x_objects[image_name] = xobject
return image_name
2020-06-03 18:58:53 +03:00
2020-06-07 12:20:17 +03:00
def add_pattern(self, x, y, width, height, repeat_width, repeat_height):
alpha_states = pydyf.Dictionary()
x_objects = pydyf.Dictionary()
patterns = pydyf.Dictionary()
shadings = pydyf.Dictionary()
resources = pydyf.Dictionary({
'ExtGState': alpha_states,
'XObject': x_objects,
'Pattern': patterns,
'Shading': shadings,
})
2020-06-07 01:32:47 +03:00
matrix = (1, 0, 0, -1, x, self.page_rectangle[3] - y)
extra = pydyf.Dictionary({
'PatternType': 1,
'BBox': pydyf.Array([0, 0, width, height]),
2020-06-07 12:20:17 +03:00
'XStep': repeat_width,
'YStep': repeat_height,
2020-06-07 01:32:47 +03:00
'TilingType': 1,
'PaintType': 1,
'Matrix': pydyf.Array(0.75 * i for i in matrix),
'Resources': resources,
2020-06-07 01:32:47 +03:00
})
pattern = Context(
2020-06-07 16:38:16 +03:00
self._document, self.page_rectangle, alpha_states, x_objects,
patterns, shadings, extra=extra)
2020-06-07 01:32:47 +03:00
pattern.id = f'p{len(self._patterns)}'
self._patterns[pattern.id] = pattern
return pattern
def add_shading(self):
shading = pydyf.Dictionary()
shading.id = f's{len(self._shadings)}'
self._shadings[shading.id] = shading
return shading
2012-10-02 20:59:02 +04:00
2020-04-19 17:49:37 +03:00
BookmarkSubtree = collections.namedtuple(
'BookmarkSubtree', ('label', 'destination', 'children', 'state'))
2020-04-18 23:12:25 +03:00
2020-04-19 17:49:37 +03:00
def _write_pdf_attachment(pdf, attachment, url_fetcher):
"""Write an attachment to the PDF stream.
:return:
2020-04-21 23:30:38 +03:00
the attachment PDF dictionary.
2020-04-19 17:49:37 +03:00
"""
2020-04-21 23:30:38 +03:00
# Attachments from document links like <link> or <a> can only be URLs.
# They're passed in as tuples
2020-04-19 17:49:37 +03:00
url = ''
2020-04-21 23:30:38 +03:00
if isinstance(attachment, tuple):
url, description = attachment
attachment = Attachment(
url=url, url_fetcher=url_fetcher, description=description)
elif not isinstance(attachment, Attachment):
attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
2020-04-19 17:49:37 +03:00
2020-04-21 23:30:38 +03:00
try:
2020-04-19 17:49:37 +03:00
with attachment.source as (source_type, source, url, _):
if isinstance(source, bytes):
source = io.BytesIO(source)
2020-04-21 23:30:38 +03:00
uncompressed_length = 0
stream = b''
md5 = hashlib.md5()
compress = zlib.compressobj()
for data in iter(lambda: source.read(4096), b''):
uncompressed_length += len(data)
md5.update(data)
compressed = compress.compress(data)
stream += compressed
compressed = compress.flush(zlib.Z_FINISH)
stream += compressed
file_extra = pydyf.Dictionary({
'Type': '/EmbeddedFile',
'Filter': '/FlateDecode',
'Params': pydyf.Dictionary({
'CheckSum': f'<{md5.hexdigest()}>',
'Size': uncompressed_length,
})
})
file_stream = pydyf.Stream([stream], file_extra)
pdf.add_object(file_stream)
except URLFetchingError as exception:
LOGGER.error('Failed to load attachment: %s', exception)
return
2020-04-19 17:49:37 +03:00
# TODO: Use the result object from a URL fetch operation to provide more
# details on the possible filename.
2020-05-16 18:05:11 +03:00
if url and urlsplit(url).path:
filename = basename(unquote(urlsplit(url).path))
else:
filename = 'attachment.bin'
2020-04-19 17:49:37 +03:00
2020-04-21 23:30:38 +03:00
attachment = pydyf.Dictionary({
'Type': '/Filespec',
'F': pydyf.String(),
'UF': pydyf.String(filename),
'EF': pydyf.Dictionary({'F': file_stream.reference}),
'Desc': pydyf.String(attachment.description or ''),
})
pdf.add_object(attachment)
return attachment
2020-04-19 17:49:37 +03:00
def create_bookmarks(bookmarks, pdf, parent=None):
2020-04-18 23:12:25 +03:00
count = len(bookmarks)
outlines = []
2020-04-19 17:49:37 +03:00
for title, (page, x, y), children, state in bookmarks:
2020-04-18 23:12:25 +03:00
destination = pydyf.Array((
2020-04-19 17:49:37 +03:00
pdf.objects[pdf.pages['Kids'][page * 3]].reference,
2020-04-18 23:12:25 +03:00
'/XYZ', x, y, 0))
outline = pydyf.Dictionary({
'Title': pydyf.String(title), 'Dest': destination})
2020-04-19 17:49:37 +03:00
pdf.add_object(outline)
2020-04-18 23:12:25 +03:00
children_outlines, children_count = create_bookmarks(
2020-04-19 17:49:37 +03:00
children, pdf, parent=outline)
2020-04-18 23:41:23 +03:00
outline['Count'] = children_count
if state == 'closed':
outline['Count'] *= -1
else:
count += children_count
2020-04-18 23:12:25 +03:00
if outlines:
outline['Prev'] = outlines[-1].reference
outlines[-1]['Next'] = outline.reference
if children_outlines:
outline['First'] = children_outlines[0].reference
outline['Last'] = children_outlines[-1].reference
if parent is not None:
outline['Parent'] = parent.reference
outlines.append(outline)
return outlines, count
2020-04-19 17:49:37 +03:00
def add_hyperlinks(links, anchors, matrix, pdf, page, names):
"""Include hyperlinks in current PDF page."""
for link in links:
link_type, link_target, rectangle = link
x1, y1 = matrix.transform_point(*rectangle[:2])
x2, y2 = matrix.transform_point(*rectangle[2:])
if link_type in ('internal', 'external'):
annot = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Link',
'Rect': pydyf.Array([x1, y1, x2, y2]),
'BS': pydyf.Dictionary({'W': 0}),
})
if link_type == 'internal':
annot['Dest'] = pydyf.String(link_target)
else:
2020-04-19 17:49:37 +03:00
annot['A'] = pydyf.Dictionary({
'Type': '/Action',
'S': '/URI',
'URI': pydyf.String(link_target),
})
pdf.add_object(annot)
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
2020-04-19 17:49:37 +03:00
page['Annots'].append(annot.reference)
for anchor in anchors:
anchor_name, x, y = anchor
x, y = matrix.transform_point(x, y)
names.append(pydyf.String(anchor_name))
names.append(pydyf.Array([page.reference, '/XYZ', x, y, 0]))
def rectangle_aabb(matrix, pos_x, pos_y, width, height):
2020-04-19 17:49:37 +03:00
"""Apply a transformation matrix to an axis-aligned rectangle.
2020-05-30 02:11:30 +03:00
Return its axis-aligned bounding box as ``(x1, y1, x2, y2)``.
"""
transform_point = matrix.transform_point
x1, y1 = transform_point(pos_x, pos_y)
x2, y2 = transform_point(pos_x + width, pos_y)
x3, y3 = transform_point(pos_x, pos_y + height)
x4, y4 = transform_point(pos_x + width, pos_y + height)
box_x1 = min(x1, x2, x3, x4)
box_y1 = min(y1, y2, y3, y4)
box_x2 = max(x1, x2, x3, x4)
box_y2 = max(y1, y2, y3, y4)
2020-05-30 02:11:30 +03:00
return box_x1, box_y1, box_x2, box_y2
def resolve_links(pages):
"""Resolve internal hyperlinks.
Links to a missing anchor are removed with a warning.
If multiple anchors have the same name, the first one is used.
:returns:
A generator yielding lists (one per page) like :attr:`Page.links`,
except that ``target`` for internal hyperlinks is
``(page_number, x, y)`` instead of an anchor name.
The page number is a 0-based index into the :attr:`pages` list,
and ``x, y`` are in CSS pixels from the top-left of the page.
"""
anchors = set()
paged_anchors = []
for i, page in enumerate(pages):
paged_anchors.append([])
for anchor_name, (point_x, point_y) in page.anchors.items():
if anchor_name not in anchors:
paged_anchors[-1].append((anchor_name, point_x, point_y))
anchors.add(anchor_name)
for page in pages:
page_links = []
for link in page.links:
link_type, anchor_name, rectangle = link
if link_type == 'internal':
if anchor_name not in anchors:
LOGGER.error(
'No anchor #%s for internal URI reference',
anchor_name)
else:
page_links.append((link_type, anchor_name, rectangle))
else:
# External link
page_links.append(link)
yield page_links, paged_anchors.pop(0)
2020-04-19 17:49:37 +03:00
class Matrix(list):
def __init__(self, a=1, b=0, c=0, d=1, e=0, f=0, matrix=None):
if matrix is None:
matrix = [[a, b, 0], [c, d, 0], [e, f, 1]]
super().__init__(matrix)
2012-10-04 13:35:25 +04:00
2020-04-19 17:49:37 +03:00
def __matmul__(self, other):
assert len(self[0]) == len(other) == len(other[0]) == 3
return Matrix(matrix=[
[sum(self[i][k] * other[k][j] for k in range(3)) for j in range(3)]
2020-05-08 03:13:44 +03:00
for i in range(len(self))])
2012-10-04 13:35:25 +04:00
2020-04-19 17:49:37 +03:00
@property
def determinant(self):
assert len(self) == len(self[0]) == 3
return (
self[0][0] * (self[1][1] * self[2][2] - self[1][2] * self[2][1]) -
self[1][0] * (self[0][1] * self[2][2] - self[0][2] * self[2][1]) +
self[2][0] * (self[0][1] * self[1][2] - self[0][2] * self[1][1]))
2012-10-04 13:35:25 +04:00
2020-04-19 17:49:37 +03:00
def transform_point(self, x, y):
return (Matrix(matrix=[[x, y, 1]]) @ self)[0][:2]
class Page:
2012-10-04 13:35:25 +04:00
"""Represents a single rendered page.
2012-10-08 21:51:18 +04:00
.. versionadded:: 0.15
2012-10-04 13:35:25 +04:00
Should be obtained from :attr:`Document.pages` but not
instantiated directly.
"""
def __init__(self, page_box):
2012-10-08 21:51:18 +04:00
#: The page width, including margins, in CSS pixels.
2012-10-05 22:12:05 +04:00
self.width = page_box.margin_width()
2012-10-04 13:35:25 +04:00
2012-10-08 21:51:18 +04:00
#: The page height, including margins, in CSS pixels.
2012-10-05 22:12:05 +04:00
self.height = page_box.margin_height()
2012-10-04 13:35:25 +04:00
#: The page bleed widths as a :obj:`dict` with ``'top'``, ``'right'``,
#: ``'bottom'`` and ``'left'`` as keys, and values in CSS pixels.
2017-10-05 09:45:50 +03:00
self.bleed = {
2020-05-30 02:11:30 +03:00
side: page_box.style[f'bleed_{side}'].value
2017-10-05 09:45:50 +03:00
for side in ('top', 'right', 'bottom', 'left')}
#: The :obj:`list` of ``(bookmark_level, bookmark_label, target)``
#: :obj:`tuples <tuple>`. ``bookmark_level`` and ``bookmark_label``
#: are respectively an :obj:`int` and a :obj:`string <str>`, based on
#: the CSS properties of the same names. ``target`` is an ``(x, y)``
#: point in CSS pixels from the top-left of the page.
self.bookmarks = []
#: The :obj:`list` of ``(link_type, target, rectangle)`` :obj:`tuples
#: <tuple>`. A ``rectangle`` is ``(x, y, width, height)``, in CSS
#: pixels from the top-left of the page. ``link_type`` is one of three
#: strings:
2012-10-04 13:35:25 +04:00
#:
#: * ``'external'``: ``target`` is an absolute URL
#: * ``'internal'``: ``target`` is an anchor name (see
#: :attr:`Page.anchors`).
#: The anchor might be defined in another page,
#: in multiple pages (in which case the first occurence is used),
#: or not at all.
#: * ``'attachment'``: ``target`` is an absolute URL and points
#: to a resource to attach to the document.
self.links = []
2012-10-04 13:35:25 +04:00
#: The :obj:`dict` mapping each anchor name to its target, an
#: ``(x, y)`` point in CSS pixels from the top-left of the page.
self.anchors = {}
2012-10-04 13:35:25 +04:00
2020-04-19 17:49:37 +03:00
self._gather_links_and_bookmarks(page_box)
2012-10-05 22:12:05 +04:00
self._page_box = page_box
2012-10-02 20:59:02 +04:00
2020-08-01 16:12:56 +03:00
def _gather_links_and_bookmarks(self, box, parent_matrix=None):
2020-04-19 17:49:37 +03:00
# Get box transformation matrix.
# "Transforms apply to block-level and atomic inline-level elements,
# but do not apply to elements which may be split into
# multiple inline-level boxes."
# http://www.w3.org/TR/css3-2d-transforms/#introduction
if box.style['transform'] and not isinstance(box, boxes.InlineBox):
border_width = box.border_width()
border_height = box.border_height()
origin_x, origin_y = box.style['transform_origin']
offset_x = percentage(origin_x, border_width)
offset_y = percentage(origin_y, border_height)
origin_x = box.border_box_x() + offset_x
origin_y = box.border_box_y() + offset_y
matrix = Matrix(e=origin_x, f=origin_y)
for name, args in box.style['transform']:
a, b, c, d, e, f = 1, 0, 0, 1, 0, 0
if name == 'scale':
a, d = args
elif name == 'rotate':
a = d = math.cos(args)
b = math.sin(args)
c = -b
elif name == 'translate':
e = percentage(args[0], border_width)
f = percentage(args[1], border_height)
elif name == 'skew':
b, c = math.tan(args[1]), math.tan(args[0])
else:
assert name == 'matrix'
a, b, c, d, e, f = args
matrix = Matrix(a, b, c, d, e, f) @ matrix
box.transformation_matrix = (
Matrix(e=-origin_x, f=-origin_y) @ matrix)
2020-08-01 16:12:56 +03:00
if parent_matrix:
matrix = box.transformation_matrix @ parent_matrix
2020-04-19 17:49:37 +03:00
else:
matrix = box.transformation_matrix
2020-08-01 16:12:56 +03:00
else:
matrix = parent_matrix
2020-04-19 17:49:37 +03:00
bookmark_label = box.bookmark_label
if box.style['bookmark_level'] == 'none':
bookmark_level = None
else:
bookmark_level = box.style['bookmark_level']
state = box.style['bookmark_state']
link = box.style['link']
anchor_name = box.style['anchor']
has_bookmark = bookmark_label and bookmark_level
# 'link' is inherited but redundant on text boxes
2020-05-17 16:54:02 +03:00
has_link = link and not isinstance(box, (boxes.TextBox, boxes.LineBox))
2020-04-19 17:49:37 +03:00
# In case of duplicate IDs, only the first is an anchor.
has_anchor = anchor_name and anchor_name not in self.anchors
is_attachment = hasattr(box, 'is_attachment') and box.is_attachment
if has_bookmark or has_link or has_anchor:
pos_x, pos_y, width, height = box.hit_area()
if has_link:
token_type, link = link
assert token_type == 'url'
link_type, target = link
assert isinstance(target, str)
if link_type == 'external' and is_attachment:
link_type = 'attachment'
if matrix:
link = (link_type, target, rectangle_aabb(
2020-05-30 02:11:30 +03:00
matrix, pos_x, pos_y, width, height))
2020-04-19 17:49:37 +03:00
else:
link = (link_type, target, (
pos_x, pos_y, pos_x + width, pos_y + height))
self.links.append(link)
if matrix and (has_bookmark or has_anchor):
pos_x, pos_y = matrix.transform_point(pos_x, pos_y)
if has_bookmark:
self.bookmarks.append(
(bookmark_level, bookmark_label, (pos_x, pos_y), state))
if has_anchor:
self.anchors[anchor_name] = pos_x, pos_y
for child in box.all_children():
2020-08-01 16:12:56 +03:00
self._gather_links_and_bookmarks(child, matrix)
2020-04-19 17:49:37 +03:00
def paint(self, context, left_x=0, top_y=0, scale=1, clip=False):
2020-04-18 23:12:25 +03:00
"""Paint the page into the PDF file.
2012-10-02 20:59:02 +04:00
2020-04-19 17:49:37 +03:00
:type context: :class:`pdf.Context`
:param context:
A context object.
:type left_x: float
2012-10-02 20:59:02 +04:00
:param left_x:
2020-04-18 23:12:25 +03:00
X coordinate of the left of the page, in PDF points.
:type top_y: float
2012-10-02 20:59:02 +04:00
:param top_y:
2020-04-18 23:12:25 +03:00
Y coordinate of the top of the page, in PDF points.
:type scale: float
2012-10-05 22:12:05 +04:00
:param scale:
2020-04-18 23:12:25 +03:00
Zoom scale.
:type clip: bool
2012-10-02 20:59:02 +04:00
:param clip:
Whether to clip/cut content outside the page. If false or
not provided, content can overflow.
"""
with stacked(context):
# Make (0, 0) the top-left corner, and make user units CSS pixels:
context.transform(scale, 0, 0, scale, left_x, top_y)
2012-10-02 20:59:02 +04:00
if clip:
2012-10-05 22:12:05 +04:00
width = self.width
height = self.height
context.rectangle(0, 0, width, height)
context.clip()
draw_page(self._page_box, context)
2012-10-02 20:59:02 +04:00
class DocumentMetadata:
"""Meta-information belonging to a whole :class:`Document`.
.. versionadded:: 0.20
New attributes may be added in future versions of WeasyPrint.
"""
def __init__(self, title=None, authors=None, description=None,
2014-04-04 14:32:21 +04:00
keywords=None, generator=None, created=None, modified=None,
attachments=None):
#: The title of the document, as a string or :obj:`None`.
#: Extracted from the ``<title>`` element in HTML
#: and written to the ``/Title`` info field in PDF.
self.title = title
#: The authors of the document, as a list of strings.
#: (Defaults to the empty list.)
#: Extracted from the ``<meta name=author>`` elements in HTML
#: and written to the ``/Author`` info field in PDF.
self.authors = authors or []
#: The description of the document, as a string or :obj:`None`.
#: Extracted from the ``<meta name=description>`` element in HTML
#: and written to the ``/Subject`` info field in PDF.
self.description = description
#: Keywords associated with the document, as a list of strings.
#: (Defaults to the empty list.)
#: Extracted from ``<meta name=keywords>`` elements in HTML
#: and written to the ``/Keywords`` info field in PDF.
self.keywords = keywords or []
#: The name of one of the software packages
#: used to generate the document, as a string or :obj:`None`.
#: Extracted from the ``<meta name=generator>`` element in HTML
#: and written to the ``/Creator`` info field in PDF.
self.generator = generator
#: The creation date of the document, as a string or :obj:`None`.
#: Dates are in one of the six formats specified in
#: `W3Cs profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
#: Extracted from the ``<meta name=dcterms.created>`` element in HTML
#: and written to the ``/CreationDate`` info field in PDF.
self.created = created
#: The modification date of the document, as a string or :obj:`None`.
#: Dates are in one of the six formats specified in
#: `W3Cs profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
#: Extracted from the ``<meta name=dcterms.modified>`` element in HTML
#: and written to the ``/ModDate`` info field in PDF.
self.modified = modified
#: File attachments, as a list of tuples of URL and a description or
#: :obj:`None`. (Defaults to the empty list.)
2014-04-04 14:32:21 +04:00
#: Extracted from the ``<link rel=attachment>`` elements in HTML
#: and written to the ``/EmbeddedFiles`` dictionary in PDF.
#:
#: .. versionadded:: 0.22
2014-04-04 14:32:21 +04:00
self.attachments = attachments or []
class Document:
"""A rendered document ready to be painted on a cairo surface.
2012-10-04 13:35:25 +04:00
Typically obtained from :meth:`HTML.render() <weasyprint.HTML.render>`, but
can also be instantiated directly with a list of :class:`pages <Page>`, a
set of :class:`metadata <DocumentMetadata>`, a :func:`url_fetcher
<weasyprint.default_url_fetcher>` function, and a :class:`font_config
<weasyprint.fonts.FontConfiguration>`.
2012-10-04 13:35:25 +04:00
"""
2012-10-02 20:59:02 +04:00
@classmethod
def _build_layout_context(cls, html, stylesheets,
presentational_hints=False,
optimize_images=False, font_config=None,
counter_style=None, image_cache=None):
if font_config is None:
font_config = FontConfiguration()
2019-12-24 16:39:40 +03:00
if counter_style is None:
2019-12-24 17:56:24 +03:00
counter_style = CounterStyle()
2018-03-28 01:34:34 +03:00
target_collector = TargetCollector()
2017-06-30 18:54:02 +03:00
page_rules = []
user_stylesheets = []
image_cache = {} if image_cache is None else image_cache
for css in stylesheets or []:
if not hasattr(css, 'matcher'):
css = CSS(
guess=css, media_type=html.media_type,
2019-12-24 16:39:40 +03:00
font_config=font_config, counter_style=counter_style)
user_stylesheets.append(css)
style_for = get_all_computed_styles(
html, user_stylesheets, presentational_hints, font_config,
2019-12-24 16:39:40 +03:00
counter_style, page_rules, target_collector)
2012-10-05 20:50:40 +04:00
get_image_from_uri = functools.partial(
original_get_image_from_uri, image_cache, html.url_fetcher,
optimize_images)
PROGRESS_LOGGER.info('Step 4 - Creating formatting structure')
context = LayoutContext(
style_for, get_image_from_uri, font_config, counter_style,
target_collector)
return context
@classmethod
2020-04-19 17:49:37 +03:00
def _render(cls, html, stylesheets, presentational_hints=False,
optimize_images=False, font_config=None, counter_style=None,
image_cache=None):
if font_config is None:
font_config = FontConfiguration()
2019-12-24 16:39:40 +03:00
if counter_style is None:
2019-12-24 17:56:24 +03:00
counter_style = CounterStyle()
2019-12-24 16:39:40 +03:00
context = cls._build_layout_context(
html, stylesheets, presentational_hints, optimize_images,
font_config, counter_style, image_cache)
2018-08-08 18:47:47 +03:00
root_box = build_formatting_structure(
2019-07-23 19:07:14 +03:00
html.etree_element, context.style_for, context.get_image_from_uri,
2019-12-24 16:39:40 +03:00
html.base_url, context.target_collector, counter_style)
page_boxes = layout_document(html, root_box, context)
rendering = cls(
[Page(page_box) for page_box in page_boxes],
2020-05-30 02:11:30 +03:00
DocumentMetadata(**get_html_metadata(html)),
html.url_fetcher, font_config)
return rendering
2012-10-02 20:59:02 +04:00
def _use_references(self, pdf, resources):
# XObjects
for key, x_object in resources.get('XObject', {}).items():
pdf.add_object(x_object)
resources['XObject'][key] = x_object.reference
if 'Resources' in x_object.extra:
self._use_references(pdf, x_object.extra['Resources'])
pdf.add_object(x_object.extra['Resources'])
x_object.extra['Resources'] = (
x_object.extra['Resources'].reference)
# Patterns
for key, pattern in resources.get('Pattern', {}).items():
pdf.add_object(pattern)
resources['Pattern'][key] = pattern.reference
if 'Resources' in pattern.extra:
self._use_references(pdf, pattern.extra['Resources'])
pdf.add_object(pattern.extra['Resources'])
pattern.extra['Resources'] = (
pattern.extra['Resources'].reference)
# Shadings
for key, shading in resources.get('Shading', {}).items():
pdf.add_object(shading)
2020-06-08 17:34:28 +03:00
resources['Shading'][key] = shading.reference
def __init__(self, pages, metadata, url_fetcher, font_config):
2012-10-02 20:59:02 +04:00
#: A list of :class:`Page` objects.
self.pages = pages
#: A :class:`DocumentMetadata` object.
#: Contains information that does not belong to a specific page
#: but to the whole document.
self.metadata = metadata
#: A function or other callable with the same signature as
#: :func:`default_url_fetcher` called to fetch external resources such
#: as stylesheets and images. (See :ref:`url-fetchers`.)
self.url_fetcher = url_fetcher
2020-05-31 02:20:38 +03:00
#: A :obj:`dict` of fonts used by the document. Keys are hashes used to
#: identify fonts, values are :class:`Font` objects.
self.fonts = {}
# Keep a reference to font_config to avoid its garbage collection until
# rendering is destroyed. This is needed as font_config.__del__ removes
# fonts that may be used when rendering
self._font_config = font_config
2012-10-02 20:59:02 +04:00
def copy(self, pages='all'):
2012-10-05 20:50:40 +04:00
"""Take a subset of the pages.
.. versionadded:: 0.15
:type pages: :term:`iterable`
2012-10-05 20:50:40 +04:00
:param pages:
An iterable of :class:`Page` objects from :attr:`pages`.
:return:
A new :class:`Document` object.
Examples:
2012-10-05 20:50:40 +04:00
Write two PDF files for odd-numbered and even-numbered pages::
# Python lists count from 0 but pages are numbered from 1.
2012-10-05 20:50:40 +04:00
# [::2] is a slice of even list indexes but odd-numbered pages.
document.copy(document.pages[::2]).write_pdf('odd_pages.pdf')
document.copy(document.pages[1::2]).write_pdf('even_pages.pdf')
Write each page to a numbred PNG file::
2012-10-05 20:50:40 +04:00
for i, page in enumerate(document.pages):
2020-05-30 02:11:30 +03:00
document.copy(page).write_png(f'page_{i}.png')
2012-10-05 20:50:40 +04:00
Combine multiple documents into one PDF file,
using metadata from the first::
all_pages = [p for doc in documents for p in doc.pages]
documents[0].copy(all_pages).write_pdf('combined.pdf')
2012-10-05 20:50:40 +04:00
"""
2012-10-02 20:59:02 +04:00
if pages == 'all':
pages = self.pages
2012-10-05 20:50:40 +04:00
elif not isinstance(pages, list):
pages = list(pages)
return type(self)(
pages, self.metadata, self.url_fetcher, self._font_config)
2012-10-02 20:59:02 +04:00
def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None):
2012-10-05 20:50:40 +04:00
"""Paint the pages in a PDF file, with meta-data.
2012-10-02 20:59:02 +04:00
2012-10-05 20:50:40 +04:00
PDF files written directly by cairo do not have meta-data such as
bookmarks/outlines and hyperlinks.
2012-10-02 20:59:02 +04:00
:type target: str, pathlib.Path or file object
2012-10-05 20:50:40 +04:00
:param target:
A filename where the PDF file is generated, a file object, or
:obj:`None`.
:type zoom: float
:param zoom:
The zoom factor in PDF units per CSS units. **Warning**:
All CSS units are affected, including physical units like
``cm`` and named sizes like ``A4``. For values other than
1, the physical CSS units will thus be "wrong".
:type attachments: list
:param attachments: A list of additional file attachments for the
generated PDF document or :obj:`None`. The list's elements are
:class:`Attachment` objects, filenames, URLs or file-like objects.
2020-04-19 11:01:27 +03:00
:param finisher: A finisher function, that accepts the document and a
``pydyf.PDF`` object as parameters, can be passed to perform
post-processing on the PDF right before the trailer is written.
2012-10-05 20:50:40 +04:00
:returns:
The PDF as :obj:`bytes` if ``target`` is not provided or
:obj:`None`, otherwise :obj:`None` (the PDF is written to
``target``).
2012-10-02 20:59:02 +04:00
"""
2020-04-18 23:12:25 +03:00
# 0.75 = 72 PDF point per inch / 96 CSS pixel per inch
scale = zoom * 0.75
2020-04-19 17:49:37 +03:00
PROGRESS_LOGGER.info('Step 6 - Creating PDF')
pdf = pydyf.PDF()
alpha_states = pydyf.Dictionary()
2020-05-17 15:46:41 +03:00
x_objects = pydyf.Dictionary()
2020-06-07 01:32:47 +03:00
patterns = pydyf.Dictionary()
shadings = pydyf.Dictionary()
2020-05-17 15:46:41 +03:00
resources = pydyf.Dictionary({
'ExtGState': alpha_states,
'XObject': x_objects,
2020-06-07 01:32:47 +03:00
'Pattern': patterns,
'Shading': shadings,
2020-05-17 15:46:41 +03:00
})
2020-04-19 17:49:37 +03:00
pdf.add_object(resources)
pdf_names = pydyf.Array()
# Links and anchors
2020-05-17 17:59:58 +03:00
page_links_and_anchors = list(resolve_links(self.pages))
2020-04-21 23:30:38 +03:00
attachment_links = [
[link for link in page_links if link[0] == 'attachment']
2020-05-17 17:59:58 +03:00
for page_links, page_anchors in page_links_and_anchors]
2020-04-21 23:30:38 +03:00
# Annotations
annot_files = {}
2020-04-21 23:30:38 +03:00
# A single link can be split in multiple regions. We don't want to
# embed a file multiple times of course, so keep a reference to every
# embedded URL and reuse the object number.
for page_links in attachment_links:
for link_type, annot_target, rectangle in page_links:
if link_type == 'attachment' and target not in annot_files:
# TODO: Use the title attribute as description. The comment
# above about multiple regions won't always be correct,
# because two links might have the same href, but different
# titles.
2020-04-21 23:30:38 +03:00
annot_files[annot_target] = _write_pdf_attachment(
pdf, (annot_target, None), self.url_fetcher)
# Bookmarks
root = []
# At one point in the document, for each "output" depth, how much
# to add to get the source level (CSS values of bookmark-level).
# E.g. with <h1> then <h3>, level_shifts == [0, 1]
# 1 means that <h3> has depth 3 - 1 = 2 in the output.
skipped_levels = []
last_by_depth = [root]
previous_level = 0
for page_number, (page, links_and_anchors, page_links) in enumerate(
2020-05-17 17:59:58 +03:00
zip(self.pages, page_links_and_anchors, attachment_links)):
# Draw from the top-left corner
matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale)
# Links and anchors
links, anchors = links_and_anchors
2020-04-19 01:47:19 +03:00
2020-04-19 10:55:39 +03:00
page_width = scale * (
page.width + page.bleed['left'] + page.bleed['right'])
page_height = scale * (
page.height + page.bleed['top'] + page.bleed['bottom'])
left = -scale * page.bleed['left']
top = -scale * page.bleed['top']
2020-04-19 17:49:37 +03:00
right = left + page_width
bottom = top + page_height
2020-06-07 01:32:47 +03:00
page_rectangle = (
left / scale, top / scale, right / scale, bottom / scale)
stream = Context(
self, page_rectangle, alpha_states, x_objects, patterns,
shadings)
stream.transform(1, 0, 0, -1, 0, page.height * scale)
page.paint(stream, scale=scale)
2020-04-19 17:49:37 +03:00
pdf.add_object(stream)
2020-04-19 01:47:19 +03:00
pdf_page = pydyf.Dictionary({
'Type': '/Page',
2020-04-19 17:49:37 +03:00
'Parent': pdf.pages.reference,
'MediaBox': pydyf.Array([left, top, right, bottom]),
'Contents': stream.reference,
'Resources': resources.reference,
2020-04-19 01:47:19 +03:00
})
2020-04-19 17:49:37 +03:00
pdf.add_page(pdf_page)
add_hyperlinks(links, anchors, matrix, pdf, pdf_page, pdf_names)
# Bleed
2020-04-19 17:49:37 +03:00
bleed = {key: value * 0.75 for key, value in page.bleed.items()}
trim_left = left + bleed['left']
trim_top = top + bleed['top']
trim_right = right - bleed['right']
trim_bottom = bottom - bleed['bottom']
2020-04-19 01:47:19 +03:00
2020-04-19 17:49:37 +03:00
# Arbitrarly set PDF BleedBox between CSS bleed box (MediaBox) and
# CSS page box (TrimBox) at most 10 points from the TrimBox.
bleed_left = trim_left - min(10, bleed['left'])
bleed_top = trim_top - min(10, bleed['top'])
bleed_right = trim_right + min(10, bleed['right'])
bleed_bottom = trim_bottom + min(10, bleed['bottom'])
2020-04-19 01:47:19 +03:00
2020-04-19 17:49:37 +03:00
pdf_page['TrimBox'] = pydyf.Array([
trim_left, trim_top, trim_right, trim_bottom])
pdf_page['BleedBox'] = pydyf.Array([
bleed_left, bleed_top, bleed_right, bleed_bottom])
2012-10-02 20:59:02 +04:00
# Annotations
2020-04-21 23:30:38 +03:00
# TODO: splitting a link into multiple independent rectangular
# annotations works well for pure links, but rather mediocre for
# other annotations and fails completely for transformed (CSS) or
# complex link shapes (area). It would be better to use /AP for all
# links and coalesce link shapes that originate from the same HTML
# link. This would give a feeling similiar to what browsers do with
# links that span multiple lines.
for link_type, annot_target, rectangle in page_links:
annot_file = annot_files[annot_target]
if link_type == 'attachment' and annot_file is not None:
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
annot = pydyf.Dictionary({
'Type': '/Annot',
'Rect': pydyf.Array(rectangle),
'Subtype': '/FileAttachment',
'T': pydyf.String(),
'FS': annot_file.reference,
'AP': pydyf.Dictionary({'N': pydyf.Stream([], {
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array(rectangle),
'Length': 0,
})})
})
pdf.add_object(annot)
if 'Annots' not in pdf_page:
pdf_page['Annots'] = pydyf.Array()
2020-04-21 23:30:38 +03:00
pdf_page['Annots'].append(annot.reference)
# Bookmarks
2020-04-19 17:49:37 +03:00
for level, label, (point_x, point_y), state in page.bookmarks:
if level > previous_level:
# Example: if the previous bookmark is a <h2>, the next
# depth "should" be for <h3>. If now we get a <h6> were
# skipping two levels: append 6 - 3 - 1 = 2
skipped_levels.append(level - previous_level - 1)
else:
temp = level
while temp < previous_level:
temp += 1 + skipped_levels.pop()
if temp > previous_level:
# We remove too many "skips", add some back:
skipped_levels.append(temp - previous_level - 1)
previous_level = level
depth = level - sum(skipped_levels)
assert depth == len(skipped_levels)
assert depth >= 1
children = []
point_x, point_y = matrix.transform_point(point_x, point_y)
subtree = BookmarkSubtree(
label, (page_number, point_x, point_y), children, state)
last_by_depth[depth - 1].append(subtree)
del last_by_depth[depth:]
last_by_depth.append(children)
2020-05-17 18:12:16 +03:00
# Outlines
2020-04-19 17:49:37 +03:00
outlines, count = create_bookmarks(root, pdf)
2020-04-22 00:07:35 +03:00
if outlines:
2020-05-12 22:53:54 +03:00
outlines_dictionary = pydyf.Dictionary({
2020-04-22 00:07:35 +03:00
'Count': count,
'First': outlines[0].reference,
'Last': outlines[-1].reference,
})
2020-05-12 22:53:54 +03:00
pdf.add_object(outlines_dictionary)
for outline in outlines:
outline['Parent'] = outlines_dictionary.reference
2020-05-16 17:19:28 +03:00
pdf.catalog['Outlines'] = outlines_dictionary.reference
2020-04-19 17:49:37 +03:00
PROGRESS_LOGGER.info('Step 7 - Adding PDF metadata')
2020-05-06 08:42:45 +03:00
# PDF information
if self.metadata.title:
pdf.info['Title'] = pydyf.String(self.metadata.title)
if self.metadata.authors:
pdf.info['Author'] = pydyf.String(
', '.join(self.metadata.authors))
if self.metadata.description:
pdf.info['Subject'] = pydyf.String(self.metadata.description)
if self.metadata.keywords:
pdf.info['Keywords'] = pydyf.String(
', '.join(self.metadata.keywords))
if self.metadata.generator:
pdf.info['Creator'] = pydyf.String(self.metadata.generator)
pdf.info['Producer'] = pydyf.String(f'WeasyPrint {__version__}')
if self.metadata.created:
pdf.info['CreationDate'] = pydyf.String(
_w3c_date_to_pdf(self.metadata.created, 'created'))
if self.metadata.modified:
pdf.info['ModDate'] = pydyf.String(
_w3c_date_to_pdf(self.metadata.modified, 'modified'))
2020-05-06 08:42:45 +03:00
# Embedded files
2020-04-21 23:30:38 +03:00
attachments = self.metadata.attachments + (attachments or [])
2020-05-06 08:42:45 +03:00
pdf_attachments = []
for attachment in attachments:
pdf_attachment = _write_pdf_attachment(
pdf, attachment, self.url_fetcher)
if pdf_attachment is not None:
pdf_attachments.append(pdf_attachment)
if pdf_attachments:
content = pydyf.Dictionary({'Names': pydyf.Array()})
for i, pdf_attachment in enumerate(pdf_attachments):
content['Names'].append(pydyf.String(f'attachment{i}'))
content['Names'].append(pdf_attachment.reference)
pdf.add_object(content)
if 'Names' not in pdf.catalog:
pdf.catalog['Names'] = pydyf.Dictionary()
2020-05-06 08:42:45 +03:00
pdf.catalog['Names']['EmbeddedFiles'] = content.reference
# Embeded fonts
resources['Font'] = pydyf.Dictionary()
2020-06-01 02:12:32 +03:00
for font in self.fonts.values():
2020-05-29 20:43:56 +03:00
# Optimize font
try:
full_font = io.BytesIO(font.file_content)
optimized_font = io.BytesIO()
ttfont = TTFont(full_font)
options = subset.Options(
retain_gids=True, passthrough_tables=True)
subsetter = subset.Subsetter(options)
subsetter.populate(gids=font.cmap)
subsetter.subset(ttfont)
ttfont.save(optimized_font)
content = optimized_font.getvalue()
except TTLibError:
content = font.file_content
2020-05-29 20:43:56 +03:00
# Include font
font_type = 'otf' if content[:4] == b'OTTO' else 'ttf'
if font_type == 'otf':
2020-06-01 12:48:17 +03:00
font_extra = pydyf.Dictionary({'Subtype': '/OpenType'})
2020-05-12 14:11:52 +03:00
else:
2020-06-01 12:48:17 +03:00
font_extra = pydyf.Dictionary({'Length1': len(content)})
font_stream = pydyf.Stream([content], font_extra, compress=True)
2020-05-06 08:42:45 +03:00
pdf.add_object(font_stream)
2020-05-12 03:00:07 +03:00
widths = pydyf.Array()
for i in sorted(font.widths):
if i - 1 not in font.widths:
widths.append(i)
2020-05-12 03:00:07 +03:00
current_widths = pydyf.Array()
widths.append(current_widths)
current_widths.append(font.widths[i])
subfont_dictionary = pydyf.Dictionary({
2020-05-06 08:42:45 +03:00
'Type': '/Font',
2020-05-30 02:11:30 +03:00
'Subtype': f'/CIDFontType{"0" if font_type == "otf" else "2"}',
2020-05-08 02:55:50 +03:00
'BaseFont': font.name,
'CIDSystemInfo': pydyf.Dictionary({
'Registry': pydyf.String('Adobe'),
'Ordering': pydyf.String('Identity'),
'Supplement': 0,
}),
2020-05-12 03:00:07 +03:00
'W': widths,
2020-05-06 08:42:45 +03:00
'FontDescriptor': pydyf.Dictionary({
'Type': '/FontDescriptor',
2020-05-08 02:55:50 +03:00
'FontName': font.name,
'FontFamily': pydyf.String(font.family),
2020-05-06 08:42:45 +03:00
'Flags': 32,
2020-05-08 02:50:41 +03:00
'FontBBox': pydyf.Array(font.bbox),
2020-05-08 01:40:01 +03:00
'ItalicAngle': font.italic_angle,
'Ascent': font.ascent,
'Descent': font.descent,
'CapHeight': font.bbox[3],
2020-05-08 01:40:01 +03:00
'StemV': font.stemv,
'StemH': font.stemh,
2020-05-30 02:11:30 +03:00
(f'FontFile{"3" if font_type == "otf" else "2"}'):
font_stream.reference,
}),
})
2020-05-12 14:11:52 +03:00
if font_type == 'otf':
subfont_dictionary['FontDescriptor']['Subtype'] = '/OpenType'
pdf.add_object(subfont_dictionary)
2020-05-12 19:38:12 +03:00
to_unicode = pydyf.Stream([
2020-06-01 02:12:32 +03:00
b'/CIDInit /ProcSet findresource begin',
b'12 dict begin',
b'begincmap',
b'/CIDSystemInfo',
b'<< /Registry (Adobe)',
b'/Ordering (UCS)',
b'/Supplement 0',
b'>> def',
b'/CMapName /Adobe-Identity-UCS def',
b'/CMapType 2 def',
b'1 begincodespacerange',
b'<0000> <ffff>',
b'endcodespacerange',
f'{len(font.cmap)} beginbfchar'.encode('ascii')])
2020-05-12 19:38:12 +03:00
for glyph, text in font.cmap.items():
unicode_codepoints = ''.join(
f'{letter.encode("utf-16-be").hex()}' for letter in text)
to_unicode.stream.append(
2020-06-01 02:12:32 +03:00
f'<{glyph:04x}> <{unicode_codepoints}>'.encode('ascii'))
2020-05-12 19:38:12 +03:00
to_unicode.stream.extend([
2020-06-01 02:12:32 +03:00
b'endbfchar',
b'endcmap',
b'CMapName currentdict /CMap defineresource pop',
b'end',
b'end'])
2020-05-13 00:47:22 +03:00
pdf.add_object(to_unicode)
font_dictionary = pydyf.Dictionary({
'Type': '/Font',
'Subtype': '/Type0',
'BaseFont': font.name,
'Encoding': '/Identity-H',
'DescendantFonts': pydyf.Array([subfont_dictionary.reference]),
2020-05-13 00:47:22 +03:00
'ToUnicode': to_unicode.reference,
2020-05-06 08:42:45 +03:00
})
pdf.add_object(font_dictionary)
2020-06-01 02:12:32 +03:00
resources['Font'][font.hash] = font_dictionary.reference
2020-04-21 23:30:38 +03:00
self._use_references(pdf, resources)
2020-05-17 15:46:41 +03:00
2020-05-17 18:12:23 +03:00
# Anchors
if pdf_names:
pdf.catalog['Names'] = pydyf.Dictionary(
{'Dests': pydyf.Dictionary({'Names': pdf_names})})
2020-04-19 11:01:27 +03:00
if finisher:
2020-04-19 17:49:37 +03:00
finisher(self, pdf)
2020-04-19 11:01:27 +03:00
file_obj = io.BytesIO()
2020-04-19 17:49:37 +03:00
pdf.write(file_obj)
2012-10-02 20:59:02 +04:00
if target is None:
return file_obj.getvalue()
else:
file_obj.seek(0)
if hasattr(target, 'write'):
shutil.copyfileobj(file_obj, target)
else:
with open(target, 'wb') as fd:
shutil.copyfileobj(file_obj, fd)
2020-05-18 02:29:37 +03:00
def write_png(self, target=None, resolution=96, antialiasing=1):
2012-12-29 04:00:30 +04:00
"""Paint the pages vertically to a single PNG image.
There is no decoration around pages other than those specified in CSS
with ``@page`` rules. The final image is as wide as the widest page.
Each page is below the previous one, centered horizontally.
:param target:
A filename, file-like object, or :obj:`None`.
:type resolution: float
:param resolution:
The output resolution in PNG pixels per CSS inch. At 96 dpi
(the default), PNG pixels match the CSS ``px`` unit.
:type antialiasing: int
:param antialiasing:
2020-05-18 02:29:37 +03:00
The antialiasing subsampling box size. Default is 1 (disabled), can
be set to 4 for optimal (but slow) antialiasing.
2012-12-29 04:00:30 +04:00
:returns:
A ``(png_bytes, png_width, png_height)`` tuple. ``png_bytes`` is a
byte string if ``target`` is :obj:`None`, otherwise :obj:`None`
(the image is written to ``target``). ``png_width`` and
``png_height`` are the size of the final image, in PNG pixels.
2012-12-29 04:00:30 +04:00
"""
# TODO: dont crash if GhostScript cant be found
# TODO: fix that for Windows
2020-05-13 02:02:43 +03:00
command = [
'gs', '-q', '-sstdout=%stderr', '-dNOPAUSE', '-dSAFER',
f'-dTextAlphaBits={antialiasing}',
f'-dGraphicsAlphaBits={antialiasing}', '-sDEVICE=png16m',
2020-05-16 01:04:09 +03:00
f'-r{resolution}', '-sOutputFile=-', '-']
command = run(command, input=self.write_pdf(), capture_output=True)
pngs = command.stdout
2020-05-18 02:36:48 +03:00
magic_number = b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a'
# TODO: use a different way to find PNG files in stream
2020-05-18 02:36:48 +03:00
if pngs.count(magic_number) == 1:
if target is None:
return pngs
png = io.BytesIO(pngs)
else:
images = []
for i, png in enumerate(pngs[8:].split(magic_number)):
images.append(Image.open(io.BytesIO(magic_number + png)))
width = max(image.width for image in images)
height = sum(image.height for image in images)
output_image = Image.new('RGBA', (width, height))
top = 0
for image in images:
output_image.paste(
image, (int((width - image.width) / 2), top))
top += image.height
png = io.BytesIO()
output_image.save(png, format='png')
png.seek(0)
2020-05-13 02:02:43 +03:00
2020-05-18 12:54:37 +03:00
if target is None:
return png.read()
2020-05-16 01:04:09 +03:00
2020-05-18 12:54:37 +03:00
if hasattr(target, 'write'):
shutil.copyfileobj(png, target)
else:
with open(target, 'wb') as fd:
shutil.copyfileobj(png, fd)