mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-10-05 08:27:22 +03:00

365 lines
14 KiB
Raw Normal View History

2012-05-19 16:50:38 +04:00
Post-process the PDF files created by cairo and add metadata such as
hyperlinks and bookmarks.
2014-01-10 18:27:02 +04:00
:copyright: Copyright 2011-2014 Simon Sapin and contributors, see AUTHORS.
2012-05-19 16:50:38 +04:00
:license: BSD, see LICENSE for details.
import hashlib
2014-04-04 14:32:21 +04:00
import io
import mimetypes
import zlib
2018-01-14 03:48:17 +03:00
from urllib.parse import unquote
2012-05-19 16:50:38 +04:00
2012-12-29 04:00:30 +04:00
import cairocffi as cairo
from pdfrw import PdfArray, PdfDict, PdfName, PdfReader, PdfString, PdfWriter
from . import VERSION_STRING, Attachment
from .html import W3C_DATE_RE
from .logger import LOGGER
2017-03-25 02:33:36 +03:00
from .urls import URLFetchingError, iri_to_uri, urlsplit
def convert_bookmarks_units(bookmarks, matrices):
converted_bookmarks = []
2012-10-04 13:35:25 +04:00
for label, target, children in bookmarks:
page, x, y = target
x, y = matrices[target[0]].transform_point(x, y)
children = convert_bookmarks_units(children, matrices)
converted_bookmarks.append((label, (page, x, y), children))
return converted_bookmarks
def prepare_metadata(document, scale, pages):
2012-10-04 13:35:25 +04:00
"""Change metadata into data structures closer to the PDF objects.
2012-11-23 01:49:30 +04:00
In particular, convert from WeasyPrint units (CSS pixels from
the top-left corner) to PDF units (points from the bottom-left corner.)
2012-12-29 04:00:30 +04:00
2012-11-23 01:49:30 +04:00
:param scale:
PDF points per CSS pixels.
2012-12-29 04:00:30 +04:00
Defaults to 0.75, but is affected by `zoom` in
2012-11-23 01:49:30 +04:00
2012-10-04 13:35:25 +04:00
# X and width unchanged; Y = page_height - Y; height = -height
2012-11-23 01:49:30 +04:00
matrices = [cairo.Matrix(xx=scale, yy=-scale, y0=page.height * scale)
2012-10-05 22:12:05 +04:00
for page in document.pages]
2012-10-04 13:35:25 +04:00
links = []
2018-01-14 03:48:17 +03:00
for page_links, matrix in zip(document.resolve_links(), matrices):
2012-10-04 13:35:25 +04:00
new_page_links = []
for link_type, target, rectangle in page_links:
if link_type == 'internal':
target_page, target_x, target_y = target
2013-04-11 14:08:53 +04:00
target = (
(pages[target_page].indirect,) +
2012-10-05 22:12:05 +04:00
matrices[target_page].transform_point(target_x, target_y))
2012-10-04 13:35:25 +04:00
rect_x, rect_y, width, height = rectangle
2012-10-05 22:12:05 +04:00
rect_x, rect_y = matrix.transform_point(rect_x, rect_y)
width, height = matrix.transform_distance(width, height)
# x, y, w, h => x0, y0, x1, y1
rectangle = rect_x, rect_y, rect_x + width, rect_y + height
2012-10-04 13:35:25 +04:00
new_page_links.append((link_type, target, rectangle))
bookmarks = convert_bookmarks_units(
document.make_bookmark_tree(), matrices)
return bookmarks, links
def _create_compressed_file_object(source):
Create a file like object as ``/EmbeddedFile`` compressing it with deflate.
the object representing the compressed file stream object
md5 = hashlib.md5()
compress = zlib.compressobj()
pdf_file_object = PdfDict(
Type=PdfName('EmbeddedFile'), Filter=PdfName('FlateDecode'))
# pdfrw needs Latin-1-decoded unicode strings in object.stream
pdf_file_object.stream = ''
size = 0
for data in iter(lambda: source.read(4096), b''):
size += len(data)
pdf_file_object.stream += compress.compress(data).decode('latin-1')
pdf_file_object.stream += compress.flush(zlib.Z_FINISH).decode('latin-1')
pdf_file_object.Params = PdfDict(
CheckSum=PdfString('<{}>'.format(md5.hexdigest())), Size=size)
return pdf_file_object
def _get_filename_from_result(url, result):
Derives a filename from a fetched resource. This is either the filename
returned by the URL fetcher, the last URL path component or a synthetic
name if the URL has no path
filename = None
# A given filename will always take precedence
if result:
filename = result.get('filename')
if filename:
return filename
# The URL path likely contains a filename, which is a good second guess
if url:
split = urlsplit(url)
if split.scheme != 'data':
filename = split.path.split("/")[-1]
if filename == '':
filename = None
if filename is None:
# The URL lacks a path altogether. Use a synthetic name.
# Using guess_extension is a great idea, but sadly the extension is
# probably random, depending on the alignment of the stars, which car
# you're driving and which software has been installed on your machine.
# Unfortuneatly this isn't even imdepodent on one machine, because the
# extension can depend on PYTHONHASHSEED if mimetypes has multiple
# extensions to offer
extension = None
if result:
mime_type = result.get('mime_type')
if mime_type == 'text/plain':
# text/plain has a phletora of extensions - all garbage
extension = '.txt'
extension = mimetypes.guess_extension(mime_type) or '.bin'
extension = '.bin'
filename = 'attachment' + extension
2018-01-14 03:48:17 +03:00
filename = unquote(filename)
return filename
def _create_pdf_attachment(attachment, url_fetcher):
2014-04-04 14:32:21 +04:00
Create an attachment to the PDF stream
2014-04-04 14:32:21 +04:00
the object representing the ``/Filespec`` object or :obj:`None` if the
2014-04-04 14:32:21 +04:00
attachment couldn't be read.
# Attachments from document links like <link> or <a> can only be URLs.
# They're passed in as tuples
if isinstance(attachment, tuple):
2014-04-27 21:16:14 +04:00
url, description = attachment
attachment = Attachment(
url=url, url_fetcher=url_fetcher, description=description)
elif not isinstance(attachment, Attachment):
attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
with attachment.source as (source_type, source, url, _):
if isinstance(source, bytes):
source = io.BytesIO(source)
pdf_file_object = _create_compressed_file_object(source)
2014-04-04 14:32:21 +04:00
except URLFetchingError as exc:
LOGGER.error('Failed to load attachment: %s', exc)
return None
# TODO: Use the result object from a URL fetch operation to provide more
# details on the possible filename
return PdfDict(
Type=PdfName('Filespec'), F=PdfString.encode(''),
UF=PdfString.encode(_get_filename_from_result(url, None)),
Desc=PdfString.encode(attachment.description or ''))
def create_bookmarks(bookmarks, pages, parent=None):
count = len(bookmarks)
bookmark_objects = []
for label, target, children in bookmarks:
destination = (
PdfName('XYZ'), target[1], target[2], 0)
bookmark_object = PdfDict(
Title=PdfString.encode(label), A=PdfDict(
Type=PdfName('Action'), S=PdfName('GoTo'),
bookmark_object.indirect = True
children_objects, children_count = create_bookmarks(
children, pages, parent=bookmark_object)
bookmark_object.Count = 1 + children_count
if bookmark_objects:
bookmark_object.Prev = bookmark_objects[-1]
bookmark_objects[-1].Next = bookmark_object
if children_objects:
bookmark_object.First = children_objects[0]
bookmark_object.Last = children_objects[-1]
if parent is not None:
bookmark_object.Parent = parent
count += children_count
return bookmark_objects, count
def write_pdf_metadata(document, fileobj, scale, metadata, attachments,
2014-04-27 21:16:14 +04:00
2012-10-04 13:35:25 +04:00
"""Append to a seekable file-like object to add PDF metadata."""
trailer = PdfReader(fileobj)
pages = trailer.Root.Pages.Kids
bookmarks, links = prepare_metadata(document, scale, pages)
if bookmarks:
bookmark_objects, count = create_bookmarks(bookmarks, pages)
trailer.Root.Outlines = PdfDict(
Type=PdfName('Outlines'), Count=count,
First=bookmark_objects[0], Last=bookmark_objects[-1])
attachments = metadata.attachments + (attachments or [])
if attachments:
embedded_files = []
for attachment in attachments:
attachment_object = _create_pdf_attachment(attachment, url_fetcher)
if attachment_object is not None:
if embedded_files:
trailer.Root.Names = PdfDict(
2014-04-04 14:32:21 +04:00
# A single link can be split in multiple regions. We don't want to embedded
# a file multiple times of course, so keep a reference to every embedded
# URL and reuse the object number.
# TODO: If we add support for descriptions this won't always be correct,
# because two links might have the same href, but different titles.
annot_files = {}
for page_links in links:
for link_type, target, rectangle in page_links:
if link_type == 'attachment' and target not in annot_files:
# TODO: use the title attribute as description
annot_files[target] = _create_pdf_attachment(
(target, None), url_fetcher)
# TODO: splitting a link into multiple independent rectangular annotations
# works well for pure links, but rather mediocre for other annotations and
# fails completely for transformed (CSS) or complex link shapes (area).
# It would be better to use /AP for all links and coalesce link shapes that
# originate from the same HTML link. This would give a feeling similiar to
# what browsers do with links that span multiple lines.
for page, page_links in zip(pages, links):
annotations = PdfArray()
for link_type, target, rectangle in page_links:
if link_type != 'attachment' or annot_files[target] is None:
annotation = PdfDict(
Type=PdfName('Annot'), Subtype=PdfName('Link'),
Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0)))
if link_type == 'internal':
destination = (
target[0], PdfName('XYZ'), target[1], target[2], 0)
annotation.A = PdfDict(
Type=PdfName('Action'), S=PdfName('GoTo'),
annotation.A = PdfDict(
Type=PdfName('Action'), S=PdfName('URI'),
assert annot_files[target] is not None
ap = PdfDict(N=PdfDict(
BBox=PdfArray(rectangle), Subtype=PdfName('Form'),
# evince needs /T or fails on an internal assertion. PDF
# doesn't require it.
annotation = PdfDict(
Type=PdfName('Annot'), Subtype=PdfName('FileAttachment'),
T=PdfString.encode(''), Rect=PdfArray(rectangle),
Border=PdfArray((0, 0, 0)), FS=annot_files[target],
if annotations:
page.Annots = annotations
trailer.Info.Producer = VERSION_STRING
for attr, key in (('title', 'Title'), ('description', 'Subject'),
('generator', 'Creator')):
value = getattr(metadata, attr)
if value is not None:
setattr(trailer.Info, key, value)
for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')):
value = getattr(metadata, attr)
if value is not None:
setattr(trailer.Info, key, ', '.join(getattr(metadata, attr)))
for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')):
value = w3c_date_to_pdf(getattr(metadata, attr), attr)
if value is not None:
setattr(trailer.Info, key, value)
2012-10-04 13:35:25 +04:00
for page, document_page in zip(pages, document.pages):
2017-10-05 09:45:50 +03:00
left, top, right, bottom = (float(value) for value in page.MediaBox)
# Convert pixels into points
bleed = {
key: value * 0.75 for key, value in document_page.bleed.items()}
trim_left = left + bleed['left']
trim_top = top + bleed['top']
trim_right = right - bleed['right']
trim_bottom = bottom - bleed['bottom']
page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom))
# Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and
# CSS page box (PDF TrimBox), at most 10 points from the TrimBox.
bleed_left = trim_left - min(10, bleed['left'])
bleed_top = trim_top - min(10, bleed['top'])
bleed_right = trim_right + min(10, bleed['right'])
bleed_bottom = trim_bottom + min(10, bleed['bottom'])
page.BleedBox = PdfArray(
(bleed_left, bleed_top, bleed_right, bleed_bottom))
PdfWriter().write(fileobj, trailer=trailer)
def w3c_date_to_pdf(string, attr_name):
if string is None:
return None
match = W3C_DATE_RE.match(string)
if match is None:
2013-08-19 16:38:09 +04:00
LOGGER.warning('Invalid %s date: %r', attr_name, string)
return None
groups = match.groupdict()
pdf_date = (groups['year'] +
(groups['month'] or '') +
(groups['day'] or '') +
(groups['hour'] or '') +
(groups['minute'] or '') +
(groups['second'] or ''))
if groups['hour']:
assert groups['minute']
if not groups['second']:
pdf_date += '00'
if groups['tz_hour']:
assert groups['tz_hour'].startswith(('+', '-'))
assert groups['tz_minute']
pdf_date += "%s'%s'" % (groups['tz_hour'], groups['tz_minute'])
pdf_date += 'Z' # UTC
return pdf_date