1
1
mirror of https://github.com/Kozea/WeasyPrint.git synced 2024-09-11 20:47:56 +03:00

Reorganize anchors management

This commit is contained in:
Guillaume Ayoub 2023-01-15 21:59:13 +01:00
parent 168ed3b9a3
commit 0b1617edc8
8 changed files with 416 additions and 418 deletions

View File

@ -13,7 +13,7 @@ import py
import pytest
from PIL import Image
from weasyprint import CSS, HTML, __main__, default_url_fetcher
from weasyprint.links import resolve_links
from weasyprint.pdf.anchors import resolve_links
from weasyprint.urls import path2url
from .draw import parse_pixels

View File

@ -1,53 +1,12 @@
"""PDF links and bookmarks management."""
"""Find anchors, links, bookmarks and inputs in documents."""
import math
from .formatting_structure import boxes
from .layout.percent import percentage
from .logger import LOGGER
from .matrix import Matrix
def resolve_links(pages):
"""Resolve internal hyperlinks.
Links to a missing anchor are removed with a warning.
If multiple anchors have the same name, the first one is used.
:returns:
A generator yielding lists (one per page) like :attr:`Page.links`,
except that ``target`` for internal hyperlinks is
``(page_number, x, y)`` instead of an anchor name.
The page number is a 0-based index into the :attr:`pages` list,
and ``x, y`` are in CSS pixels from the top-left of the page.
"""
anchors = set()
paged_anchors = []
for i, page in enumerate(pages):
paged_anchors.append([])
for anchor_name, (point_x, point_y) in page.anchors.items():
if anchor_name not in anchors:
paged_anchors[-1].append((anchor_name, point_x, point_y))
anchors.add(anchor_name)
for page in pages:
page_links = []
for link in page.links:
link_type, anchor_name, _, _ = link
if link_type == 'internal':
if anchor_name not in anchors:
LOGGER.error(
'No anchor #%s for internal URI reference',
anchor_name)
else:
page_links.append(link)
else:
# External link
page_links.append(link)
yield page_links, paged_anchors.pop(0)
def rectangle_aabb(matrix, pos_x, pos_y, width, height):
"""Apply a transformation matrix to an axis-aligned rectangle.
@ -68,8 +27,12 @@ def rectangle_aabb(matrix, pos_x, pos_y, width, height):
return box_x1, box_y1, box_x2, box_y2
def gather_links_and_bookmarks(box, anchors, links, bookmarks, inputs,
parent_matrix=None):
def gather_anchors(box, anchors, links, bookmarks, inputs, parent_matrix=None):
"""Gather anchors and other data related to specific positions in PDF.
Currently finds anchors, links, bookmarks and inputs.
"""
# Get box transformation matrix.
# "Transforms apply to block-level and atomic inline-level elements,
# but do not apply to elements which may be split into
@ -149,8 +112,7 @@ def gather_links_and_bookmarks(box, anchors, links, bookmarks, inputs,
anchors[anchor_name] = pos_x, pos_y
for child in box.all_children():
gather_links_and_bookmarks(
child, anchors, links, bookmarks, inputs, matrix)
gather_anchors(child, anchors, links, bookmarks, inputs, matrix)
def make_page_bookmark_tree(page, skipped_levels, last_by_depth,

View File

@ -5,6 +5,7 @@ import io
import shutil
from . import CSS
from .anchors import gather_anchors, make_page_bookmark_tree
from .css import get_all_computed_styles
from .css.counters import CounterStyle
from .css.targets import TargetCollector
@ -13,7 +14,6 @@ from .formatting_structure.build import build_formatting_structure
from .html import get_html_metadata
from .images import get_image_from_uri as original_get_image_from_uri
from .layout import LayoutContext, layout_document
from .links import gather_links_and_bookmarks, make_page_bookmark_tree
from .logger import PROGRESS_LOGGER
from .matrix import Matrix
from .pdf import generate_pdf
@ -72,7 +72,7 @@ class Page:
#: :ojb:`dict` of HTML tag attributes and values.
self.inputs = []
gather_links_and_bookmarks(
gather_anchors(
page_box, self.anchors, self.links, self.bookmarks, self.inputs)
self._page_box = page_box
@ -247,7 +247,7 @@ class Document:
# Keep a reference to font_config to avoid its garbage collection until
# rendering is destroyed. This is needed as font_config.__del__ removes
# fonts that may be used when rendering
self._font_config = font_config
self.font_config = font_config
# Set of flags for PDF size optimization. Can contain "images" and
# "fonts".
self._optimize_size = optimize_size
@ -290,7 +290,7 @@ class Document:
elif not isinstance(pages, list):
pages = list(pages)
return type(self)(
pages, self.metadata, self.url_fetcher, self._font_config,
pages, self.metadata, self.url_fetcher, self.font_config,
self._optimize_size)
def make_bookmark_tree(self):

View File

@ -1,20 +1,15 @@
"""PDF generation management."""
import hashlib
import io
import zlib
from os.path import basename
from urllib.parse import unquote, urlsplit
import pydyf
from .. import Attachment, __version__
from .. import VERSION
from ..html import W3C_DATE_RE
from ..links import make_page_bookmark_tree, resolve_links
from ..logger import LOGGER, PROGRESS_LOGGER
from ..matrix import Matrix
from ..urls import URLFetchingError
from . import pdfa, pdfua
from .anchors import (
add_annotations, add_inputs, add_links, add_outlines, resolve_links,
write_pdf_attachment)
from .fonts import build_fonts_dictionary
from .stream import Stream
@ -53,71 +48,6 @@ def _w3c_date_to_pdf(string, attr_name):
return pdf_date
def _write_pdf_attachment(pdf, attachment, url_fetcher):
"""Write an attachment to the PDF stream.
:return:
the attachment PDF dictionary.
"""
# Attachments from document links like <link> or <a> can only be URLs.
# They're passed in as tuples
url = ''
if isinstance(attachment, tuple):
url, description = attachment
attachment = Attachment(
url=url, url_fetcher=url_fetcher, description=description)
elif not isinstance(attachment, Attachment):
attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
try:
with attachment.source as (source_type, source, url, _):
if isinstance(source, bytes):
source = io.BytesIO(source)
uncompressed_length = 0
stream = b''
md5 = hashlib.md5()
compress = zlib.compressobj()
for data in iter(lambda: source.read(4096), b''):
uncompressed_length += len(data)
md5.update(data)
compressed = compress.compress(data)
stream += compressed
compressed = compress.flush(zlib.Z_FINISH)
stream += compressed
file_extra = pydyf.Dictionary({
'Type': '/EmbeddedFile',
'Filter': '/FlateDecode',
'Params': pydyf.Dictionary({
'CheckSum': f'<{md5.hexdigest()}>',
'Size': uncompressed_length,
})
})
file_stream = pydyf.Stream([stream], file_extra)
pdf.add_object(file_stream)
except URLFetchingError as exception:
LOGGER.error('Failed to load attachment: %s', exception)
return
# TODO: Use the result object from a URL fetch operation to provide more
# details on the possible filename.
if url and urlsplit(url).path:
filename = basename(unquote(urlsplit(url).path))
else:
filename = 'attachment.bin'
attachment = pydyf.Dictionary({
'Type': '/Filespec',
'F': pydyf.String(),
'UF': pydyf.String(filename),
'EF': pydyf.Dictionary({'F': file_stream.reference}),
'Desc': pydyf.String(attachment.description or ''),
})
pdf.add_object(attachment)
return attachment
def _reference_resources(pdf, resources, images, fonts):
if 'Font' in resources:
assert resources['Font'] is None
@ -170,67 +100,6 @@ def _use_references(pdf, resources, images):
alpha['SMask']['G'] = alpha['SMask']['G'].reference
def _add_links(links, anchors, matrix, pdf, page, names, mark):
"""Include hyperlinks in given PDF page."""
for link_type, link_target, rectangle, box in links:
x1, y1 = matrix.transform_point(*rectangle[:2])
x2, y2 = matrix.transform_point(*rectangle[2:])
if link_type in ('internal', 'external'):
box.link_annotation = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Link',
'Rect': pydyf.Array([x1, y1, x2, y2]),
'BS': pydyf.Dictionary({'W': 0}),
})
if mark:
box.link_annotation['Contents'] = pydyf.String(link_target)
if link_type == 'internal':
box.link_annotation['Dest'] = pydyf.String(link_target)
else:
box.link_annotation['A'] = pydyf.Dictionary({
'Type': '/Action',
'S': '/URI',
'URI': pydyf.String(link_target),
})
pdf.add_object(box.link_annotation)
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
page['Annots'].append(box.link_annotation.reference)
for anchor in anchors:
anchor_name, x, y = anchor
x, y = matrix.transform_point(x, y)
names.append([
anchor_name, pydyf.Array([page.reference, '/XYZ', x, y, 0])])
def _create_bookmarks(bookmarks, pdf, parent=None):
count = len(bookmarks)
outlines = []
for title, (page, x, y), children, state in bookmarks:
destination = pydyf.Array((pdf.page_references[page], '/XYZ', x, y, 0))
outline = pydyf.Dictionary({
'Title': pydyf.String(title), 'Dest': destination})
pdf.add_object(outline)
children_outlines, children_count = _create_bookmarks(
children, pdf, parent=outline)
outline['Count'] = children_count
if state == 'closed':
outline['Count'] *= -1
else:
count += children_count
if outlines:
outline['Prev'] = outlines[-1].reference
outlines[-1]['Next'] = outline.reference
if children_outlines:
outline['First'] = children_outlines[0].reference
outline['Last'] = children_outlines[-1].reference
if parent is not None:
outline['Parent'] = parent.reference
outlines.append(outline)
return outlines, count
def generate_pdf(document, target, zoom, attachments, optimize_size,
identifier, variant, version, custom_metadata):
# 0.75 = 72 PDF point per inch / 96 CSS pixel per inch
@ -264,43 +133,14 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,
# Links and anchors
page_links_and_anchors = list(resolve_links(document.pages))
attachment_links = [
[link for link in page_links if link[0] == 'attachment']
for page_links, page_anchors in page_links_and_anchors]
# Annotations
annot_files = {}
# A single link can be split in multiple regions. We don't want to embed a
# file multiple times of course, so keep a reference to every embedded URL
# and reuse the object number.
for page_links in attachment_links:
for link_type, annot_target, rectangle, _ in page_links:
if link_type == 'attachment' and target not in annot_files:
# TODO: Use the title attribute as description. The comment
# above about multiple regions won't always be correct, because
# two links might have the same href, but different titles.
annot_files[annot_target] = _write_pdf_attachment(
pdf, (annot_target, None), document.url_fetcher)
# Bookmarks
root = []
# At one point in the document, for each "output" depth, how much to add to
# get the source level (CSS values of bookmark-level).
# E.g. with <h1> then <h3>, level_shifts == [0, 1]
# 1 means that <h3> has depth 3 - 1 = 2 in the output.
skipped_levels = []
last_by_depth = [root]
previous_level = 0
page_streams = []
for page_number, (page, links_and_anchors, page_links) in enumerate(
zip(document.pages, page_links_and_anchors, attachment_links)):
pdf_pages, page_streams = [], []
for page_number, (page, links_and_anchors) in enumerate(
zip(document.pages, page_links_and_anchors)):
# Draw from the top-left corner
matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale)
# Links and anchors
links, anchors = links_and_anchors
page_width = scale * (
page.width + page.bleed['left'] + page.bleed['right'])
page_height = scale * (
@ -331,8 +171,14 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,
pdf_page['Tabs'] = '/S'
pdf_page['StructParents'] = page_number
pdf.add_page(pdf_page)
pdf_pages.append(pdf_page)
_add_links(links, anchors, matrix, pdf, pdf_page, pdf_names, mark)
add_links(links_and_anchors, matrix, pdf, pdf_page, pdf_names, mark)
add_inputs(
page.inputs, matrix, pdf, pdf_page, resources, stream,
document.font_config.font_map)
add_annotations(
links_and_anchors[0], matrix, document, pdf, pdf_page, annot_files)
page.paint(stream, scale=scale)
# Bleed
@ -355,175 +201,13 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,
pdf_page['BleedBox'] = pydyf.Array([
bleed_left, bleed_top, bleed_right, bleed_bottom])
# Inputs
if page.inputs:
if 'Annots' not in pdf_page:
pdf_page['Annots'] = pydyf.Array()
if 'AcroForm' not in pdf.catalog:
pdf.catalog['AcroForm'] = pydyf.Dictionary({
'Fields': pydyf.Array(),
'DR': resources.reference,
})
for element, style, rectangle in page.inputs:
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
font_map = document._font_config.font_map
context = ffi.gc(
pango.pango_font_map_create_context(font_map),
gobject.g_object_unref)
font_description = ffi.gc(
pango.pango_font_description_new(),
pango.pango_font_description_free)
family_p, _ = unicode_to_char_p(','.join(style['font_family']))
pango.pango_font_description_set_family(font_description, family_p)
pango.pango_font_description_set_style(
font_description, PANGO_STYLE[style['font_style']])
pango.pango_font_description_set_stretch(
font_description, PANGO_STRETCH[style['font_stretch']])
pango.pango_font_description_set_weight(
font_description, style['font_weight'])
font = pango.pango_font_map_load_font(
font_map, context, font_description)
font = stream.add_font(font)
input_type = element.attrib.get('type')
if input_type == 'checkbox':
# Checkboxes
width = rectangle[2] - rectangle[0]
height = rectangle[1] - rectangle[3]
checked_stream = pydyf.Stream(extra={
'Resources': resources.reference,
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array((0, 0, width, height)),
})
checked_stream.push_state()
checked_stream.begin_text()
checked_stream.set_color_rgb(*style['color'][:3])
checked_stream.set_font_size('ZaDi', style['font_size'])
x = (width - style['font_size']) / 1.3
y = (height - style['font_size']) / 1.3
checked_stream.stream.append(f'{x} {y} Td')
checked_stream.stream.append('(8) Tj')
checked_stream.end_text()
checked_stream.pop_state()
pdf.add_object(checked_stream)
unchecked_stream = pydyf.Stream()
unchecked_stream.push_state()
unchecked_stream.pop_state()
pdf.add_object(unchecked_stream)
checked = 'checked' in element.attrib
# field_stream = pydyf.Stream()
# field_stream.set_color_rgb(*style['color'][:3])
# field_stream.set_font_size('ZaDi', style['font_size'])
field = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Widget',
# 'F': 4,
'Rect': pydyf.Array(rectangle),
'FT': '/Btn',
'P': pdf_page.reference,
'T': pydyf.String(element.attrib.get('name', '')),
'V': '/Yes' if checked else '/Off',
# 'DV': '/Yes' if checked else '/Off',
'DR': resources.reference,
# 'DA': pydyf.String(b' '.join(field_stream.stream)),
# 'MK': pydyf.Dictionary({'CA': pydyf.String('8')}),
'AP': pydyf.Dictionary({'N': pydyf.Dictionary({
'Yes': checked_stream.reference,
'Off': unchecked_stream.reference,
})}),
'AS': '/Yes' if checked else '/Off',
})
else:
# Text, password, textarea, files, and unknown
field_stream = pydyf.Stream()
field_stream.set_color_rgb(*style['color'][:3])
field_stream.set_font_size(font.hash, style['font_size'])
value = (
element.attrib.get('value', '') if element.tag == 'input'
else element.text)
field = pydyf.Dictionary({
'FT': '/Tx',
'DA': pydyf.String(b' '.join(field_stream.stream)),
'Type': '/Annot',
'Subtype': '/Widget',
'Rect': pydyf.Array(rectangle),
'T': pydyf.String(element.attrib.get('name', 'unknown')),
'V': pydyf.String(value),
'P': pdf_page.reference,
})
if element.tag == 'textarea':
field['Ff'] = 2 ** (13 - 1)
elif input_type == 'password':
field['Ff'] = 2 ** (14 - 1)
elif input_type == 'file':
field['Ff'] = 2 ** (21 - 1)
pdf.add_object(field)
pdf_page['Annots'].append(field.reference)
pdf.catalog['AcroForm']['Fields'].append(field.reference)
# Annotations
# TODO: splitting a link into multiple independent rectangular
# annotations works well for pure links, but rather mediocre for
# other annotations and fails completely for transformed (CSS) or
# complex link shapes (area). It would be better to use /AP for all
# links and coalesce link shapes that originate from the same HTML
# link. This would give a feeling similiar to what browsers do with
# links that span multiple lines.
for link_type, annot_target, rectangle, _ in page_links:
annot_file = annot_files[annot_target]
if link_type == 'attachment' and annot_file is not None:
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
stream = pydyf.Stream([], {
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array(rectangle),
'Length': 0,
})
pdf.add_object(stream)
annot = pydyf.Dictionary({
'Type': '/Annot',
'Rect': pydyf.Array(rectangle),
'Subtype': '/FileAttachment',
'T': pydyf.String(),
'FS': annot_file.reference,
'AP': pydyf.Dictionary({'N': stream.reference}),
'AS': '/N',
})
pdf.add_object(annot)
if 'Annots' not in pdf_page:
pdf_page['Annots'] = pydyf.Array()
pdf_page['Annots'].append(annot.reference)
# Bookmarks
previous_level = make_page_bookmark_tree(
page, skipped_levels, last_by_depth, previous_level, page_number,
matrix)
# Outlines
outlines, count = _create_bookmarks(root, pdf)
if outlines:
outlines_dictionary = pydyf.Dictionary({
'Count': count,
'First': outlines[0].reference,
'Last': outlines[-1].reference,
})
pdf.add_object(outlines_dictionary)
for outline in outlines:
outline['Parent'] = outlines_dictionary.reference
pdf.catalog['Outlines'] = outlines_dictionary.reference
add_outlines(pdf, document.make_bookmark_tree())
PROGRESS_LOGGER.info('Step 7 - Adding PDF metadata')
# PDF information
pdf.info['Producer'] = pydyf.String(f'WeasyPrint {__version__}')
pdf.info['Producer'] = pydyf.String(f'WeasyPrint {VERSION}')
metadata = document.metadata
if metadata.title:
pdf.info['Title'] = pydyf.String(metadata.title)
@ -554,7 +238,7 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,
attachments = metadata.attachments + (attachments or [])
pdf_attachments = []
for attachment in attachments:
pdf_attachment = _write_pdf_attachment(
pdf_attachment = write_pdf_attachment(
pdf, attachment, document.url_fetcher)
if pdf_attachment is not None:
pdf_attachments.append(pdf_attachment)
@ -591,10 +275,9 @@ def generate_pdf(document, target, zoom, attachments, optimize_size,
name_array.append(pydyf.String(anchor[0]))
name_array.append(anchor[1])
dests = pydyf.Dictionary({'Names': name_array})
if 'Names' in pdf.catalog:
pdf.catalog['Names']['Dests'] = dests
else:
pdf.catalog['Names'] = pydyf.Dictionary({'Dests': dests})
if 'Names' not in pdf.catalog:
pdf.catalog['Names'] = pydyf.Dictionary()
pdf.catalog['Names']['Dests'] = dests
# Apply PDF variants functions
if variant:

345
weasyprint/pdf/anchors.py Normal file
View File

@ -0,0 +1,345 @@
"""Insert anchors, links, bookmarks and inputs in PDFs."""
import hashlib
import io
import zlib
from os.path import basename
from urllib.parse import unquote, urlsplit
import pydyf
from .. import Attachment
from ..logger import LOGGER
from ..text.ffi import ffi, gobject, pango
from ..text.fonts import get_font_description
from ..urls import URLFetchingError
def add_links(links_and_anchors, matrix, pdf, page, names, mark):
"""Include hyperlinks in given PDF page."""
links, anchors = links_and_anchors
for link_type, link_target, rectangle, box in links:
x1, y1 = matrix.transform_point(*rectangle[:2])
x2, y2 = matrix.transform_point(*rectangle[2:])
if link_type in ('internal', 'external'):
box.link_annotation = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Link',
'Rect': pydyf.Array([x1, y1, x2, y2]),
'BS': pydyf.Dictionary({'W': 0}),
})
if mark:
box.link_annotation['Contents'] = pydyf.String(link_target)
if link_type == 'internal':
box.link_annotation['Dest'] = pydyf.String(link_target)
else:
box.link_annotation['A'] = pydyf.Dictionary({
'Type': '/Action',
'S': '/URI',
'URI': pydyf.String(link_target),
})
pdf.add_object(box.link_annotation)
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
page['Annots'].append(box.link_annotation.reference)
for anchor in anchors:
anchor_name, x, y = anchor
x, y = matrix.transform_point(x, y)
names.append([
anchor_name, pydyf.Array([page.reference, '/XYZ', x, y, 0])])
def add_outlines(pdf, bookmarks, parent=None):
"""Include bookmark outlines in PDF."""
count = len(bookmarks)
outlines = []
for title, (page, x, y), children, state in bookmarks:
destination = pydyf.Array((pdf.page_references[page], '/XYZ', x, y, 0))
outline = pydyf.Dictionary({
'Title': pydyf.String(title), 'Dest': destination})
pdf.add_object(outline)
children_outlines, children_count = add_outlines(
pdf, children, parent=outline)
outline['Count'] = children_count
if state == 'closed':
outline['Count'] *= -1
else:
count += children_count
if outlines:
outline['Prev'] = outlines[-1].reference
outlines[-1]['Next'] = outline.reference
if children_outlines:
outline['First'] = children_outlines[0].reference
outline['Last'] = children_outlines[-1].reference
if parent is not None:
outline['Parent'] = parent.reference
outlines.append(outline)
if parent is None and outlines:
outlines_dictionary = pydyf.Dictionary({
'Count': count,
'First': outlines[0].reference,
'Last': outlines[-1].reference,
})
pdf.add_object(outlines_dictionary)
for outline in outlines:
outline['Parent'] = outlines_dictionary.reference
pdf.catalog['Outlines'] = outlines_dictionary.reference
return outlines, count
def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map):
"""Include form inputs in PDF."""
if not inputs:
return
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
if 'AcroForm' not in pdf.catalog:
pdf.catalog['AcroForm'] = pydyf.Dictionary({
'Fields': pydyf.Array(),
'DR': resources.reference,
})
context = ffi.gc(
pango.pango_font_map_create_context(font_map),
gobject.g_object_unref)
for element, style, rectangle in inputs:
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
font_description = get_font_description(style)
font = pango.pango_font_map_load_font(
font_map, context, font_description)
font = stream.add_font(font)
input_type = element.attrib.get('type')
if input_type == 'checkbox':
# Checkboxes
width = rectangle[2] - rectangle[0]
height = rectangle[1] - rectangle[3]
checked_stream = pydyf.Stream(extra={
'Resources': resources.reference,
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array((0, 0, width, height)),
})
checked_stream.push_state()
checked_stream.begin_text()
checked_stream.set_color_rgb(*style['color'][:3])
checked_stream.set_font_size('ZaDi', style['font_size'])
x = (width - style['font_size']) / 1.3
y = (height - style['font_size']) / 1.3
checked_stream.stream.append(f'{x} {y} Td')
checked_stream.stream.append('(8) Tj')
checked_stream.end_text()
checked_stream.pop_state()
pdf.add_object(checked_stream)
unchecked_stream = pydyf.Stream()
unchecked_stream.push_state()
unchecked_stream.pop_state()
pdf.add_object(unchecked_stream)
checked = 'checked' in element.attrib
# field_stream = pydyf.Stream()
# field_stream.set_color_rgb(*style['color'][:3])
# field_stream.set_font_size('ZaDi', style['font_size'])
field = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Widget',
# 'F': 4,
'Rect': pydyf.Array(rectangle),
'FT': '/Btn',
'P': page.reference,
'T': pydyf.String(element.attrib.get('name', '')),
'V': '/Yes' if checked else '/Off',
# 'DV': '/Yes' if checked else '/Off',
'DR': resources.reference,
# 'DA': pydyf.String(b' '.join(field_stream.stream)),
# 'MK': pydyf.Dictionary({'CA': pydyf.String('8')}),
'AP': pydyf.Dictionary({'N': pydyf.Dictionary({
'Yes': checked_stream.reference,
'Off': unchecked_stream.reference,
})}),
'AS': '/Yes' if checked else '/Off',
})
else:
# Text, password, textarea, files, and unknown
field_stream = pydyf.Stream()
field_stream.set_color_rgb(*style['color'][:3])
field_stream.set_font_size(font.hash, style['font_size'])
value = (
element.attrib.get('value', '') if element.tag == 'input'
else element.text)
field = pydyf.Dictionary({
'FT': '/Tx',
'DA': pydyf.String(b' '.join(field_stream.stream)),
'Type': '/Annot',
'Subtype': '/Widget',
'Rect': pydyf.Array(rectangle),
'T': pydyf.String(element.attrib.get('name', 'unknown')),
'V': pydyf.String(value),
'P': page.reference,
})
if element.tag == 'textarea':
field['Ff'] = 2 ** (13 - 1)
elif input_type == 'password':
field['Ff'] = 2 ** (14 - 1)
elif input_type == 'file':
field['Ff'] = 2 ** (21 - 1)
pdf.add_object(field)
page['Annots'].append(field.reference)
pdf.catalog['AcroForm']['Fields'].append(field.reference)
def add_annotations(links, matrix, document, pdf, page, annot_files):
"""Include annotations in PDF."""
# TODO: splitting a link into multiple independent rectangular
# annotations works well for pure links, but rather mediocre for
# other annotations and fails completely for transformed (CSS) or
# complex link shapes (area). It would be better to use /AP for all
# links and coalesce link shapes that originate from the same HTML
# link. This would give a feeling similiar to what browsers do with
# links that span multiple lines.
for link_type, annot_target, rectangle, _ in links:
if link_type != 'attachment':
continue
if annot_target not in annot_files:
# A single link can be split in multiple regions. We don't want
# to embed a file multiple times of course, so keep a reference
# to every embedded URL and reuse the object number.
# TODO: Use the title attribute as description. The comment
# above about multiple regions won't always be correct, because
# two links might have the same href, but different titles.
annot_files[annot_target] = write_pdf_attachment(
pdf, (annot_target, None), document.url_fetcher)
annot_file = annot_files[annot_target]
if annot_file is None:
continue
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
stream = pydyf.Stream([], {
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array(rectangle),
'Length': 0,
})
pdf.add_object(stream)
annot = pydyf.Dictionary({
'Type': '/Annot',
'Rect': pydyf.Array(rectangle),
'Subtype': '/FileAttachment',
'T': pydyf.String(),
'FS': annot_file.reference,
'AP': pydyf.Dictionary({'N': stream.reference}),
'AS': '/N',
})
pdf.add_object(annot)
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
page['Annots'].append(annot.reference)
def write_pdf_attachment(pdf, attachment, url_fetcher):
"""Write an attachment to the PDF stream."""
# Attachments from document links like <link> or <a> can only be URLs.
# They're passed in as tuples
url = ''
if isinstance(attachment, tuple):
url, description = attachment
attachment = Attachment(
url=url, url_fetcher=url_fetcher, description=description)
elif not isinstance(attachment, Attachment):
attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
try:
with attachment.source as (source_type, source, url, _):
if isinstance(source, bytes):
source = io.BytesIO(source)
uncompressed_length = 0
stream = b''
md5 = hashlib.md5()
compress = zlib.compressobj()
for data in iter(lambda: source.read(4096), b''):
uncompressed_length += len(data)
md5.update(data)
compressed = compress.compress(data)
stream += compressed
compressed = compress.flush(zlib.Z_FINISH)
stream += compressed
file_extra = pydyf.Dictionary({
'Type': '/EmbeddedFile',
'Filter': '/FlateDecode',
'Params': pydyf.Dictionary({
'CheckSum': f'<{md5.hexdigest()}>',
'Size': uncompressed_length,
})
})
file_stream = pydyf.Stream([stream], file_extra)
pdf.add_object(file_stream)
except URLFetchingError as exception:
LOGGER.error('Failed to load attachment: %s', exception)
return
# TODO: Use the result object from a URL fetch operation to provide more
# details on the possible filename.
if url and urlsplit(url).path:
filename = basename(unquote(urlsplit(url).path))
else:
filename = 'attachment.bin'
attachment = pydyf.Dictionary({
'Type': '/Filespec',
'F': pydyf.String(),
'UF': pydyf.String(filename),
'EF': pydyf.Dictionary({'F': file_stream.reference}),
'Desc': pydyf.String(attachment.description or ''),
})
pdf.add_object(attachment)
return attachment
def resolve_links(pages):
"""Resolve internal hyperlinks.
Links to a missing anchor are removed with a warning.
If multiple anchors have the same name, the first one is used.
:returns:
A generator yielding lists (one per page) like :attr:`Page.links`,
except that ``target`` for internal hyperlinks is
``(page_number, x, y)`` instead of an anchor name.
The page number is a 0-based index into the :attr:`pages` list,
and ``x, y`` are in CSS pixels from the top-left of the page.
"""
anchors = set()
paged_anchors = []
for i, page in enumerate(pages):
paged_anchors.append([])
for anchor_name, (point_x, point_y) in page.anchors.items():
if anchor_name not in anchors:
paged_anchors[-1].append((anchor_name, point_x, point_y))
anchors.add(anchor_name)
for page in pages:
page_links = []
for link in page.links:
link_type, anchor_name, _, _ = link
if link_type == 'internal':
if anchor_name not in anchors:
LOGGER.error(
'No anchor #%s for internal URI reference',
anchor_name)
else:
page_links.append(link)
else:
# External link
page_links.append(link)
yield page_links, paged_anchors.pop(0)

View File

@ -140,8 +140,7 @@ def color(string):
def transform(transform_string, font_size, normalized_diagonal):
"""Get a matrix corresponding to the transform string."""
# TODO: merge with Page._gather_links_and_bookmarks and
# css.validation.properties.transform
# TODO: merge with gather_anchors and css.validation.properties.transform
transformations = re.findall(
r'(\w+) ?\( ?(.*?) ?\)', normalize(transform_string))
matrix = Matrix()

View File

@ -13,8 +13,10 @@ from ..logger import LOGGER
from ..urls import FILESYSTEM_ENCODING, fetch
from .constants import (
CAPS_KEYS, EAST_ASIAN_KEYS, FONTCONFIG_STRETCH, FONTCONFIG_STYLE,
FONTCONFIG_WEIGHT, LIGATURE_KEYS, NUMERIC_KEYS)
from .ffi import ffi, fontconfig, gobject, pangoft2
FONTCONFIG_WEIGHT, LIGATURE_KEYS, NUMERIC_KEYS, PANGO_STRETCH, PANGO_STYLE)
from .ffi import (
ffi, fontconfig, gobject, pango, pangoft2, unicode_to_char_p,
units_from_double)
def _check_font_configuration(font_config): # pragma: no cover
@ -326,3 +328,27 @@ def font_features(font_kerning='normal', font_variant_ligatures='normal',
features.update(dict(font_feature_settings))
return features
def get_font_description(style, font_size=None):
font_description = ffi.gc(
pango.pango_font_description_new(),
pango.pango_font_description_free)
family_p, family = unicode_to_char_p(','.join(style['font_family']))
pango.pango_font_description_set_family(font_description, family_p)
pango.pango_font_description_set_style(
font_description, PANGO_STYLE[style['font_style']])
pango.pango_font_description_set_stretch(
font_description, PANGO_STRETCH[style['font_stretch']])
pango.pango_font_description_set_weight(
font_description, style['font_weight'])
if font_size is not None:
pango.pango_font_description_set_absolute_size(
font_description, units_from_double(font_size))
if style['font_variation_settings'] != 'normal':
string = ','.join(
f'{key}={value}' for key, value in
style['font_variation_settings']).encode()
pango.pango_font_description_set_variations(
font_description, string)
return font_description

View File

@ -5,11 +5,11 @@ from math import inf
import pyphen
from .constants import LST_TO_ISO, PANGO_STRETCH, PANGO_STYLE, PANGO_WRAP_MODE
from .constants import LST_TO_ISO, PANGO_WRAP_MODE
from .ffi import (
ffi, gobject, pango, pangoft2, unicode_to_char_p, units_from_double,
units_to_double)
from .fonts import font_features
from .fonts import font_features, get_font_description
def line_size(line, style):
@ -78,9 +78,6 @@ class Layout:
pango.pango_font_map_create_context(font_map),
gobject.g_object_unref)
pango.pango_context_set_round_glyph_positions(pango_context, False)
self.layout = ffi.gc(
pango.pango_layout_new(pango_context),
gobject.g_object_unref)
if style['font_language_override'] != 'normal':
lang_p, lang = unicode_to_char_p(LST_TO_ISO.get(
@ -97,31 +94,17 @@ class Layout:
assert not isinstance(style['font_family'], str), (
'font_family should be a list')
self.font = ffi.gc(
pango.pango_font_description_new(),
pango.pango_font_description_free)
family_p, family = unicode_to_char_p(','.join(style['font_family']))
pango.pango_font_description_set_family(self.font, family_p)
pango.pango_font_description_set_style(
self.font, PANGO_STYLE[style['font_style']])
pango.pango_font_description_set_stretch(
self.font, PANGO_STRETCH[style['font_stretch']])
pango.pango_font_description_set_weight(
self.font, style['font_weight'])
pango.pango_font_description_set_absolute_size(
self.font, units_from_double(font_size))
if style['font_variation_settings'] != 'normal':
string = ','.join(
f'{key}={value}' for key, value in
style['font_variation_settings']).encode()
pango.pango_font_description_set_variations(self.font, string)
pango.pango_layout_set_font_description(self.layout, self.font)
font_description = get_font_description(style, font_size)
self.layout = ffi.gc(
pango.pango_layout_new(pango_context),
gobject.g_object_unref)
pango.pango_layout_set_font_description(self.layout, font_description)
text_decoration = style['text_decoration_line']
if text_decoration != 'none':
metrics = ffi.gc(
pango.pango_context_get_metrics(
pango_context, self.font, self.language),
pango_context, font_description, self.language),
pango.pango_font_metrics_unref)
self.ascent = units_to_double(
pango.pango_font_metrics_get_ascent(metrics))
@ -236,7 +219,7 @@ class Layout:
pango.pango_layout_set_tabs(self.layout, array)
def deactivate(self):
del self.layout, self.font, self.language, self.style
del self.layout, self.language, self.style
def reactivate(self, style):
self.setup(self.context, style['font_size'], style)