WeasyPrint/weasyprint/document.py

"""
    weasyprint.document
    -------------------

"""

import collections
import functools
import hashlib
import io
import math
import shutil
import zlib
from os.path import basename
from urllib.parse import unquote, urlsplit

import pydyf
from weasyprint.layout import LayoutContext

from . import Attachment, CSS, __version__
from .css import get_all_computed_styles
from .css.counters import CounterStyle
from .css.targets import TargetCollector
from .draw import draw_page, stacked
from .fonts import FontConfiguration
from .formatting_structure import boxes
from .formatting_structure.build import build_formatting_structure
from .html import W3C_DATE_RE
from .images import get_image_from_uri as original_get_image_from_uri
from .layout import layout_document
from .layout.percentages import percentage
from .logger import LOGGER, PROGRESS_LOGGER
from .text import ffi, pango
from .urls import URLFetchingError


def _w3c_date_to_pdf(string, attr_name):
    """Tranform W3C date to PDF format."""
    if string is None:
        return None
    match = W3C_DATE_RE.match(string)
    if match is None:
        LOGGER.warning('Invalid %s date: %r', attr_name, string)
        return None
    groups = match.groupdict()
    pdf_date = ''
    found = False
    for key in ('second', 'minute', 'hour', 'day', 'month', 'year'):
        if groups[key]:
            found = True
            pdf_date = groups[key] + pdf_date
        elif found:
            pdf_date = '%02i' % (key in ('day', 'month')) + pdf_date
    if groups['hour']:
        assert groups['minute']
        if groups['tz_hour']:
            assert groups['tz_hour'].startswith(('+', '-'))
            assert groups['tz_minute']
            pdf_date += "%+03i'%02i" % (
                int(groups['tz_hour']), int(groups['tz_minute']))
        else:
            pdf_date += 'Z'
    return pdf_date


class Font:
    def __init__(self, file_content, pango_font, glyph_item):
        pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
        font_description = ffi.gc(
            pango.pango_font_description_copy(
                pango.pango_font_describe(pango_font)),
            pango.pango_font_description_free)
        pango.pango_font_description_set_absolute_size(
            font_description, pango.pango_units_from_double(1))
        font_family = ffi.string(pango.pango_font_description_get_family(
            font_description))
        glyph_string = glyph_item.glyphs
        num_glyphs = glyph_string.num_glyphs

        self.hash = hash(file_content)
        self.file_content = file_content
        self.pango_font = pango_font
        self.glyph_item = glyph_item
        # When the font will be a font subset, the font name will have to be
        # like '/XXXXXX+font_family'
        self.name = b'/' + font_family.replace(b' ', b'')
        self.family = font_family
        self.flags = 4
        self.font_bbox = None
        self.italic_angle = 0
        self.ascent = pango.pango_font_metrics_get_ascent(pango_metrics)
        self.descent = pango.pango_font_metrics_get_descent(pango_metrics)
        self.cap_height = None
        self.stemv = 80
        self.stemh = 80
        self.glyphs = {glyph_string.glyphs[x].glyph for x in range(num_glyphs)}
        self.first_char = None
        self.last_char = None
        self.widths = None

    def add_glyphs(self, glyph_item):
        glyph_string = glyph_item.glyphs
        num_glyphs = glyph_string.num_glyphs
        self.glyphs |= {
            glyph_string.glyphs[x].glyph for x in range(num_glyphs)}

    def compute_glyphs_values(self):
        first_char = min(self.glyphs)
        last_char = max(self.glyphs)
        font_bbox = [0, 0, 0, 0]
        widths = [0] * (last_char - first_char + 1)
        ink_rect = ffi.new('PangoRectangle *')
        logical_rect = ffi.new('PangoRectangle *')

        for glyph in self.glyphs:
            pango.pango_font_get_glyph_extents(
                self.pango_font, glyph, ink_rect, logical_rect)

            x1, y1, x2, y2 = (
                ink_rect.x, -ink_rect.y - ink_rect.height,
                ink_rect.x + ink_rect.width, -ink_rect.y)
            if x1 < font_bbox[0]:
                font_bbox[0] = x1
            if y1 < font_bbox[1]:
                font_bbox[1] = y1
            if x2 > font_bbox[2]:
                font_bbox[2] = x2
            if y2 > font_bbox[3]:
                font_bbox[3] = y2

            widths[glyph - first_char] = (
                pango.pango_units_to_double(logical_rect.width) * 1000)

        ffi.release(ink_rect)
        ffi.release(logical_rect)
        self.bbox = font_bbox
        self.cap_height = font_bbox[1]
        self.first_char = first_char
        self.last_char = last_char
        self.widths = widths


class Context(pydyf.Stream):
    """PDF stream object with context storing alpha states."""
    def __init__(self, alpha_states, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._alpha_states = alpha_states
        self._fonts = {}

    def set_alpha(self, alpha, stroke=False):
        if alpha not in self._alpha_states:
            self._alpha_states[alpha] = pydyf.Dictionary(
                {'CA' if stroke else 'ca': alpha})
        self.set_state(alpha)

    def add_font(self, font, pango_font, glyph_item):
        font_hash = hash(font)
        if font_hash not in self._fonts:
            self._fonts[font_hash] = Font(font, pango_font, glyph_item)
        else:
            self._fonts[font_hash].add_glyphs(glyph_item)
        return self._fonts[font_hash]


BookmarkSubtree = collections.namedtuple(
    'BookmarkSubtree', ('label', 'destination', 'children', 'state'))


def _write_pdf_attachment(pdf, attachment, url_fetcher):
    """Write an attachment to the PDF stream.

    :return:
        the attachment PDF dictionary.

    """
    # Attachments from document links like <link> or <a> can only be URLs.
    # They're passed in as tuples
    url = ''
    if isinstance(attachment, tuple):
        url, description = attachment
        attachment = Attachment(
            url=url, url_fetcher=url_fetcher, description=description)
    elif not isinstance(attachment, Attachment):
        attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)

    try:
        with attachment.source as (source_type, source, url, _):
            if isinstance(source, bytes):
                source = io.BytesIO(source)
            uncompressed_length = 0
            stream = b''
            md5 = hashlib.md5()
            compress = zlib.compressobj()
            for data in iter(lambda: source.read(4096), b''):
                uncompressed_length += len(data)
                md5.update(data)
                compressed = compress.compress(data)
                stream += compressed
            compressed = compress.flush(zlib.Z_FINISH)
            stream += compressed
            file_extra = pydyf.Dictionary({
                'Type': '/EmbeddedFile',
                'Filter': '/FlateDecode',
                'Params': pydyf.Dictionary({
                    'CheckSum': f'<{md5.hexdigest()}>',
                    'Size': uncompressed_length,
                })
            })
            file_stream = pydyf.Stream([stream], file_extra)
            pdf.add_object(file_stream)

    except URLFetchingError as exception:
        LOGGER.error('Failed to load attachment: %s', exception)
        return

    # TODO: Use the result object from a URL fetch operation to provide more
    # details on the possible filename.
    filename = basename(unquote(urlsplit(url).path)) or 'attachment.bin'

    attachment = pydyf.Dictionary({
        'Type': '/Filespec',
        'F': pydyf.String(),
        'UF': pydyf.String(filename),
        'EF': pydyf.Dictionary({'F': file_stream.reference}),
        'Desc': pydyf.String(attachment.description or ''),
    })
    pdf.add_object(attachment)
    return attachment


def create_bookmarks(bookmarks, pdf, parent=None):
    count = len(bookmarks)
    outlines = []
    for title, (page, x, y), children, state in bookmarks:
        destination = pydyf.Array((
            pdf.objects[pdf.pages['Kids'][page * 3]].reference,
            '/XYZ', x, y, 0))
        outline = pydyf.Dictionary({
            'Title': pydyf.String(title), 'Dest': destination})
        pdf.add_object(outline)
        children_outlines, children_count = create_bookmarks(
            children, pdf, parent=outline)
        outline['Count'] = children_count
        if state == 'closed':
            outline['Count'] *= -1
        else:
            count += children_count
        if outlines:
            outline['Prev'] = outlines[-1].reference
            outlines[-1]['Next'] = outline.reference
        if children_outlines:
            outline['First'] = children_outlines[0].reference
            outline['Last'] = children_outlines[-1].reference
        if parent is not None:
            outline['Parent'] = parent.reference
        outlines.append(outline)
    return outlines, count


def add_hyperlinks(links, anchors, matrix, pdf, page, names):
    """Include hyperlinks in current PDF page."""
    page['Annots'] = pydyf.Array()
    for link in links:
        link_type, link_target, rectangle = link
        x1, y1 = matrix.transform_point(*rectangle[:2])
        x2, y2 = matrix.transform_point(*rectangle[2:])
        if link_type in ('internal', 'external'):
            annot = pydyf.Dictionary({
                'Type': '/Annot',
                'Subtype': '/Link',
                'Rect': pydyf.Array([x1, y1, x2, y2]),
                'BS': pydyf.Dictionary({'W': 0}),
            })
            if link_type == 'internal':
                annot['Dest'] = pydyf.String(link_target)
            else:
                annot['A'] = pydyf.Dictionary({
                    'Type': '/Action',
                    'S': '/URI',
                    'URI': pydyf.String(link_target),
                })
            pdf.add_object(annot)
            page['Annots'].append(annot.reference)

    for anchor in anchors:
        anchor_name, x, y = anchor
        x, y = matrix.transform_point(x, y)
        names.append(pydyf.String(anchor_name))
        names.append(pydyf.Array([page.reference, '/XYZ', x, y, 0]))


def rectangle_aabb(matrix, pos_x, pos_y, width, height):
    """Apply a transformation matrix to an axis-aligned rectangle.

    Return its axis-aligned bounding box as ``(x, y, width, height)``.

    """
    transform_point = matrix.transform_point
    x1, y1 = transform_point(pos_x, pos_y)
    x2, y2 = transform_point(pos_x + width, pos_y)
    x3, y3 = transform_point(pos_x, pos_y + height)
    x4, y4 = transform_point(pos_x + width, pos_y + height)
    box_x1 = min(x1, x2, x3, x4)
    box_y1 = min(y1, y2, y3, y4)
    box_x2 = max(x1, x2, x3, x4)
    box_y2 = max(y1, y2, y3, y4)
    return box_x1, box_y1, box_x2 - box_x1, box_y2 - box_y1


def resolve_links(pages):
    """Resolve internal hyperlinks.

    Links to a missing anchor are removed with a warning.

    If multiple anchors have the same name, the first one is used.

    :returns:
        A generator yielding lists (one per page) like :attr:`Page.links`,
        except that ``target`` for internal hyperlinks is
        ``(page_number, x, y)`` instead of an anchor name.
        The page number is a 0-based index into the :attr:`pages` list,
        and ``x, y`` are in CSS pixels from the top-left of the page.

    """
    anchors = set()
    paged_anchors = []
    for i, page in enumerate(pages):
        paged_anchors.append([])
        for anchor_name, (point_x, point_y) in page.anchors.items():
            if anchor_name not in anchors:
                paged_anchors[-1].append((anchor_name, point_x, point_y))
                anchors.add(anchor_name)
    for page in pages:
        page_links = []
        for link in page.links:
            link_type, anchor_name, rectangle = link
            if link_type == 'internal':
                if anchor_name not in anchors:
                    LOGGER.error(
                        'No anchor #%s for internal URI reference',
                        anchor_name)
                else:
                    page_links.append((link_type, anchor_name, rectangle))
            else:
                # External link
                page_links.append(link)
        yield page_links, paged_anchors.pop(0)


class Matrix(list):
    def __init__(self, a=1, b=0, c=0, d=1, e=0, f=0, matrix=None):
        if matrix is None:
            matrix = [[a, b, 0], [c, d, 0], [e, f, 1]]
        super().__init__(matrix)

    def __matmul__(self, other):
        assert len(self[0]) == len(other) == len(other[0]) == 3
        return Matrix(matrix=[
            [sum(self[i][k] * other[k][j] for k in range(3)) for j in range(3)]
            for i in range(len(self))])

    @property
    def determinant(self):
        assert len(self) == len(self[0]) == 3
        return (
            self[0][0] * (self[1][1] * self[2][2] - self[1][2] * self[2][1]) -
            self[1][0] * (self[0][1] * self[2][2] - self[0][2] * self[2][1]) +
            self[2][0] * (self[0][1] * self[1][2] - self[0][2] * self[1][1]))

    def transform_point(self, x, y):
        return (Matrix(matrix=[[x, y, 1]]) @ self)[0][:2]


class Page:
    """Represents a single rendered page.

    .. versionadded:: 0.15

    Should be obtained from :attr:`Document.pages` but not
    instantiated directly.

    """
    def __init__(self, page_box):
        #: The page width, including margins, in CSS pixels.
        self.width = page_box.margin_width()

        #: The page height, including margins, in CSS pixels.
        self.height = page_box.margin_height()

        #: The page bleed widths as a :obj:`dict` with ``'top'``, ``'right'``,
        #: ``'bottom'`` and ``'left'`` as keys, and values in CSS pixels.
        self.bleed = {
            side: page_box.style['bleed_%s' % side].value
            for side in ('top', 'right', 'bottom', 'left')}

        #: The :obj:`list` of ``(bookmark_level, bookmark_label, target)``
        #: :obj:`tuples <tuple>`. ``bookmark_level`` and ``bookmark_label``
        #: are respectively an :obj:`int` and a :obj:`string <str>`, based on
        #: the CSS properties of the same names. ``target`` is an ``(x, y)``
        #: point in CSS pixels from the top-left of the page.
        self.bookmarks = []

        #: The :obj:`list` of ``(link_type, target, rectangle)`` :obj:`tuples
        #: <tuple>`. A ``rectangle`` is ``(x, y, width, height)``, in CSS
        #: pixels from the top-left of the page. ``link_type`` is one of three
        #: strings:
        #:
        #: * ``'external'``: ``target`` is an absolute URL
        #: * ``'internal'``: ``target`` is an anchor name (see
        #:   :attr:`Page.anchors`).
        #:   The anchor might be defined in another page,
        #:   in multiple pages (in which case the first occurence is used),
        #:   or not at all.
        #: * ``'attachment'``: ``target`` is an absolute URL and points
        #:   to a resource to attach to the document.
        self.links = []

        #: The :obj:`dict` mapping each anchor name to its target, an
        #: ``(x, y)`` point in CSS pixels from the top-left of the page.
        self.anchors = {}

        self._gather_links_and_bookmarks(page_box)
        self._page_box = page_box

    def _gather_links_and_bookmarks(self, box, matrix=None):
        # Get box transformation matrix.
        # "Transforms apply to block-level and atomic inline-level elements,
        #  but do not apply to elements which may be split into
        #  multiple inline-level boxes."
        # http://www.w3.org/TR/css3-2d-transforms/#introduction
        if box.style['transform'] and not isinstance(box, boxes.InlineBox):
            border_width = box.border_width()
            border_height = box.border_height()
            origin_x, origin_y = box.style['transform_origin']
            offset_x = percentage(origin_x, border_width)
            offset_y = percentage(origin_y, border_height)
            origin_x = box.border_box_x() + offset_x
            origin_y = box.border_box_y() + offset_y

            matrix = Matrix(e=origin_x, f=origin_y)
            for name, args in box.style['transform']:
                a, b, c, d, e, f = 1, 0, 0, 1, 0, 0
                if name == 'scale':
                    a, d = args
                elif name == 'rotate':
                    a = d = math.cos(args)
                    b = math.sin(args)
                    c = -b
                elif name == 'translate':
                    e = percentage(args[0], border_width)
                    f = percentage(args[1], border_height)
                elif name == 'skew':
                    b, c = math.tan(args[1]), math.tan(args[0])
                else:
                    assert name == 'matrix'
                    a, b, c, d, e, f = args
                matrix = Matrix(a, b, c, d, e, f) @ matrix
            box.transformation_matrix = (
                Matrix(e=-origin_x, f=-origin_y) @ matrix)
            if matrix:
                matrix = box.transformation_matrix @ matrix
            else:
                matrix = box.transformation_matrix

        bookmark_label = box.bookmark_label
        if box.style['bookmark_level'] == 'none':
            bookmark_level = None
        else:
            bookmark_level = box.style['bookmark_level']
        state = box.style['bookmark_state']
        link = box.style['link']
        anchor_name = box.style['anchor']
        has_bookmark = bookmark_label and bookmark_level
        # 'link' is inherited but redundant on text boxes
        has_link = link and not isinstance(box, boxes.TextBox)
        # In case of duplicate IDs, only the first is an anchor.
        has_anchor = anchor_name and anchor_name not in self.anchors
        is_attachment = hasattr(box, 'is_attachment') and box.is_attachment

        if has_bookmark or has_link or has_anchor:
            pos_x, pos_y, width, height = box.hit_area()
            if has_link:
                token_type, link = link
                assert token_type == 'url'
                link_type, target = link
                assert isinstance(target, str)
                if link_type == 'external' and is_attachment:
                    link_type = 'attachment'
                if matrix:
                    link = (link_type, target, rectangle_aabb(
                        matrix, pos_x, pos_y, pos_x + width, pos_y + height))
                else:
                    link = (link_type, target, (
                        pos_x, pos_y, pos_x + width, pos_y + height))
                self.links.append(link)
            if matrix and (has_bookmark or has_anchor):
                pos_x, pos_y = matrix.transform_point(pos_x, pos_y)
            if has_bookmark:
                self.bookmarks.append(
                    (bookmark_level, bookmark_label, (pos_x, pos_y), state))
            if has_anchor:
                self.anchors[anchor_name] = pos_x, pos_y

        for child in box.all_children():
            self._gather_links_and_bookmarks(child)

    def paint(self, context, left_x=0, top_y=0, scale=1, clip=False):
        """Paint the page into the PDF file.

        :type context: :class:`pdf.Context`
        :param context:
            A context object.
        :type left_x: float
        :param left_x:
            X coordinate of the left of the page, in PDF points.
        :type top_y: float
        :param top_y:
            Y coordinate of the top of the page, in PDF points.
        :type scale: float
        :param scale:
            Zoom scale.
        :type clip: bool
        :param clip:
            Whether to clip/cut content outside the page. If false or
            not provided, content can overflow.

        """
        with stacked(context):
            # Make (0, 0) the top-left corner, and make user units CSS pixels:
            context.transform(scale, 0, 0, scale, left_x, top_y)
            if clip:
                width = self.width
                height = self.height
                context.rectangle(0, 0, width, height)
                context.clip()
            draw_page(self._page_box, context)


class DocumentMetadata:
    """Meta-information belonging to a whole :class:`Document`.

    .. versionadded:: 0.20

    New attributes may be added in future versions of WeasyPrint.

    """
    def __init__(self, title=None, authors=None, description=None,
                 keywords=None, generator=None, created=None, modified=None,
                 attachments=None):
        #: The title of the document, as a string or :obj:`None`.
        #: Extracted from the ``<title>`` element in HTML
        #: and written to the ``/Title`` info field in PDF.
        self.title = title
        #: The authors of the document, as a list of strings.
        #: (Defaults to the empty list.)
        #: Extracted from the ``<meta name=author>`` elements in HTML
        #: and written to the ``/Author`` info field in PDF.
        self.authors = authors or []
        #: The description of the document, as a string or :obj:`None`.
        #: Extracted from the ``<meta name=description>`` element in HTML
        #: and written to the ``/Subject`` info field in PDF.
        self.description = description
        #: Keywords associated with the document, as a list of strings.
        #: (Defaults to the empty list.)
        #: Extracted from ``<meta name=keywords>`` elements in HTML
        #: and written to the ``/Keywords`` info field in PDF.
        self.keywords = keywords or []
        #: The name of one of the software packages
        #: used to generate the document, as a string or :obj:`None`.
        #: Extracted from the ``<meta name=generator>`` element in HTML
        #: and written to the ``/Creator`` info field in PDF.
        self.generator = generator
        #: The creation date of the document, as a string or :obj:`None`.
        #: Dates are in one of the six formats specified in
        #: `W3C’s profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
        #: Extracted from the ``<meta name=dcterms.created>`` element in HTML
        #: and written to the ``/CreationDate`` info field in PDF.
        self.created = created
        #: The modification date of the document, as a string or :obj:`None`.
        #: Dates are in one of the six formats specified in
        #: `W3C’s profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
        #: Extracted from the ``<meta name=dcterms.modified>`` element in HTML
        #: and written to the ``/ModDate`` info field in PDF.
        self.modified = modified
        #: File attachments, as a list of tuples of URL and a description or
        #: :obj:`None`. (Defaults to the empty list.)
        #: Extracted from the ``<link rel=attachment>`` elements in HTML
        #: and written to the ``/EmbeddedFiles`` dictionary in PDF.
        #:
        #: .. versionadded:: 0.22
        self.attachments = attachments or []


class Document:
    """A rendered document ready to be painted on a cairo surface.

    Typically obtained from :meth:`HTML.render() <weasyprint.HTML.render>`, but
    can also be instantiated directly with a list of :class:`pages <Page>`, a
    set of :class:`metadata <DocumentMetadata>`, a :func:`url_fetcher
    <weasyprint.default_url_fetcher>` function, and a :class:`font_config
    <weasyprint.fonts.FontConfiguration>`.

    """

    @classmethod
    def _build_layout_context(cls, html, stylesheets,
                              presentational_hints=False, font_config=None,
                              counter_style=None):
        if font_config is None:
            font_config = FontConfiguration()
        if counter_style is None:
            counter_style = CounterStyle()
        target_collector = TargetCollector()
        page_rules = []
        user_stylesheets = []
        for css in stylesheets or []:
            if not hasattr(css, 'matcher'):
                css = CSS(
                    guess=css, media_type=html.media_type,
                    font_config=font_config, counter_style=counter_style)
            user_stylesheets.append(css)
        style_for = get_all_computed_styles(
            html, user_stylesheets, presentational_hints, font_config,
            counter_style, page_rules, target_collector)
        get_image_from_uri = functools.partial(
            original_get_image_from_uri, {}, html.url_fetcher)
        PROGRESS_LOGGER.info('Step 4 - Creating formatting structure')
        context = LayoutContext(
            style_for, get_image_from_uri, font_config, counter_style,
            target_collector)
        return context

    @classmethod
    def _render(cls, html, stylesheets, presentational_hints=False,
                font_config=None, counter_style=None):
        if font_config is None:
            font_config = FontConfiguration()

        if counter_style is None:
            counter_style = CounterStyle()

        context = cls._build_layout_context(
            html, stylesheets, presentational_hints, font_config,
            counter_style)

        root_box = build_formatting_structure(
            html.etree_element, context.style_for, context.get_image_from_uri,
            html.base_url, context.target_collector, counter_style)

        page_boxes = layout_document(html, root_box, context)
        rendering = cls(
            [Page(page_box) for page_box in page_boxes],
            DocumentMetadata(**html._get_metadata()),
            html.url_fetcher, font_config)
        return rendering

    def __init__(self, pages, metadata, url_fetcher, font_config):
        #: A list of :class:`Page` objects.
        self.pages = pages
        #: A :class:`DocumentMetadata` object.
        #: Contains information that does not belong to a specific page
        #: but to the whole document.
        self.metadata = metadata
        #: A function or other callable with the same signature as
        #: :func:`default_url_fetcher` called to fetch external resources such
        #: as stylesheets and images.  (See :ref:`url-fetchers`.)
        self.url_fetcher = url_fetcher
        # Keep a reference to font_config to avoid its garbage collection until
        # rendering is destroyed. This is needed as font_config.__del__ removes
        # fonts that may be used when rendering
        self._font_config = font_config

    def copy(self, pages='all'):
        """Take a subset of the pages.

        .. versionadded:: 0.15

        :type pages: :term:`iterable`
        :param pages:
            An iterable of :class:`Page` objects from :attr:`pages`.
        :return:
            A new :class:`Document` object.

        Examples:

        Write two PDF files for odd-numbered and even-numbered pages::

            # Python lists count from 0 but pages are numbered from 1.
            # [::2] is a slice of even list indexes but odd-numbered pages.
            document.copy(document.pages[::2]).write_pdf('odd_pages.pdf')
            document.copy(document.pages[1::2]).write_pdf('even_pages.pdf')

        Write each page to a numbred PNG file::

            for i, page in enumerate(document.pages):
                document.copy(page).write_png('page_%s.png' % i)

        Combine multiple documents into one PDF file,
        using metadata from the first::

            all_pages = [p for doc in documents for p in doc.pages]
            documents[0].copy(all_pages).write_pdf('combined.pdf')

        """
        if pages == 'all':
            pages = self.pages
        elif not isinstance(pages, list):
            pages = list(pages)
        return type(self)(
            pages, self.metadata, self.url_fetcher, self._font_config)

    def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None):
        """Paint the pages in a PDF file, with meta-data.

        PDF files written directly by cairo do not have meta-data such as
        bookmarks/outlines and hyperlinks.

        :type target: str, pathlib.Path or file object
        :param target:
            A filename where the PDF file is generated, a file object, or
            :obj:`None`.
        :type zoom: float
        :param zoom:
            The zoom factor in PDF units per CSS units.  **Warning**:
            All CSS units are affected, including physical units like
            ``cm`` and named sizes like ``A4``.  For values other than
            1, the physical CSS units will thus be "wrong".
        :type attachments: list
        :param attachments: A list of additional file attachments for the
            generated PDF document or :obj:`None`. The list's elements are
            :class:`Attachment` objects, filenames, URLs or file-like objects.
        :param finisher: A finisher function, that accepts the document and a
            ``pydyf.PDF`` object as parameters, can be passed to perform
            post-processing on the PDF right before the trailer is written.
        :returns:
            The PDF as :obj:`bytes` if ``target`` is not provided or
            :obj:`None`, otherwise :obj:`None` (the PDF is written to
            ``target``).

        """
        # 0.75 = 72 PDF point per inch / 96 CSS pixel per inch
        scale = zoom * 0.75

        PROGRESS_LOGGER.info('Step 6 - Creating PDF')

        pdf = pydyf.PDF()
        alpha_states = pydyf.Dictionary()
        pdf.add_object(alpha_states)
        resources = pydyf.Dictionary({'ExtGState': alpha_states.reference})
        pdf.add_object(resources)
        pdf_names = pydyf.Array()
        pdf.catalog['Names'] = pydyf.Dictionary(
            {'Dests': pydyf.Dictionary({'Names': pdf_names})})

        # Links and anchors
        paged_links_and_anchors = list(resolve_links(self.pages))
        attachment_links = [
            [link for link in page_links if link[0] == 'attachment']
            for page_links, page_anchors in paged_links_and_anchors]

        # Annotations
        annot_files = {}
        # A single link can be split in multiple regions. We don't want to
        # embed a file multiple times of course, so keep a reference to every
        # embedded URL and reuse the object number.
        for page_links in attachment_links:
            for link_type, annot_target, rectangle in page_links:
                if link_type == 'attachment' and target not in annot_files:
                    # TODO: Use the title attribute as description. The comment
                    # above about multiple regions won't always be correct,
                    # because two links might have the same href, but different
                    # titles.
                    annot_files[annot_target] = _write_pdf_attachment(
                        pdf, (annot_target, None), self.url_fetcher)

        # Bookmarks
        root = []
        # At one point in the document, for each "output" depth, how much
        # to add to get the source level (CSS values of bookmark-level).
        # E.g. with <h1> then <h3>, level_shifts == [0, 1]
        # 1 means that <h3> has depth 3 - 1 = 2 in the output.
        skipped_levels = []
        last_by_depth = [root]
        previous_level = 0

        for page_number, (page, links_and_anchors, page_links) in enumerate(
                zip(self.pages, paged_links_and_anchors, attachment_links)):
            # Draw from the top-left corner
            matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale)

            # Links and anchors
            links, anchors = links_and_anchors

            page_width = scale * (
                page.width + page.bleed['left'] + page.bleed['right'])
            page_height = scale * (
                page.height + page.bleed['top'] + page.bleed['bottom'])
            left = -scale * page.bleed['left']
            top = -scale * page.bleed['top']
            right = left + page_width
            bottom = top + page_height

            stream = Context(alpha_states)
            stream.transform(1, 0, 0, -1, 0, page.height * scale)
            page.paint(stream, scale=scale)
            pdf.add_object(stream)

            pdf_page = pydyf.Dictionary({
                'Type': '/Page',
                'Parent': pdf.pages.reference,
                'MediaBox': pydyf.Array([left, top, right, bottom]),
                'Contents': stream.reference,
                'Resources': resources.reference,
                'Annots': pydyf.Array(),
            })
            pdf.add_page(pdf_page)

            add_hyperlinks(links, anchors, matrix, pdf, pdf_page, pdf_names)

            # Bleed
            bleed = {key: value * 0.75 for key, value in page.bleed.items()}

            trim_left = left + bleed['left']
            trim_top = top + bleed['top']
            trim_right = right - bleed['right']
            trim_bottom = bottom - bleed['bottom']

            # Arbitrarly set PDF BleedBox between CSS bleed box (MediaBox) and
            # CSS page box (TrimBox) at most 10 points from the TrimBox.
            bleed_left = trim_left - min(10, bleed['left'])
            bleed_top = trim_top - min(10, bleed['top'])
            bleed_right = trim_right + min(10, bleed['right'])
            bleed_bottom = trim_bottom + min(10, bleed['bottom'])

            pdf_page['TrimBox'] = pydyf.Array([
                trim_left, trim_top, trim_right, trim_bottom])
            pdf_page['BleedBox'] = pydyf.Array([
                bleed_left, bleed_top, bleed_right, bleed_bottom])

            # Annotations
            # TODO: splitting a link into multiple independent rectangular
            # annotations works well for pure links, but rather mediocre for
            # other annotations and fails completely for transformed (CSS) or
            # complex link shapes (area). It would be better to use /AP for all
            # links and coalesce link shapes that originate from the same HTML
            # link. This would give a feeling similiar to what browsers do with
            # links that span multiple lines.
            for link_type, annot_target, rectangle in page_links:
                annot_file = annot_files[annot_target]
                if link_type == 'attachment' and annot_file is not None:
                    rectangle = (
                        *matrix.transform_point(*rectangle[:2]),
                        *matrix.transform_point(*rectangle[2:]))
                    annot = pydyf.Dictionary({
                        'Type': '/Annot',
                        'Rect': pydyf.Array(rectangle),
                        'Subtype': '/FileAttachment',
                        'T': pydyf.String(),
                        'FS': annot_file.reference,
                        'AP': pydyf.Dictionary({'N': pydyf.Stream([], {
                            'Type': '/XObject',
                            'Subtype': '/Form',
                            'BBox': pydyf.Array(rectangle),
                            'Length': 0,
                        })})
                    })
                    pdf.add_object(annot)
                    pdf_page['Annots'].append(annot.reference)

            # Bookmarks
            for level, label, (point_x, point_y), state in page.bookmarks:
                if level > previous_level:
                    # Example: if the previous bookmark is a <h2>, the next
                    # depth "should" be for <h3>. If now we get a <h6> we’re
                    # skipping two levels: append 6 - 3 - 1 = 2
                    skipped_levels.append(level - previous_level - 1)
                else:
                    temp = level
                    while temp < previous_level:
                        temp += 1 + skipped_levels.pop()
                    if temp > previous_level:
                        # We remove too many "skips", add some back:
                        skipped_levels.append(temp - previous_level - 1)

                previous_level = level
                depth = level - sum(skipped_levels)
                assert depth == len(skipped_levels)
                assert depth >= 1

                children = []
                point_x, point_y = matrix.transform_point(point_x, point_y)
                subtree = BookmarkSubtree(
                    label, (page_number, point_x, point_y), children, state)
                last_by_depth[depth - 1].append(subtree)
                del last_by_depth[depth:]
                last_by_depth.append(children)

        outlines, count = create_bookmarks(root, pdf)
        if outlines:
            pdf.catalog['Outlines'] = pydyf.Dictionary({
                'Count': count,
                'First': outlines[0].reference,
                'Last': outlines[-1].reference,
            })

        PROGRESS_LOGGER.info('Step 7 - Adding PDF metadata')

        # PDF information
        if self.metadata.title:
            pdf.info['Title'] = pydyf.String(self.metadata.title)
        if self.metadata.authors:
            pdf.info['Author'] = pydyf.String(
                ', '.join(self.metadata.authors))
        if self.metadata.description:
            pdf.info['Subject'] = pydyf.String(self.metadata.description)
        if self.metadata.keywords:
            pdf.info['Keywords'] = pydyf.String(
                ', '.join(self.metadata.keywords))
        if self.metadata.generator:
            pdf.info['Creator'] = pydyf.String(self.metadata.generator)
        pdf.info['Producer'] = pydyf.String(f'WeasyPrint {__version__}')
        if self.metadata.created:
            pdf.info['CreationDate'] = pydyf.String(
                _w3c_date_to_pdf(self.metadata.created, 'created'))
        if self.metadata.modified:
            pdf.info['ModDate'] = pydyf.String(
                _w3c_date_to_pdf(self.metadata.modified, 'modified'))

        # Embedded files
        attachments = self.metadata.attachments + (attachments or [])
        pdf_attachments = []
        for attachment in attachments:
            pdf_attachment = _write_pdf_attachment(
                pdf, attachment, self.url_fetcher)
            if pdf_attachment is not None:
                pdf_attachments.append(pdf_attachment)
        if pdf_attachments:
            content = pydyf.Dictionary({'Names': pydyf.Array()})
            for i, pdf_attachment in enumerate(pdf_attachments):
                content['Names'].append(pydyf.String(f'attachment{i}'))
                content['Names'].append(pdf_attachment.reference)
            pdf.add_object(content)
            pdf.catalog['Names']['EmbeddedFiles'] = content.reference

        # Embeded fonts
        resources['Font'] = pydyf.Dictionary()
        for font_hash, font in stream._fonts.items():
            compressed = zlib.compressobj().compress(font.file_content)
            font_extra = pydyf.Dictionary({
                'Filter': '/FlateDecode',
                'Length1': len(font.file_content),
            })
            font_stream = pydyf.Stream([compressed], font_extra)
            pdf.add_object(font_stream)

            font.compute_glyphs_values()
            subfont_dictionary = pydyf.Dictionary({
                'Type': '/Font',
                'Subtype': '/CIDFontType2',
                'BaseFont': font.name,
                'CIDSystemInfo': pydyf.Dictionary({
                    'Registry': pydyf.String('Adobe'),
                    'Ordering': pydyf.String('Identity'),
                    'Supplement': 0,
                }),
                'W': pydyf.Array([font.first_char, pydyf.Array(font.widths)]),
                'FontDescriptor': pydyf.Dictionary({
                    'FontName': font.name,
                    'FontFamily': pydyf.String(font.family),
                    'Flags': 32,
                    'FontBBox': pydyf.Array(font.bbox),
                    'ItalicAngle': font.italic_angle,
                    'Ascent': font.ascent,
                    'Descent': font.descent,
                    'CapHeight': font.cap_height,
                    'StemV': font.stemv,
                    'StemH': font.stemh,
                    'FontFile': font_stream.reference,
                }),
            })
            pdf.add_object(subfont_dictionary)
            font_dictionary = pydyf.Dictionary({
                'Type': '/Font',
                'Subtype': '/Type0',
                'BaseFont': font.name,
                'Encoding': '/Identity-H',
                'DescendantFonts': pydyf.Array([subfont_dictionary.reference]),
            })
            pdf.add_object(font_dictionary)
            resources['Font'][str(font_hash)] = font_dictionary.reference

        if finisher:
            finisher(self, pdf)

        file_obj = io.BytesIO()
        pdf.write(file_obj)

        if target is None:
            return file_obj.getvalue()
        else:
            file_obj.seek(0)
            if hasattr(target, 'write'):
                shutil.copyfileobj(file_obj, target)
            else:
                with open(target, 'wb') as fd:
                    shutil.copyfileobj(file_obj, fd)

    def write_png(self, target=None, resolution=96):
        """Paint the pages vertically to a single PNG image.

        There is no decoration around pages other than those specified in CSS
        with ``@page`` rules. The final image is as wide as the widest page.
        Each page is below the previous one, centered horizontally.

        :param target:
            A filename, file-like object, or :obj:`None`.
        :type resolution: float
        :param resolution:
            The output resolution in PNG pixels per CSS inch. At 96 dpi
            (the default), PNG pixels match the CSS ``px`` unit.
        :returns:
            A ``(png_bytes, png_width, png_height)`` tuple. ``png_bytes`` is a
            byte string if ``target`` is :obj:`None`, otherwise :obj:`None`
            (the image is written to ``target``).  ``png_width`` and
            ``png_height`` are the size of the final image, in PNG pixels.

        """
        # TODO: write this
        raise NotImplementedError
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								"""
 								    weasyprint.document
 								    -------------------
 								"""
-												Small code and style improvements around bookmark-state

Related to #870.

											
										
										
											2019-05-24 00:55:56 +03:00
+								import collections
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								import functools
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								import hashlib
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								import io
 								import math
 								import shutil
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								import zlib
 								from os.path import basename
 								from urllib.parse import unquote, urlsplit
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								import pydyf
-												Refactor LayoutContext building

 this will provide a simpler API for testing context-using functions inside tests

											
										
										
											2019-07-23 08:12:08 +03:00
+								from weasyprint.layout import LayoutContext
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								from . import Attachment, CSS, __version__
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								from .css import get_all_computed_styles
-												Add default counter class

											
										
										
											2019-12-24 17:56:24 +03:00
+								from .css.counters import CounterStyle
-												Don't use a global target collector

											
										
										
											2018-03-28 01:34:34 +03:00
+								from .css.targets import TargetCollector
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								from .draw import draw_page, stacked
-												Add the font configuration in LayoutContext

The font configuration is available (almost) everywhere it's needed,
@font-face doesn't rely on a global state anymore.

											
										
										
											2016-10-27 18:36:24 +03:00
+								from .fonts import FontConfiguration
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								from .formatting_structure import boxes
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								from .formatting_structure.build import build_formatting_structure
-												Don't use pdfrw anymore

pdfrw is a great piece of software, but we don't know PDF enough to debug the
problems we've met. It's safer to use the new cairo API and get back to manual
edition for attachments and bleed boxes.

We only have two regressions for now:
- some internal links are broken,
- PDF producer is not overwritten.

A mail has been sent to cairo's mailing-list about that:
https://lists.cairographics.org/archives/cairo/2018-August/028694.html

Fix #639, #615, fix #596, fix #565.

											
										
										
											2018-08-06 18:38:02 +03:00
+								from .html import W3C_DATE_RE
-												Fix circular imports

											
										
										
											2018-01-07 03:46:39 +03:00
+								from .images import get_image_from_uri as original_get_image_from_uri
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								from .layout import layout_document
-												Move percentage function into percentages module

											
										
										
											2019-06-02 19:06:25 +03:00
+								from .layout.percentages import percentage
-												Use a separate logger for generation progress

											
										
										
											2019-01-04 01:02:44 +03:00
+								from .logger import LOGGER, PROGRESS_LOGGER
-												Get FontBBox and CapHeight

											
										
										
											2020-05-08 01:11:19 +03:00
+								from .text import ffi, pango
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								from .urls import URLFetchingError
 								def _w3c_date_to_pdf(string, attr_name):
 								    """Tranform W3C date to PDF format."""
 								    if string is None:
 								        return None
 								    match = W3C_DATE_RE.match(string)
 								    if match is None:
 								        LOGGER.warning('Invalid %s date: %r', attr_name, string)
 								        return None
 								    groups = match.groupdict()
 								    pdf_date = ''
 								    found = False
 								    for key in ('second', 'minute', 'hour', 'day', 'month', 'year'):
 								        if groups[key]:
 								            found = True
 								            pdf_date = groups[key] + pdf_date
 								        elif found:
 								            pdf_date = '%02i' % (key in ('day', 'month')) + pdf_date
 								    if groups['hour']:
 								        assert groups['minute']
 								        if groups['tz_hour']:
 								            assert groups['tz_hour'].startswith(('+', '-'))
 								            assert groups['tz_minute']
 								            pdf_date += "%+03i'%02i" % (
 								                int(groups['tz_hour']), int(groups['tz_minute']))
 								        else:
 								            pdf_date += 'Z'
 								    return pdf_date
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
-												Remove useless parenthesis

											
										
										
											2020-05-08 01:31:50 +03:00
+								class Font:
-												Change some font attributes names

											
										
										
											2020-05-08 02:50:41 +03:00
+								    def __init__(self, file_content, pango_font, glyph_item):
-												Move getting font informations from text to document

											
										
										
											2020-05-07 20:33:54 +03:00
+								        pango_metrics = pango.pango_font_get_metrics(pango_font, ffi.NULL)
-												Always store 1pt fonts to get metrics

											
										
										
											2020-05-09 19:06:20 +03:00
+								        font_description = ffi.gc(
 								            pango.pango_font_description_copy(
 								                pango.pango_font_describe(pango_font)),
 								            pango.pango_font_description_free)
 								        pango.pango_font_description_set_absolute_size(
 								            font_description, pango.pango_units_from_double(1))
-												Font name, and save glyphs

											
										
										
											2020-05-08 00:27:43 +03:00
+								        font_family = ffi.string(pango.pango_font_description_get_family(
-												Fix font bounding box

											
										
										
											2020-05-08 02:30:07 +03:00
+								            font_description))
-												Font name, and save glyphs

											
										
										
											2020-05-08 00:27:43 +03:00
+								        glyph_string = glyph_item.glyphs
 								        num_glyphs = glyph_string.num_glyphs
-												Move getting font informations from text to document

											
										
										
											2020-05-07 20:33:54 +03:00
-												Fix text size and position

											
										
										
											2020-05-09 01:00:07 +03:00
+								        self.hash = hash(file_content)
-												Change some font attributes names

											
										
										
											2020-05-08 02:50:41 +03:00
+								        self.file_content = file_content
-												Move getting font informations from text to document

											
										
										
											2020-05-07 20:33:54 +03:00
+								        self.pango_font = pango_font
 								        self.glyph_item = glyph_item
-												Font name, and save glyphs

											
										
										
											2020-05-08 00:27:43 +03:00
+								        # When the font will be a font subset, the font name will have to be
 								        # like '/XXXXXX+font_family'
-												Remove useless font name prefix

											
										
										
											2020-05-08 02:58:43 +03:00
+								        self.name = b'/' + font_family.replace(b' ', b'')
-												Change some font attributes names

											
										
										
											2020-05-08 02:50:41 +03:00
+								        self.family = font_family
-												Set flags always to 4, symbolic

											
										
										
											2020-05-08 01:19:07 +03:00
+								        self.flags = 4
-												Move getting font informations from text to document

											
										
										
											2020-05-07 20:33:54 +03:00
+								        self.font_bbox = None
 								        self.italic_angle = 0
 								        self.ascent = pango.pango_font_metrics_get_ascent(pango_metrics)
 								        self.descent = pango.pango_font_metrics_get_descent(pango_metrics)
 								        self.cap_height = None
 								        self.stemv = 80
 								        self.stemh = 80
-												Use computed values

											
										
										
											2020-05-08 01:40:01 +03:00
+								        self.glyphs = {glyph_string.glyphs[x].glyph for x in range(num_glyphs)}
-												First and last characters, and widths

											
										
										
											2020-05-08 16:07:51 +03:00
+								        self.first_char = None
 								        self.last_char = None
 								        self.widths = None
-												Get FontBBox and CapHeight

											
										
										
											2020-05-08 01:11:19 +03:00
 								    def add_glyphs(self, glyph_item):
 								        glyph_string = glyph_item.glyphs
 								        num_glyphs = glyph_string.num_glyphs
-												Use font codepoints instead of Unicode codepoints

											
										
										
											2020-05-08 18:14:45 +03:00
+								        self.glyphs |= {
-												Use computed values

											
										
										
											2020-05-08 01:40:01 +03:00
+								            glyph_string.glyphs[x].glyph for x in range(num_glyphs)}
-												Get FontBBox and CapHeight

											
										
										
											2020-05-08 01:11:19 +03:00
-												Change some font attributes names

											
										
										
											2020-05-08 02:50:41 +03:00
+								    def compute_glyphs_values(self):
-												First and last characters, and widths

											
										
										
											2020-05-08 16:07:51 +03:00
+								        first_char = min(self.glyphs)
 								        last_char = max(self.glyphs)
-												Fix font bounding box

											
										
										
											2020-05-08 02:30:07 +03:00
+								        font_bbox = [0, 0, 0, 0]
-												First and last characters, and widths

											
										
										
											2020-05-08 16:07:51 +03:00
+								        widths = [0] * (last_char - first_char + 1)
-												Get FontBBox and CapHeight

											
										
										
											2020-05-08 01:11:19 +03:00
+								        ink_rect = ffi.new('PangoRectangle *')
-												First and last characters, and widths

											
										
										
											2020-05-08 16:07:51 +03:00
+								        logical_rect = ffi.new('PangoRectangle *')
-												Get FontBBox and CapHeight

											
										
										
											2020-05-08 01:11:19 +03:00
 								        for glyph in self.glyphs:
 								            pango.pango_font_get_glyph_extents(
-												First and last characters, and widths

											
										
										
											2020-05-08 16:07:51 +03:00
+								                self.pango_font, glyph, ink_rect, logical_rect)
-												Fix font bounding box

											
										
										
											2020-05-08 02:30:07 +03:00
+								            x1, y1, x2, y2 = (
 								                ink_rect.x, -ink_rect.y - ink_rect.height,
 								                ink_rect.x + ink_rect.width, -ink_rect.y)
 								            if x1 < font_bbox[0]:
 								                font_bbox[0] = x1
 								            if y1 < font_bbox[1]:
 								                font_bbox[1] = y1
 								            if x2 > font_bbox[2]:
 								                font_bbox[2] = x2
 								            if y2 > font_bbox[3]:
 								                font_bbox[3] = y2
-												Get FontBBox and CapHeight

											
										
										
											2020-05-08 01:11:19 +03:00
-												Always store 1pt fonts to get metrics

											
										
										
											2020-05-09 19:06:20 +03:00
+								            widths[glyph - first_char] = (
 								                pango.pango_units_to_double(logical_rect.width) * 1000)
-												First and last characters, and widths

											
										
										
											2020-05-08 16:07:51 +03:00
-												Release memory

											
										
										
											2020-05-08 01:28:30 +03:00
+								        ffi.release(ink_rect)
-												First and last characters, and widths

											
										
										
											2020-05-08 16:07:51 +03:00
+								        ffi.release(logical_rect)
-												Always store 1pt fonts to get metrics

											
										
										
											2020-05-09 19:06:20 +03:00
+								        self.bbox = font_bbox
-												Fix font bounding box

											
										
										
											2020-05-08 02:30:07 +03:00
+								        self.cap_height = font_bbox[1]
-												First and last characters, and widths

											
										
										
											2020-05-08 16:07:51 +03:00
+								        self.first_char = first_char
 								        self.last_char = last_char
 								        self.widths = widths
-												Move getting font informations from text to document

											
										
										
											2020-05-07 20:33:54 +03:00
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								class Context(pydyf.Stream):
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								    """PDF stream object with context storing alpha states."""
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								    def __init__(self, alpha_states, *args, **kwargs):
 								        super().__init__(*args, **kwargs)
 								        self._alpha_states = alpha_states
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								        self._fonts = {}
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
 								    def set_alpha(self, alpha, stroke=False):
 								        if alpha not in self._alpha_states:
 								            self._alpha_states[alpha] = pydyf.Dictionary(
 								                {'CA' if stroke else 'ca': alpha})
 								        self.set_state(alpha)
-												Warn users with cairo < 1.15.4

Related to #339, #565, #616.

											
										
										
											2018-04-13 11:44:19 +03:00
-												Get FontBBox and CapHeight

											
										
										
											2020-05-08 01:11:19 +03:00
+								    def add_font(self, font, pango_font, glyph_item):
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								        font_hash = hash(font)
 								        if font_hash not in self._fonts:
-												Get FontBBox and CapHeight

											
										
										
											2020-05-08 01:11:19 +03:00
+								            self._fonts[font_hash] = Font(font, pango_font, glyph_item)
 								        else:
 								            self._fonts[font_hash].add_glyphs(glyph_item)
-												Fix text size and position

											
										
										
											2020-05-09 01:00:07 +03:00
+								        return self._fonts[font_hash]
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								BookmarkSubtree = collections.namedtuple(
 								    'BookmarkSubtree', ('label', 'destination', 'children', 'state'))
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								def _write_pdf_attachment(pdf, attachment, url_fetcher):
 								    """Write an attachment to the PDF stream.
 								    :return:
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								        the attachment PDF dictionary.
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
 								    """
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								    # Attachments from document links like <link> or <a> can only be URLs.
 								    # They're passed in as tuples
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								    url = ''
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								    if isinstance(attachment, tuple):
 								        url, description = attachment
 								        attachment = Attachment(
 								            url=url, url_fetcher=url_fetcher, description=description)
 								    elif not isinstance(attachment, Attachment):
 								        attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								    try:
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								        with attachment.source as (source_type, source, url, _):
 								            if isinstance(source, bytes):
 								                source = io.BytesIO(source)
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								            uncompressed_length = 0
 								            stream = b''
 								            md5 = hashlib.md5()
 								            compress = zlib.compressobj()
 								            for data in iter(lambda: source.read(4096), b''):
 								                uncompressed_length += len(data)
 								                md5.update(data)
 								                compressed = compress.compress(data)
 								                stream += compressed
 								            compressed = compress.flush(zlib.Z_FINISH)
 								            stream += compressed
 								            file_extra = pydyf.Dictionary({
 								                'Type': '/EmbeddedFile',
 								                'Filter': '/FlateDecode',
 								                'Params': pydyf.Dictionary({
 								                    'CheckSum': f'<{md5.hexdigest()}>',
 								                    'Size': uncompressed_length,
 								                })
 								            })
 								            file_stream = pydyf.Stream([stream], file_extra)
 								            pdf.add_object(file_stream)
 								    except URLFetchingError as exception:
 								        LOGGER.error('Failed to load attachment: %s', exception)
 								        return
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
 								    # TODO: Use the result object from a URL fetch operation to provide more
 								    # details on the possible filename.
 								    filename = basename(unquote(urlsplit(url).path)) or 'attachment.bin'
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								    attachment = pydyf.Dictionary({
 								        'Type': '/Filespec',
 								        'F': pydyf.String(),
 								        'UF': pydyf.String(filename),
 								        'EF': pydyf.Dictionary({'F': file_stream.reference}),
 								        'Desc': pydyf.String(attachment.description or ''),
 								    })
 								    pdf.add_object(attachment)
 								    return attachment
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
 								def create_bookmarks(bookmarks, pdf, parent=None):
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								    count = len(bookmarks)
 								    outlines = []
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								    for title, (page, x, y), children, state in bookmarks:
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								        destination = pydyf.Array((
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            pdf.objects[pdf.pages['Kids'][page * 3]].reference,
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								            '/XYZ', x, y, 0))
 								        outline = pydyf.Dictionary({
 								            'Title': pydyf.String(title), 'Dest': destination})
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								        pdf.add_object(outline)
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								        children_outlines, children_count = create_bookmarks(
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            children, pdf, parent=outline)
-												Handle bookmark state

											
										
										
											2020-04-18 23:41:23 +03:00
+								        outline['Count'] = children_count
 								        if state == 'closed':
 								            outline['Count'] *= -1
 								        else:
 								            count += children_count
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								        if outlines:
 								            outline['Prev'] = outlines[-1].reference
 								            outlines[-1]['Next'] = outline.reference
 								        if children_outlines:
 								            outline['First'] = children_outlines[0].reference
 								            outline['Last'] = children_outlines[-1].reference
 								        if parent is not None:
 								            outline['Parent'] = parent.reference
 								        outlines.append(outline)
 								    return outlines, count
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								def add_hyperlinks(links, anchors, matrix, pdf, page, names):
 								    """Include hyperlinks in current PDF page."""
 								    page['Annots'] = pydyf.Array()
 								    for link in links:
 								        link_type, link_target, rectangle = link
 								        x1, y1 = matrix.transform_point(*rectangle[:2])
 								        x2, y2 = matrix.transform_point(*rectangle[2:])
 								        if link_type in ('internal', 'external'):
 								            annot = pydyf.Dictionary({
 								                'Type': '/Annot',
 								                'Subtype': '/Link',
 								                'Rect': pydyf.Array([x1, y1, x2, y2]),
 								                'BS': pydyf.Dictionary({'W': 0}),
 								            })
 								            if link_type == 'internal':
 								                annot['Dest'] = pydyf.String(link_target)
-												Pre-compute transformation matricies.

… so that they are available when getting meta-data.

											
										
										
											2012-10-06 13:26:55 +04:00
+								            else:
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								                annot['A'] = pydyf.Dictionary({
 								                    'Type': '/Action',
 								                    'S': '/URI',
 								                    'URI': pydyf.String(link_target),
 								                })
 								            pdf.add_object(annot)
 								            page['Annots'].append(annot.reference)
 								    for anchor in anchors:
 								        anchor_name, x, y = anchor
 								        x, y = matrix.transform_point(x, y)
 								        names.append(pydyf.String(anchor_name))
 								        names.append(pydyf.Array([page.reference, '/XYZ', x, y, 0]))
-												Pre-compute transformation matricies.

… so that they are available when getting meta-data.

											
										
										
											2012-10-06 13:26:55 +04:00
-												Have metadata account for CSS transforms.

											
										
										
											2012-10-07 00:09:17 +04:00
+								def rectangle_aabb(matrix, pos_x, pos_y, width, height):
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								    """Apply a transformation matrix to an axis-aligned rectangle.
 								    Return its axis-aligned bounding box as ``(x, y, width, height)``.
-												Have metadata account for CSS transforms.

											
										
										
											2012-10-07 00:09:17 +04:00
 								    """
 								    transform_point = matrix.transform_point
 								    x1, y1 = transform_point(pos_x, pos_y)
 								    x2, y2 = transform_point(pos_x + width, pos_y)
 								    x3, y3 = transform_point(pos_x, pos_y + height)
 								    x4, y4 = transform_point(pos_x + width, pos_y + height)
 								    box_x1 = min(x1, x2, x3, x4)
 								    box_y1 = min(y1, y2, y3, y4)
 								    box_x2 = max(x1, x2, x3, x4)
 								    box_y2 = max(y1, y2, y3, y4)
 								    return box_x1, box_y1, box_x2 - box_x1, box_y2 - box_y1
-												Remove resolve_links from Document class

											
										
										
											2020-04-19 19:26:49 +03:00
+								def resolve_links(pages):
 								    """Resolve internal hyperlinks.
 								    Links to a missing anchor are removed with a warning.
 								    If multiple anchors have the same name, the first one is used.
 								    :returns:
 								        A generator yielding lists (one per page) like :attr:`Page.links`,
 								        except that ``target`` for internal hyperlinks is
 								        ``(page_number, x, y)`` instead of an anchor name.
 								        The page number is a 0-based index into the :attr:`pages` list,
 								        and ``x, y`` are in CSS pixels from the top-left of the page.
 								    """
 								    anchors = set()
 								    paged_anchors = []
 								    for i, page in enumerate(pages):
 								        paged_anchors.append([])
 								        for anchor_name, (point_x, point_y) in page.anchors.items():
 								            if anchor_name not in anchors:
 								                paged_anchors[-1].append((anchor_name, point_x, point_y))
 								                anchors.add(anchor_name)
 								    for page in pages:
 								        page_links = []
 								        for link in page.links:
 								            link_type, anchor_name, rectangle = link
 								            if link_type == 'internal':
 								                if anchor_name not in anchors:
 								                    LOGGER.error(
 								                        'No anchor #%s for internal URI reference',
 								                        anchor_name)
 								                else:
 								                    page_links.append((link_type, anchor_name, rectangle))
 								            else:
 								                # External link
 								                page_links.append(link)
 								        yield page_links, paged_anchors.pop(0)
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								class Matrix(list):
 								    def __init__(self, a=1, b=0, c=0, d=1, e=0, f=0, matrix=None):
 								        if matrix is None:
 								            matrix = [[a, b, 0], [c, d, 0], [e, f, 1]]
 								        super().__init__(matrix)
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								    def __matmul__(self, other):
 								        assert len(self[0]) == len(other) == len(other[0]) == 3
 								        return Matrix(matrix=[
 								            [sum(self[i][k] * other[k][j] for k in range(3)) for j in range(3)]
-												Remove useless variable

											
										
										
											2020-05-08 03:13:44 +03:00
+								            for i in range(len(self))])
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								    @property
 								    def determinant(self):
 								        assert len(self) == len(self[0]) == 3
 								        return (
 								            self[0][0] * (self[1][1] * self[2][2] - self[1][2] * self[2][1]) -
 								            self[1][0] * (self[0][1] * self[2][2] - self[0][2] * self[2][1]) +
 								            self[2][0] * (self[0][1] * self[1][2] - self[0][2] * self[1][1]))
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								    def transform_point(self, x, y):
 								        return (Matrix(matrix=[[x, y, 1]]) @ self)[0][:2]
-												Don't use pdfrw anymore

pdfrw is a great piece of software, but we don't know PDF enough to debug the
problems we've met. It's safer to use the new cairo API and get back to manual
edition for attachments and bleed boxes.

We only have two regressions for now:
- some internal links are broken,
- PDF producer is not overwritten.

A mail has been sent to cairo's mailing-list about that:
https://lists.cairographics.org/archives/cairo/2018-August/028694.html

Fix #639, #615, fix #596, fix #565.

											
										
										
											2018-08-06 18:38:02 +03:00
-												Remove useless explicit object inheritance

											
										
										
											2020-01-02 14:06:58 +03:00
+								class Page:
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								    """Represents a single rendered page.
-												Docs docs docs.

But no ducks.

											
										
										
											2012-10-08 21:51:18 +04:00
+								    .. versionadded:: 0.15
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								    Should be obtained from :attr:`Document.pages` but not
 								    instantiated directly.
 								    """
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								    def __init__(self, page_box):
-												Docs docs docs.

But no ducks.

											
										
										
											2012-10-08 21:51:18 +04:00
+								        #: The page width, including margins, in CSS pixels.
-												Move resolution to *.write_png only.

											
										
										
											2012-10-05 22:12:05 +04:00
+								        self.width = page_box.margin_width()
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
-												Docs docs docs.

But no ducks.

											
										
										
											2012-10-08 21:51:18 +04:00
+								        #: The page height, including margins, in CSS pixels.
-												Move resolution to *.write_png only.

											
										
										
											2012-10-05 22:12:05 +04:00
+								        self.height = page_box.margin_height()
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #: The page bleed widths as a :obj:`dict` with ``'top'``, ``'right'``,
 								        #: ``'bottom'`` and ``'left'`` as keys, and values in CSS pixels.
-												Cleanups

											
										
										
											2017-10-05 09:45:50 +03:00
+								        self.bleed = {
 								            side: page_box.style['bleed_%s' % side].value
 								            for side in ('top', 'right', 'bottom', 'left')}
-												Add support of marks and bleed pages properties

Fix #471.

											
										
										
											2017-09-05 16:44:50 +03:00
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #: The :obj:`list` of ``(bookmark_level, bookmark_label, target)``
 								        #: :obj:`tuples <tuple>`. ``bookmark_level`` and ``bookmark_label``
 								        #: are respectively an :obj:`int` and a :obj:`string <str>`, based on
 								        #: the CSS properties of the same names. ``target`` is an ``(x, y)``
 								        #: point in CSS pixels from the top-left of the page.
 								        self.bookmarks = []
 								        #: The :obj:`list` of ``(link_type, target, rectangle)`` :obj:`tuples
 								        #: <tuple>`. A ``rectangle`` is ``(x, y, width, height)``, in CSS
 								        #: pixels from the top-left of the page. ``link_type`` is one of three
 								        #: strings:
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								        #:
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #: * ``'external'``: ``target`` is an absolute URL
 								        #: * ``'internal'``: ``target`` is an anchor name (see
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								        #:   :attr:`Page.anchors`).
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #:   The anchor might be defined in another page,
 								        #:   in multiple pages (in which case the first occurence is used),
 								        #:   or not at all.
 								        #: * ``'attachment'``: ``target`` is an absolute URL and points
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
+								        #:   to a resource to attach to the document.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        self.links = []
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #: The :obj:`dict` mapping each anchor name to its target, an
 								        #: ``(x, y)`` point in CSS pixels from the top-left of the page.
 								        self.anchors = {}
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								        self._gather_links_and_bookmarks(page_box)
-												Move resolution to *.write_png only.

											
										
										
											2012-10-05 22:12:05 +04:00
+								        self._page_box = page_box
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								    def _gather_links_and_bookmarks(self, box, matrix=None):
 								        # Get box transformation matrix.
 								        # "Transforms apply to block-level and atomic inline-level elements,
 								        #  but do not apply to elements which may be split into
 								        #  multiple inline-level boxes."
 								        # http://www.w3.org/TR/css3-2d-transforms/#introduction
 								        if box.style['transform'] and not isinstance(box, boxes.InlineBox):
 								            border_width = box.border_width()
 								            border_height = box.border_height()
 								            origin_x, origin_y = box.style['transform_origin']
 								            offset_x = percentage(origin_x, border_width)
 								            offset_y = percentage(origin_y, border_height)
 								            origin_x = box.border_box_x() + offset_x
 								            origin_y = box.border_box_y() + offset_y
 								            matrix = Matrix(e=origin_x, f=origin_y)
 								            for name, args in box.style['transform']:
 								                a, b, c, d, e, f = 1, 0, 0, 1, 0, 0
 								                if name == 'scale':
 								                    a, d = args
 								                elif name == 'rotate':
 								                    a = d = math.cos(args)
 								                    b = math.sin(args)
 								                    c = -b
 								                elif name == 'translate':
 								                    e = percentage(args[0], border_width)
 								                    f = percentage(args[1], border_height)
 								                elif name == 'skew':
 								                    b, c = math.tan(args[1]), math.tan(args[0])
 								                else:
 								                    assert name == 'matrix'
 								                    a, b, c, d, e, f = args
 								                matrix = Matrix(a, b, c, d, e, f) @ matrix
 								            box.transformation_matrix = (
 								                Matrix(e=-origin_x, f=-origin_y) @ matrix)
 								            if matrix:
 								                matrix = box.transformation_matrix @ matrix
 								            else:
 								                matrix = box.transformation_matrix
 								        bookmark_label = box.bookmark_label
 								        if box.style['bookmark_level'] == 'none':
 								            bookmark_level = None
 								        else:
 								            bookmark_level = box.style['bookmark_level']
 								        state = box.style['bookmark_state']
 								        link = box.style['link']
 								        anchor_name = box.style['anchor']
 								        has_bookmark = bookmark_label and bookmark_level
 								        # 'link' is inherited but redundant on text boxes
 								        has_link = link and not isinstance(box, boxes.TextBox)
 								        # In case of duplicate IDs, only the first is an anchor.
 								        has_anchor = anchor_name and anchor_name not in self.anchors
 								        is_attachment = hasattr(box, 'is_attachment') and box.is_attachment
 								        if has_bookmark or has_link or has_anchor:
 								            pos_x, pos_y, width, height = box.hit_area()
 								            if has_link:
 								                token_type, link = link
 								                assert token_type == 'url'
 								                link_type, target = link
 								                assert isinstance(target, str)
 								                if link_type == 'external' and is_attachment:
 								                    link_type = 'attachment'
 								                if matrix:
 								                    link = (link_type, target, rectangle_aabb(
 								                        matrix, pos_x, pos_y, pos_x + width, pos_y + height))
 								                else:
 								                    link = (link_type, target, (
 								                        pos_x, pos_y, pos_x + width, pos_y + height))
 								                self.links.append(link)
 								            if matrix and (has_bookmark or has_anchor):
 								                pos_x, pos_y = matrix.transform_point(pos_x, pos_y)
 								            if has_bookmark:
 								                self.bookmarks.append(
 								                    (bookmark_level, bookmark_label, (pos_x, pos_y), state))
 								            if has_anchor:
 								                self.anchors[anchor_name] = pos_x, pos_y
 								        for child in box.all_children():
 								            self._gather_links_and_bookmarks(child)
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								    def paint(self, context, left_x=0, top_y=0, scale=1, clip=False):
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								        """Paint the page into the PDF file.
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								        :type context: :class:`pdf.Context`
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								        :param context:
 								            A context object.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        :type left_x: float
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								        :param left_x:
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								            X coordinate of the left of the page, in PDF points.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        :type top_y: float
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								        :param top_y:
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								            Y coordinate of the top of the page, in PDF points.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        :type scale: float
-												Move resolution to *.write_png only.

											
										
										
											2012-10-05 22:12:05 +04:00
+								        :param scale:
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								            Zoom scale.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        :type clip: bool
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								        :param clip:
 								            Whether to clip/cut content outside the page. If false or
 								            not provided, content can overflow.
 								        """
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								        with stacked(context):
 								            # Make (0, 0) the top-left corner, and make user units CSS pixels:
 								            context.transform(scale, 0, 0, scale, left_x, top_y)
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								            if clip:
-												Move resolution to *.write_png only.

											
										
										
											2012-10-05 22:12:05 +04:00
+								                width = self.width
 								                height = self.height
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								                context.rectangle(0, 0, width, height)
 								                context.clip()
 								            draw_page(self._page_box, context)
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Remove useless explicit object inheritance

											
										
										
											2020-01-02 14:06:58 +03:00
+								class DocumentMetadata:
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    """Meta-information belonging to a whole :class:`Document`.
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    .. versionadded:: 0.20
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    New attributes may be added in future versions of WeasyPrint.
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
 								    """
 								    def __init__(self, title=None, authors=None, description=None,
-												Added support for PDF attachments (v2)

											
										
										
											2014-04-04 14:32:21 +04:00
+								                 keywords=None, generator=None, created=None, modified=None,
 								                 attachments=None):
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								        #: The title of the document, as a string or :obj:`None`.
 								        #: Extracted from the ``<title>`` element in HTML
 								        #: and written to the ``/Title`` info field in PDF.
 								        self.title = title
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #: The authors of the document, as a list of strings.
 								        #: (Defaults to the empty list.)
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								        #: Extracted from the ``<meta name=author>`` elements in HTML
 								        #: and written to the ``/Author`` info field in PDF.
 								        self.authors = authors or []
 								        #: The description of the document, as a string or :obj:`None`.
 								        #: Extracted from the ``<meta name=description>`` element in HTML
 								        #: and written to the ``/Subject`` info field in PDF.
 								        self.description = description
 								        #: Keywords associated with the document, as a list of strings.
 								        #: (Defaults to the empty list.)
 								        #: Extracted from ``<meta name=keywords>`` elements in HTML
 								        #: and written to the ``/Keywords`` info field in PDF.
 								        self.keywords = keywords or []
 								        #: The name of one of the software packages
 								        #: used to generate the document, as a string or :obj:`None`.
 								        #: Extracted from the ``<meta name=generator>`` element in HTML
 								        #: and written to the ``/Creator`` info field in PDF.
 								        self.generator = generator
 								        #: The creation date of the document, as a string or :obj:`None`.
 								        #: Dates are in one of the six formats specified in
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #: `W3C’s profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								        #: Extracted from the ``<meta name=dcterms.created>`` element in HTML
 								        #: and written to the ``/CreationDate`` info field in PDF.
 								        self.created = created
 								        #: The modification date of the document, as a string or :obj:`None`.
 								        #: Dates are in one of the six formats specified in
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #: `W3C’s profile of ISO 8601 <http://www.w3.org/TR/NOTE-datetime>`_.
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								        #: Extracted from the ``<meta name=dcterms.modified>`` element in HTML
 								        #: and written to the ``/ModDate`` info field in PDF.
 								        self.modified = modified
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #: File attachments, as a list of tuples of URL and a description or
 								        #: :obj:`None`. (Defaults to the empty list.)
-												Added support for PDF attachments (v2)

											
										
										
											2014-04-04 14:32:21 +04:00
+								        #: Extracted from the ``<link rel=attachment>`` elements in HTML
 								        #: and written to the ``/EmbeddedFiles`` dictionary in PDF.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #:
 								        #: .. versionadded:: 0.22
-												Added support for PDF attachments (v2)

											
										
										
											2014-04-04 14:32:21 +04:00
+								        self.attachments = attachments or []
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
-												Remove useless explicit object inheritance

											
										
										
											2020-01-02 14:06:58 +03:00
+								class Document:
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    """A rendered document ready to be painted on a cairo surface.
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								    Typically obtained from :meth:`HTML.render() <weasyprint.HTML.render>`, but
 								    can also be instantiated directly with a list of :class:`pages <Page>`, a
 								    set of :class:`metadata <DocumentMetadata>`, a :func:`url_fetcher
 								    <weasyprint.default_url_fetcher>` function, and a :class:`font_config
 								    <weasyprint.fonts.FontConfiguration>`.
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
 								    """
-												Refactor LayoutContext building

 this will provide a simpler API for testing context-using functions inside tests

											
										
										
											2019-07-23 08:12:08 +03:00
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								    @classmethod
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								    def _build_layout_context(cls, html, stylesheets,
-												Add base code for @counter-style rules

											
										
										
											2019-12-24 16:39:40 +03:00
+								                              presentational_hints=False, font_config=None,
 								                              counter_style=None):
-												Add a font_config parameter to various render methods

Fix #506.

											
										
										
											2017-10-01 16:17:32 +03:00
+								        if font_config is None:
 								            font_config = FontConfiguration()
-												Add base code for @counter-style rules

											
										
										
											2019-12-24 16:39:40 +03:00
+								        if counter_style is None:
-												Add default counter class

											
										
										
											2019-12-24 17:56:24 +03:00
+								            counter_style = CounterStyle()
-												Don't use a global target collector

											
										
										
											2018-03-28 01:34:34 +03:00
+								        target_collector = TargetCollector()
-												Use cssselect2 instead of cssselect

											
										
										
											2017-06-30 18:54:02 +03:00
+								        page_rules = []
-												Use font config in stylesheets given in CLI

Related to #596.

											
										
										
											2018-03-24 01:57:33 +03:00
+								        user_stylesheets = []
 								        for css in stylesheets or []:
 								            if not hasattr(css, 'matcher'):
 								                css = CSS(
 								                    guess=css, media_type=html.media_type,
-												Add base code for @counter-style rules

											
										
										
											2019-12-24 16:39:40 +03:00
+								                    font_config=font_config, counter_style=counter_style)
-												Use font config in stylesheets given in CLI

Related to #596.

											
										
										
											2018-03-24 01:57:33 +03:00
+								            user_stylesheets.append(css)
-												Put media queries in a separate module, create a class for style_for

											
										
										
											2018-08-17 11:30:51 +03:00
+								        style_for = get_all_computed_styles(
-												Use font config in stylesheets given in CLI

Related to #596.

											
										
										
											2018-03-24 01:57:33 +03:00
+								            html, user_stylesheets, presentational_hints, font_config,
-												Add base code for @counter-style rules

											
										
										
											2019-12-24 16:39:40 +03:00
+								            counter_style, page_rules, target_collector)
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								        get_image_from_uri = functools.partial(
-												Fix circular imports

											
										
										
											2018-01-07 03:46:39 +03:00
+								            original_get_image_from_uri, {}, html.url_fetcher)
-												Use a separate logger for generation progress

											
										
										
											2019-01-04 01:02:44 +03:00
+								        PROGRESS_LOGGER.info('Step 4 - Creating formatting structure')
-												Refactor LayoutContext building

 this will provide a simpler API for testing context-using functions inside tests

											
										
										
											2019-07-23 08:12:08 +03:00
+								        context = LayoutContext(
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								            style_for, get_image_from_uri, font_config, counter_style,
 								            target_collector)
-												Refactor LayoutContext building

 this will provide a simpler API for testing context-using functions inside tests

											
										
										
											2019-07-23 08:12:08 +03:00
+								        return context
 								    @classmethod
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								    def _render(cls, html, stylesheets, presentational_hints=False,
 								                font_config=None, counter_style=None):
-												Refactor LayoutContext building

 this will provide a simpler API for testing context-using functions inside tests

											
										
										
											2019-07-23 08:12:08 +03:00
+								        if font_config is None:
 								            font_config = FontConfiguration()
-												Add base code for @counter-style rules

											
										
										
											2019-12-24 16:39:40 +03:00
+								        if counter_style is None:
-												Add default counter class

											
										
										
											2019-12-24 17:56:24 +03:00
+								            counter_style = CounterStyle()
-												Add base code for @counter-style rules

											
										
										
											2019-12-24 16:39:40 +03:00
-												Refactor LayoutContext building

 this will provide a simpler API for testing context-using functions inside tests

											
										
										
											2019-07-23 08:12:08 +03:00
+								        context = cls._build_layout_context(
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								            html, stylesheets, presentational_hints, font_config,
 								            counter_style)
-												Refactor LayoutContext building

 this will provide a simpler API for testing context-using functions inside tests

											
										
										
											2019-07-23 08:12:08 +03:00
-												Avoid useless nested function call

											
										
										
											2018-08-08 18:47:47 +03:00
+								        root_box = build_formatting_structure(
-												Fix lint

											
										
										
											2019-07-23 19:07:14 +03:00
+								            html.etree_element, context.style_for, context.get_image_from_uri,
-												Add base code for @counter-style rules

											
										
										
											2019-12-24 16:39:40 +03:00
+								            html.base_url, context.target_collector, counter_style)
-												Refactor LayoutContext building

 this will provide a simpler API for testing context-using functions inside tests

											
										
										
											2019-07-23 08:12:08 +03:00
 								        page_boxes = layout_document(html, root_box, context)
-												Clean the font config after rendering the document

											
										
										
											2016-10-27 12:41:34 +03:00
+								        rendering = cls(
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								            [Page(page_box) for page_box in page_boxes],
-												Keep a reference to font_config in Document

Fix #566.

											
										
										
											2018-01-28 17:45:39 +03:00
+								            DocumentMetadata(**html._get_metadata()),
 								            html.url_fetcher, font_config)
-												Clean the font config after rendering the document

											
										
										
											2016-10-27 12:41:34 +03:00
+								        return rendering
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Keep a reference to font_config in Document

Fix #566.

											
										
										
											2018-01-28 17:45:39 +03:00
+								    def __init__(self, pages, metadata, url_fetcher, font_config):
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								        #: A list of :class:`Page` objects.
 								        self.pages = pages
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								        #: A :class:`DocumentMetadata` object.
 								        #: Contains information that does not belong to a specific page
 								        #: but to the whole document.
 								        self.metadata = metadata
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        #: A function or other callable with the same signature as
 								        #: :func:`default_url_fetcher` called to fetch external resources such
 								        #: as stylesheets and images.  (See :ref:`url-fetchers`.)
-												Refactored the `url_fetcher` argument for `write_pdf` to an attribute of the `Document` class

											
										
										
											2014-04-18 17:11:45 +04:00
+								        self.url_fetcher = url_fetcher
-												Keep a reference to font_config in Document

Fix #566.

											
										
										
											2018-01-28 17:45:39 +03:00
+								        # Keep a reference to font_config to avoid its garbage collection until
 								        # rendering is destroyed. This is needed as font_config.__del__ removes
 								        # fonts that may be used when rendering
 								        self._font_config = font_config
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
 								    def copy(self, pages='all'):
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								        """Take a subset of the pages.
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        .. versionadded:: 0.15
 								        :type pages: :term:`iterable`
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								        :param pages:
 								            An iterable of :class:`Page` objects from :attr:`pages`.
 								        :return:
 								            A new :class:`Document` object.
-												Docs: add an example combining documents into one PDF file.

											
										
										
											2013-07-14 12:17:40 +04:00
+								        Examples:
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
-												Docs: add an example combining documents into one PDF file.

											
										
										
											2013-07-14 12:17:40 +04:00
+								        Write two PDF files for odd-numbered and even-numbered pages::
 								            # Python lists count from 0 but pages are numbered from 1.
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								            # [::2] is a slice of even list indexes but odd-numbered pages.
 								            document.copy(document.pages[::2]).write_pdf('odd_pages.pdf')
 								            document.copy(document.pages[1::2]).write_pdf('even_pages.pdf')
-												Docs: add an example combining documents into one PDF file.

											
										
										
											2013-07-14 12:17:40 +04:00
+								        Write each page to a numbred PNG file::
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								            for i, page in enumerate(document.pages):
 								                document.copy(page).write_png('page_%s.png' % i)
-												Docs: add an example combining documents into one PDF file.

											
										
										
											2013-07-14 12:17:40 +04:00
+								        Combine multiple documents into one PDF file,
 								        using metadata from the first::
-												Docs: Fixed wrong nested list comprehension example

											
										
										
											2019-07-09 01:06:19 +03:00
+								            all_pages = [p for doc in documents for p in doc.pages]
-												Docs: add an example combining documents into one PDF file.

											
										
										
											2013-07-14 12:17:40 +04:00
+								            documents[0].copy(all_pages).write_pdf('combined.pdf')
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								        """
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
+								        if pages == 'all':
 								            pages = self.pages
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								        elif not isinstance(pages, list):
 								            pages = list(pages)
-												Keep a reference to font_config in Document

Fix #566.

											
										
										
											2018-01-28 17:45:39 +03:00
+								        return type(self)(
 								            pages, self.metadata, self.url_fetcher, self._font_config)
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Support for post-processing by passing a finisher function to write_pdf

											
										
										
											2020-04-09 02:46:11 +03:00
+								    def write_pdf(self, target=None, zoom=1, attachments=None, finisher=None):
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								        """Paint the pages in a PDF file, with meta-data.
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								        PDF files written directly by cairo do not have meta-data such as
 								        bookmarks/outlines and hyperlinks.
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								        :type target: str, pathlib.Path or file object
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								        :param target:
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								            A filename where the PDF file is generated, a file object, or
 								            :obj:`None`.
-												Rename PDF scale to zoom, have the default be 1 rather than 0.75

The 0.75 factor is an implementation detail that should not be exposed
in the API.

											
										
										
											2012-11-23 01:27:34 +04:00
+								        :type zoom: float
 								        :param zoom:
-												Fix typos and awkward grammar on Tutorial and API pages.

											
										
										
											2017-04-28 21:36:14 +03:00
+								            The zoom factor in PDF units per CSS units.  **Warning**:
 								            All CSS units are affected, including physical units like
 								            ``cm`` and named sizes like ``A4``.  For values other than
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+, the physical CSS units will thus be "wrong".
 								        :type attachments: list
-												Refactored `attachments` attribute from the `HTML` class to an argument for `write_pdf`

											
										
										
											2014-04-22 22:40:46 +04:00
+								        :param attachments: A list of additional file attachments for the
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								            generated PDF document or :obj:`None`. The list's elements are
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								            :class:`Attachment` objects, filenames, URLs or file-like objects.
-												Call finisher

											
										
										
											2020-04-19 11:01:27 +03:00
+								        :param finisher: A finisher function, that accepts the document and a
 								            ``pydyf.PDF`` object as parameters, can be passed to perform
 								            post-processing on the PDF right before the trailer is written.
-												Refine docstrings.

											
										
										
											2012-10-05 20:50:40 +04:00
+								        :returns:
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								            The PDF as :obj:`bytes` if ``target`` is not provided or
 								            :obj:`None`, otherwise :obj:`None` (the PDF is written to
 								            ``target``).
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
 								        """
-												Handle transforms and bookmarks

											
										
										
											2020-04-18 23:12:25 +03:00
+								        # 0.75 = 72 PDF point per inch / 96 CSS pixel per inch
-												Rename PDF scale to zoom, have the default be 1 rather than 0.75

The 0.75 factor is an implementation detail that should not be exposed
in the API.

											
										
										
											2012-11-23 01:27:34 +04:00
+								        scale = zoom * 0.75
-												Don't use pdfrw anymore

pdfrw is a great piece of software, but we don't know PDF enough to debug the
problems we've met. It's safer to use the new cairo API and get back to manual
edition for attachments and bleed boxes.

We only have two regressions for now:
- some internal links are broken,
- PDF producer is not overwritten.

A mail has been sent to cairo's mailing-list about that:
https://lists.cairographics.org/archives/cairo/2018-August/028694.html

Fix #639, #615, fix #596, fix #565.

											
										
										
											2018-08-06 18:38:02 +03:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								        PROGRESS_LOGGER.info('Step 6 - Creating PDF')
 								        pdf = pydyf.PDF()
 								        alpha_states = pydyf.Dictionary()
 								        pdf.add_object(alpha_states)
 								        resources = pydyf.Dictionary({'ExtGState': alpha_states.reference})
 								        pdf.add_object(resources)
 								        pdf_names = pydyf.Array()
 								        pdf.catalog['Names'] = pydyf.Dictionary(
 								            {'Dests': pydyf.Dictionary({'Names': pdf_names})})
-												Don't use pdfrw anymore

pdfrw is a great piece of software, but we don't know PDF enough to debug the
problems we've met. It's safer to use the new cairo API and get back to manual
edition for attachments and bleed boxes.

We only have two regressions for now:
- some internal links are broken,
- PDF producer is not overwritten.

A mail has been sent to cairo's mailing-list about that:
https://lists.cairographics.org/archives/cairo/2018-August/028694.html

Fix #639, #615, fix #596, fix #565.

											
										
										
											2018-08-06 18:38:02 +03:00
-												Go through PDF pages only once during generation

											
										
										
											2020-04-21 23:47:55 +03:00
+								        # Links and anchors
-												Remove resolve_links from Document class

											
										
										
											2020-04-19 19:26:49 +03:00
+								        paged_links_and_anchors = list(resolve_links(self.pages))
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								        attachment_links = [
 								            [link for link in page_links if link[0] == 'attachment']
 								            for page_links, page_anchors in paged_links_and_anchors]
-												Go through PDF pages only once during generation

											
										
										
											2020-04-21 23:47:55 +03:00
+								        # Annotations
 								        annot_files = {}
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								        # A single link can be split in multiple regions. We don't want to
 								        # embed a file multiple times of course, so keep a reference to every
 								        # embedded URL and reuse the object number.
 								        for page_links in attachment_links:
 								            for link_type, annot_target, rectangle in page_links:
 								                if link_type == 'attachment' and target not in annot_files:
-												Go through PDF pages only once during generation

											
										
										
											2020-04-21 23:47:55 +03:00
+								                    # TODO: Use the title attribute as description. The comment
 								                    # above about multiple regions won't always be correct,
 								                    # because two links might have the same href, but different
 								                    # titles.
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								                    annot_files[annot_target] = _write_pdf_attachment(
 								                        pdf, (annot_target, None), self.url_fetcher)
-												Go through PDF pages only once during generation

											
										
										
											2020-04-21 23:47:55 +03:00
+								        # Bookmarks
 								        root = []
 								        # At one point in the document, for each "output" depth, how much
 								        # to add to get the source level (CSS values of bookmark-level).
 								        # E.g. with <h1> then <h3>, level_shifts == [0, 1]
 								        # 1 means that <h3> has depth 3 - 1 = 2 in the output.
 								        skipped_levels = []
 								        last_by_depth = [root]
 								        previous_level = 0
 								        for page_number, (page, links_and_anchors, page_links) in enumerate(
 								                zip(self.pages, paged_links_and_anchors, attachment_links)):
 								            # Draw from the top-left corner
 								            matrix = Matrix(scale, 0, 0, -scale, 0, page.height * scale)
 								            # Links and anchors
-												Use named destinations instead of pages and position for links

There's a limitation / bug in cairo: we can't add links to pages that have not
been created yet. We have to use named destinations instead as they work even
if the destination has not been created.

This change offers the advantage of advertising targets: generated PDF files
now embed the list of named targets (even if I don't know if PDF readers have a
UI for that feature).

Fix #678.

											
										
										
											2018-09-24 16:27:24 +03:00
+								            links, anchors = links_and_anchors
-												Handle hyperlinks and anchors

											
										
										
											2020-04-19 01:47:19 +03:00
-												Handle bleed box

											
										
										
											2020-04-19 10:55:39 +03:00
+								            page_width = scale * (
 								                page.width + page.bleed['left'] + page.bleed['right'])
 								            page_height = scale * (
 								                page.height + page.bleed['top'] + page.bleed['bottom'])
 								            left = -scale * page.bleed['left']
 								            top = -scale * page.bleed['top']
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            right = left + page_width
 								            bottom = top + page_height
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            stream = Context(alpha_states)
-												Fix transformation matrix with bleed box

											
										
										
											2020-04-19 15:40:30 +03:00
+								            stream.transform(1, 0, 0, -1, 0, page.height * scale)
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								            page.paint(stream, scale=scale)
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            pdf.add_object(stream)
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
-												Handle hyperlinks and anchors

											
										
										
											2020-04-19 01:47:19 +03:00
+								            pdf_page = pydyf.Dictionary({
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								                'Type': '/Page',
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								                'Parent': pdf.pages.reference,
 								                'MediaBox': pydyf.Array([left, top, right, bottom]),
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								                'Contents': stream.reference,
 								                'Resources': resources.reference,
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								                'Annots': pydyf.Array(),
-												Handle hyperlinks and anchors

											
										
										
											2020-04-19 01:47:19 +03:00
+								            })
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            pdf.add_page(pdf_page)
 								            add_hyperlinks(links, anchors, matrix, pdf, pdf_page, pdf_names)
-												Go through PDF pages only once during generation

											
										
										
											2020-04-21 23:47:55 +03:00
+								            # Bleed
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            bleed = {key: value * 0.75 for key, value in page.bleed.items()}
 								            trim_left = left + bleed['left']
 								            trim_top = top + bleed['top']
 								            trim_right = right - bleed['right']
 								            trim_bottom = bottom - bleed['bottom']
-												Handle hyperlinks and anchors

											
										
										
											2020-04-19 01:47:19 +03:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            # Arbitrarly set PDF BleedBox between CSS bleed box (MediaBox) and
 								            # CSS page box (TrimBox) at most 10 points from the TrimBox.
 								            bleed_left = trim_left - min(10, bleed['left'])
 								            bleed_top = trim_top - min(10, bleed['top'])
 								            bleed_right = trim_right + min(10, bleed['right'])
 								            bleed_bottom = trim_bottom + min(10, bleed['bottom'])
-												Handle hyperlinks and anchors

											
										
										
											2020-04-19 01:47:19 +03:00
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            pdf_page['TrimBox'] = pydyf.Array([
 								                trim_left, trim_top, trim_right, trim_bottom])
 								            pdf_page['BleedBox'] = pydyf.Array([
 								                bleed_left, bleed_top, bleed_right, bleed_bottom])
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
-												Go through PDF pages only once during generation

											
										
										
											2020-04-21 23:47:55 +03:00
+								            # Annotations
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								            # TODO: splitting a link into multiple independent rectangular
 								            # annotations works well for pure links, but rather mediocre for
 								            # other annotations and fails completely for transformed (CSS) or
 								            # complex link shapes (area). It would be better to use /AP for all
 								            # links and coalesce link shapes that originate from the same HTML
 								            # link. This would give a feeling similiar to what browsers do with
 								            # links that span multiple lines.
 								            for link_type, annot_target, rectangle in page_links:
 								                annot_file = annot_files[annot_target]
 								                if link_type == 'attachment' and annot_file is not None:
 								                    rectangle = (
 								                        *matrix.transform_point(*rectangle[:2]),
 								                        *matrix.transform_point(*rectangle[2:]))
 								                    annot = pydyf.Dictionary({
 								                        'Type': '/Annot',
 								                        'Rect': pydyf.Array(rectangle),
 								                        'Subtype': '/FileAttachment',
 								                        'T': pydyf.String(),
 								                        'FS': annot_file.reference,
 								                        'AP': pydyf.Dictionary({'N': pydyf.Stream([], {
 								                            'Type': '/XObject',
 								                            'Subtype': '/Form',
 								                            'BBox': pydyf.Array(rectangle),
 								                            'Length': 0,
 								                        })})
 								                    })
 								                    pdf.add_object(annot)
 								                    pdf_page['Annots'].append(annot.reference)
-												Go through PDF pages only once during generation

											
										
										
											2020-04-21 23:47:55 +03:00
+								            # Bookmarks
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            for level, label, (point_x, point_y), state in page.bookmarks:
 								                if level > previous_level:
 								                    # Example: if the previous bookmark is a <h2>, the next
 								                    # depth "should" be for <h3>. If now we get a <h6> we’re
 								                    # skipping two levels: append 6 - 3 - 1 = 2
 								                    skipped_levels.append(level - previous_level - 1)
 								                else:
 								                    temp = level
 								                    while temp < previous_level:
 								                        temp += 1 + skipped_levels.pop()
 								                    if temp > previous_level:
 								                        # We remove too many "skips", add some back:
 								                        skipped_levels.append(temp - previous_level - 1)
 								                previous_level = level
 								                depth = level - sum(skipped_levels)
 								                assert depth == len(skipped_levels)
 								                assert depth >= 1
 								                children = []
 								                point_x, point_y = matrix.transform_point(point_x, point_y)
 								                subtree = BookmarkSubtree(
 								                    label, (page_number, point_x, point_y), children, state)
 								                last_by_depth[depth - 1].append(subtree)
 								                del last_by_depth[depth:]
 								                last_by_depth.append(children)
 								        outlines, count = create_bookmarks(root, pdf)
-												Fix crash when there’s no outline

											
										
										
											2020-04-22 00:07:35 +03:00
+								        if outlines:
 								            pdf.catalog['Outlines'] = pydyf.Dictionary({
 								                'Count': count,
 								                'First': outlines[0].reference,
 								                'Last': outlines[-1].reference,
 								            })
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
-												Go through PDF pages only once during generation

											
										
										
											2020-04-21 23:47:55 +03:00
+								        PROGRESS_LOGGER.info('Step 7 - Adding PDF metadata')
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								        # PDF information
-												Go through PDF pages only once during generation

											
										
										
											2020-04-21 23:47:55 +03:00
+								        if self.metadata.title:
 								            pdf.info['Title'] = pydyf.String(self.metadata.title)
 								        if self.metadata.authors:
 								            pdf.info['Author'] = pydyf.String(
 								                ', '.join(self.metadata.authors))
 								        if self.metadata.description:
 								            pdf.info['Subject'] = pydyf.String(self.metadata.description)
 								        if self.metadata.keywords:
 								            pdf.info['Keywords'] = pydyf.String(
 								                ', '.join(self.metadata.keywords))
 								        if self.metadata.generator:
 								            pdf.info['Creator'] = pydyf.String(self.metadata.generator)
 								        pdf.info['Producer'] = pydyf.String(f'WeasyPrint {__version__}')
 								        if self.metadata.created:
 								            pdf.info['CreationDate'] = pydyf.String(
 								                _w3c_date_to_pdf(self.metadata.created, 'created'))
 								        if self.metadata.modified:
 								            pdf.info['ModDate'] = pydyf.String(
 								                _w3c_date_to_pdf(self.metadata.modified, 'modified'))
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								        # Embedded files
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
+								        attachments = self.metadata.attachments + (attachments or [])
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								        pdf_attachments = []
 								        for attachment in attachments:
 								            pdf_attachment = _write_pdf_attachment(
 								                pdf, attachment, self.url_fetcher)
 								            if pdf_attachment is not None:
 								                pdf_attachments.append(pdf_attachment)
 								        if pdf_attachments:
 								            content = pydyf.Dictionary({'Names': pydyf.Array()})
 								            for i, pdf_attachment in enumerate(pdf_attachments):
 								                content['Names'].append(pydyf.String(f'attachment{i}'))
 								                content['Names'].append(pdf_attachment.reference)
 								            pdf.add_object(content)
 								            pdf.catalog['Names']['EmbeddedFiles'] = content.reference
 								        # Embeded fonts
 								        resources['Font'] = pydyf.Dictionary()
 								        for font_hash, font in stream._fonts.items():
-												Change some font attributes names

											
										
										
											2020-05-08 02:50:41 +03:00
+								            compressed = zlib.compressobj().compress(font.file_content)
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								            font_extra = pydyf.Dictionary({
 								                'Filter': '/FlateDecode',
-												Change some font attributes names

											
										
										
											2020-05-08 02:50:41 +03:00
+								                'Length1': len(font.file_content),
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								            })
 								            font_stream = pydyf.Stream([compressed], font_extra)
 								            pdf.add_object(font_stream)
-												Change some font attributes names

											
										
										
											2020-05-08 02:50:41 +03:00
+								            font.compute_glyphs_values()
-												Use font codepoints instead of Unicode codepoints

											
										
										
											2020-05-08 18:14:45 +03:00
+								            subfont_dictionary = pydyf.Dictionary({
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								                'Type': '/Font',
-												Use font codepoints instead of Unicode codepoints

											
										
										
											2020-05-08 18:14:45 +03:00
+								                'Subtype': '/CIDFontType2',
-												Fix font name

											
										
										
											2020-05-08 02:55:50 +03:00
+								                'BaseFont': font.name,
-												Use font codepoints instead of Unicode codepoints

											
										
										
											2020-05-08 18:14:45 +03:00
+								                'CIDSystemInfo': pydyf.Dictionary({
 								                    'Registry': pydyf.String('Adobe'),
 								                    'Ordering': pydyf.String('Identity'),
 								                    'Supplement': 0,
 								                }),
 								                'W': pydyf.Array([font.first_char, pydyf.Array(font.widths)]),
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								                'FontDescriptor': pydyf.Dictionary({
-												Fix font name

											
										
										
											2020-05-08 02:55:50 +03:00
+								                    'FontName': font.name,
 								                    'FontFamily': pydyf.String(font.family),
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								                    'Flags': 32,
-												Change some font attributes names

											
										
										
											2020-05-08 02:50:41 +03:00
+								                    'FontBBox': pydyf.Array(font.bbox),
-												Use computed values

											
										
										
											2020-05-08 01:40:01 +03:00
+								                    'ItalicAngle': font.italic_angle,
 								                    'Ascent': font.ascent,
 								                    'Descent': font.descent,
 								                    'CapHeight': font.cap_height,
 								                    'StemV': font.stemv,
 								                    'StemH': font.stemh,
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								                    'FontFile': font_stream.reference,
-												Use font codepoints instead of Unicode codepoints

											
										
										
											2020-05-08 18:14:45 +03:00
+								                }),
 								            })
 								            pdf.add_object(subfont_dictionary)
 								            font_dictionary = pydyf.Dictionary({
 								                'Type': '/Font',
 								                'Subtype': '/Type0',
 								                'BaseFont': font.name,
 								                'Encoding': '/Identity-H',
 								                'DescendantFonts': pydyf.Array([subfont_dictionary.reference]),
-												Include font streams

											
										
										
											2020-05-06 08:42:45 +03:00
+								            })
 								            pdf.add_object(font_dictionary)
 								            resources['Font'][str(font_hash)] = font_dictionary.reference
-												Add inline attachments

											
										
										
											2020-04-21 23:30:38 +03:00
-												Call finisher

											
										
										
											2020-04-19 11:01:27 +03:00
+								        if finisher:
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								            finisher(self, pdf)
-												Call finisher

											
										
										
											2020-04-19 11:01:27 +03:00
-												First steps to replace cairo with pydyf

											
										
										
											2020-04-18 01:19:35 +03:00
+								        file_obj = io.BytesIO()
-												Merge document and pdf modules

											
										
										
											2020-04-19 17:49:37 +03:00
+								        pdf.write(file_obj)
-												The Document is back! (But different.)

											
										
										
											2012-10-02 20:59:02 +04:00
 								        if target is None:
 								            return file_obj.getvalue()
 								        else:
 								            file_obj.seek(0)
 								            if hasattr(target, 'write'):
 								                shutil.copyfileobj(file_obj, target)
 								            else:
 								                with open(target, 'wb') as fd:
 								                    shutil.copyfileobj(file_obj, fd)
-												More WIP: cairocffi and pango cffi.

											
										
										
											2012-12-29 04:00:30 +04:00
+								    def write_png(self, target=None, resolution=96):
 								        """Paint the pages vertically to a single PNG image.
 								        There is no decoration around pages other than those specified in CSS
 								        with ``@page`` rules. The final image is as wide as the widest page.
 								        Each page is below the previous one, centered horizontally.
 								        :param target:
 								            A filename, file-like object, or :obj:`None`.
 								        :type resolution: float
 								        :param resolution:
 								            The output resolution in PNG pixels per CSS inch. At 96 dpi
 								            (the default), PNG pixels match the CSS ``px`` unit.
 								        :returns:
-												Clean a lot of things in API documentation

											
										
										
											2019-02-22 13:34:46 +03:00
+								            A ``(png_bytes, png_width, png_height)`` tuple. ``png_bytes`` is a
 								            byte string if ``target`` is :obj:`None`, otherwise :obj:`None`
 								            (the image is written to ``target``).  ``png_width`` and
 								            ``png_height`` are the size of the final image, in PNG pixels.
-												More WIP: cairocffi and pango cffi.

											
										
										
											2012-12-29 04:00:30 +04:00
 								        """
-												Remove some references to Cairo

											
										
										
											2020-04-19 09:09:22 +03:00
+								        # TODO: write this
 								        raise NotImplementedError