WeasyPrint/weasyprint/pdf.py

"""
    weasyprint.pdf
    --------------

    Post-process the PDF files created by cairo and add metadata such as
    hyperlinks and bookmarks.

    :copyright: Copyright 2011-2014 Simon Sapin and contributors, see AUTHORS.
    :license: BSD, see LICENSE for details.

"""

import hashlib
import io
import mimetypes
import zlib
from urllib.parse import unquote

import cairocffi as cairo
from pdfrw import PdfArray, PdfDict, PdfName, PdfReader, PdfString, PdfWriter

from . import VERSION_STRING, Attachment
from .html import W3C_DATE_RE
from .logger import LOGGER
from .urls import URLFetchingError, iri_to_uri, urlsplit


def convert_bookmarks_units(bookmarks, matrices):
    converted_bookmarks = []
    for label, target, children in bookmarks:
        page, x, y = target
        x, y = matrices[target[0]].transform_point(x, y)
        children = convert_bookmarks_units(children, matrices)
        converted_bookmarks.append((label, (page, x, y), children))
    return converted_bookmarks


def prepare_metadata(document, scale, pages):
    """Change metadata into data structures closer to the PDF objects.

    In particular, convert from WeasyPrint units (CSS pixels from
    the top-left corner) to PDF units (points from the bottom-left corner.)

    :param scale:
        PDF points per CSS pixels.
        Defaults to 0.75, but is affected by `zoom` in
        :meth:`weasyprint.document.Document.write_pdf`.

    """
    # X and width unchanged;  Y’ = page_height - Y;  height’ = -height
    matrices = [cairo.Matrix(xx=scale, yy=-scale, y0=page.height * scale)
                for page in document.pages]
    links = []
    for page_links, matrix in zip(document.resolve_links(), matrices):
        new_page_links = []
        for link_type, target, rectangle in page_links:
            if link_type == 'internal':
                target_page, target_x, target_y = target
                target = (
                    (pages[target_page].indirect,) +
                    matrices[target_page].transform_point(target_x, target_y))
            rect_x, rect_y, width, height = rectangle
            rect_x, rect_y = matrix.transform_point(rect_x, rect_y)
            width, height = matrix.transform_distance(width, height)
            # x, y, w, h => x0, y0, x1, y1
            rectangle = rect_x, rect_y, rect_x + width, rect_y + height
            new_page_links.append((link_type, target, rectangle))
        links.append(new_page_links)

    bookmarks = convert_bookmarks_units(
        document.make_bookmark_tree(), matrices)

    return bookmarks, links


def _create_compressed_file_object(source):
    """
    Create a file like object as ``/EmbeddedFile`` compressing it with deflate.

    :return:
        the object representing the compressed file stream object
    """
    md5 = hashlib.md5()
    compress = zlib.compressobj()

    pdf_file_object = PdfDict(
        Type=PdfName('EmbeddedFile'), Filter=PdfName('FlateDecode'))

    # pdfrw needs Latin-1-decoded unicode strings in object.stream
    pdf_file_object.stream = ''
    size = 0
    for data in iter(lambda: source.read(4096), b''):
        size += len(data)
        md5.update(data)
        pdf_file_object.stream += compress.compress(data).decode('latin-1')
    pdf_file_object.stream += compress.flush(zlib.Z_FINISH).decode('latin-1')
    pdf_file_object.Params = PdfDict(
        CheckSum=PdfString('<{}>'.format(md5.hexdigest())), Size=size)
    return pdf_file_object


def _get_filename_from_result(url, result):
    """
    Derives a filename from a fetched resource. This is either the filename
    returned by the URL fetcher, the last URL path component or a synthetic
    name if the URL has no path
    """

    filename = None

    # A given filename will always take precedence
    if result:
        filename = result.get('filename')
        if filename:
            return filename

    # The URL path likely contains a filename, which is a good second guess
    if url:
        split = urlsplit(url)
        if split.scheme != 'data':
            filename = split.path.split("/")[-1]
            if filename == '':
                filename = None

    if filename is None:
        # The URL lacks a path altogether. Use a synthetic name.

        # Using guess_extension is a great idea, but sadly the extension is
        # probably random, depending on the alignment of the stars, which car
        # you're driving and which software has been installed on your machine.
        #
        # Unfortuneatly this isn't even imdepodent on one machine, because the
        # extension can depend on PYTHONHASHSEED if mimetypes has multiple
        # extensions to offer
        extension = None
        if result:
            mime_type = result.get('mime_type')
            if mime_type == 'text/plain':
                # text/plain has a phletora of extensions - all garbage
                extension = '.txt'
            else:
                extension = mimetypes.guess_extension(mime_type) or '.bin'
        else:
            extension = '.bin'

        filename = 'attachment' + extension
    else:
        filename = unquote(filename)

    return filename


def _create_pdf_attachment(attachment, url_fetcher):
    """
    Create an attachment to the PDF stream

    :return:
        the object representing the ``/Filespec`` object or :obj:`None` if the
        attachment couldn't be read.
    """
    try:
        # Attachments from document links like <link> or <a> can only be URLs.
        # They're passed in as tuples
        if isinstance(attachment, tuple):
            url, description = attachment
            attachment = Attachment(
                url=url, url_fetcher=url_fetcher, description=description)
        elif not isinstance(attachment, Attachment):
            attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)

        with attachment.source as (source_type, source, url, _):
            if isinstance(source, bytes):
                source = io.BytesIO(source)
            pdf_file_object = _create_compressed_file_object(source)
    except URLFetchingError as exc:
        LOGGER.error('Failed to load attachment: %s', exc)
        return None

    # TODO: Use the result object from a URL fetch operation to provide more
    # details on the possible filename
    return PdfDict(
        Type=PdfName('Filespec'), F=PdfString.encode(''),
        UF=PdfString.encode(_get_filename_from_result(url, None)),
        EF=PdfDict(F=pdf_file_object),
        Desc=PdfString.encode(attachment.description or ''))


def create_bookmarks(bookmarks, pages, parent=None):
    count = len(bookmarks)
    bookmark_objects = []
    for label, target, children in bookmarks:
        destination = (
            pages[target[0]].indirect,
            PdfName('XYZ'), target[1], target[2], 0)
        bookmark_object = PdfDict(
            Title=PdfString.encode(label), A=PdfDict(
                Type=PdfName('Action'), S=PdfName('GoTo'),
                D=PdfArray(destination)))
        bookmark_object.indirect = True
        children_objects, children_count = create_bookmarks(
            children, pages, parent=bookmark_object)
        bookmark_object.Count = 1 + children_count
        if bookmark_objects:
            bookmark_object.Prev = bookmark_objects[-1]
            bookmark_objects[-1].Next = bookmark_object
        if children_objects:
            bookmark_object.First = children_objects[0]
            bookmark_object.Last = children_objects[-1]
        if parent is not None:
            bookmark_object.Parent = parent
        count += children_count
        bookmark_objects.append(bookmark_object)
    return bookmark_objects, count


def write_pdf_metadata(document, fileobj, scale, metadata, attachments,
                       url_fetcher):
    """Append to a seekable file-like object to add PDF metadata."""
    fileobj.seek(0)
    trailer = PdfReader(fileobj)
    pages = trailer.Root.Pages.Kids

    bookmarks, links = prepare_metadata(document, scale, pages)
    if bookmarks:
        bookmark_objects, count = create_bookmarks(bookmarks, pages)
        trailer.Root.Outlines = PdfDict(
            Type=PdfName('Outlines'), Count=count,
            First=bookmark_objects[0], Last=bookmark_objects[-1])

    attachments = metadata.attachments + (attachments or [])
    if attachments:
        embedded_files = []
        for attachment in attachments:
            attachment_object = _create_pdf_attachment(attachment, url_fetcher)
            if attachment_object is not None:
                embedded_files.append(PdfString.encode('attachment'))
                embedded_files.append(attachment_object)
        if embedded_files:
            trailer.Root.Names = PdfDict(
                EmbeddedFiles=PdfDict(Names=PdfArray(embedded_files)))

    # A single link can be split in multiple regions. We don't want to embedded
    # a file multiple times of course, so keep a reference to every embedded
    # URL and reuse the object number.
    # TODO: If we add support for descriptions this won't always be correct,
    # because two links might have the same href, but different titles.
    annot_files = {}
    for page_links in links:
        for link_type, target, rectangle in page_links:
            if link_type == 'attachment' and target not in annot_files:
                # TODO: use the title attribute as description
                annot_files[target] = _create_pdf_attachment(
                    (target, None), url_fetcher)

    # TODO: splitting a link into multiple independent rectangular annotations
    # works well for pure links, but rather mediocre for other annotations and
    # fails completely for transformed (CSS) or complex link shapes (area).
    # It would be better to use /AP for all links and coalesce link shapes that
    # originate from the same HTML link. This would give a feeling similiar to
    # what browsers do with links that span multiple lines.
    for page, page_links in zip(pages, links):
        annotations = PdfArray()
        for link_type, target, rectangle in page_links:
            if link_type != 'attachment' or annot_files[target] is None:
                annotation = PdfDict(
                    Type=PdfName('Annot'), Subtype=PdfName('Link'),
                    Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0)))
                if link_type == 'internal':
                    destination = (
                        target[0], PdfName('XYZ'), target[1], target[2], 0)
                    annotation.A = PdfDict(
                        Type=PdfName('Action'), S=PdfName('GoTo'),
                        D=PdfArray(destination))
                else:
                    annotation.A = PdfDict(
                        Type=PdfName('Action'), S=PdfName('URI'),
                        URI=PdfString.encode(iri_to_uri(target)))
            else:
                assert annot_files[target] is not None
                ap = PdfDict(N=PdfDict(
                    BBox=PdfArray(rectangle), Subtype=PdfName('Form'),
                    Type=PdfName('XObject')))
                # evince needs /T or fails on an internal assertion. PDF
                # doesn't require it.
                annotation = PdfDict(
                    Type=PdfName('Annot'), Subtype=PdfName('FileAttachment'),
                    T=PdfString.encode(''), Rect=PdfArray(rectangle),
                    Border=PdfArray((0, 0, 0)), FS=annot_files[target],
                    AP=ap)
            annotations.append(annotation)

        if annotations:
            page.Annots = annotations

    trailer.Info.Producer = VERSION_STRING
    for attr, key in (('title', 'Title'), ('description', 'Subject'),
                      ('generator', 'Creator')):
        value = getattr(metadata, attr)
        if value is not None:
            setattr(trailer.Info, key, value)
    for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')):
        value = getattr(metadata, attr)
        if value is not None:
            setattr(trailer.Info, key, ', '.join(getattr(metadata, attr)))
    for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')):
        value = w3c_date_to_pdf(getattr(metadata, attr), attr)
        if value is not None:
            setattr(trailer.Info, key, value)

    for page, document_page in zip(pages, document.pages):
        left, top, right, bottom = (float(value) for value in page.MediaBox)
        # Convert pixels into points
        bleed = {
            key: value * 0.75 for key, value in document_page.bleed.items()}

        trim_left = left + bleed['left']
        trim_top = top + bleed['top']
        trim_right = right - bleed['right']
        trim_bottom = bottom - bleed['bottom']
        page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom))

        # Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and
        # CSS page box (PDF TrimBox), at most 10 points from the TrimBox.
        bleed_left = trim_left - min(10, bleed['left'])
        bleed_top = trim_top - min(10, bleed['top'])
        bleed_right = trim_right + min(10, bleed['right'])
        bleed_bottom = trim_bottom + min(10, bleed['bottom'])
        page.BleedBox = PdfArray(
            (bleed_left, bleed_top, bleed_right, bleed_bottom))

    fileobj.seek(0)
    PdfWriter().write(fileobj, trailer=trailer)
    fileobj.truncate()


def w3c_date_to_pdf(string, attr_name):
    """
    YYYYMMDDHHmmSSOHH'mm'

    """
    if string is None:
        return None
    match = W3C_DATE_RE.match(string)
    if match is None:
        LOGGER.warning('Invalid %s date: %r', attr_name, string)
        return None
    groups = match.groupdict()
    pdf_date = (groups['year'] +
                (groups['month'] or '') +
                (groups['day'] or '') +
                (groups['hour'] or '') +
                (groups['minute'] or '') +
                (groups['second'] or ''))
    if groups['hour']:
        assert groups['minute']
        if not groups['second']:
            pdf_date += '00'
        if groups['tz_hour']:
            assert groups['tz_hour'].startswith(('+', '-'))
            assert groups['tz_minute']
            pdf_date += "%s'%s'" % (groups['tz_hour'], groups['tz_minute'])
        else:
            pdf_date += 'Z'  # UTC
    return pdf_date
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								"""
-												Start a more robust PDF parser.

											
										
										
											2012-05-19 16:50:38 +04:00
+								    weasyprint.pdf
 								    --------------
 								    Post-process the PDF files created by cairo and add metadata such as
 								    hyperlinks and bookmarks.
-.

											
										
										
											2014-01-10 18:27:02 +04:00
+								    :copyright: Copyright 2011-2014 Simon Sapin and contributors, see AUTHORS.
-												Start a more robust PDF parser.

											
										
										
											2012-05-19 16:50:38 +04:00
+								    :license: BSD, see LICENSE for details.
 								"""
-												Refactored `write_compressed_file_object` to a top-level function

											
										
										
											2014-04-07 21:55:23 +04:00
+								import hashlib
-												Added support for PDF attachments (v2)

											
										
										
											2014-04-04 14:32:21 +04:00
+								import io
-												Change filename logic for PDF attachments

This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.

											
										
										
											2014-04-18 18:40:47 +04:00
+								import mimetypes
-												Refactored `write_compressed_file_object` to a top-level function

											
										
										
											2014-04-07 21:55:23 +04:00
+								import zlib
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								from urllib.parse import unquote
-												Start a more robust PDF parser.

											
										
										
											2012-05-19 16:50:38 +04:00
-												More WIP: cairocffi and pango cffi.

											
										
										
											2012-12-29 04:00:30 +04:00
+								import cairocffi as cairo
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								from pdfrw import PdfArray, PdfDict, PdfName, PdfReader, PdfString, PdfWriter
-												Refactor the metadata stuff into the pdf module.

											
										
										
											2012-05-20 19:04:22 +04:00
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								from . import VERSION_STRING, Attachment
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								from .html import W3C_DATE_RE
 								from .logger import LOGGER
-												Use isort in tests

											
										
										
											2017-03-25 02:33:36 +03:00
+								from .urls import URLFetchingError, iri_to_uri, urlsplit
-												Refactor the metadata stuff into the pdf module.

											
										
										
											2012-05-20 19:04:22 +04:00
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								def convert_bookmarks_units(bookmarks, matrices):
 								    converted_bookmarks = []
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								    for label, target, children in bookmarks:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								        page, x, y = target
 								        x, y = matrices[target[0]].transform_point(x, y)
 								        children = convert_bookmarks_units(children, matrices)
 								        converted_bookmarks.append((label, (page, x, y), children))
 								    return converted_bookmarks
-												Refactor the metadata stuff into the pdf module.

											
										
										
											2012-05-20 19:04:22 +04:00
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								def prepare_metadata(document, scale, pages):
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								    """Change metadata into data structures closer to the PDF objects.
-												Refactor the metadata stuff into the pdf module.

											
										
										
											2012-05-20 19:04:22 +04:00
-												Fix PDF metadata with zoom != 1

											
										
										
											2012-11-23 01:49:30 +04:00
+								    In particular, convert from WeasyPrint units (CSS pixels from
 								    the top-left corner) to PDF units (points from the bottom-left corner.)
-												More WIP: cairocffi and pango cffi.

											
										
										
											2012-12-29 04:00:30 +04:00
-												Fix PDF metadata with zoom != 1

											
										
										
											2012-11-23 01:49:30 +04:00
+								    :param scale:
 								        PDF points per CSS pixels.
-												More WIP: cairocffi and pango cffi.

											
										
										
											2012-12-29 04:00:30 +04:00
+								        Defaults to 0.75, but is affected by `zoom` in
-												Fix PDF metadata with zoom != 1

											
										
										
											2012-11-23 01:49:30 +04:00
+								        :meth:`weasyprint.document.Document.write_pdf`.
-												Refactor the metadata stuff into the pdf module.

											
										
										
											2012-05-20 19:04:22 +04:00
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								    """
 								    # X and width unchanged;  Y’ = page_height - Y;  height’ = -height
-												Fix PDF metadata with zoom != 1

											
										
										
											2012-11-23 01:49:30 +04:00
+								    matrices = [cairo.Matrix(xx=scale, yy=-scale, y0=page.height * scale)
-												Move resolution to *.write_png only.

											
										
										
											2012-10-05 22:12:05 +04:00
+								                for page in document.pages]
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								    links = []
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								    for page_links, matrix in zip(document.resolve_links(), matrices):
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								        new_page_links = []
 								        for link_type, target, rectangle in page_links:
 								            if link_type == 'internal':
 								                target_page, target_x, target_y = target
-												Flake8.

											
										
										
											2013-04-11 14:08:53 +04:00
+								                target = (
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								                    (pages[target_page].indirect,) +
-												Move resolution to *.write_png only.

											
										
										
											2012-10-05 22:12:05 +04:00
+								                    matrices[target_page].transform_point(target_x, target_y))
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								            rect_x, rect_y, width, height = rectangle
-												Move resolution to *.write_png only.

											
										
										
											2012-10-05 22:12:05 +04:00
+								            rect_x, rect_y = matrix.transform_point(rect_x, rect_y)
 								            width, height = matrix.transform_distance(width, height)
 								            # x, y, w, h => x0, y0, x1, y1
 								            rectangle = rect_x, rect_y, rect_x + width, rect_y + height
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								            new_page_links.append((link_type, target, rectangle))
 								        links.append(new_page_links)
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    bookmarks = convert_bookmarks_units(
 								        document.make_bookmark_tree(), matrices)
-												Refactor the metadata stuff into the pdf module.

											
										
										
											2012-05-20 19:04:22 +04:00
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    return bookmarks, links
-												Refactor the metadata stuff into the pdf module.

											
										
										
											2012-05-20 19:04:22 +04:00
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								def _create_compressed_file_object(source):
-												Refactored `write_compressed_file_object` to a top-level function

											
										
										
											2014-04-07 21:55:23 +04:00
+								    """
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    Create a file like object as ``/EmbeddedFile`` compressing it with deflate.
-												Refactored `write_compressed_file_object` to a top-level function

											
										
										
											2014-04-07 21:55:23 +04:00
 								    :return:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								        the object representing the compressed file stream object
-												Refactored `write_compressed_file_object` to a top-level function

											
										
										
											2014-04-07 21:55:23 +04:00
+								    """
 								    md5 = hashlib.md5()
 								    compress = zlib.compressobj()
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    pdf_file_object = PdfDict(
 								        Type=PdfName('EmbeddedFile'), Filter=PdfName('FlateDecode'))
-												Use Latin-1-decoded strings instead of bytestrings in pdfrw streams

Fix #558.

											
										
										
											2018-01-28 18:21:48 +03:00
 								    # pdfrw needs Latin-1-decoded unicode strings in object.stream
 								    pdf_file_object.stream = ''
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    size = 0
 								    for data in iter(lambda: source.read(4096), b''):
 								        size += len(data)
-												Refactored `write_compressed_file_object` to a top-level function

											
										
										
											2014-04-07 21:55:23 +04:00
+								        md5.update(data)
-												Use Latin-1-decoded strings instead of bytestrings in pdfrw streams

Fix #558.

											
										
										
											2018-01-28 18:21:48 +03:00
+								        pdf_file_object.stream += compress.compress(data).decode('latin-1')
 								    pdf_file_object.stream += compress.flush(zlib.Z_FINISH).decode('latin-1')
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    pdf_file_object.Params = PdfDict(
 								        CheckSum=PdfString('<{}>'.format(md5.hexdigest())), Size=size)
 								    return pdf_file_object
-												Refactored `write_compressed_file_object` to a top-level function

											
										
										
											2014-04-07 21:55:23 +04:00
-												Change filename logic for PDF attachments

This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.

											
										
										
											2014-04-18 18:40:47 +04:00
+								def _get_filename_from_result(url, result):
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
+								    """
-												Change filename logic for PDF attachments

This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.

											
										
										
											2014-04-18 18:40:47 +04:00
+								    Derives a filename from a fetched resource. This is either the filename
 								    returned by the URL fetcher, the last URL path component or a synthetic
 								    name if the URL has no path
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
+								    """
-												Change filename logic for PDF attachments

This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.

											
										
										
											2014-04-18 18:40:47 +04:00
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								    filename = None
-												Change filename logic for PDF attachments

This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.

											
										
										
											2014-04-18 18:40:47 +04:00
+								    # A given filename will always take precedence
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								    if result:
 								        filename = result.get('filename')
 								        if filename:
 								            return filename
-												Change filename logic for PDF attachments

This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.

											
										
										
											2014-04-18 18:40:47 +04:00
 								    # The URL path likely contains a filename, which is a good second guess
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								    if url:
 								        split = urlsplit(url)
 								        if split.scheme != 'data':
 								            filename = split.path.split("/")[-1]
 								            if filename == '':
 								                filename = None
 								    if filename is None:
-												Change filename logic for PDF attachments

This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.

											
										
										
											2014-04-18 18:40:47 +04:00
+								        # The URL lacks a path altogether. Use a synthetic name.
 								        # Using guess_extension is a great idea, but sadly the extension is
 								        # probably random, depending on the alignment of the stars, which car
 								        # you're driving and which software has been installed on your machine.
 								        #
 								        # Unfortuneatly this isn't even imdepodent on one machine, because the
 								        # extension can depend on PYTHONHASHSEED if mimetypes has multiple
 								        # extensions to offer
 								        extension = None
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								        if result:
 								            mime_type = result.get('mime_type')
 								            if mime_type == 'text/plain':
 								                # text/plain has a phletora of extensions - all garbage
 								                extension = '.txt'
 								            else:
 								                extension = mimetypes.guess_extension(mime_type) or '.bin'
-												Change filename logic for PDF attachments

This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.

											
										
										
											2014-04-18 18:40:47 +04:00
+								        else:
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								            extension = '.bin'
-												Change filename logic for PDF attachments

This patch honors the filename key of a fetched resource, which can be set by
the `Content-Disposition` or `Content-Type` headers and uses
`mimetypes.guess_extension` for resources that lack any indication of a
filename.

											
										
										
											2014-04-18 18:40:47 +04:00
 								        filename = 'attachment' + extension
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
+								    else:
-												Drop Python 2 support

											
										
										
											2018-01-14 03:48:17 +03:00
+								        filename = unquote(filename)
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
 								    return filename
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								def _create_pdf_attachment(attachment, url_fetcher):
-												Added support for PDF attachments (v2)

											
										
										
											2014-04-04 14:32:21 +04:00
+								    """
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    Create an attachment to the PDF stream
-												Added support for PDF attachments (v2)

											
										
										
											2014-04-04 14:32:21 +04:00
 								    :return:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								        the object representing the ``/Filespec`` object or :obj:`None` if the
-												Added support for PDF attachments (v2)

											
										
										
											2014-04-04 14:32:21 +04:00
+								        attachment couldn't be read.
 								    """
 								    try:
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								        # Attachments from document links like <link> or <a> can only be URLs.
 								        # They're passed in as tuples
 								        if isinstance(attachment, tuple):
-												Fix code formatting

											
										
										
											2014-04-27 21:16:14 +04:00
+								            url, description = attachment
 								            attachment = Attachment(
 								                url=url, url_fetcher=url_fetcher, description=description)
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								        elif not isinstance(attachment, Attachment):
 								            attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
-												Don't crash when attachments are not available (fix #250)

											
										
										
											2015-08-14 14:51:56 +03:00
 								        with attachment.source as (source_type, source, url, _):
 								            if isinstance(source, bytes):
 								                source = io.BytesIO(source)
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								            pdf_file_object = _create_compressed_file_object(source)
-												Added support for PDF attachments (v2)

											
										
										
											2014-04-04 14:32:21 +04:00
+								    except URLFetchingError as exc:
-												Clean and document the logging levels

Related to #488.

											
										
										
											2017-07-25 14:59:56 +03:00
+								        LOGGER.error('Failed to load attachment: %s', exc)
-												Added an `Attachment` class for attachments provided through the API instead of the URL/description tuples

											
										
										
											2014-04-26 01:35:43 +04:00
+								        return None
 								    # TODO: Use the result object from a URL fetch operation to provide more
 								    # details on the possible filename
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    return PdfDict(
 								        Type=PdfName('Filespec'), F=PdfString.encode(''),
 								        UF=PdfString.encode(_get_filename_from_result(url, None)),
 								        EF=PdfDict(F=pdf_file_object),
 								        Desc=PdfString.encode(attachment.description or ''))
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								def create_bookmarks(bookmarks, pages, parent=None):
 								    count = len(bookmarks)
 								    bookmark_objects = []
 								    for label, target, children in bookmarks:
 								        destination = (
 								            pages[target[0]].indirect,
 								            PdfName('XYZ'), target[1], target[2], 0)
 								        bookmark_object = PdfDict(
 								            Title=PdfString.encode(label), A=PdfDict(
 								                Type=PdfName('Action'), S=PdfName('GoTo'),
 								                D=PdfArray(destination)))
 								        bookmark_object.indirect = True
 								        children_objects, children_count = create_bookmarks(
 								            children, pages, parent=bookmark_object)
 								        bookmark_object.Count = 1 + children_count
 								        if bookmark_objects:
 								            bookmark_object.Prev = bookmark_objects[-1]
 								            bookmark_objects[-1].Next = bookmark_object
 								        if children_objects:
 								            bookmark_object.First = children_objects[0]
 								            bookmark_object.Last = children_objects[-1]
 								        if parent is not None:
 								            bookmark_object.Parent = parent
 								        count += children_count
 								        bookmark_objects.append(bookmark_object)
 								    return bookmark_objects, count
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
-												Refactored `attachments` attribute from the `HTML` class to an argument for `write_pdf`

											
										
										
											2014-04-22 22:40:46 +04:00
+								def write_pdf_metadata(document, fileobj, scale, metadata, attachments,
-												Fix code formatting

											
										
										
											2014-04-27 21:16:14 +04:00
+								                       url_fetcher):
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
+								    """Append to a seekable file-like object to add PDF metadata."""
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    fileobj.seek(0)
 								    trailer = PdfReader(fileobj)
 								    pages = trailer.Root.Pages.Kids
-												Write bookmarks and links with the new PDF module.

											
										
										
											2012-05-20 17:55:57 +04:00
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    bookmarks, links = prepare_metadata(document, scale, pages)
-												Write bookmarks and links with the new PDF module.

											
										
										
											2012-05-20 17:55:57 +04:00
+								    if bookmarks:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								        bookmark_objects, count = create_bookmarks(bookmarks, pages)
 								        trailer.Root.Outlines = PdfDict(
 								            Type=PdfName('Outlines'), Count=count,
 								            First=bookmark_objects[0], Last=bookmark_objects[-1])
 								    attachments = metadata.attachments + (attachments or [])
 								    if attachments:
 								        embedded_files = []
 								        for attachment in attachments:
 								            attachment_object = _create_pdf_attachment(attachment, url_fetcher)
 								            if attachment_object is not None:
 								                embedded_files.append(PdfString.encode('attachment'))
 								                embedded_files.append(attachment_object)
 								        if embedded_files:
 								            trailer.Root.Names = PdfDict(
 								                EmbeddedFiles=PdfDict(Names=PdfArray(embedded_files)))
-												Added support for PDF attachments (v2)

											
										
										
											2014-04-04 14:32:21 +04:00
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
+								    # A single link can be split in multiple regions. We don't want to embedded
 								    # a file multiple times of course, so keep a reference to every embedded
 								    # URL and reuse the object number.
 								    # TODO: If we add support for descriptions this won't always be correct,
 								    # because two links might have the same href, but different titles.
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    annot_files = {}
 								    for page_links in links:
 								        for link_type, target, rectangle in page_links:
 								            if link_type == 'attachment' and target not in annot_files:
 								                # TODO: use the title attribute as description
 								                annot_files[target] = _create_pdf_attachment(
 								                    (target, None), url_fetcher)
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
 								    # TODO: splitting a link into multiple independent rectangular annotations
 								    # works well for pure links, but rather mediocre for other annotations and
 								    # fails completely for transformed (CSS) or complex link shapes (area).
 								    # It would be better to use /AP for all links and coalesce link shapes that
 								    # originate from the same HTML link. This would give a feeling similiar to
 								    # what browsers do with links that span multiple lines.
-												Add support of marks and bleed pages properties

Fix #471.

											
										
										
											2017-09-05 16:44:50 +03:00
+								    for page, page_links in zip(pages, links):
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								        annotations = PdfArray()
-												Renamed `is_internal` to `link_type`, which is less confusing

											
										
										
											2014-04-23 19:12:54 +04:00
+								        for link_type, target, rectangle in page_links:
 								            if link_type != 'attachment' or annot_files[target] is None:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								                annotation = PdfDict(
 								                    Type=PdfName('Annot'), Subtype=PdfName('Link'),
 								                    Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0)))
-												Renamed `is_internal` to `link_type`, which is less confusing

											
										
										
											2014-04-23 19:12:54 +04:00
+								                if link_type == 'internal':
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								                    destination = (
 								                        target[0], PdfName('XYZ'), target[1], target[2], 0)
 								                    annotation.A = PdfDict(
 								                        Type=PdfName('Action'), S=PdfName('GoTo'),
 								                        D=PdfArray(destination))
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
+								                else:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								                    annotation.A = PdfDict(
 								                        Type=PdfName('Action'), S=PdfName('URI'),
 								                        URI=PdfString.encode(iri_to_uri(target)))
-												Test hyperlinks, fix internal hyperlink parsing.

											
										
										
											2012-05-21 20:43:08 +04:00
+								            else:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								                assert annot_files[target] is not None
 								                ap = PdfDict(N=PdfDict(
 								                    BBox=PdfArray(rectangle), Subtype=PdfName('Form'),
 								                    Type=PdfName('XObject')))
-												Added support for PDF file annotations.

											
										
										
											2014-04-04 20:46:00 +04:00
+								                # evince needs /T or fails on an internal assertion. PDF
 								                # doesn't require it.
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								                annotation = PdfDict(
 								                    Type=PdfName('Annot'), Subtype=PdfName('FileAttachment'),
 								                    T=PdfString.encode(''), Rect=PdfArray(rectangle),
 								                    Border=PdfArray((0, 0, 0)), FS=annot_files[target],
 								                    AP=ap)
 								            annotations.append(annotation)
-												Write bookmarks and links with the new PDF module.

											
										
										
											2012-05-20 17:55:57 +04:00
 								        if annotations:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								            page.Annots = annotations
-												Write bookmarks and links with the new PDF module.

											
										
										
											2012-05-20 17:55:57 +04:00
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    trailer.Info.Producer = VERSION_STRING
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								    for attr, key in (('title', 'Title'), ('description', 'Subject'),
 								                      ('generator', 'Creator')):
 								        value = getattr(metadata, attr)
 								        if value is not None:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								            setattr(trailer.Info, key, value)
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								    for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')):
 								        value = getattr(metadata, attr)
 								        if value is not None:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								            setattr(trailer.Info, key, ', '.join(getattr(metadata, attr)))
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								    for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')):
 								        value = w3c_date_to_pdf(getattr(metadata, attr), attr)
 								        if value is not None:
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								            setattr(trailer.Info, key, value)
-												Add metadata in the low-level API.

											
										
										
											2012-10-04 13:35:25 +04:00
-												Add support of marks and bleed pages properties

Fix #471.

											
										
										
											2017-09-05 16:44:50 +03:00
+								    for page, document_page in zip(pages, document.pages):
-												Cleanups

											
										
										
											2017-10-05 09:45:50 +03:00
+								        left, top, right, bottom = (float(value) for value in page.MediaBox)
 								        # Convert pixels into points
 								        bleed = {
 								            key: value * 0.75 for key, value in document_page.bleed.items()}
 								        trim_left = left + bleed['left']
 								        trim_top = top + bleed['top']
 								        trim_right = right - bleed['right']
 								        trim_bottom = bottom - bleed['bottom']
 								        page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom))
 								        # Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and
 								        # CSS page box (PDF TrimBox), at most 10 points from the TrimBox.
 								        bleed_left = trim_left - min(10, bleed['left'])
 								        bleed_top = trim_top - min(10, bleed['top'])
 								        bleed_right = trim_right + min(10, bleed['right'])
 								        bleed_bottom = trim_bottom + min(10, bleed['bottom'])
 								        page.BleedBox = PdfArray(
 								            (bleed_left, bleed_top, bleed_right, bleed_bottom))
-												Add support of marks and bleed pages properties

Fix #471.

											
										
										
											2017-09-05 16:44:50 +03:00
-												Use pdfrw to edit PDF metadata and test them

											
										
										
											2017-09-04 18:27:56 +03:00
+								    fileobj.seek(0)
 								    PdfWriter().write(fileobj, trailer=trailer)
 								    fileobj.truncate()
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
 								def w3c_date_to_pdf(string, attr_name):
 								    """
 								    YYYYMMDDHHmmSSOHH'mm'
 								    """
 								    if string is None:
 								        return None
 								    match = W3C_DATE_RE.match(string)
 								    if match is None:
-												Be careful logging.warn is deprecated

											
										
										
											2013-08-19 16:38:09 +04:00
+								        LOGGER.warning('Invalid %s date: %r', attr_name, string)
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								        return None
 								    groups = match.groupdict()
-												Fix many PEP8 errors and warnings

- Put line breaks after operators (that's OK)
- Don't assign lambdas, use functions (well, that's really explicit in
  PEP8, it's really verbose but why not)
- Put imports at the beginning of the file (only special cases for us)

											
										
										
											2016-01-15 14:47:03 +03:00
+								    pdf_date = (groups['year'] +
 								                (groups['month'] or '') +
 								                (groups['day'] or '') +
 								                (groups['hour'] or '') +
 								                (groups['minute'] or '') +
 								                (groups['second'] or ''))
-												Add PDF metadata parsed from HTML. Fix #77.

<title> → /Title
<meta name=author> → /Author
<meta name=description> → /Subject
<meta name=keywords> → /Keywords
<meta name=generator> → /Creator
<meta name=dcterms.created> → /CreationDate
<meta name=dcterms.modified> → /ModDate
"WeasyPrint vX.Y" → /Producer

											
										
										
											2013-07-14 15:08:02 +04:00
+								    if groups['hour']:
 								        assert groups['minute']
 								        if not groups['second']:
 								            pdf_date += '00'
 								        if groups['tz_hour']:
 								            assert groups['tz_hour'].startswith(('+', '-'))
 								            assert groups['tz_minute']
 								            pdf_date += "%s'%s'" % (groups['tz_hour'], groups['tz_minute'])
 								        else:
 								            pdf_date += 'Z'  # UTC
 								    return pdf_date