Only add AF values for PDF/A-2+

2024-09-11 20:47:56 +03:00 · 2024-02-02 16:42:10 +01:00 · 2024-02-02 16:42:10 +01:00 · cec6e5ec3e
commit cec6e5ec3e
parent 05a34e7d9b
4 changed files with 74 additions and 57 deletions
--- a/weasyprint/init.py
+++ b/weasyprint/init.py
@ -6,8 +6,8 @@ importing sub-modules.
 """

 import contextlib
-import datetime
-import os
+from datetime import datetime
+from os.path import getctime, getmtime
 from pathlib import Path
 from urllib.parse import urljoin

@ -310,46 +310,46 @@ class Attachment:

    An instance is created in the same way as :class:`HTML`, except that the
    HTML specific arguments (``encoding`` and ``media_type``) are not
-    supported. An optional description can be provided with the ``description``
-    argument.
+    supported.

-    :param description:
+    :param str description:
        A description of the attachment to be included in the PDF document.
        May be :obj:`None`.
+    :type created: :obj:`datetime.datetime`
+    :param created:
+        Creation date and time. Default is current date and time.
+    :type modified: :obj:`datetime.datetime`
+    :param modified:
+        Modification date and time. Default is current date and time.
+    :param str relationship:
+        A string that represents the relationship between the attachment and
+        the PDF it is embedded in. Default is 'Unspecified', other common
+        values are defined in ISO-32000-2:2020, 7.11.3.

    """
    def __init__(self, guess=None, filename=None, url=None, file_obj=None,
                 string=None, base_url=None, url_fetcher=default_url_fetcher,
                 description=None, created=None, modified=None,
-                 af_relationship="Source"):
+                 relationship='Unspecified'):
        self.source = _select_source(
            guess, filename, url, file_obj, string, base_url=base_url,
            url_fetcher=url_fetcher)
        self.description = description
-        self.af_relationship = af_relationship
+        self.relationship = relationship
+        self.md5 = None

-        def epoch_to_pdf(epoch):
-            dt_object = datetime.datetime.fromtimestamp(epoch)
-            return datetime_to_pdf(dt_object)
-
-        def datetime_to_pdf(dt_object):
-            return dt_object.strftime("D:%Y%m%d%H%M%SZ")
-
-        if created:
-            self.created = created
-        else:
+        if created is None:
            if filename:
-                self.created = epoch_to_pdf(os.path.getctime(filename))
+                created = datetime.fromtimestamp(getctime(filename))
            else:
-                self.created = datetime_to_pdf(datetime.datetime.now())
-
-        if modified:
-            self.modified = modified
-        else:
+                created = datetime.now()
+        if modified is None:
            if filename:
-                self.modified = epoch_to_pdf(os.path.getmtime(filename))
+                modified = datetime.fromtimestamp(getmtime(filename))
            else:
-                self.modified = datetime_to_pdf(datetime.datetime.now())
+                modified = datetime.now()
+        self.created = created.strftime('D:%Y%m%d%H%M%SZ')
+        self.modified = modified.strftime('D:%Y%m%d%H%M%SZ')


@contextlib.contextmanager
--- a/weasyprint/pdf/init.py
+++ b/weasyprint/pdf/init.py
@ -254,12 +254,9 @@ def generate_pdf(document, target, zoom, **options):
            pdf_attachments.append(pdf_attachment)
    if pdf_attachments:
        content = pydyf.Dictionary({'Names': pydyf.Array()})
-        if 'AF' not in pdf.catalog:
-            pdf.catalog['AF'] = pydyf.Array()
        for i, pdf_attachment in enumerate(pdf_attachments):
            content['Names'].append(pydyf.String(f'attachment{i}'))
            content['Names'].append(pdf_attachment.reference)
-            pdf.catalog['AF'].append(pdf_attachment.reference)
        pdf.add_object(content)
        if 'Names' not in pdf.catalog:
            pdf.catalog['Names'] = pydyf.Dictionary()
--- a/weasyprint/pdf/anchors.py
+++ b/weasyprint/pdf/anchors.py
@ -293,58 +293,54 @@ def write_pdf_attachment(pdf, attachment, url_fetcher, compress):
    elif not isinstance(attachment, Attachment):
        attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)

+    uncompressed_length = 0
+    stream = b''
+    md5 = hashlib.md5()
    try:
-        with attachment.source as (source_type, source, url, _):
+        with attachment.source as (_, source, url, _):
            if isinstance(source, bytes):
                source = io.BytesIO(source)
-            uncompressed_length = 0
-            stream = b''
-            md5 = hashlib.md5()
            for data in iter(lambda: source.read(4096), b''):
                uncompressed_length += len(data)
                md5.update(data)
                stream += data
-            mime_type, _ = mimetypes.guess_type(url, strict=False)
-            if not mime_type:
-                mime_type = 'application/octet-stream'
-            mime_type = '/' + mime_type.replace('/', '#2f')
-            file_extra = pydyf.Dictionary({
-                'Type': '/EmbeddedFile',
-                "Subtype": mime_type,
-                'Params': pydyf.Dictionary({
-                    'CheckSum': f'<{md5.hexdigest()}>',
-                    'Size': uncompressed_length,
-                    'CreationDate': attachment.created,
-                    'ModDate': attachment.modified,
-                })
-            })
-            file_stream = pydyf.Stream([stream], file_extra, compress=compress)
-            pdf.add_object(file_stream)
-
    except URLFetchingError as exception:
        LOGGER.error('Failed to load attachment: %s', exception)
        return
+    attachment.md5 = md5.hexdigest()

    # TODO: Use the result object from a URL fetch operation to provide more
-    # details on the possible filename.
+    # details on the possible filename and MIME type.
    if url and urlsplit(url).path:
        filename = basename(unquote(urlsplit(url).path))
    else:
        filename = 'attachment.bin'
+    mime_type = mimetypes.guess_type(filename, strict=False)[0]
+    if not mime_type:
+        mime_type = 'application/octet-stream'

-    attachment = pydyf.Dictionary({
+    file_extra = pydyf.Dictionary({
+        'Type': '/EmbeddedFile',
+        'Subtype': f'/{mime_type.replace("/", "#2f")}',
+        'Params': pydyf.Dictionary({
+            'CheckSum': f'<{attachment.md5}>',
+            'Size': uncompressed_length,
+            'CreationDate': attachment.created,
+            'ModDate': attachment.modified,
+        })
+    })
+    file_stream = pydyf.Stream([stream], file_extra, compress=compress)
+    pdf.add_object(file_stream)
+
+    pdf_attachment = pydyf.Dictionary({
        'Type': '/Filespec',
        'F': pydyf.String(),
        'UF': pydyf.String(filename),
-        "AFRelationship": "/"+attachment.af_relationship,
        'EF': pydyf.Dictionary({'F': file_stream.reference}),
        'Desc': pydyf.String(attachment.description or ''),
    })
-    pdf.add_object(attachment)
-    if "AF" not in pdf.catalog:
-        pdf.catalog["AF"] = pydyf.Array()
-    pdf.catalog["AF"].append(attachment.reference)
-    return attachment
+    pdf.add_object(pdf_attachment)
+    return pdf_attachment


 def resolve_links(pages):
--- a/weasyprint/pdf/pdfa.py
+++ b/weasyprint/pdf/pdfa.py
@ -34,6 +34,30 @@ def pdfa(pdf, metadata, document, page_streams, compress, version):
        }),
    ])

+    # Add AF for attachments
+    if version >= 2:
+        attachments = []
+        if 'Names' in pdf.catalog and 'EmbeddedFiles' in pdf.catalog['Names']:
+            reference = int(pdf.catalog['Names']['EmbeddedFiles'].split()[0])
+            names = pdf.objects[reference]
+            for name in names[1::2]:
+                attachments.append(name)
+        relationships = {
+            attachment.md5: attachment.relationship
+            for attachment in document.metadata.attachments
+            if attachment.md5}
+        for pdf_object in pdf.objects:
+            if isinstance(pdf_object, dict):
+                if pdf_object.get('Type') == '/Filespec':
+                    checksum = pdf_object['CheckSum']
+                    relationship = relationships.get(checksum, 'Unspecified')
+                    pdf_object['AFRelationship'] = f'/{relationship}'
+                    attachments.append(pdf_object.reference)
+        if attachments:
+            if 'AF' not in pdf.catalog:
+                pdf.catalog['AF'] = pydyf.Array()
+            pdf.catalog['AF'].extend(attachments)
+
    # Print annotations
    for pdf_object in pdf.objects:
        if isinstance(pdf_object, dict) and pdf_object.get('Type') == '/Annot':